You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ra...@apache.org on 2018/06/28 14:54:29 UTC

[01/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Repository: mahout
Updated Branches:
  refs/heads/branch-0.14.0 e0573de33 -> 410ed16af


http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/sequencefile/PathFilters.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/sequencefile/PathFilters.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/sequencefile/PathFilters.java
new file mode 100644
index 0000000..19f78b5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/sequencefile/PathFilters.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator.sequencefile;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+
+/**
+ * Supplies some useful and repeatedly-used instances of {@link PathFilter}.
+ */
+public final class PathFilters {
+
+  private static final PathFilter PART_FILE_INSTANCE = new PathFilter() {
+    @Override
+    public boolean accept(Path path) {
+      String name = path.getName();
+      return name.startsWith("part-") && !name.endsWith(".crc");
+    }
+  };
+  
+  /**
+   * Pathfilter to read the final clustering file.
+   */
+  private static final PathFilter CLUSTER_FINAL = new PathFilter() {
+    @Override
+    public boolean accept(Path path) {
+      String name = path.getName();
+      return name.startsWith("clusters-") && name.endsWith("-final");
+    }
+  };
+
+  private static final PathFilter LOGS_CRC_INSTANCE = new PathFilter() {
+    @Override
+    public boolean accept(Path path) {
+      String name = path.getName();
+      return !(name.endsWith(".crc") || name.startsWith(".") || name.startsWith("_"));
+    }
+  };
+
+  private PathFilters() {
+  }
+
+  /**
+   * @return {@link PathFilter} that accepts paths whose file name starts with "part-". Excludes
+   * ".crc" files.
+   */
+  public static PathFilter partFilter() {
+    return PART_FILE_INSTANCE;
+  }
+  
+  /**
+   * @return {@link PathFilter} that accepts paths whose file name starts with "part-" and ends with "-final".
+   */
+  public static PathFilter finalPartFilter() {
+    return CLUSTER_FINAL;
+  }
+
+  /**
+   * @return {@link PathFilter} that rejects paths whose file name starts with "_" (e.g. Cloudera
+   * _SUCCESS files or Hadoop _logs), or "." (e.g. local hidden files), or ends with ".crc"
+   */
+  public static PathFilter logsCRCFilter() {
+    return LOGS_CRC_INSTANCE;
+  }
+
+}


[40/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/resources/wdbc/wdbc.data
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/resources/wdbc/wdbc.data b/community/mahout-mr/examples/src/test/resources/wdbc/wdbc.data
deleted file mode 100644
index 8885375..0000000
--- a/community/mahout-mr/examples/src/test/resources/wdbc/wdbc.data
+++ /dev/null
@@ -1,569 +0,0 @@
-842302,M,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
-842517,M,20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956,0.1238,0.1866,0.2416,0.186,0.275,0.08902
-84300903,M,19.69,21.25,130,1203,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
-84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
-84358402,M,20.29,14.34,135.1,1297,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575,0.1374,0.205,0.4,0.1625,0.2364,0.07678
-843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,0.3345,0.8902,2.217,27.19,0.00751,0.03345,0.03672,0.01137,0.02165,0.005082,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
-844359,M,18.25,19.98,119.6,1040,0.09463,0.109,0.1127,0.074,0.1794,0.05742,0.4467,0.7732,3.18,53.91,0.004314,0.01382,0.02254,0.01039,0.01369,0.002179,22.88,27.66,153.2,1606,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
-84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,0.5835,1.377,3.856,50.96,0.008805,0.03029,0.02488,0.01448,0.01486,0.005412,17.06,28.14,110.6,897,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
-844981,M,13,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,0.3063,1.002,2.406,24.32,0.005731,0.03502,0.03553,0.01226,0.02143,0.003749,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
-84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,0.2976,1.599,2.039,23.94,0.007149,0.07217,0.07743,0.01432,0.01789,0.01008,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075
-845636,M,16.02,23.24,102.7,797.8,0.08206,0.06669,0.03299,0.03323,0.1528,0.05697,0.3795,1.187,2.466,40.51,0.004029,0.009269,0.01101,0.007591,0.0146,0.003042,19.19,33.88,123.8,1150,0.1181,0.1551,0.1459,0.09975,0.2948,0.08452
-84610002,M,15.78,17.89,103.6,781,0.0971,0.1292,0.09954,0.06606,0.1842,0.06082,0.5058,0.9849,3.564,54.16,0.005771,0.04061,0.02791,0.01282,0.02008,0.004144,20.42,27.28,136.5,1299,0.1396,0.5609,0.3965,0.181,0.3792,0.1048
-846226,M,19.17,24.8,132.4,1123,0.0974,0.2458,0.2065,0.1118,0.2397,0.078,0.9555,3.568,11.07,116.2,0.003139,0.08297,0.0889,0.0409,0.04484,0.01284,20.96,29.94,151.7,1332,0.1037,0.3903,0.3639,0.1767,0.3176,0.1023
-846381,M,15.85,23.95,103.7,782.7,0.08401,0.1002,0.09938,0.05364,0.1847,0.05338,0.4033,1.078,2.903,36.58,0.009769,0.03126,0.05051,0.01992,0.02981,0.003002,16.84,27.66,112,876.5,0.1131,0.1924,0.2322,0.1119,0.2809,0.06287
-84667401,M,13.73,22.61,93.6,578.3,0.1131,0.2293,0.2128,0.08025,0.2069,0.07682,0.2121,1.169,2.061,19.21,0.006429,0.05936,0.05501,0.01628,0.01961,0.008093,15.03,32.01,108.8,697.7,0.1651,0.7725,0.6943,0.2208,0.3596,0.1431
-84799002,M,14.54,27.54,96.73,658.8,0.1139,0.1595,0.1639,0.07364,0.2303,0.07077,0.37,1.033,2.879,32.55,0.005607,0.0424,0.04741,0.0109,0.01857,0.005466,17.46,37.13,124.1,943.2,0.1678,0.6577,0.7026,0.1712,0.4218,0.1341
-848406,M,14.68,20.13,94.74,684.5,0.09867,0.072,0.07395,0.05259,0.1586,0.05922,0.4727,1.24,3.195,45.4,0.005718,0.01162,0.01998,0.01109,0.0141,0.002085,19.07,30.88,123.4,1138,0.1464,0.1871,0.2914,0.1609,0.3029,0.08216
-84862001,M,16.13,20.68,108.1,798.8,0.117,0.2022,0.1722,0.1028,0.2164,0.07356,0.5692,1.073,3.854,54.18,0.007026,0.02501,0.03188,0.01297,0.01689,0.004142,20.96,31.48,136.8,1315,0.1789,0.4233,0.4784,0.2073,0.3706,0.1142
-849014,M,19.81,22.15,130,1260,0.09831,0.1027,0.1479,0.09498,0.1582,0.05395,0.7582,1.017,5.865,112.4,0.006494,0.01893,0.03391,0.01521,0.01356,0.001997,27.32,30.88,186.8,2398,0.1512,0.315,0.5372,0.2388,0.2768,0.07615
-8510426,B,13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,0.1885,0.05766,0.2699,0.7886,2.058,23.56,0.008462,0.0146,0.02387,0.01315,0.0198,0.0023,15.11,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259
-8510653,B,13.08,15.71,85.63,520,0.1075,0.127,0.04568,0.0311,0.1967,0.06811,0.1852,0.7477,1.383,14.67,0.004097,0.01898,0.01698,0.00649,0.01678,0.002425,14.5,20.49,96.09,630.5,0.1312,0.2776,0.189,0.07283,0.3184,0.08183
-8510824,B,9.504,12.44,60.34,273.9,0.1024,0.06492,0.02956,0.02076,0.1815,0.06905,0.2773,0.9768,1.909,15.7,0.009606,0.01432,0.01985,0.01421,0.02027,0.002968,10.23,15.66,65.13,314.9,0.1324,0.1148,0.08867,0.06227,0.245,0.07773
-8511133,M,15.34,14.26,102.5,704.4,0.1073,0.2135,0.2077,0.09756,0.2521,0.07032,0.4388,0.7096,3.384,44.91,0.006789,0.05328,0.06446,0.02252,0.03672,0.004394,18.07,19.08,125.1,980.9,0.139,0.5954,0.6305,0.2393,0.4667,0.09946
-851509,M,21.16,23.04,137.2,1404,0.09428,0.1022,0.1097,0.08632,0.1769,0.05278,0.6917,1.127,4.303,93.99,0.004728,0.01259,0.01715,0.01038,0.01083,0.001987,29.17,35.59,188,2615,0.1401,0.26,0.3155,0.2009,0.2822,0.07526
-852552,M,16.65,21.38,110,904.6,0.1121,0.1457,0.1525,0.0917,0.1995,0.0633,0.8068,0.9017,5.455,102.6,0.006048,0.01882,0.02741,0.0113,0.01468,0.002801,26.46,31.56,177,2215,0.1805,0.3578,0.4695,0.2095,0.3613,0.09564
-852631,M,17.14,16.4,116,912.7,0.1186,0.2276,0.2229,0.1401,0.304,0.07413,1.046,0.976,7.276,111.4,0.008029,0.03799,0.03732,0.02397,0.02308,0.007444,22.25,21.4,152.4,1461,0.1545,0.3949,0.3853,0.255,0.4066,0.1059
-852763,M,14.58,21.53,97.41,644.8,0.1054,0.1868,0.1425,0.08783,0.2252,0.06924,0.2545,0.9832,2.11,21.05,0.004452,0.03055,0.02681,0.01352,0.01454,0.003711,17.62,33.21,122.4,896.9,0.1525,0.6643,0.5539,0.2701,0.4264,0.1275
-852781,M,18.61,20.25,122.1,1094,0.0944,0.1066,0.149,0.07731,0.1697,0.05699,0.8529,1.849,5.632,93.54,0.01075,0.02722,0.05081,0.01911,0.02293,0.004217,21.31,27.26,139.9,1403,0.1338,0.2117,0.3446,0.149,0.2341,0.07421
-852973,M,15.3,25.27,102.4,732.4,0.1082,0.1697,0.1683,0.08751,0.1926,0.0654,0.439,1.012,3.498,43.5,0.005233,0.03057,0.03576,0.01083,0.01768,0.002967,20.27,36.71,149.3,1269,0.1641,0.611,0.6335,0.2024,0.4027,0.09876
-853201,M,17.57,15.05,115,955.1,0.09847,0.1157,0.09875,0.07953,0.1739,0.06149,0.6003,0.8225,4.655,61.1,0.005627,0.03033,0.03407,0.01354,0.01925,0.003742,20.01,19.52,134.9,1227,0.1255,0.2812,0.2489,0.1456,0.2756,0.07919
-853401,M,18.63,25.11,124.8,1088,0.1064,0.1887,0.2319,0.1244,0.2183,0.06197,0.8307,1.466,5.574,105,0.006248,0.03374,0.05196,0.01158,0.02007,0.00456,23.15,34.01,160.5,1670,0.1491,0.4257,0.6133,0.1848,0.3444,0.09782
-853612,M,11.84,18.7,77.93,440.6,0.1109,0.1516,0.1218,0.05182,0.2301,0.07799,0.4825,1.03,3.475,41,0.005551,0.03414,0.04205,0.01044,0.02273,0.005667,16.82,28.12,119.4,888.7,0.1637,0.5775,0.6956,0.1546,0.4761,0.1402
-85382601,M,17.02,23.98,112.8,899.3,0.1197,0.1496,0.2417,0.1203,0.2248,0.06382,0.6009,1.398,3.999,67.78,0.008268,0.03082,0.05042,0.01112,0.02102,0.003854,20.88,32.09,136.1,1344,0.1634,0.3559,0.5588,0.1847,0.353,0.08482
-854002,M,19.27,26.47,127.9,1162,0.09401,0.1719,0.1657,0.07593,0.1853,0.06261,0.5558,0.6062,3.528,68.17,0.005015,0.03318,0.03497,0.009643,0.01543,0.003896,24.15,30.9,161.4,1813,0.1509,0.659,0.6091,0.1785,0.3672,0.1123
-854039,M,16.13,17.88,107,807.2,0.104,0.1559,0.1354,0.07752,0.1998,0.06515,0.334,0.6857,2.183,35.03,0.004185,0.02868,0.02664,0.009067,0.01703,0.003817,20.21,27.26,132.7,1261,0.1446,0.5804,0.5274,0.1864,0.427,0.1233
-854253,M,16.74,21.59,110.1,869.5,0.0961,0.1336,0.1348,0.06018,0.1896,0.05656,0.4615,0.9197,3.008,45.19,0.005776,0.02499,0.03695,0.01195,0.02789,0.002665,20.01,29.02,133.5,1229,0.1563,0.3835,0.5409,0.1813,0.4863,0.08633
-854268,M,14.25,21.72,93.63,633,0.09823,0.1098,0.1319,0.05598,0.1885,0.06125,0.286,1.019,2.657,24.91,0.005878,0.02995,0.04815,0.01161,0.02028,0.004022,15.89,30.36,116.2,799.6,0.1446,0.4238,0.5186,0.1447,0.3591,0.1014
-854941,B,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,0.1467,0.05863,0.1839,2.342,1.17,14.16,0.004352,0.004899,0.01343,0.01164,0.02671,0.001777,13.3,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169
-855133,M,14.99,25.2,95.54,698.8,0.09387,0.05131,0.02398,0.02899,0.1565,0.05504,1.214,2.188,8.077,106,0.006883,0.01094,0.01818,0.01917,0.007882,0.001754,14.99,25.2,95.54,698.8,0.09387,0.05131,0.02398,0.02899,0.1565,0.05504
-855138,M,13.48,20.82,88.4,559.2,0.1016,0.1255,0.1063,0.05439,0.172,0.06419,0.213,0.5914,1.545,18.52,0.005367,0.02239,0.03049,0.01262,0.01377,0.003187,15.53,26.02,107.3,740.4,0.161,0.4225,0.503,0.2258,0.2807,0.1071
-855167,M,13.44,21.58,86.18,563,0.08162,0.06031,0.0311,0.02031,0.1784,0.05587,0.2385,0.8265,1.572,20.53,0.00328,0.01102,0.0139,0.006881,0.0138,0.001286,15.93,30.25,102.5,787.9,0.1094,0.2043,0.2085,0.1112,0.2994,0.07146
-855563,M,10.95,21.35,71.9,371.1,0.1227,0.1218,0.1044,0.05669,0.1895,0.0687,0.2366,1.428,1.822,16.97,0.008064,0.01764,0.02595,0.01037,0.01357,0.00304,12.84,35.34,87.22,514,0.1909,0.2698,0.4023,0.1424,0.2964,0.09606
-855625,M,19.07,24.81,128.3,1104,0.09081,0.219,0.2107,0.09961,0.231,0.06343,0.9811,1.666,8.83,104.9,0.006548,0.1006,0.09723,0.02638,0.05333,0.007646,24.09,33.17,177.4,1651,0.1247,0.7444,0.7242,0.2493,0.467,0.1038
-856106,M,13.28,20.28,87.32,545.2,0.1041,0.1436,0.09847,0.06158,0.1974,0.06782,0.3704,0.8249,2.427,31.33,0.005072,0.02147,0.02185,0.00956,0.01719,0.003317,17.38,28,113.1,907.2,0.153,0.3724,0.3664,0.1492,0.3739,0.1027
-85638502,M,13.17,21.81,85.42,531.5,0.09714,0.1047,0.08259,0.05252,0.1746,0.06177,0.1938,0.6123,1.334,14.49,0.00335,0.01384,0.01452,0.006853,0.01113,0.00172,16.23,29.89,105.5,740.7,0.1503,0.3904,0.3728,0.1607,0.3693,0.09618
-857010,M,18.65,17.6,123.7,1076,0.1099,0.1686,0.1974,0.1009,0.1907,0.06049,0.6289,0.6633,4.293,71.56,0.006294,0.03994,0.05554,0.01695,0.02428,0.003535,22.82,21.32,150.6,1567,0.1679,0.509,0.7345,0.2378,0.3799,0.09185
-85713702,B,8.196,16.84,51.71,201.9,0.086,0.05943,0.01588,0.005917,0.1769,0.06503,0.1563,0.9567,1.094,8.205,0.008968,0.01646,0.01588,0.005917,0.02574,0.002582,8.964,21.96,57.26,242.2,0.1297,0.1357,0.0688,0.02564,0.3105,0.07409
-85715,M,13.17,18.66,85.98,534.6,0.1158,0.1231,0.1226,0.0734,0.2128,0.06777,0.2871,0.8937,1.897,24.25,0.006532,0.02336,0.02905,0.01215,0.01743,0.003643,15.67,27.95,102.8,759.4,0.1786,0.4166,0.5006,0.2088,0.39,0.1179
-857155,B,12.05,14.63,78.04,449.3,0.1031,0.09092,0.06592,0.02749,0.1675,0.06043,0.2636,0.7294,1.848,19.87,0.005488,0.01427,0.02322,0.00566,0.01428,0.002422,13.76,20.7,89.88,582.6,0.1494,0.2156,0.305,0.06548,0.2747,0.08301
-857156,B,13.49,22.3,86.91,561,0.08752,0.07698,0.04751,0.03384,0.1809,0.05718,0.2338,1.353,1.735,20.2,0.004455,0.01382,0.02095,0.01184,0.01641,0.001956,15.15,31.82,99,698.8,0.1162,0.1711,0.2282,0.1282,0.2871,0.06917
-857343,B,11.76,21.6,74.72,427.9,0.08637,0.04966,0.01657,0.01115,0.1495,0.05888,0.4062,1.21,2.635,28.47,0.005857,0.009758,0.01168,0.007445,0.02406,0.001769,12.98,25.72,82.98,516.5,0.1085,0.08615,0.05523,0.03715,0.2433,0.06563
-857373,B,13.64,16.34,87.21,571.8,0.07685,0.06059,0.01857,0.01723,0.1353,0.05953,0.1872,0.9234,1.449,14.55,0.004477,0.01177,0.01079,0.007956,0.01325,0.002551,14.67,23.19,96.08,656.7,0.1089,0.1582,0.105,0.08586,0.2346,0.08025
-857374,B,11.94,18.24,75.71,437.6,0.08261,0.04751,0.01972,0.01349,0.1868,0.0611,0.2273,0.6329,1.52,17.47,0.00721,0.00838,0.01311,0.008,0.01996,0.002635,13.1,21.33,83.67,527.2,0.1144,0.08906,0.09203,0.06296,0.2785,0.07408
-857392,M,18.22,18.7,120.3,1033,0.1148,0.1485,0.1772,0.106,0.2092,0.0631,0.8337,1.593,4.877,98.81,0.003899,0.02961,0.02817,0.009222,0.02674,0.005126,20.6,24.13,135.1,1321,0.128,0.2297,0.2623,0.1325,0.3021,0.07987
-857438,M,15.1,22.02,97.26,712.8,0.09056,0.07081,0.05253,0.03334,0.1616,0.05684,0.3105,0.8339,2.097,29.91,0.004675,0.0103,0.01603,0.009222,0.01095,0.001629,18.1,31.69,117.7,1030,0.1389,0.2057,0.2712,0.153,0.2675,0.07873
-85759902,B,11.52,18.75,73.34,409,0.09524,0.05473,0.03036,0.02278,0.192,0.05907,0.3249,0.9591,2.183,23.47,0.008328,0.008722,0.01349,0.00867,0.03218,0.002386,12.84,22.47,81.81,506.2,0.1249,0.0872,0.09076,0.06316,0.3306,0.07036
-857637,M,19.21,18.57,125.5,1152,0.1053,0.1267,0.1323,0.08994,0.1917,0.05961,0.7275,1.193,4.837,102.5,0.006458,0.02306,0.02945,0.01538,0.01852,0.002608,26.14,28.14,170.1,2145,0.1624,0.3511,0.3879,0.2091,0.3537,0.08294
-857793,M,14.71,21.59,95.55,656.9,0.1137,0.1365,0.1293,0.08123,0.2027,0.06758,0.4226,1.15,2.735,40.09,0.003659,0.02855,0.02572,0.01272,0.01817,0.004108,17.87,30.7,115.7,985.5,0.1368,0.429,0.3587,0.1834,0.3698,0.1094
-857810,B,13.05,19.31,82.61,527.2,0.0806,0.03789,0.000692,0.004167,0.1819,0.05501,0.404,1.214,2.595,32.96,0.007491,0.008593,0.000692,0.004167,0.0219,0.00299,14.23,22.25,90.24,624.1,0.1021,0.06191,0.001845,0.01111,0.2439,0.06289
-858477,B,8.618,11.79,54.34,224.5,0.09752,0.05272,0.02061,0.007799,0.1683,0.07187,0.1559,0.5796,1.046,8.322,0.01011,0.01055,0.01981,0.005742,0.0209,0.002788,9.507,15.4,59.9,274.9,0.1733,0.1239,0.1168,0.04419,0.322,0.09026
-858970,B,10.17,14.88,64.55,311.9,0.1134,0.08061,0.01084,0.0129,0.2743,0.0696,0.5158,1.441,3.312,34.62,0.007514,0.01099,0.007665,0.008193,0.04183,0.005953,11.02,17.45,69.86,368.6,0.1275,0.09866,0.02168,0.02579,0.3557,0.0802
-858981,B,8.598,20.98,54.66,221.8,0.1243,0.08963,0.03,0.009259,0.1828,0.06757,0.3582,2.067,2.493,18.39,0.01193,0.03162,0.03,0.009259,0.03357,0.003048,9.565,27.04,62.06,273.9,0.1639,0.1698,0.09001,0.02778,0.2972,0.07712
-858986,M,14.25,22.15,96.42,645.7,0.1049,0.2008,0.2135,0.08653,0.1949,0.07292,0.7036,1.268,5.373,60.78,0.009407,0.07056,0.06899,0.01848,0.017,0.006113,17.67,29.51,119.1,959.5,0.164,0.6247,0.6922,0.1785,0.2844,0.1132
-859196,B,9.173,13.86,59.2,260.9,0.07721,0.08751,0.05988,0.0218,0.2341,0.06963,0.4098,2.265,2.608,23.52,0.008738,0.03938,0.04312,0.0156,0.04192,0.005822,10.01,19.23,65.59,310.1,0.09836,0.1678,0.1397,0.05087,0.3282,0.0849
-85922302,M,12.68,23.84,82.69,499,0.1122,0.1262,0.1128,0.06873,0.1905,0.0659,0.4255,1.178,2.927,36.46,0.007781,0.02648,0.02973,0.0129,0.01635,0.003601,17.09,33.47,111.8,888.3,0.1851,0.4061,0.4024,0.1716,0.3383,0.1031
-859283,M,14.78,23.94,97.4,668.3,0.1172,0.1479,0.1267,0.09029,0.1953,0.06654,0.3577,1.281,2.45,35.24,0.006703,0.0231,0.02315,0.01184,0.019,0.003224,17.31,33.39,114.6,925.1,0.1648,0.3416,0.3024,0.1614,0.3321,0.08911
-859464,B,9.465,21.01,60.11,269.4,0.1044,0.07773,0.02172,0.01504,0.1717,0.06899,0.2351,2.011,1.66,14.2,0.01052,0.01755,0.01714,0.009333,0.02279,0.004237,10.41,31.56,67.03,330.7,0.1548,0.1664,0.09412,0.06517,0.2878,0.09211
-859465,B,11.31,19.04,71.8,394.1,0.08139,0.04701,0.03709,0.0223,0.1516,0.05667,0.2727,0.9429,1.831,18.15,0.009282,0.009216,0.02063,0.008965,0.02183,0.002146,12.33,23.84,78,466.7,0.129,0.09148,0.1444,0.06961,0.24,0.06641
-859471,B,9.029,17.33,58.79,250.5,0.1066,0.1413,0.313,0.04375,0.2111,0.08046,0.3274,1.194,1.885,17.67,0.009549,0.08606,0.3038,0.03322,0.04197,0.009559,10.31,22.65,65.5,324.7,0.1482,0.4365,1.252,0.175,0.4228,0.1175
-859487,B,12.78,16.49,81.37,502.5,0.09831,0.05234,0.03653,0.02864,0.159,0.05653,0.2368,0.8732,1.471,18.33,0.007962,0.005612,0.01585,0.008662,0.02254,0.001906,13.46,19.76,85.67,554.9,0.1296,0.07061,0.1039,0.05882,0.2383,0.0641
-859575,M,18.94,21.31,123.6,1130,0.09009,0.1029,0.108,0.07951,0.1582,0.05461,0.7888,0.7975,5.486,96.05,0.004444,0.01652,0.02269,0.0137,0.01386,0.001698,24.86,26.58,165.9,1866,0.1193,0.2336,0.2687,0.1789,0.2551,0.06589
-859711,B,8.888,14.64,58.79,244,0.09783,0.1531,0.08606,0.02872,0.1902,0.0898,0.5262,0.8522,3.168,25.44,0.01721,0.09368,0.05671,0.01766,0.02541,0.02193,9.733,15.67,62.56,284.4,0.1207,0.2436,0.1434,0.04786,0.2254,0.1084
-859717,M,17.2,24.52,114.2,929.4,0.1071,0.183,0.1692,0.07944,0.1927,0.06487,0.5907,1.041,3.705,69.47,0.00582,0.05616,0.04252,0.01127,0.01527,0.006299,23.32,33.82,151.6,1681,0.1585,0.7394,0.6566,0.1899,0.3313,0.1339
-859983,M,13.8,15.79,90.43,584.1,0.1007,0.128,0.07789,0.05069,0.1662,0.06566,0.2787,0.6205,1.957,23.35,0.004717,0.02065,0.01759,0.009206,0.0122,0.00313,16.57,20.86,110.3,812.4,0.1411,0.3542,0.2779,0.1383,0.2589,0.103
-8610175,B,12.31,16.52,79.19,470.9,0.09172,0.06829,0.03372,0.02272,0.172,0.05914,0.2505,1.025,1.74,19.68,0.004854,0.01819,0.01826,0.007965,0.01386,0.002304,14.11,23.21,89.71,611.1,0.1176,0.1843,0.1703,0.0866,0.2618,0.07609
-8610404,M,16.07,19.65,104.1,817.7,0.09168,0.08424,0.09769,0.06638,0.1798,0.05391,0.7474,1.016,5.029,79.25,0.01082,0.02203,0.035,0.01809,0.0155,0.001948,19.77,24.56,128.8,1223,0.15,0.2045,0.2829,0.152,0.265,0.06387
-8610629,B,13.53,10.94,87.91,559.2,0.1291,0.1047,0.06877,0.06556,0.2403,0.06641,0.4101,1.014,2.652,32.65,0.0134,0.02839,0.01162,0.008239,0.02572,0.006164,14.08,12.49,91.36,605.5,0.1451,0.1379,0.08539,0.07407,0.271,0.07191
-8610637,M,18.05,16.15,120.2,1006,0.1065,0.2146,0.1684,0.108,0.2152,0.06673,0.9806,0.5505,6.311,134.8,0.00794,0.05839,0.04658,0.0207,0.02591,0.007054,22.39,18.91,150.1,1610,0.1478,0.5634,0.3786,0.2102,0.3751,0.1108
-8610862,M,20.18,23.97,143.7,1245,0.1286,0.3454,0.3754,0.1604,0.2906,0.08142,0.9317,1.885,8.649,116.4,0.01038,0.06835,0.1091,0.02593,0.07895,0.005987,23.37,31.72,170.3,1623,0.1639,0.6164,0.7681,0.2508,0.544,0.09964
-8610908,B,12.86,18,83.19,506.3,0.09934,0.09546,0.03889,0.02315,0.1718,0.05997,0.2655,1.095,1.778,20.35,0.005293,0.01661,0.02071,0.008179,0.01748,0.002848,14.24,24.82,91.88,622.1,0.1289,0.2141,0.1731,0.07926,0.2779,0.07918
-861103,B,11.45,20.97,73.81,401.5,0.1102,0.09362,0.04591,0.02233,0.1842,0.07005,0.3251,2.174,2.077,24.62,0.01037,0.01706,0.02586,0.007506,0.01816,0.003976,13.11,32.16,84.53,525.1,0.1557,0.1676,0.1755,0.06127,0.2762,0.08851
-8611161,B,13.34,15.86,86.49,520,0.1078,0.1535,0.1169,0.06987,0.1942,0.06902,0.286,1.016,1.535,12.96,0.006794,0.03575,0.0398,0.01383,0.02134,0.004603,15.53,23.19,96.66,614.9,0.1536,0.4791,0.4858,0.1708,0.3527,0.1016
-8611555,M,25.22,24.91,171.5,1878,0.1063,0.2665,0.3339,0.1845,0.1829,0.06782,0.8973,1.474,7.382,120,0.008166,0.05693,0.0573,0.0203,0.01065,0.005893,30,33.62,211.7,2562,0.1573,0.6076,0.6476,0.2867,0.2355,0.1051
-8611792,M,19.1,26.29,129.1,1132,0.1215,0.1791,0.1937,0.1469,0.1634,0.07224,0.519,2.91,5.801,67.1,0.007545,0.0605,0.02134,0.01843,0.03056,0.01039,20.33,32.72,141.3,1298,0.1392,0.2817,0.2432,0.1841,0.2311,0.09203
-8612080,B,12,15.65,76.95,443.3,0.09723,0.07165,0.04151,0.01863,0.2079,0.05968,0.2271,1.255,1.441,16.16,0.005969,0.01812,0.02007,0.007027,0.01972,0.002607,13.67,24.9,87.78,567.9,0.1377,0.2003,0.2267,0.07632,0.3379,0.07924
-8612399,M,18.46,18.52,121.1,1075,0.09874,0.1053,0.1335,0.08795,0.2132,0.06022,0.6997,1.475,4.782,80.6,0.006471,0.01649,0.02806,0.0142,0.0237,0.003755,22.93,27.68,152.2,1603,0.1398,0.2089,0.3157,0.1642,0.3695,0.08579
-86135501,M,14.48,21.46,94.25,648.2,0.09444,0.09947,0.1204,0.04938,0.2075,0.05636,0.4204,2.22,3.301,38.87,0.009369,0.02983,0.05371,0.01761,0.02418,0.003249,16.21,29.25,108.4,808.9,0.1306,0.1976,0.3349,0.1225,0.302,0.06846
-86135502,M,19.02,24.59,122,1076,0.09029,0.1206,0.1468,0.08271,0.1953,0.05629,0.5495,0.6636,3.055,57.65,0.003872,0.01842,0.0371,0.012,0.01964,0.003337,24.56,30.41,152.9,1623,0.1249,0.3206,0.5755,0.1956,0.3956,0.09288
-861597,B,12.36,21.8,79.78,466.1,0.08772,0.09445,0.06015,0.03745,0.193,0.06404,0.2978,1.502,2.203,20.95,0.007112,0.02493,0.02703,0.01293,0.01958,0.004463,13.83,30.5,91.46,574.7,0.1304,0.2463,0.2434,0.1205,0.2972,0.09261
-861598,B,14.64,15.24,95.77,651.9,0.1132,0.1339,0.09966,0.07064,0.2116,0.06346,0.5115,0.7372,3.814,42.76,0.005508,0.04412,0.04436,0.01623,0.02427,0.004841,16.34,18.24,109.4,803.6,0.1277,0.3089,0.2604,0.1397,0.3151,0.08473
-861648,B,14.62,24.02,94.57,662.7,0.08974,0.08606,0.03102,0.02957,0.1685,0.05866,0.3721,1.111,2.279,33.76,0.004868,0.01818,0.01121,0.008606,0.02085,0.002893,16.11,29.11,102.9,803.7,0.1115,0.1766,0.09189,0.06946,0.2522,0.07246
-861799,M,15.37,22.76,100.2,728.2,0.092,0.1036,0.1122,0.07483,0.1717,0.06097,0.3129,0.8413,2.075,29.44,0.009882,0.02444,0.04531,0.01763,0.02471,0.002142,16.43,25.84,107.5,830.9,0.1257,0.1997,0.2846,0.1476,0.2556,0.06828
-861853,B,13.27,14.76,84.74,551.7,0.07355,0.05055,0.03261,0.02648,0.1386,0.05318,0.4057,1.153,2.701,36.35,0.004481,0.01038,0.01358,0.01082,0.01069,0.001435,16.36,22.35,104.5,830.6,0.1006,0.1238,0.135,0.1001,0.2027,0.06206
-862009,B,13.45,18.3,86.6,555.1,0.1022,0.08165,0.03974,0.0278,0.1638,0.0571,0.295,1.373,2.099,25.22,0.005884,0.01491,0.01872,0.009366,0.01884,0.001817,15.1,25.94,97.59,699.4,0.1339,0.1751,0.1381,0.07911,0.2678,0.06603
-862028,M,15.06,19.83,100.3,705.6,0.1039,0.1553,0.17,0.08815,0.1855,0.06284,0.4768,0.9644,3.706,47.14,0.00925,0.03715,0.04867,0.01851,0.01498,0.00352,18.23,24.23,123.5,1025,0.1551,0.4203,0.5203,0.2115,0.2834,0.08234
-86208,M,20.26,23.03,132.4,1264,0.09078,0.1313,0.1465,0.08683,0.2095,0.05649,0.7576,1.509,4.554,87.87,0.006016,0.03482,0.04232,0.01269,0.02657,0.004411,24.22,31.59,156.1,1750,0.119,0.3539,0.4098,0.1573,0.3689,0.08368
-86211,B,12.18,17.84,77.79,451.1,0.1045,0.07057,0.0249,0.02941,0.19,0.06635,0.3661,1.511,2.41,24.44,0.005433,0.01179,0.01131,0.01519,0.0222,0.003408,12.83,20.92,82.14,495.2,0.114,0.09358,0.0498,0.05882,0.2227,0.07376
-862261,B,9.787,19.94,62.11,294.5,0.1024,0.05301,0.006829,0.007937,0.135,0.0689,0.335,2.043,2.132,20.05,0.01113,0.01463,0.005308,0.00525,0.01801,0.005667,10.92,26.29,68.81,366.1,0.1316,0.09473,0.02049,0.02381,0.1934,0.08988
-862485,B,11.6,12.84,74.34,412.6,0.08983,0.07525,0.04196,0.0335,0.162,0.06582,0.2315,0.5391,1.475,15.75,0.006153,0.0133,0.01693,0.006884,0.01651,0.002551,13.06,17.16,82.96,512.5,0.1431,0.1851,0.1922,0.08449,0.2772,0.08756
-862548,M,14.42,19.77,94.48,642.5,0.09752,0.1141,0.09388,0.05839,0.1879,0.0639,0.2895,1.851,2.376,26.85,0.008005,0.02895,0.03321,0.01424,0.01462,0.004452,16.33,30.86,109.5,826.4,0.1431,0.3026,0.3194,0.1565,0.2718,0.09353
-862717,M,13.61,24.98,88.05,582.7,0.09488,0.08511,0.08625,0.04489,0.1609,0.05871,0.4565,1.29,2.861,43.14,0.005872,0.01488,0.02647,0.009921,0.01465,0.002355,16.99,35.27,108.6,906.5,0.1265,0.1943,0.3169,0.1184,0.2651,0.07397
-862722,B,6.981,13.43,43.79,143.5,0.117,0.07568,0,0,0.193,0.07818,0.2241,1.508,1.553,9.833,0.01019,0.01084,0,0,0.02659,0.0041,7.93,19.54,50.41,185.2,0.1584,0.1202,0,0,0.2932,0.09382
-862965,B,12.18,20.52,77.22,458.7,0.08013,0.04038,0.02383,0.0177,0.1739,0.05677,0.1924,1.571,1.183,14.68,0.00508,0.006098,0.01069,0.006797,0.01447,0.001532,13.34,32.84,84.58,547.8,0.1123,0.08862,0.1145,0.07431,0.2694,0.06878
-862980,B,9.876,19.4,63.95,298.3,0.1005,0.09697,0.06154,0.03029,0.1945,0.06322,0.1803,1.222,1.528,11.77,0.009058,0.02196,0.03029,0.01112,0.01609,0.00357,10.76,26.83,72.22,361.2,0.1559,0.2302,0.2644,0.09749,0.2622,0.0849
-862989,B,10.49,19.29,67.41,336.1,0.09989,0.08578,0.02995,0.01201,0.2217,0.06481,0.355,1.534,2.302,23.13,0.007595,0.02219,0.0288,0.008614,0.0271,0.003451,11.54,23.31,74.22,402.8,0.1219,0.1486,0.07987,0.03203,0.2826,0.07552
-863030,M,13.11,15.56,87.21,530.2,0.1398,0.1765,0.2071,0.09601,0.1925,0.07692,0.3908,0.9238,2.41,34.66,0.007162,0.02912,0.05473,0.01388,0.01547,0.007098,16.31,22.4,106.4,827.2,0.1862,0.4099,0.6376,0.1986,0.3147,0.1405
-863031,B,11.64,18.33,75.17,412.5,0.1142,0.1017,0.0707,0.03485,0.1801,0.0652,0.306,1.657,2.155,20.62,0.00854,0.0231,0.02945,0.01398,0.01565,0.00384,13.14,29.26,85.51,521.7,0.1688,0.266,0.2873,0.1218,0.2806,0.09097
-863270,B,12.36,18.54,79.01,466.7,0.08477,0.06815,0.02643,0.01921,0.1602,0.06066,0.1199,0.8944,0.8484,9.227,0.003457,0.01047,0.01167,0.005558,0.01251,0.001356,13.29,27.49,85.56,544.1,0.1184,0.1963,0.1937,0.08442,0.2983,0.07185
-86355,M,22.27,19.67,152.8,1509,0.1326,0.2768,0.4264,0.1823,0.2556,0.07039,1.215,1.545,10.05,170,0.006515,0.08668,0.104,0.0248,0.03112,0.005037,28.4,28.01,206.8,2360,0.1701,0.6997,0.9608,0.291,0.4055,0.09789
-864018,B,11.34,21.26,72.48,396.5,0.08759,0.06575,0.05133,0.01899,0.1487,0.06529,0.2344,0.9861,1.597,16.41,0.009113,0.01557,0.02443,0.006435,0.01568,0.002477,13.01,29.15,83.99,518.1,0.1699,0.2196,0.312,0.08278,0.2829,0.08832
-864033,B,9.777,16.99,62.5,290.2,0.1037,0.08404,0.04334,0.01778,0.1584,0.07065,0.403,1.424,2.747,22.87,0.01385,0.02932,0.02722,0.01023,0.03281,0.004638,11.05,21.47,71.68,367,0.1467,0.1765,0.13,0.05334,0.2533,0.08468
-86408,B,12.63,20.76,82.15,480.4,0.09933,0.1209,0.1065,0.06021,0.1735,0.0707,0.3424,1.803,2.711,20.48,0.01291,0.04042,0.05101,0.02295,0.02144,0.005891,13.33,25.47,89,527.4,0.1287,0.225,0.2216,0.1105,0.2226,0.08486
-86409,B,14.26,19.65,97.83,629.9,0.07837,0.2233,0.3003,0.07798,0.1704,0.07769,0.3628,1.49,3.399,29.25,0.005298,0.07446,0.1435,0.02292,0.02566,0.01298,15.3,23.73,107,709,0.08949,0.4193,0.6783,0.1505,0.2398,0.1082
-864292,B,10.51,20.19,68.64,334.2,0.1122,0.1303,0.06476,0.03068,0.1922,0.07782,0.3336,1.86,2.041,19.91,0.01188,0.03747,0.04591,0.01544,0.02287,0.006792,11.16,22.75,72.62,374.4,0.13,0.2049,0.1295,0.06136,0.2383,0.09026
-864496,B,8.726,15.83,55.84,230.9,0.115,0.08201,0.04132,0.01924,0.1649,0.07633,0.1665,0.5864,1.354,8.966,0.008261,0.02213,0.03259,0.0104,0.01708,0.003806,9.628,19.62,64.48,284.4,0.1724,0.2364,0.2456,0.105,0.2926,0.1017
-864685,B,11.93,21.53,76.53,438.6,0.09768,0.07849,0.03328,0.02008,0.1688,0.06194,0.3118,0.9227,2,24.79,0.007803,0.02507,0.01835,0.007711,0.01278,0.003856,13.67,26.15,87.54,583,0.15,0.2399,0.1503,0.07247,0.2438,0.08541
-864726,B,8.95,15.76,58.74,245.2,0.09462,0.1243,0.09263,0.02308,0.1305,0.07163,0.3132,0.9789,3.28,16.94,0.01835,0.0676,0.09263,0.02308,0.02384,0.005601,9.414,17.07,63.34,270,0.1179,0.1879,0.1544,0.03846,0.1652,0.07722
-864729,M,14.87,16.67,98.64,682.5,0.1162,0.1649,0.169,0.08923,0.2157,0.06768,0.4266,0.9489,2.989,41.18,0.006985,0.02563,0.03011,0.01271,0.01602,0.003884,18.81,27.37,127.1,1095,0.1878,0.448,0.4704,0.2027,0.3585,0.1065
-864877,M,15.78,22.91,105.7,782.6,0.1155,0.1752,0.2133,0.09479,0.2096,0.07331,0.552,1.072,3.598,58.63,0.008699,0.03976,0.0595,0.0139,0.01495,0.005984,20.19,30.5,130.3,1272,0.1855,0.4925,0.7356,0.2034,0.3274,0.1252
-865128,M,17.95,20.01,114.2,982,0.08402,0.06722,0.07293,0.05596,0.2129,0.05025,0.5506,1.214,3.357,54.04,0.004024,0.008422,0.02291,0.009863,0.05014,0.001902,20.58,27.83,129.2,1261,0.1072,0.1202,0.2249,0.1185,0.4882,0.06111
-865137,B,11.41,10.82,73.34,403.3,0.09373,0.06685,0.03512,0.02623,0.1667,0.06113,0.1408,0.4607,1.103,10.5,0.00604,0.01529,0.01514,0.00646,0.01344,0.002206,12.82,15.97,83.74,510.5,0.1548,0.239,0.2102,0.08958,0.3016,0.08523
-86517,M,18.66,17.12,121.4,1077,0.1054,0.11,0.1457,0.08665,0.1966,0.06213,0.7128,1.581,4.895,90.47,0.008102,0.02101,0.03342,0.01601,0.02045,0.00457,22.25,24.9,145.4,1549,0.1503,0.2291,0.3272,0.1674,0.2894,0.08456
-865423,M,24.25,20.2,166.2,1761,0.1447,0.2867,0.4268,0.2012,0.2655,0.06877,1.509,3.12,9.807,233,0.02333,0.09806,0.1278,0.01822,0.04547,0.009875,26.02,23.99,180.9,2073,0.1696,0.4244,0.5803,0.2248,0.3222,0.08009
-865432,B,14.5,10.89,94.28,640.7,0.1101,0.1099,0.08842,0.05778,0.1856,0.06402,0.2929,0.857,1.928,24.19,0.003818,0.01276,0.02882,0.012,0.0191,0.002808,15.7,15.98,102.8,745.5,0.1313,0.1788,0.256,0.1221,0.2889,0.08006
-865468,B,13.37,16.39,86.1,553.5,0.07115,0.07325,0.08092,0.028,0.1422,0.05823,0.1639,1.14,1.223,14.66,0.005919,0.0327,0.04957,0.01038,0.01208,0.004076,14.26,22.75,91.99,632.1,0.1025,0.2531,0.3308,0.08978,0.2048,0.07628
-86561,B,13.85,17.21,88.44,588.7,0.08785,0.06136,0.0142,0.01141,0.1614,0.0589,0.2185,0.8561,1.495,17.91,0.004599,0.009169,0.009127,0.004814,0.01247,0.001708,15.49,23.58,100.3,725.9,0.1157,0.135,0.08115,0.05104,0.2364,0.07182
-866083,M,13.61,24.69,87.76,572.6,0.09258,0.07862,0.05285,0.03085,0.1761,0.0613,0.231,1.005,1.752,19.83,0.004088,0.01174,0.01796,0.00688,0.01323,0.001465,16.89,35.64,113.2,848.7,0.1471,0.2884,0.3796,0.1329,0.347,0.079
-866203,M,19,18.91,123.4,1138,0.08217,0.08028,0.09271,0.05627,0.1946,0.05044,0.6896,1.342,5.216,81.23,0.004428,0.02731,0.0404,0.01361,0.0203,0.002686,22.32,25.73,148.2,1538,0.1021,0.2264,0.3207,0.1218,0.2841,0.06541
-866458,B,15.1,16.39,99.58,674.5,0.115,0.1807,0.1138,0.08534,0.2001,0.06467,0.4309,1.068,2.796,39.84,0.009006,0.04185,0.03204,0.02258,0.02353,0.004984,16.11,18.33,105.9,762.6,0.1386,0.2883,0.196,0.1423,0.259,0.07779
-866674,M,19.79,25.12,130.4,1192,0.1015,0.1589,0.2545,0.1149,0.2202,0.06113,0.4953,1.199,2.765,63.33,0.005033,0.03179,0.04755,0.01043,0.01578,0.003224,22.63,33.58,148.7,1589,0.1275,0.3861,0.5673,0.1732,0.3305,0.08465
-866714,B,12.19,13.29,79.08,455.8,0.1066,0.09509,0.02855,0.02882,0.188,0.06471,0.2005,0.8163,1.973,15.24,0.006773,0.02456,0.01018,0.008094,0.02662,0.004143,13.34,17.81,91.38,545.2,0.1427,0.2585,0.09915,0.08187,0.3469,0.09241
-8670,M,15.46,19.48,101.7,748.9,0.1092,0.1223,0.1466,0.08087,0.1931,0.05796,0.4743,0.7859,3.094,48.31,0.00624,0.01484,0.02813,0.01093,0.01397,0.002461,19.26,26,124.9,1156,0.1546,0.2394,0.3791,0.1514,0.2837,0.08019
-86730502,M,16.16,21.54,106.2,809.8,0.1008,0.1284,0.1043,0.05613,0.216,0.05891,0.4332,1.265,2.844,43.68,0.004877,0.01952,0.02219,0.009231,0.01535,0.002373,19.47,31.68,129.7,1175,0.1395,0.3055,0.2992,0.1312,0.348,0.07619
-867387,B,15.71,13.93,102,761.7,0.09462,0.09462,0.07135,0.05933,0.1816,0.05723,0.3117,0.8155,1.972,27.94,0.005217,0.01515,0.01678,0.01268,0.01669,0.00233,17.5,19.25,114.3,922.8,0.1223,0.1949,0.1709,0.1374,0.2723,0.07071
-867739,M,18.45,21.91,120.2,1075,0.0943,0.09709,0.1153,0.06847,0.1692,0.05727,0.5959,1.202,3.766,68.35,0.006001,0.01422,0.02855,0.009148,0.01492,0.002205,22.52,31.39,145.6,1590,0.1465,0.2275,0.3965,0.1379,0.3109,0.0761
-868202,M,12.77,22.47,81.72,506.3,0.09055,0.05761,0.04711,0.02704,0.1585,0.06065,0.2367,1.38,1.457,19.87,0.007499,0.01202,0.02332,0.00892,0.01647,0.002629,14.49,33.37,92.04,653.6,0.1419,0.1523,0.2177,0.09331,0.2829,0.08067
-868223,B,11.71,16.67,74.72,423.6,0.1051,0.06095,0.03592,0.026,0.1339,0.05945,0.4489,2.508,3.258,34.37,0.006578,0.0138,0.02662,0.01307,0.01359,0.003707,13.33,25.48,86.16,546.7,0.1271,0.1028,0.1046,0.06968,0.1712,0.07343
-868682,B,11.43,15.39,73.06,399.8,0.09639,0.06889,0.03503,0.02875,0.1734,0.05865,0.1759,0.9938,1.143,12.67,0.005133,0.01521,0.01434,0.008602,0.01501,0.001588,12.32,22.02,79.93,462,0.119,0.1648,0.1399,0.08476,0.2676,0.06765
-868826,M,14.95,17.57,96.85,678.1,0.1167,0.1305,0.1539,0.08624,0.1957,0.06216,1.296,1.452,8.419,101.9,0.01,0.0348,0.06577,0.02801,0.05168,0.002887,18.55,21.43,121.4,971.4,0.1411,0.2164,0.3355,0.1667,0.3414,0.07147
-868871,B,11.28,13.39,73,384.8,0.1164,0.1136,0.04635,0.04796,0.1771,0.06072,0.3384,1.343,1.851,26.33,0.01127,0.03498,0.02187,0.01965,0.0158,0.003442,11.92,15.77,76.53,434,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
-868999,B,9.738,11.97,61.24,288.5,0.0925,0.04102,0,0,0.1903,0.06422,0.1988,0.496,1.218,12.26,0.00604,0.005656,0,0,0.02277,0.00322,10.62,14.1,66.53,342.9,0.1234,0.07204,0,0,0.3105,0.08151
-869104,M,16.11,18.05,105.1,813,0.09721,0.1137,0.09447,0.05943,0.1861,0.06248,0.7049,1.332,4.533,74.08,0.00677,0.01938,0.03067,0.01167,0.01875,0.003434,19.92,25.27,129,1233,0.1314,0.2236,0.2802,0.1216,0.2792,0.08158
-869218,B,11.43,17.31,73.66,398,0.1092,0.09486,0.02031,0.01861,0.1645,0.06562,0.2843,1.908,1.937,21.38,0.006664,0.01735,0.01158,0.00952,0.02282,0.003526,12.78,26.76,82.66,503,0.1413,0.1792,0.07708,0.06402,0.2584,0.08096
-869224,B,12.9,15.92,83.74,512.2,0.08677,0.09509,0.04894,0.03088,0.1778,0.06235,0.2143,0.7712,1.689,16.64,0.005324,0.01563,0.0151,0.007584,0.02104,0.001887,14.48,21.82,97.17,643.8,0.1312,0.2548,0.209,0.1012,0.3549,0.08118
-869254,B,10.75,14.97,68.26,355.3,0.07793,0.05139,0.02251,0.007875,0.1399,0.05688,0.2525,1.239,1.806,17.74,0.006547,0.01781,0.02018,0.005612,0.01671,0.00236,11.95,20.72,77.79,441.2,0.1076,0.1223,0.09755,0.03413,0.23,0.06769
-869476,B,11.9,14.65,78.11,432.8,0.1152,0.1296,0.0371,0.03003,0.1995,0.07839,0.3962,0.6538,3.021,25.03,0.01017,0.04741,0.02789,0.0111,0.03127,0.009423,13.15,16.51,86.26,509.6,0.1424,0.2517,0.0942,0.06042,0.2727,0.1036
-869691,M,11.8,16.58,78.99,432,0.1091,0.17,0.1659,0.07415,0.2678,0.07371,0.3197,1.426,2.281,24.72,0.005427,0.03633,0.04649,0.01843,0.05628,0.004635,13.74,26.38,91.93,591.7,0.1385,0.4092,0.4504,0.1865,0.5774,0.103
-86973701,B,14.95,18.77,97.84,689.5,0.08138,0.1167,0.0905,0.03562,0.1744,0.06493,0.422,1.909,3.271,39.43,0.00579,0.04877,0.05303,0.01527,0.03356,0.009368,16.25,25.47,107.1,809.7,0.0997,0.2521,0.25,0.08405,0.2852,0.09218
-86973702,B,14.44,15.18,93.97,640.1,0.0997,0.1021,0.08487,0.05532,0.1724,0.06081,0.2406,0.7394,2.12,21.2,0.005706,0.02297,0.03114,0.01493,0.01454,0.002528,15.85,19.85,108.6,766.9,0.1316,0.2735,0.3103,0.1599,0.2691,0.07683
-869931,B,13.74,17.91,88.12,585,0.07944,0.06376,0.02881,0.01329,0.1473,0.0558,0.25,0.7574,1.573,21.47,0.002838,0.01592,0.0178,0.005828,0.01329,0.001976,15.34,22.46,97.19,725.9,0.09711,0.1824,0.1564,0.06019,0.235,0.07014
-871001501,B,13,20.78,83.51,519.4,0.1135,0.07589,0.03136,0.02645,0.254,0.06087,0.4202,1.322,2.873,34.78,0.007017,0.01142,0.01949,0.01153,0.02951,0.001533,14.16,24.11,90.82,616.7,0.1297,0.1105,0.08112,0.06296,0.3196,0.06435
-871001502,B,8.219,20.7,53.27,203.9,0.09405,0.1305,0.1321,0.02168,0.2222,0.08261,0.1935,1.962,1.243,10.21,0.01243,0.05416,0.07753,0.01022,0.02309,0.01178,9.092,29.72,58.08,249.8,0.163,0.431,0.5381,0.07879,0.3322,0.1486
-8710441,B,9.731,15.34,63.78,300.2,0.1072,0.1599,0.4108,0.07857,0.2548,0.09296,0.8245,2.664,4.073,49.85,0.01097,0.09586,0.396,0.05279,0.03546,0.02984,11.02,19.49,71.04,380.5,0.1292,0.2772,0.8216,0.1571,0.3108,0.1259
-87106,B,11.15,13.08,70.87,381.9,0.09754,0.05113,0.01982,0.01786,0.183,0.06105,0.2251,0.7815,1.429,15.48,0.009019,0.008985,0.01196,0.008232,0.02388,0.001619,11.99,16.3,76.25,440.8,0.1341,0.08971,0.07116,0.05506,0.2859,0.06772
-8711002,B,13.15,15.34,85.31,538.9,0.09384,0.08498,0.09293,0.03483,0.1822,0.06207,0.271,0.7927,1.819,22.79,0.008584,0.02017,0.03047,0.009536,0.02769,0.003479,14.77,20.5,97.67,677.3,0.1478,0.2256,0.3009,0.09722,0.3849,0.08633
-8711003,B,12.25,17.94,78.27,460.3,0.08654,0.06679,0.03885,0.02331,0.197,0.06228,0.22,0.9823,1.484,16.51,0.005518,0.01562,0.01994,0.007924,0.01799,0.002484,13.59,25.22,86.6,564.2,0.1217,0.1788,0.1943,0.08211,0.3113,0.08132
-8711202,M,17.68,20.74,117.4,963.7,0.1115,0.1665,0.1855,0.1054,0.1971,0.06166,0.8113,1.4,5.54,93.91,0.009037,0.04954,0.05206,0.01841,0.01778,0.004968,20.47,25.11,132.9,1302,0.1418,0.3498,0.3583,0.1515,0.2463,0.07738
-8711216,B,16.84,19.46,108.4,880.2,0.07445,0.07223,0.0515,0.02771,0.1844,0.05268,0.4789,2.06,3.479,46.61,0.003443,0.02661,0.03056,0.0111,0.0152,0.001519,18.22,28.07,120.3,1032,0.08774,0.171,0.1882,0.08436,0.2527,0.05972
-871122,B,12.06,12.74,76.84,448.6,0.09311,0.05241,0.01972,0.01963,0.159,0.05907,0.1822,0.7285,1.171,13.25,0.005528,0.009789,0.008342,0.006273,0.01465,0.00253,13.14,18.41,84.08,532.8,0.1275,0.1232,0.08636,0.07025,0.2514,0.07898
-871149,B,10.9,12.96,68.69,366.8,0.07515,0.03718,0.00309,0.006588,0.1442,0.05743,0.2818,0.7614,1.808,18.54,0.006142,0.006134,0.001835,0.003576,0.01637,0.002665,12.36,18.2,78.07,470,0.1171,0.08294,0.01854,0.03953,0.2738,0.07685
-8711561,B,11.75,20.18,76.1,419.8,0.1089,0.1141,0.06843,0.03738,0.1993,0.06453,0.5018,1.693,3.926,38.34,0.009433,0.02405,0.04167,0.01152,0.03397,0.005061,13.32,26.21,88.91,543.9,0.1358,0.1892,0.1956,0.07909,0.3168,0.07987
-8711803,M,19.19,15.94,126.3,1157,0.08694,0.1185,0.1193,0.09667,0.1741,0.05176,1,0.6336,6.971,119.3,0.009406,0.03055,0.04344,0.02794,0.03156,0.003362,22.03,17.81,146.6,1495,0.1124,0.2016,0.2264,0.1777,0.2443,0.06251
-871201,M,19.59,18.15,130.7,1214,0.112,0.1666,0.2508,0.1286,0.2027,0.06082,0.7364,1.048,4.792,97.07,0.004057,0.02277,0.04029,0.01303,0.01686,0.003318,26.73,26.39,174.9,2232,0.1438,0.3846,0.681,0.2247,0.3643,0.09223
-8712064,B,12.34,22.22,79.85,464.5,0.1012,0.1015,0.0537,0.02822,0.1551,0.06761,0.2949,1.656,1.955,21.55,0.01134,0.03175,0.03125,0.01135,0.01879,0.005348,13.58,28.68,87.36,553,0.1452,0.2338,0.1688,0.08194,0.2268,0.09082
-8712289,M,23.27,22.04,152.1,1686,0.08439,0.1145,0.1324,0.09702,0.1801,0.05553,0.6642,0.8561,4.603,97.85,0.00491,0.02544,0.02822,0.01623,0.01956,0.00374,28.01,28.22,184.2,2403,0.1228,0.3583,0.3948,0.2346,0.3589,0.09187
-8712291,B,14.97,19.76,95.5,690.2,0.08421,0.05352,0.01947,0.01939,0.1515,0.05266,0.184,1.065,1.286,16.64,0.003634,0.007983,0.008268,0.006432,0.01924,0.00152,15.98,25.82,102.3,782.1,0.1045,0.09995,0.0775,0.05754,0.2646,0.06085
-87127,B,10.8,9.71,68.77,357.6,0.09594,0.05736,0.02531,0.01698,0.1381,0.064,0.1728,0.4064,1.126,11.48,0.007809,0.009816,0.01099,0.005344,0.01254,0.00212,11.6,12.02,73.66,414,0.1436,0.1257,0.1047,0.04603,0.209,0.07699
-8712729,M,16.78,18.8,109.3,886.3,0.08865,0.09182,0.08422,0.06576,0.1893,0.05534,0.599,1.391,4.129,67.34,0.006123,0.0247,0.02626,0.01604,0.02091,0.003493,20.05,26.3,130.7,1260,0.1168,0.2119,0.2318,0.1474,0.281,0.07228
-8712766,M,17.47,24.68,116.1,984.6,0.1049,0.1603,0.2159,0.1043,0.1538,0.06365,1.088,1.41,7.337,122.3,0.006174,0.03634,0.04644,0.01569,0.01145,0.00512,23.14,32.33,155.3,1660,0.1376,0.383,0.489,0.1721,0.216,0.093
-8712853,B,14.97,16.95,96.22,685.9,0.09855,0.07885,0.02602,0.03781,0.178,0.0565,0.2713,1.217,1.893,24.28,0.00508,0.0137,0.007276,0.009073,0.0135,0.001706,16.11,23,104.6,793.7,0.1216,0.1637,0.06648,0.08485,0.2404,0.06428
-87139402,B,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,0.1959,0.05955,0.236,0.6656,1.67,17.43,0.008045,0.0118,0.01683,0.01241,0.01924,0.002248,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
-87163,M,13.43,19.63,85.84,565.4,0.09048,0.06288,0.05858,0.03438,0.1598,0.05671,0.4697,1.147,3.142,43.4,0.006003,0.01063,0.02151,0.009443,0.0152,0.001868,17.98,29.87,116.6,993.6,0.1401,0.1546,0.2644,0.116,0.2884,0.07371
-87164,M,15.46,11.89,102.5,736.9,0.1257,0.1555,0.2032,0.1097,0.1966,0.07069,0.4209,0.6583,2.805,44.64,0.005393,0.02321,0.04303,0.0132,0.01792,0.004168,18.79,17.04,125,1102,0.1531,0.3583,0.583,0.1827,0.3216,0.101
-871641,B,11.08,14.71,70.21,372.7,0.1006,0.05743,0.02363,0.02583,0.1566,0.06669,0.2073,1.805,1.377,19.08,0.01496,0.02121,0.01453,0.01583,0.03082,0.004785,11.35,16.82,72.01,396.5,0.1216,0.0824,0.03938,0.04306,0.1902,0.07313
-871642,B,10.66,15.15,67.49,349.6,0.08792,0.04302,0,0,0.1928,0.05975,0.3309,1.925,2.155,21.98,0.008713,0.01017,0,0,0.03265,0.001002,11.54,19.2,73.2,408.3,0.1076,0.06791,0,0,0.271,0.06164
-872113,B,8.671,14.45,54.42,227.2,0.09138,0.04276,0,0,0.1722,0.06724,0.2204,0.7873,1.435,11.36,0.009172,0.008007,0,0,0.02711,0.003399,9.262,17.04,58.36,259.2,0.1162,0.07057,0,0,0.2592,0.07848
-872608,B,9.904,18.06,64.6,302.4,0.09699,0.1294,0.1307,0.03716,0.1669,0.08116,0.4311,2.261,3.132,27.48,0.01286,0.08808,0.1197,0.0246,0.0388,0.01792,11.26,24.39,73.07,390.2,0.1301,0.295,0.3486,0.0991,0.2614,0.1162
-87281702,M,16.46,20.11,109.3,832.9,0.09831,0.1556,0.1793,0.08866,0.1794,0.06323,0.3037,1.284,2.482,31.59,0.006627,0.04094,0.05371,0.01813,0.01682,0.004584,17.79,28.45,123.5,981.2,0.1415,0.4667,0.5862,0.2035,0.3054,0.09519
-873357,B,13.01,22.22,82.01,526.4,0.06251,0.01938,0.001595,0.001852,0.1395,0.05234,0.1731,1.142,1.101,14.34,0.003418,0.002252,0.001595,0.001852,0.01613,0.0009683,14,29.02,88.18,608.8,0.08125,0.03432,0.007977,0.009259,0.2295,0.05843
-873586,B,12.81,13.06,81.29,508.8,0.08739,0.03774,0.009193,0.0133,0.1466,0.06133,0.2889,0.9899,1.778,21.79,0.008534,0.006364,0.00618,0.007408,0.01065,0.003351,13.63,16.15,86.7,570.7,0.1162,0.05445,0.02758,0.0399,0.1783,0.07319
-873592,M,27.22,21.87,182.1,2250,0.1094,0.1914,0.2871,0.1878,0.18,0.0577,0.8361,1.481,5.82,128.7,0.004631,0.02537,0.03109,0.01241,0.01575,0.002747,33.12,32.85,220.8,3216,0.1472,0.4034,0.534,0.2688,0.2856,0.08082
-873593,M,21.09,26.57,142.7,1311,0.1141,0.2832,0.2487,0.1496,0.2395,0.07398,0.6298,0.7629,4.414,81.46,0.004253,0.04759,0.03872,0.01567,0.01798,0.005295,26.68,33.48,176.5,2089,0.1491,0.7584,0.678,0.2903,0.4098,0.1284
-873701,M,15.7,20.31,101.2,766.6,0.09597,0.08799,0.06593,0.05189,0.1618,0.05549,0.3699,1.15,2.406,40.98,0.004626,0.02263,0.01954,0.009767,0.01547,0.00243,20.11,32.82,129.3,1269,0.1414,0.3547,0.2902,0.1541,0.3437,0.08631
-873843,B,11.41,14.92,73.53,402,0.09059,0.08155,0.06181,0.02361,0.1167,0.06217,0.3344,1.108,1.902,22.77,0.007356,0.03728,0.05915,0.01712,0.02165,0.004784,12.37,17.7,79.12,467.2,0.1121,0.161,0.1648,0.06296,0.1811,0.07427
-873885,M,15.28,22.41,98.92,710.6,0.09057,0.1052,0.05375,0.03263,0.1727,0.06317,0.2054,0.4956,1.344,19.53,0.00329,0.01395,0.01774,0.006009,0.01172,0.002575,17.8,28.03,113.8,973.1,0.1301,0.3299,0.363,0.1226,0.3175,0.09772
-874158,B,10.08,15.11,63.76,317.5,0.09267,0.04695,0.001597,0.002404,0.1703,0.06048,0.4245,1.268,2.68,26.43,0.01439,0.012,0.001597,0.002404,0.02538,0.00347,11.87,21.18,75.39,437,0.1521,0.1019,0.00692,0.01042,0.2933,0.07697
-874217,M,18.31,18.58,118.6,1041,0.08588,0.08468,0.08169,0.05814,0.1621,0.05425,0.2577,0.4757,1.817,28.92,0.002866,0.009181,0.01412,0.006719,0.01069,0.001087,21.31,26.36,139.2,1410,0.1234,0.2445,0.3538,0.1571,0.3206,0.06938
-874373,B,11.71,17.19,74.68,420.3,0.09774,0.06141,0.03809,0.03239,0.1516,0.06095,0.2451,0.7655,1.742,17.86,0.006905,0.008704,0.01978,0.01185,0.01897,0.001671,13.01,21.39,84.42,521.5,0.1323,0.104,0.1521,0.1099,0.2572,0.07097
-874662,B,11.81,17.39,75.27,428.9,0.1007,0.05562,0.02353,0.01553,0.1718,0.0578,0.1859,1.926,1.011,14.47,0.007831,0.008776,0.01556,0.00624,0.03139,0.001988,12.57,26.48,79.57,489.5,0.1356,0.1,0.08803,0.04306,0.32,0.06576
-874839,B,12.3,15.9,78.83,463.7,0.0808,0.07253,0.03844,0.01654,0.1667,0.05474,0.2382,0.8355,1.687,18.32,0.005996,0.02212,0.02117,0.006433,0.02025,0.001725,13.35,19.59,86.65,546.7,0.1096,0.165,0.1423,0.04815,0.2482,0.06306
-874858,M,14.22,23.12,94.37,609.9,0.1075,0.2413,0.1981,0.06618,0.2384,0.07542,0.286,2.11,2.112,31.72,0.00797,0.1354,0.1166,0.01666,0.05113,0.01172,15.74,37.18,106.4,762.4,0.1533,0.9327,0.8488,0.1772,0.5166,0.1446
-875093,B,12.77,21.41,82.02,507.4,0.08749,0.06601,0.03112,0.02864,0.1694,0.06287,0.7311,1.748,5.118,53.65,0.004571,0.0179,0.02176,0.01757,0.03373,0.005875,13.75,23.5,89.04,579.5,0.09388,0.08978,0.05186,0.04773,0.2179,0.06871
-875099,B,9.72,18.22,60.73,288.1,0.0695,0.02344,0,0,0.1653,0.06447,0.3539,4.885,2.23,21.69,0.001713,0.006736,0,0,0.03799,0.001688,9.968,20.83,62.25,303.8,0.07117,0.02729,0,0,0.1909,0.06559
-875263,M,12.34,26.86,81.15,477.4,0.1034,0.1353,0.1085,0.04562,0.1943,0.06937,0.4053,1.809,2.642,34.44,0.009098,0.03845,0.03763,0.01321,0.01878,0.005672,15.65,39.34,101.7,768.9,0.1785,0.4706,0.4425,0.1459,0.3215,0.1205
-87556202,M,14.86,23.21,100.4,671.4,0.1044,0.198,0.1697,0.08878,0.1737,0.06672,0.2796,0.9622,3.591,25.2,0.008081,0.05122,0.05551,0.01883,0.02545,0.004312,16.08,27.78,118.6,784.7,0.1316,0.4648,0.4589,0.1727,0.3,0.08701
-875878,B,12.91,16.33,82.53,516.4,0.07941,0.05366,0.03873,0.02377,0.1829,0.05667,0.1942,0.9086,1.493,15.75,0.005298,0.01587,0.02321,0.00842,0.01853,0.002152,13.88,22,90.81,600.6,0.1097,0.1506,0.1764,0.08235,0.3024,0.06949
-875938,M,13.77,22.29,90.63,588.9,0.12,0.1267,0.1385,0.06526,0.1834,0.06877,0.6191,2.112,4.906,49.7,0.0138,0.03348,0.04665,0.0206,0.02689,0.004306,16.39,34.01,111.6,806.9,0.1737,0.3122,0.3809,0.1673,0.308,0.09333
-877159,M,18.08,21.84,117.4,1024,0.07371,0.08642,0.1103,0.05778,0.177,0.0534,0.6362,1.305,4.312,76.36,0.00553,0.05296,0.0611,0.01444,0.0214,0.005036,19.76,24.7,129.1,1228,0.08822,0.1963,0.2535,0.09181,0.2369,0.06558
-877486,M,19.18,22.49,127.5,1148,0.08523,0.1428,0.1114,0.06772,0.1767,0.05529,0.4357,1.073,3.833,54.22,0.005524,0.03698,0.02706,0.01221,0.01415,0.003397,23.36,32.06,166.4,1688,0.1322,0.5601,0.3865,0.1708,0.3193,0.09221
-877500,M,14.45,20.22,94.49,642.7,0.09872,0.1206,0.118,0.0598,0.195,0.06466,0.2092,0.6509,1.446,19.42,0.004044,0.01597,0.02,0.007303,0.01522,0.001976,18.33,30.12,117.9,1044,0.1552,0.4056,0.4967,0.1838,0.4753,0.1013
-877501,B,12.23,19.56,78.54,461,0.09586,0.08087,0.04187,0.04107,0.1979,0.06013,0.3534,1.326,2.308,27.24,0.007514,0.01779,0.01401,0.0114,0.01503,0.003338,14.44,28.36,92.15,638.4,0.1429,0.2042,0.1377,0.108,0.2668,0.08174
-877989,M,17.54,19.32,115.1,951.6,0.08968,0.1198,0.1036,0.07488,0.1506,0.05491,0.3971,0.8282,3.088,40.73,0.00609,0.02569,0.02713,0.01345,0.01594,0.002658,20.42,25.84,139.5,1239,0.1381,0.342,0.3508,0.1939,0.2928,0.07867
-878796,M,23.29,26.67,158.9,1685,0.1141,0.2084,0.3523,0.162,0.22,0.06229,0.5539,1.56,4.667,83.16,0.009327,0.05121,0.08958,0.02465,0.02175,0.005195,25.12,32.68,177,1986,0.1536,0.4167,0.7892,0.2733,0.3198,0.08762
-87880,M,13.81,23.75,91.56,597.8,0.1323,0.1768,0.1558,0.09176,0.2251,0.07421,0.5648,1.93,3.909,52.72,0.008824,0.03108,0.03112,0.01291,0.01998,0.004506,19.2,41.85,128.5,1153,0.2226,0.5209,0.4646,0.2013,0.4432,0.1086
-87930,B,12.47,18.6,81.09,481.9,0.09965,0.1058,0.08005,0.03821,0.1925,0.06373,0.3961,1.044,2.497,30.29,0.006953,0.01911,0.02701,0.01037,0.01782,0.003586,14.97,24.64,96.05,677.9,0.1426,0.2378,0.2671,0.1015,0.3014,0.0875
-879523,M,15.12,16.68,98.78,716.6,0.08876,0.09588,0.0755,0.04079,0.1594,0.05986,0.2711,0.3621,1.974,26.44,0.005472,0.01919,0.02039,0.00826,0.01523,0.002881,17.77,20.24,117.7,989.5,0.1491,0.3331,0.3327,0.1252,0.3415,0.0974
-879804,B,9.876,17.27,62.92,295.4,0.1089,0.07232,0.01756,0.01952,0.1934,0.06285,0.2137,1.342,1.517,12.33,0.009719,0.01249,0.007975,0.007527,0.0221,0.002472,10.42,23.22,67.08,331.6,0.1415,0.1247,0.06213,0.05588,0.2989,0.0738
-879830,M,17.01,20.26,109.7,904.3,0.08772,0.07304,0.0695,0.0539,0.2026,0.05223,0.5858,0.8554,4.106,68.46,0.005038,0.01503,0.01946,0.01123,0.02294,0.002581,19.8,25.05,130,1210,0.1111,0.1486,0.1932,0.1096,0.3275,0.06469
-8810158,B,13.11,22.54,87.02,529.4,0.1002,0.1483,0.08705,0.05102,0.185,0.0731,0.1931,0.9223,1.491,15.09,0.005251,0.03041,0.02526,0.008304,0.02514,0.004198,14.55,29.16,99.48,639.3,0.1349,0.4402,0.3162,0.1126,0.4128,0.1076
-8810436,B,15.27,12.91,98.17,725.5,0.08182,0.0623,0.05892,0.03157,0.1359,0.05526,0.2134,0.3628,1.525,20,0.004291,0.01236,0.01841,0.007373,0.009539,0.001656,17.38,15.92,113.7,932.7,0.1222,0.2186,0.2962,0.1035,0.232,0.07474
-881046502,M,20.58,22.14,134.7,1290,0.0909,0.1348,0.164,0.09561,0.1765,0.05024,0.8601,1.48,7.029,111.7,0.008124,0.03611,0.05489,0.02765,0.03176,0.002365,23.24,27.84,158.3,1656,0.1178,0.292,0.3861,0.192,0.2909,0.05865
-8810528,B,11.84,18.94,75.51,428,0.08871,0.069,0.02669,0.01393,0.1533,0.06057,0.2222,0.8652,1.444,17.12,0.005517,0.01727,0.02045,0.006747,0.01616,0.002922,13.3,24.99,85.22,546.3,0.128,0.188,0.1471,0.06913,0.2535,0.07993
-8810703,M,28.11,18.47,188.5,2499,0.1142,0.1516,0.3201,0.1595,0.1648,0.05525,2.873,1.476,21.98,525.6,0.01345,0.02772,0.06389,0.01407,0.04783,0.004476,28.11,18.47,188.5,2499,0.1142,0.1516,0.3201,0.1595,0.1648,0.05525
-881094802,M,17.42,25.56,114.5,948,0.1006,0.1146,0.1682,0.06597,0.1308,0.05866,0.5296,1.667,3.767,58.53,0.03113,0.08555,0.1438,0.03927,0.02175,0.01256,18.07,28.07,120.4,1021,0.1243,0.1793,0.2803,0.1099,0.1603,0.06818
-8810955,M,14.19,23.81,92.87,610.7,0.09463,0.1306,0.1115,0.06462,0.2235,0.06433,0.4207,1.845,3.534,31,0.01088,0.0371,0.03688,0.01627,0.04499,0.004768,16.86,34.85,115,811.3,0.1559,0.4059,0.3744,0.1772,0.4724,0.1026
-8810987,M,13.86,16.93,90.96,578.9,0.1026,0.1517,0.09901,0.05602,0.2106,0.06916,0.2563,1.194,1.933,22.69,0.00596,0.03438,0.03909,0.01435,0.01939,0.00456,15.75,26.93,104.4,750.1,0.146,0.437,0.4636,0.1654,0.363,0.1059
-8811523,B,11.89,18.35,77.32,432.2,0.09363,0.1154,0.06636,0.03142,0.1967,0.06314,0.2963,1.563,2.087,21.46,0.008872,0.04192,0.05946,0.01785,0.02793,0.004775,13.25,27.1,86.2,531.2,0.1405,0.3046,0.2806,0.1138,0.3397,0.08365
-8811779,B,10.2,17.48,65.05,321.2,0.08054,0.05907,0.05774,0.01071,0.1964,0.06315,0.3567,1.922,2.747,22.79,0.00468,0.0312,0.05774,0.01071,0.0256,0.004613,11.48,24.47,75.4,403.7,0.09527,0.1397,0.1925,0.03571,0.2868,0.07809
-8811842,M,19.8,21.56,129.7,1230,0.09383,0.1306,0.1272,0.08691,0.2094,0.05581,0.9553,1.186,6.487,124.4,0.006804,0.03169,0.03446,0.01712,0.01897,0.004045,25.73,28.64,170.3,2009,0.1353,0.3235,0.3617,0.182,0.307,0.08255
-88119002,M,19.53,32.47,128,1223,0.0842,0.113,0.1145,0.06637,0.1428,0.05313,0.7392,1.321,4.722,109.9,0.005539,0.02644,0.02664,0.01078,0.01332,0.002256,27.9,45.41,180.2,2477,0.1408,0.4097,0.3995,0.1625,0.2713,0.07568
-8812816,B,13.65,13.16,87.88,568.9,0.09646,0.08711,0.03888,0.02563,0.136,0.06344,0.2102,0.4336,1.391,17.4,0.004133,0.01695,0.01652,0.006659,0.01371,0.002735,15.34,16.35,99.71,706.2,0.1311,0.2474,0.1759,0.08056,0.238,0.08718
-8812818,B,13.56,13.9,88.59,561.3,0.1051,0.1192,0.0786,0.04451,0.1962,0.06303,0.2569,0.4981,2.011,21.03,0.005851,0.02314,0.02544,0.00836,0.01842,0.002918,14.98,17.13,101.1,686.6,0.1376,0.2698,0.2577,0.0909,0.3065,0.08177
-8812844,B,10.18,17.53,65.12,313.1,0.1061,0.08502,0.01768,0.01915,0.191,0.06908,0.2467,1.217,1.641,15.05,0.007899,0.014,0.008534,0.007624,0.02637,0.003761,11.17,22.84,71.94,375.6,0.1406,0.144,0.06572,0.05575,0.3055,0.08797
-8812877,M,15.75,20.25,102.6,761.3,0.1025,0.1204,0.1147,0.06462,0.1935,0.06303,0.3473,0.9209,2.244,32.19,0.004766,0.02374,0.02384,0.008637,0.01772,0.003131,19.56,30.29,125.9,1088,0.1552,0.448,0.3976,0.1479,0.3993,0.1064
-8813129,B,13.27,17.02,84.55,546.4,0.08445,0.04994,0.03554,0.02456,0.1496,0.05674,0.2927,0.8907,2.044,24.68,0.006032,0.01104,0.02259,0.009057,0.01482,0.002496,15.14,23.6,98.84,708.8,0.1276,0.1311,0.1786,0.09678,0.2506,0.07623
-88143502,B,14.34,13.47,92.51,641.2,0.09906,0.07624,0.05724,0.04603,0.2075,0.05448,0.522,0.8121,3.763,48.29,0.007089,0.01428,0.0236,0.01286,0.02266,0.001463,16.77,16.9,110.4,873.2,0.1297,0.1525,0.1632,0.1087,0.3062,0.06072
-88147101,B,10.44,15.46,66.62,329.6,0.1053,0.07722,0.006643,0.01216,0.1788,0.0645,0.1913,0.9027,1.208,11.86,0.006513,0.008061,0.002817,0.004972,0.01502,0.002821,11.52,19.8,73.47,395.4,0.1341,0.1153,0.02639,0.04464,0.2615,0.08269
-88147102,B,15,15.51,97.45,684.5,0.08371,0.1096,0.06505,0.0378,0.1881,0.05907,0.2318,0.4966,2.276,19.88,0.004119,0.03207,0.03644,0.01155,0.01391,0.003204,16.41,19.31,114.2,808.2,0.1136,0.3627,0.3402,0.1379,0.2954,0.08362
-88147202,B,12.62,23.97,81.35,496.4,0.07903,0.07529,0.05438,0.02036,0.1514,0.06019,0.2449,1.066,1.445,18.51,0.005169,0.02294,0.03016,0.008691,0.01365,0.003407,14.2,31.31,90.67,624,0.1227,0.3454,0.3911,0.118,0.2826,0.09585
-881861,M,12.83,22.33,85.26,503.2,0.1088,0.1799,0.1695,0.06861,0.2123,0.07254,0.3061,1.069,2.257,25.13,0.006983,0.03858,0.04683,0.01499,0.0168,0.005617,15.2,30.15,105.3,706,0.1777,0.5343,0.6282,0.1977,0.3407,0.1243
-881972,M,17.05,19.08,113.4,895,0.1141,0.1572,0.191,0.109,0.2131,0.06325,0.2959,0.679,2.153,31.98,0.005532,0.02008,0.03055,0.01384,0.01177,0.002336,19.59,24.89,133.5,1189,0.1703,0.3934,0.5018,0.2543,0.3109,0.09061
-88199202,B,11.32,27.08,71.76,395.7,0.06883,0.03813,0.01633,0.003125,0.1869,0.05628,0.121,0.8927,1.059,8.605,0.003653,0.01647,0.01633,0.003125,0.01537,0.002052,12.08,33.75,79.82,452.3,0.09203,0.1432,0.1089,0.02083,0.2849,0.07087
-88203002,B,11.22,33.81,70.79,386.8,0.0778,0.03574,0.004967,0.006434,0.1845,0.05828,0.2239,1.647,1.489,15.46,0.004359,0.006813,0.003223,0.003419,0.01916,0.002534,12.36,41.78,78.44,470.9,0.09994,0.06885,0.02318,0.03002,0.2911,0.07307
-88206102,M,20.51,27.81,134.4,1319,0.09159,0.1074,0.1554,0.0834,0.1448,0.05592,0.524,1.189,3.767,70.01,0.00502,0.02062,0.03457,0.01091,0.01298,0.002887,24.47,37.38,162.7,1872,0.1223,0.2761,0.4146,0.1563,0.2437,0.08328
-882488,B,9.567,15.91,60.21,279.6,0.08464,0.04087,0.01652,0.01667,0.1551,0.06403,0.2152,0.8301,1.215,12.64,0.01164,0.0104,0.01186,0.009623,0.02383,0.00354,10.51,19.16,65.74,335.9,0.1504,0.09515,0.07161,0.07222,0.2757,0.08178
-88249602,B,14.03,21.25,89.79,603.4,0.0907,0.06945,0.01462,0.01896,0.1517,0.05835,0.2589,1.503,1.667,22.07,0.007389,0.01383,0.007302,0.01004,0.01263,0.002925,15.33,30.28,98.27,715.5,0.1287,0.1513,0.06231,0.07963,0.2226,0.07617
-88299702,M,23.21,26.97,153.5,1670,0.09509,0.1682,0.195,0.1237,0.1909,0.06309,1.058,0.9635,7.247,155.8,0.006428,0.02863,0.04497,0.01716,0.0159,0.003053,31.01,34.51,206,2944,0.1481,0.4126,0.582,0.2593,0.3103,0.08677
-883263,M,20.48,21.46,132.5,1306,0.08355,0.08348,0.09042,0.06022,0.1467,0.05177,0.6874,1.041,5.144,83.5,0.007959,0.03133,0.04257,0.01671,0.01341,0.003933,24.22,26.17,161.7,1750,0.1228,0.2311,0.3158,0.1445,0.2238,0.07127
-883270,B,14.22,27.85,92.55,623.9,0.08223,0.1039,0.1103,0.04408,0.1342,0.06129,0.3354,2.324,2.105,29.96,0.006307,0.02845,0.0385,0.01011,0.01185,0.003589,15.75,40.54,102.5,764,0.1081,0.2426,0.3064,0.08219,0.189,0.07796
-88330202,M,17.46,39.28,113.4,920.6,0.09812,0.1298,0.1417,0.08811,0.1809,0.05966,0.5366,0.8561,3.002,49,0.00486,0.02785,0.02602,0.01374,0.01226,0.002759,22.51,44.87,141.2,1408,0.1365,0.3735,0.3241,0.2066,0.2853,0.08496
-88350402,B,13.64,15.6,87.38,575.3,0.09423,0.0663,0.04705,0.03731,0.1717,0.0566,0.3242,0.6612,1.996,27.19,0.00647,0.01248,0.0181,0.01103,0.01898,0.001794,14.85,19.05,94.11,683.4,0.1278,0.1291,0.1533,0.09222,0.253,0.0651
-883539,B,12.42,15.04,78.61,476.5,0.07926,0.03393,0.01053,0.01108,0.1546,0.05754,0.1153,0.6745,0.757,9.006,0.003265,0.00493,0.006493,0.003762,0.0172,0.00136,13.2,20.37,83.85,543.4,0.1037,0.07776,0.06243,0.04052,0.2901,0.06783
-883852,B,11.3,18.19,73.93,389.4,0.09592,0.1325,0.1548,0.02854,0.2054,0.07669,0.2428,1.642,2.369,16.39,0.006663,0.05914,0.0888,0.01314,0.01995,0.008675,12.58,27.96,87.16,472.9,0.1347,0.4848,0.7436,0.1218,0.3308,0.1297
-88411702,B,13.75,23.77,88.54,590,0.08043,0.06807,0.04697,0.02344,0.1773,0.05429,0.4347,1.057,2.829,39.93,0.004351,0.02667,0.03371,0.01007,0.02598,0.003087,15.01,26.34,98,706,0.09368,0.1442,0.1359,0.06106,0.2663,0.06321
-884180,M,19.4,23.5,129.1,1155,0.1027,0.1558,0.2049,0.08886,0.1978,0.06,0.5243,1.802,4.037,60.41,0.01061,0.03252,0.03915,0.01559,0.02186,0.003949,21.65,30.53,144.9,1417,0.1463,0.2968,0.3458,0.1564,0.292,0.07614
-884437,B,10.48,19.86,66.72,337.7,0.107,0.05971,0.04831,0.0307,0.1737,0.0644,0.3719,2.612,2.517,23.22,0.01604,0.01386,0.01865,0.01133,0.03476,0.00356,11.48,29.46,73.68,402.8,0.1515,0.1026,0.1181,0.06736,0.2883,0.07748
-884448,B,13.2,17.43,84.13,541.6,0.07215,0.04524,0.04336,0.01105,0.1487,0.05635,0.163,1.601,0.873,13.56,0.006261,0.01569,0.03079,0.005383,0.01962,0.00225,13.94,27.82,88.28,602,0.1101,0.1508,0.2298,0.0497,0.2767,0.07198
-884626,B,12.89,14.11,84.95,512.2,0.0876,0.1346,0.1374,0.0398,0.1596,0.06409,0.2025,0.4402,2.393,16.35,0.005501,0.05592,0.08158,0.0137,0.01266,0.007555,14.39,17.7,105,639.1,0.1254,0.5849,0.7727,0.1561,0.2639,0.1178
-88466802,B,10.65,25.22,68.01,347,0.09657,0.07234,0.02379,0.01615,0.1897,0.06329,0.2497,1.493,1.497,16.64,0.007189,0.01035,0.01081,0.006245,0.02158,0.002619,12.25,35.19,77.98,455.7,0.1499,0.1398,0.1125,0.06136,0.3409,0.08147
-884689,B,11.52,14.93,73.87,406.3,0.1013,0.07808,0.04328,0.02929,0.1883,0.06168,0.2562,1.038,1.686,18.62,0.006662,0.01228,0.02105,0.01006,0.01677,0.002784,12.65,21.19,80.88,491.8,0.1389,0.1582,0.1804,0.09608,0.2664,0.07809
-884948,M,20.94,23.56,138.9,1364,0.1007,0.1606,0.2712,0.131,0.2205,0.05898,1.004,0.8208,6.372,137.9,0.005283,0.03908,0.09518,0.01864,0.02401,0.005002,25.58,27,165.3,2010,0.1211,0.3172,0.6991,0.2105,0.3126,0.07849
-88518501,B,11.5,18.45,73.28,407.4,0.09345,0.05991,0.02638,0.02069,0.1834,0.05934,0.3927,0.8429,2.684,26.99,0.00638,0.01065,0.01245,0.009175,0.02292,0.001461,12.97,22.46,83.12,508.9,0.1183,0.1049,0.08105,0.06544,0.274,0.06487
-885429,M,19.73,19.82,130.7,1206,0.1062,0.1849,0.2417,0.0974,0.1733,0.06697,0.7661,0.78,4.115,92.81,0.008482,0.05057,0.068,0.01971,0.01467,0.007259,25.28,25.59,159.8,1933,0.171,0.5955,0.8489,0.2507,0.2749,0.1297
-8860702,M,17.3,17.08,113,928.2,0.1008,0.1041,0.1266,0.08353,0.1813,0.05613,0.3093,0.8568,2.193,33.63,0.004757,0.01503,0.02332,0.01262,0.01394,0.002362,19.85,25.09,130.9,1222,0.1416,0.2405,0.3378,0.1857,0.3138,0.08113
-886226,M,19.45,19.33,126.5,1169,0.1035,0.1188,0.1379,0.08591,0.1776,0.05647,0.5959,0.6342,3.797,71,0.004649,0.018,0.02749,0.01267,0.01365,0.00255,25.7,24.57,163.1,1972,0.1497,0.3161,0.4317,0.1999,0.3379,0.0895
-886452,M,13.96,17.05,91.43,602.4,0.1096,0.1279,0.09789,0.05246,0.1908,0.0613,0.425,0.8098,2.563,35.74,0.006351,0.02679,0.03119,0.01342,0.02062,0.002695,16.39,22.07,108.1,826,0.1512,0.3262,0.3209,0.1374,0.3068,0.07957
-88649001,M,19.55,28.77,133.6,1207,0.0926,0.2063,0.1784,0.1144,0.1893,0.06232,0.8426,1.199,7.158,106.4,0.006356,0.04765,0.03863,0.01519,0.01936,0.005252,25.05,36.27,178.6,1926,0.1281,0.5329,0.4251,0.1941,0.2818,0.1005
-886776,M,15.32,17.27,103.2,713.3,0.1335,0.2284,0.2448,0.1242,0.2398,0.07596,0.6592,1.059,4.061,59.46,0.01015,0.04588,0.04983,0.02127,0.01884,0.00866,17.73,22.66,119.8,928.8,0.1765,0.4503,0.4429,0.2229,0.3258,0.1191
-887181,M,15.66,23.2,110.2,773.5,0.1109,0.3114,0.3176,0.1377,0.2495,0.08104,1.292,2.454,10.12,138.5,0.01236,0.05995,0.08232,0.03024,0.02337,0.006042,19.85,31.64,143.7,1226,0.1504,0.5172,0.6181,0.2462,0.3277,0.1019
-88725602,M,15.53,33.56,103.7,744.9,0.1063,0.1639,0.1751,0.08399,0.2091,0.0665,0.2419,1.278,1.903,23.02,0.005345,0.02556,0.02889,0.01022,0.009947,0.003359,18.49,49.54,126.3,1035,0.1883,0.5564,0.5703,0.2014,0.3512,0.1204
-887549,M,20.31,27.06,132.9,1288,0.1,0.1088,0.1519,0.09333,0.1814,0.05572,0.3977,1.033,2.587,52.34,0.005043,0.01578,0.02117,0.008185,0.01282,0.001892,24.33,39.16,162.3,1844,0.1522,0.2945,0.3788,0.1697,0.3151,0.07999
-888264,M,17.35,23.06,111,933.1,0.08662,0.0629,0.02891,0.02837,0.1564,0.05307,0.4007,1.317,2.577,44.41,0.005726,0.01106,0.01246,0.007671,0.01411,0.001578,19.85,31.47,128.2,1218,0.124,0.1486,0.1211,0.08235,0.2452,0.06515
-888570,M,17.29,22.13,114.4,947.8,0.08999,0.1273,0.09697,0.07507,0.2108,0.05464,0.8348,1.633,6.146,90.94,0.006717,0.05981,0.04638,0.02149,0.02747,0.005838,20.39,27.24,137.9,1295,0.1134,0.2867,0.2298,0.1528,0.3067,0.07484
-889403,M,15.61,19.38,100,758.6,0.0784,0.05616,0.04209,0.02847,0.1547,0.05443,0.2298,0.9988,1.534,22.18,0.002826,0.009105,0.01311,0.005174,0.01013,0.001345,17.91,31.67,115.9,988.6,0.1084,0.1807,0.226,0.08568,0.2683,0.06829
-889719,M,17.19,22.07,111.6,928.3,0.09726,0.08995,0.09061,0.06527,0.1867,0.0558,0.4203,0.7383,2.819,45.42,0.004493,0.01206,0.02048,0.009875,0.01144,0.001575,21.58,29.33,140.5,1436,0.1558,0.2567,0.3889,0.1984,0.3216,0.0757
-88995002,M,20.73,31.12,135.7,1419,0.09469,0.1143,0.1367,0.08646,0.1769,0.05674,1.172,1.617,7.749,199.7,0.004551,0.01478,0.02143,0.00928,0.01367,0.002299,32.49,47.16,214,3432,0.1401,0.2644,0.3442,0.1659,0.2868,0.08218
-8910251,B,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,0.1922,0.06491,0.4505,1.197,3.43,27.1,0.00747,0.03581,0.03354,0.01365,0.03504,0.003318,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
-8910499,B,13.59,21.84,87.16,561,0.07956,0.08259,0.04072,0.02142,0.1635,0.05859,0.338,1.916,2.591,26.76,0.005436,0.02406,0.03099,0.009919,0.0203,0.003009,14.8,30.04,97.66,661.5,0.1005,0.173,0.1453,0.06189,0.2446,0.07024
-8910506,B,12.87,16.21,82.38,512.2,0.09425,0.06219,0.039,0.01615,0.201,0.05769,0.2345,1.219,1.546,18.24,0.005518,0.02178,0.02589,0.00633,0.02593,0.002157,13.9,23.64,89.27,597.5,0.1256,0.1808,0.1992,0.0578,0.3604,0.07062
-8910720,B,10.71,20.39,69.5,344.9,0.1082,0.1289,0.08448,0.02867,0.1668,0.06862,0.3198,1.489,2.23,20.74,0.008902,0.04785,0.07339,0.01745,0.02728,0.00761,11.69,25.21,76.51,410.4,0.1335,0.255,0.2534,0.086,0.2605,0.08701
-8910721,B,14.29,16.82,90.3,632.6,0.06429,0.02675,0.00725,0.00625,0.1508,0.05376,0.1302,0.7198,0.8439,10.77,0.003492,0.00371,0.004826,0.003608,0.01536,0.001381,14.91,20.65,94.44,684.6,0.08567,0.05036,0.03866,0.03333,0.2458,0.0612
-8910748,B,11.29,13.04,72.23,388,0.09834,0.07608,0.03265,0.02755,0.1769,0.0627,0.1904,0.5293,1.164,13.17,0.006472,0.01122,0.01282,0.008849,0.01692,0.002817,12.32,16.18,78.27,457.5,0.1358,0.1507,0.1275,0.0875,0.2733,0.08022
-8910988,M,21.75,20.99,147.3,1491,0.09401,0.1961,0.2195,0.1088,0.1721,0.06194,1.167,1.352,8.867,156.8,0.005687,0.0496,0.06329,0.01561,0.01924,0.004614,28.19,28.18,195.9,2384,0.1272,0.4725,0.5807,0.1841,0.2833,0.08858
-8910996,B,9.742,15.67,61.5,289.9,0.09037,0.04689,0.01103,0.01407,0.2081,0.06312,0.2684,1.409,1.75,16.39,0.0138,0.01067,0.008347,0.009472,0.01798,0.004261,10.75,20.88,68.09,355.2,0.1467,0.0937,0.04043,0.05159,0.2841,0.08175
-8911163,M,17.93,24.48,115.2,998.9,0.08855,0.07027,0.05699,0.04744,0.1538,0.0551,0.4212,1.433,2.765,45.81,0.005444,0.01169,0.01622,0.008522,0.01419,0.002751,20.92,34.69,135.1,1320,0.1315,0.1806,0.208,0.1136,0.2504,0.07948
-8911164,B,11.89,17.36,76.2,435.6,0.1225,0.0721,0.05929,0.07404,0.2015,0.05875,0.6412,2.293,4.021,48.84,0.01418,0.01489,0.01267,0.0191,0.02678,0.003002,12.4,18.99,79.46,472.4,0.1359,0.08368,0.07153,0.08946,0.222,0.06033
-8911230,B,11.33,14.16,71.79,396.6,0.09379,0.03872,0.001487,0.003333,0.1954,0.05821,0.2375,1.28,1.565,17.09,0.008426,0.008998,0.001487,0.003333,0.02358,0.001627,12.2,18.99,77.37,458,0.1259,0.07348,0.004955,0.01111,0.2758,0.06386
-8911670,M,18.81,19.98,120.9,1102,0.08923,0.05884,0.0802,0.05843,0.155,0.04996,0.3283,0.828,2.363,36.74,0.007571,0.01114,0.02623,0.01463,0.0193,0.001676,19.96,24.3,129,1236,0.1243,0.116,0.221,0.1294,0.2567,0.05737
-8911800,B,13.59,17.84,86.24,572.3,0.07948,0.04052,0.01997,0.01238,0.1573,0.0552,0.258,1.166,1.683,22.22,0.003741,0.005274,0.01065,0.005044,0.01344,0.001126,15.5,26.1,98.91,739.1,0.105,0.07622,0.106,0.05185,0.2335,0.06263
-8911834,B,13.85,15.18,88.99,587.4,0.09516,0.07688,0.04479,0.03711,0.211,0.05853,0.2479,0.9195,1.83,19.41,0.004235,0.01541,0.01457,0.01043,0.01528,0.001593,14.98,21.74,98.37,670,0.1185,0.1724,0.1456,0.09993,0.2955,0.06912
-8912049,M,19.16,26.6,126.2,1138,0.102,0.1453,0.1921,0.09664,0.1902,0.0622,0.6361,1.001,4.321,69.65,0.007392,0.02449,0.03988,0.01293,0.01435,0.003446,23.72,35.9,159.8,1724,0.1782,0.3841,0.5754,0.1872,0.3258,0.0972
-8912055,B,11.74,14.02,74.24,427.3,0.07813,0.0434,0.02245,0.02763,0.2101,0.06113,0.5619,1.268,3.717,37.83,0.008034,0.01442,0.01514,0.01846,0.02921,0.002005,13.31,18.26,84.7,533.7,0.1036,0.085,0.06735,0.0829,0.3101,0.06688
-89122,M,19.4,18.18,127.2,1145,0.1037,0.1442,0.1626,0.09464,0.1893,0.05892,0.4709,0.9951,2.903,53.16,0.005654,0.02199,0.03059,0.01499,0.01623,0.001965,23.79,28.65,152.4,1628,0.1518,0.3749,0.4316,0.2252,0.359,0.07787
-8912280,M,16.24,18.77,108.8,805.1,0.1066,0.1802,0.1948,0.09052,0.1876,0.06684,0.2873,0.9173,2.464,28.09,0.004563,0.03481,0.03872,0.01209,0.01388,0.004081,18.55,25.09,126.9,1031,0.1365,0.4706,0.5026,0.1732,0.277,0.1063
-8912284,B,12.89,15.7,84.08,516.6,0.07818,0.0958,0.1115,0.0339,0.1432,0.05935,0.2913,1.389,2.347,23.29,0.006418,0.03961,0.07927,0.01774,0.01878,0.003696,13.9,19.69,92.12,595.6,0.09926,0.2317,0.3344,0.1017,0.1999,0.07127
-8912521,B,12.58,18.4,79.83,489,0.08393,0.04216,0.00186,0.002924,0.1697,0.05855,0.2719,1.35,1.721,22.45,0.006383,0.008008,0.00186,0.002924,0.02571,0.002015,13.5,23.08,85.56,564.1,0.1038,0.06624,0.005579,0.008772,0.2505,0.06431
-8912909,B,11.94,20.76,77.87,441,0.08605,0.1011,0.06574,0.03791,0.1588,0.06766,0.2742,1.39,3.198,21.91,0.006719,0.05156,0.04387,0.01633,0.01872,0.008015,13.24,27.29,92.2,546.1,0.1116,0.2813,0.2365,0.1155,0.2465,0.09981
-8913,B,12.89,13.12,81.89,515.9,0.06955,0.03729,0.0226,0.01171,0.1337,0.05581,0.1532,0.469,1.115,12.68,0.004731,0.01345,0.01652,0.005905,0.01619,0.002081,13.62,15.54,87.4,577,0.09616,0.1147,0.1186,0.05366,0.2309,0.06915
-8913049,B,11.26,19.96,73.72,394.1,0.0802,0.1181,0.09274,0.05588,0.2595,0.06233,0.4866,1.905,2.877,34.68,0.01574,0.08262,0.08099,0.03487,0.03418,0.006517,11.86,22.33,78.27,437.6,0.1028,0.1843,0.1546,0.09314,0.2955,0.07009
-89143601,B,11.37,18.89,72.17,396,0.08713,0.05008,0.02399,0.02173,0.2013,0.05955,0.2656,1.974,1.954,17.49,0.006538,0.01395,0.01376,0.009924,0.03416,0.002928,12.36,26.14,79.29,459.3,0.1118,0.09708,0.07529,0.06203,0.3267,0.06994
-89143602,B,14.41,19.73,96.03,651,0.08757,0.1676,0.1362,0.06602,0.1714,0.07192,0.8811,1.77,4.36,77.11,0.007762,0.1064,0.0996,0.02771,0.04077,0.02286,15.77,22.13,101.7,767.3,0.09983,0.2472,0.222,0.1021,0.2272,0.08799
-8915,B,14.96,19.1,97.03,687.3,0.08992,0.09823,0.0594,0.04819,0.1879,0.05852,0.2877,0.948,2.171,24.87,0.005332,0.02115,0.01536,0.01187,0.01522,0.002815,16.25,26.19,109.1,809.8,0.1313,0.303,0.1804,0.1489,0.2962,0.08472
-891670,B,12.95,16.02,83.14,513.7,0.1005,0.07943,0.06155,0.0337,0.173,0.0647,0.2094,0.7636,1.231,17.67,0.008725,0.02003,0.02335,0.01132,0.02625,0.004726,13.74,19.93,88.81,585.4,0.1483,0.2068,0.2241,0.1056,0.338,0.09584
-891703,B,11.85,17.46,75.54,432.7,0.08372,0.05642,0.02688,0.0228,0.1875,0.05715,0.207,1.238,1.234,13.88,0.007595,0.015,0.01412,0.008578,0.01792,0.001784,13.06,25.75,84.35,517.8,0.1369,0.1758,0.1316,0.0914,0.3101,0.07007
-891716,B,12.72,13.78,81.78,492.1,0.09667,0.08393,0.01288,0.01924,0.1638,0.061,0.1807,0.6931,1.34,13.38,0.006064,0.0118,0.006564,0.007978,0.01374,0.001392,13.5,17.48,88.54,553.7,0.1298,0.1472,0.05233,0.06343,0.2369,0.06922
-891923,B,13.77,13.27,88.06,582.7,0.09198,0.06221,0.01063,0.01917,0.1592,0.05912,0.2191,0.6946,1.479,17.74,0.004348,0.008153,0.004272,0.006829,0.02154,0.001802,14.67,16.93,94.17,661.1,0.117,0.1072,0.03732,0.05802,0.2823,0.06794
-891936,B,10.91,12.35,69.14,363.7,0.08518,0.04721,0.01236,0.01369,0.1449,0.06031,0.1753,1.027,1.267,11.09,0.003478,0.01221,0.01072,0.009393,0.02941,0.003428,11.37,14.82,72.42,392.2,0.09312,0.07506,0.02884,0.03194,0.2143,0.06643
-892189,M,11.76,18.14,75,431.1,0.09968,0.05914,0.02685,0.03515,0.1619,0.06287,0.645,2.105,4.138,49.11,0.005596,0.01005,0.01272,0.01432,0.01575,0.002758,13.36,23.39,85.1,553.6,0.1137,0.07974,0.0612,0.0716,0.1978,0.06915
-892214,B,14.26,18.17,91.22,633.1,0.06576,0.0522,0.02475,0.01374,0.1635,0.05586,0.23,0.669,1.661,20.56,0.003169,0.01377,0.01079,0.005243,0.01103,0.001957,16.22,25.26,105.8,819.7,0.09445,0.2167,0.1565,0.0753,0.2636,0.07676
-892399,B,10.51,23.09,66.85,334.2,0.1015,0.06797,0.02495,0.01875,0.1695,0.06556,0.2868,1.143,2.289,20.56,0.01017,0.01443,0.01861,0.0125,0.03464,0.001971,10.93,24.22,70.1,362.7,0.1143,0.08614,0.04158,0.03125,0.2227,0.06777
-892438,M,19.53,18.9,129.5,1217,0.115,0.1642,0.2197,0.1062,0.1792,0.06552,1.111,1.161,7.237,133,0.006056,0.03203,0.05638,0.01733,0.01884,0.004787,25.93,26.24,171.1,2053,0.1495,0.4116,0.6121,0.198,0.2968,0.09929
-892604,B,12.46,19.89,80.43,471.3,0.08451,0.1014,0.0683,0.03099,0.1781,0.06249,0.3642,1.04,2.579,28.32,0.00653,0.03369,0.04712,0.01403,0.0274,0.004651,13.46,23.07,88.13,551.3,0.105,0.2158,0.1904,0.07625,0.2685,0.07764
-89263202,M,20.09,23.86,134.7,1247,0.108,0.1838,0.2283,0.128,0.2249,0.07469,1.072,1.743,7.804,130.8,0.007964,0.04732,0.07649,0.01936,0.02736,0.005928,23.68,29.43,158.8,1696,0.1347,0.3391,0.4932,0.1923,0.3294,0.09469
-892657,B,10.49,18.61,66.86,334.3,0.1068,0.06678,0.02297,0.0178,0.1482,0.066,0.1485,1.563,1.035,10.08,0.008875,0.009362,0.01808,0.009199,0.01791,0.003317,11.06,24.54,70.76,375.4,0.1413,0.1044,0.08423,0.06528,0.2213,0.07842
-89296,B,11.46,18.16,73.59,403.1,0.08853,0.07694,0.03344,0.01502,0.1411,0.06243,0.3278,1.059,2.475,22.93,0.006652,0.02652,0.02221,0.007807,0.01894,0.003411,12.68,21.61,82.69,489.8,0.1144,0.1789,0.1226,0.05509,0.2208,0.07638
-893061,B,11.6,24.49,74.23,417.2,0.07474,0.05688,0.01974,0.01313,0.1935,0.05878,0.2512,1.786,1.961,18.21,0.006122,0.02337,0.01596,0.006998,0.03194,0.002211,12.44,31.62,81.39,476.5,0.09545,0.1361,0.07239,0.04815,0.3244,0.06745
-89344,B,13.2,15.82,84.07,537.3,0.08511,0.05251,0.001461,0.003261,0.1632,0.05894,0.1903,0.5735,1.204,15.5,0.003632,0.007861,0.001128,0.002386,0.01344,0.002585,14.41,20.45,92,636.9,0.1128,0.1346,0.0112,0.025,0.2651,0.08385
-89346,B,9,14.4,56.36,246.3,0.07005,0.03116,0.003681,0.003472,0.1788,0.06833,0.1746,1.305,1.144,9.789,0.007389,0.004883,0.003681,0.003472,0.02701,0.002153,9.699,20.07,60.9,285.5,0.09861,0.05232,0.01472,0.01389,0.2991,0.07804
-893526,B,13.5,12.71,85.69,566.2,0.07376,0.03614,0.002758,0.004419,0.1365,0.05335,0.2244,0.6864,1.509,20.39,0.003338,0.003746,0.00203,0.003242,0.0148,0.001566,14.97,16.94,95.48,698.7,0.09023,0.05836,0.01379,0.0221,0.2267,0.06192
-893548,B,13.05,13.84,82.71,530.6,0.08352,0.03735,0.004559,0.008829,0.1453,0.05518,0.3975,0.8285,2.567,33.01,0.004148,0.004711,0.002831,0.004821,0.01422,0.002273,14.73,17.4,93.96,672.4,0.1016,0.05847,0.01824,0.03532,0.2107,0.0658
-893783,B,11.7,19.11,74.33,418.7,0.08814,0.05253,0.01583,0.01148,0.1936,0.06128,0.1601,1.43,1.109,11.28,0.006064,0.00911,0.01042,0.007638,0.02349,0.001661,12.61,26.55,80.92,483.1,0.1223,0.1087,0.07915,0.05741,0.3487,0.06958
-89382601,B,14.61,15.69,92.68,664.9,0.07618,0.03515,0.01447,0.01877,0.1632,0.05255,0.316,0.9115,1.954,28.9,0.005031,0.006021,0.005325,0.006324,0.01494,0.0008948,16.46,21.75,103.7,840.8,0.1011,0.07087,0.04746,0.05813,0.253,0.05695
-89382602,B,12.76,13.37,82.29,504.1,0.08794,0.07948,0.04052,0.02548,0.1601,0.0614,0.3265,0.6594,2.346,25.18,0.006494,0.02768,0.03137,0.01069,0.01731,0.004392,14.19,16.4,92.04,618.8,0.1194,0.2208,0.1769,0.08411,0.2564,0.08253
-893988,B,11.54,10.72,73.73,409.1,0.08597,0.05969,0.01367,0.008907,0.1833,0.061,0.1312,0.3602,1.107,9.438,0.004124,0.0134,0.01003,0.004667,0.02032,0.001952,12.34,12.87,81.23,467.8,0.1092,0.1626,0.08324,0.04715,0.339,0.07434
-894047,B,8.597,18.6,54.09,221.2,0.1074,0.05847,0,0,0.2163,0.07359,0.3368,2.777,2.222,17.81,0.02075,0.01403,0,0,0.06146,0.00682,8.952,22.44,56.65,240.1,0.1347,0.07767,0,0,0.3142,0.08116
-894089,B,12.49,16.85,79.19,481.6,0.08511,0.03834,0.004473,0.006423,0.1215,0.05673,0.1716,0.7151,1.047,12.69,0.004928,0.003012,0.00262,0.00339,0.01393,0.001344,13.34,19.71,84.48,544.2,0.1104,0.04953,0.01938,0.02784,0.1917,0.06174
-894090,B,12.18,14.08,77.25,461.4,0.07734,0.03212,0.01123,0.005051,0.1673,0.05649,0.2113,0.5996,1.438,15.82,0.005343,0.005767,0.01123,0.005051,0.01977,0.0009502,12.85,16.47,81.6,513.1,0.1001,0.05332,0.04116,0.01852,0.2293,0.06037
-894326,M,18.22,18.87,118.7,1027,0.09746,0.1117,0.113,0.0795,0.1807,0.05664,0.4041,0.5503,2.547,48.9,0.004821,0.01659,0.02408,0.01143,0.01275,0.002451,21.84,25,140.9,1485,0.1434,0.2763,0.3853,0.1776,0.2812,0.08198
-894329,B,9.042,18.9,60.07,244.5,0.09968,0.1972,0.1975,0.04908,0.233,0.08743,0.4653,1.911,3.769,24.2,0.009845,0.0659,0.1027,0.02527,0.03491,0.007877,10.06,23.4,68.62,297.1,0.1221,0.3748,0.4609,0.1145,0.3135,0.1055
-894335,B,12.43,17,78.6,477.3,0.07557,0.03454,0.01342,0.01699,0.1472,0.05561,0.3778,2.2,2.487,31.16,0.007357,0.01079,0.009959,0.0112,0.03433,0.002961,12.9,20.21,81.76,515.9,0.08409,0.04712,0.02237,0.02832,0.1901,0.05932
-894604,B,10.25,16.18,66.52,324.2,0.1061,0.1111,0.06726,0.03965,0.1743,0.07279,0.3677,1.471,1.597,22.68,0.01049,0.04265,0.04004,0.01544,0.02719,0.007596,11.28,20.61,71.53,390.4,0.1402,0.236,0.1898,0.09744,0.2608,0.09702
-894618,M,20.16,19.66,131.1,1274,0.0802,0.08564,0.1155,0.07726,0.1928,0.05096,0.5925,0.6863,3.868,74.85,0.004536,0.01376,0.02645,0.01247,0.02193,0.001589,23.06,23.03,150.2,1657,0.1054,0.1537,0.2606,0.1425,0.3055,0.05933
-894855,B,12.86,13.32,82.82,504.8,0.1134,0.08834,0.038,0.034,0.1543,0.06476,0.2212,1.042,1.614,16.57,0.00591,0.02016,0.01902,0.01011,0.01202,0.003107,14.04,21.08,92.8,599.5,0.1547,0.2231,0.1791,0.1155,0.2382,0.08553
-895100,M,20.34,21.51,135.9,1264,0.117,0.1875,0.2565,0.1504,0.2569,0.0667,0.5702,1.023,4.012,69.06,0.005485,0.02431,0.0319,0.01369,0.02768,0.003345,25.3,31.86,171.1,1938,0.1592,0.4492,0.5344,0.2685,0.5558,0.1024
-89511501,B,12.2,15.21,78.01,457.9,0.08673,0.06545,0.01994,0.01692,0.1638,0.06129,0.2575,0.8073,1.959,19.01,0.005403,0.01418,0.01051,0.005142,0.01333,0.002065,13.75,21.38,91.11,583.1,0.1256,0.1928,0.1167,0.05556,0.2661,0.07961
-89511502,B,12.67,17.3,81.25,489.9,0.1028,0.07664,0.03193,0.02107,0.1707,0.05984,0.21,0.9505,1.566,17.61,0.006809,0.009514,0.01329,0.006474,0.02057,0.001784,13.71,21.1,88.7,574.4,0.1384,0.1212,0.102,0.05602,0.2688,0.06888
-89524,B,14.11,12.88,90.03,616.5,0.09309,0.05306,0.01765,0.02733,0.1373,0.057,0.2571,1.081,1.558,23.92,0.006692,0.01132,0.005717,0.006627,0.01416,0.002476,15.53,18,98.4,749.9,0.1281,0.1109,0.05307,0.0589,0.21,0.07083
-895299,B,12.03,17.93,76.09,446,0.07683,0.03892,0.001546,0.005592,0.1382,0.0607,0.2335,0.9097,1.466,16.97,0.004729,0.006887,0.001184,0.003951,0.01466,0.001755,13.07,22.25,82.74,523.4,0.1013,0.0739,0.007732,0.02796,0.2171,0.07037
-8953902,M,16.27,20.71,106.9,813.7,0.1169,0.1319,0.1478,0.08488,0.1948,0.06277,0.4375,1.232,3.27,44.41,0.006697,0.02083,0.03248,0.01392,0.01536,0.002789,19.28,30.38,129.8,1121,0.159,0.2947,0.3597,0.1583,0.3103,0.082
-895633,M,16.26,21.88,107.5,826.8,0.1165,0.1283,0.1799,0.07981,0.1869,0.06532,0.5706,1.457,2.961,57.72,0.01056,0.03756,0.05839,0.01186,0.04022,0.006187,17.73,25.21,113.7,975.2,0.1426,0.2116,0.3344,0.1047,0.2736,0.07953
-896839,M,16.03,15.51,105.8,793.2,0.09491,0.1371,0.1204,0.07041,0.1782,0.05976,0.3371,0.7476,2.629,33.27,0.005839,0.03245,0.03715,0.01459,0.01467,0.003121,18.76,21.98,124.3,1070,0.1435,0.4478,0.4956,0.1981,0.3019,0.09124
-896864,B,12.98,19.35,84.52,514,0.09579,0.1125,0.07107,0.0295,0.1761,0.0654,0.2684,0.5664,2.465,20.65,0.005727,0.03255,0.04393,0.009811,0.02751,0.004572,14.42,21.95,99.21,634.3,0.1288,0.3253,0.3439,0.09858,0.3596,0.09166
-897132,B,11.22,19.86,71.94,387.3,0.1054,0.06779,0.005006,0.007583,0.194,0.06028,0.2976,1.966,1.959,19.62,0.01289,0.01104,0.003297,0.004967,0.04243,0.001963,11.98,25.78,76.91,436.1,0.1424,0.09669,0.01335,0.02022,0.3292,0.06522
-897137,B,11.25,14.78,71.38,390,0.08306,0.04458,0.0009737,0.002941,0.1773,0.06081,0.2144,0.9961,1.529,15.07,0.005617,0.007124,0.0009737,0.002941,0.017,0.00203,12.76,22.06,82.08,492.7,0.1166,0.09794,0.005518,0.01667,0.2815,0.07418
-897374,B,12.3,19.02,77.88,464.4,0.08313,0.04202,0.007756,0.008535,0.1539,0.05945,0.184,1.532,1.199,13.24,0.007881,0.008432,0.007004,0.006522,0.01939,0.002222,13.35,28.46,84.53,544.3,0.1222,0.09052,0.03619,0.03983,0.2554,0.07207
-89742801,M,17.06,21,111.8,918.6,0.1119,0.1056,0.1508,0.09934,0.1727,0.06071,0.8161,2.129,6.076,87.17,0.006455,0.01797,0.04502,0.01744,0.01829,0.003733,20.99,33.15,143.2,1362,0.1449,0.2053,0.392,0.1827,0.2623,0.07599
-897604,B,12.99,14.23,84.08,514.3,0.09462,0.09965,0.03738,0.02098,0.1652,0.07238,0.1814,0.6412,0.9219,14.41,0.005231,0.02305,0.03113,0.007315,0.01639,0.005701,13.72,16.91,87.38,576,0.1142,0.1975,0.145,0.0585,0.2432,0.1009
-897630,M,18.77,21.43,122.9,1092,0.09116,0.1402,0.106,0.0609,0.1953,0.06083,0.6422,1.53,4.369,88.25,0.007548,0.03897,0.03914,0.01816,0.02168,0.004445,24.54,34.37,161.1,1873,0.1498,0.4827,0.4634,0.2048,0.3679,0.0987
-897880,B,10.05,17.53,64.41,310.8,0.1007,0.07326,0.02511,0.01775,0.189,0.06331,0.2619,2.015,1.778,16.85,0.007803,0.01449,0.0169,0.008043,0.021,0.002778,11.16,26.84,71.98,384,0.1402,0.1402,0.1055,0.06499,0.2894,0.07664
-89812,M,23.51,24.27,155.1,1747,0.1069,0.1283,0.2308,0.141,0.1797,0.05506,1.009,0.9245,6.462,164.1,0.006292,0.01971,0.03582,0.01301,0.01479,0.003118,30.67,30.73,202.4,2906,0.1515,0.2678,0.4819,0.2089,0.2593,0.07738
-89813,B,14.42,16.54,94.15,641.2,0.09751,0.1139,0.08007,0.04223,0.1912,0.06412,0.3491,0.7706,2.677,32.14,0.004577,0.03053,0.0384,0.01243,0.01873,0.003373,16.67,21.51,111.4,862.1,0.1294,0.3371,0.3755,0.1414,0.3053,0.08764
-898143,B,9.606,16.84,61.64,280.5,0.08481,0.09228,0.08422,0.02292,0.2036,0.07125,0.1844,0.9429,1.429,12.07,0.005954,0.03471,0.05028,0.00851,0.0175,0.004031,10.75,23.07,71.25,353.6,0.1233,0.3416,0.4341,0.0812,0.2982,0.09825
-89827,B,11.06,14.96,71.49,373.9,0.1033,0.09097,0.05397,0.03341,0.1776,0.06907,0.1601,0.8225,1.355,10.8,0.007416,0.01877,0.02758,0.0101,0.02348,0.002917,11.92,19.9,79.76,440,0.1418,0.221,0.2299,0.1075,0.3301,0.0908
-898431,M,19.68,21.68,129.9,1194,0.09797,0.1339,0.1863,0.1103,0.2082,0.05715,0.6226,2.284,5.173,67.66,0.004756,0.03368,0.04345,0.01806,0.03756,0.003288,22.75,34.66,157.6,1540,0.1218,0.3458,0.4734,0.2255,0.4045,0.07918
-89864002,B,11.71,15.45,75.03,420.3,0.115,0.07281,0.04006,0.0325,0.2009,0.06506,0.3446,0.7395,2.355,24.53,0.009536,0.01097,0.01651,0.01121,0.01953,0.0031,13.06,18.16,84.16,516.4,0.146,0.1115,0.1087,0.07864,0.2765,0.07806
-898677,B,10.26,14.71,66.2,321.6,0.09882,0.09159,0.03581,0.02037,0.1633,0.07005,0.338,2.509,2.394,19.33,0.01736,0.04671,0.02611,0.01296,0.03675,0.006758,10.88,19.48,70.89,357.1,0.136,0.1636,0.07162,0.04074,0.2434,0.08488
-898678,B,12.06,18.9,76.66,445.3,0.08386,0.05794,0.00751,0.008488,0.1555,0.06048,0.243,1.152,1.559,18.02,0.00718,0.01096,0.005832,0.005495,0.01982,0.002754,13.64,27.06,86.54,562.6,0.1289,0.1352,0.04506,0.05093,0.288,0.08083
-89869,B,14.76,14.74,94.87,668.7,0.08875,0.0778,0.04608,0.03528,0.1521,0.05912,0.3428,0.3981,2.537,29.06,0.004732,0.01506,0.01855,0.01067,0.02163,0.002783,17.27,17.93,114.2,880.8,0.122,0.2009,0.2151,0.1251,0.3109,0.08187
-898690,B,11.47,16.03,73.02,402.7,0.09076,0.05886,0.02587,0.02322,0.1634,0.06372,0.1707,0.7615,1.09,12.25,0.009191,0.008548,0.0094,0.006315,0.01755,0.003009,12.51,20.79,79.67,475.8,0.1531,0.112,0.09823,0.06548,0.2851,0.08763
-899147,B,11.95,14.96,77.23,426.7,0.1158,0.1206,0.01171,0.01787,0.2459,0.06581,0.361,1.05,2.455,26.65,0.0058,0.02417,0.007816,0.01052,0.02734,0.003114,12.81,17.72,83.09,496.2,0.1293,0.1885,0.03122,0.04766,0.3124,0.0759
-899187,B,11.66,17.07,73.7,421,0.07561,0.0363,0.008306,0.01162,0.1671,0.05731,0.3534,0.6724,2.225,26.03,0.006583,0.006991,0.005949,0.006296,0.02216,0.002668,13.28,19.74,83.61,542.5,0.09958,0.06476,0.03046,0.04262,0.2731,0.06825
-899667,M,15.75,19.22,107.1,758.6,0.1243,0.2364,0.2914,0.1242,0.2375,0.07603,0.5204,1.324,3.477,51.22,0.009329,0.06559,0.09953,0.02283,0.05543,0.00733,17.36,24.17,119.4,915.3,0.155,0.5046,0.6872,0.2135,0.4245,0.105
-899987,M,25.73,17.46,174.2,2010,0.1149,0.2363,0.3368,0.1913,0.1956,0.06121,0.9948,0.8509,7.222,153.1,0.006369,0.04243,0.04266,0.01508,0.02335,0.003385,33.13,23.58,229.3,3234,0.153,0.5937,0.6451,0.2756,0.369,0.08815
-9010018,M,15.08,25.74,98,716.6,0.1024,0.09769,0.1235,0.06553,0.1647,0.06464,0.6534,1.506,4.174,63.37,0.01052,0.02431,0.04912,0.01746,0.0212,0.004867,18.51,33.22,121.2,1050,0.166,0.2356,0.4029,0.1526,0.2654,0.09438
-901011,B,11.14,14.07,71.24,384.6,0.07274,0.06064,0.04505,0.01471,0.169,0.06083,0.4222,0.8092,3.33,28.84,0.005541,0.03387,0.04505,0.01471,0.03102,0.004831,12.12,15.82,79.62,453.5,0.08864,0.1256,0.1201,0.03922,0.2576,0.07018
-9010258,B,12.56,19.07,81.92,485.8,0.0876,0.1038,0.103,0.04391,0.1533,0.06184,0.3602,1.478,3.212,27.49,0.009853,0.04235,0.06271,0.01966,0.02639,0.004205,13.37,22.43,89.02,547.4,0.1096,0.2002,0.2388,0.09265,0.2121,0.07188
-9010259,B,13.05,18.59,85.09,512,0.1082,0.1304,0.09603,0.05603,0.2035,0.06501,0.3106,1.51,2.59,21.57,0.007807,0.03932,0.05112,0.01876,0.0286,0.005715,14.19,24.85,94.22,591.2,0.1343,0.2658,0.2573,0.1258,0.3113,0.08317
-901028,B,13.87,16.21,88.52,593.7,0.08743,0.05492,0.01502,0.02088,0.1424,0.05883,0.2543,1.363,1.737,20.74,0.005638,0.007939,0.005254,0.006042,0.01544,0.002087,15.11,25.58,96.74,694.4,0.1153,0.1008,0.05285,0.05556,0.2362,0.07113
-9010333,B,8.878,15.49,56.74,241,0.08293,0.07698,0.04721,0.02381,0.193,0.06621,0.5381,1.2,4.277,30.18,0.01093,0.02899,0.03214,0.01506,0.02837,0.004174,9.981,17.7,65.27,302,0.1015,0.1248,0.09441,0.04762,0.2434,0.07431
-901034301,B,9.436,18.32,59.82,278.6,0.1009,0.05956,0.0271,0.01406,0.1506,0.06959,0.5079,1.247,3.267,30.48,0.006836,0.008982,0.02348,0.006565,0.01942,0.002713,12.02,25.02,75.79,439.6,0.1333,0.1049,0.1144,0.05052,0.2454,0.08136
-901034302,B,12.54,18.07,79.42,491.9,0.07436,0.0265,0.001194,0.005449,0.1528,0.05185,0.3511,0.9527,2.329,28.3,0.005783,0.004693,0.0007929,0.003617,0.02043,0.001058,13.72,20.98,86.82,585.7,0.09293,0.04327,0.003581,0.01635,0.2233,0.05521
-901041,B,13.3,21.57,85.24,546.1,0.08582,0.06373,0.03344,0.02424,0.1815,0.05696,0.2621,1.539,2.028,20.98,0.005498,0.02045,0.01795,0.006399,0.01829,0.001956,14.2,29.2,92.94,621.2,0.114,0.1667,0.1212,0.05614,0.2637,0.06658
-9010598,B,12.76,18.84,81.87,496.6,0.09676,0.07952,0.02688,0.01781,0.1759,0.06183,0.2213,1.285,1.535,17.26,0.005608,0.01646,0.01529,0.009997,0.01909,0.002133,13.75,25.99,87.82,579.7,0.1298,0.1839,0.1255,0.08312,0.2744,0.07238
-9010872,B,16.5,18.29,106.6,838.1,0.09686,0.08468,0.05862,0.04835,0.1495,0.05593,0.3389,1.439,2.344,33.58,0.007257,0.01805,0.01832,0.01033,0.01694,0.002001,18.13,25.45,117.2,1009,0.1338,0.1679,0.1663,0.09123,0.2394,0.06469
-9010877,B,13.4,16.95,85.48,552.4,0.07937,0.05696,0.02181,0.01473,0.165,0.05701,0.1584,0.6124,1.036,13.22,0.004394,0.0125,0.01451,0.005484,0.01291,0.002074,14.73,21.7,93.76,663.5,0.1213,0.1676,0.1364,0.06987,0.2741,0.07582
-901088,M,20.44,21.78,133.8,1293,0.0915,0.1131,0.09799,0.07785,0.1618,0.05557,0.5781,0.9168,4.218,72.44,0.006208,0.01906,0.02375,0.01461,0.01445,0.001906,24.31,26.37,161.2,1780,0.1327,0.2376,0.2702,0.1765,0.2609,0.06735
-9011494,M,20.2,26.83,133.7,1234,0.09905,0.1669,0.1641,0.1265,0.1875,0.0602,0.9761,1.892,7.128,103.6,0.008439,0.04674,0.05904,0.02536,0.0371,0.004286,24.19,33.81,160,1671,0.1278,0.3416,0.3703,0.2152,0.3271,0.07632
-9011495,B,12.21,18.02,78.31,458.4,0.09231,0.07175,0.04392,0.02027,0.1695,0.05916,0.2527,0.7786,1.874,18.57,0.005833,0.01388,0.02,0.007087,0.01938,0.00196,14.29,24.04,93.85,624.6,0.1368,0.217,0.2413,0.08829,0.3218,0.0747
-9011971,M,21.71,17.25,140.9,1546,0.09384,0.08562,0.1168,0.08465,0.1717,0.05054,1.207,1.051,7.733,224.1,0.005568,0.01112,0.02096,0.01197,0.01263,0.001803,30.75,26.44,199.5,3143,0.1363,0.1628,0.2861,0.182,0.251,0.06494
-9012000,M,22.01,21.9,147.2,1482,0.1063,0.1954,0.2448,0.1501,0.1824,0.0614,1.008,0.6999,7.561,130.2,0.003978,0.02821,0.03576,0.01471,0.01518,0.003796,27.66,25.8,195,2227,0.1294,0.3885,0.4756,0.2432,0.2741,0.08574
-9012315,M,16.35,23.29,109,840.4,0.09742,0.1497,0.1811,0.08773,0.2175,0.06218,0.4312,1.022,2.972,45.5,0.005635,0.03917,0.06072,0.01656,0.03197,0.004085,19.38,31.03,129.3,1165,0.1415,0.4665,0.7087,0.2248,0.4824,0.09614
-9012568,B,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,0.1721,0.05544,0.1783,0.4125,1.338,17.72,0.005012,0.01485,0.01551,0.009155,0.01647,0.001767,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766
-9012795,M,21.37,15.1,141.3,1386,0.1001,0.1515,0.1932,0.1255,0.1973,0.06183,0.3414,1.309,2.407,39.06,0.004426,0.02675,0.03437,0.01343,0.01675,0.004367,22.69,21.84,152.1,1535,0.1192,0.284,0.4024,0.1966,0.273,0.08666
-901288,M,20.64,17.35,134.8,1335,0.09446,0.1076,0.1527,0.08941,0.1571,0.05478,0.6137,0.6575,4.119,77.02,0.006211,0.01895,0.02681,0.01232,0.01276,0.001711,25.37,23.17,166.8,1946,0.1562,0.3055,0.4159,0.2112,0.2689,0.07055
-9013005,B,13.69,16.07,87.84,579.1,0.08302,0.06374,0.02556,0.02031,0.1872,0.05669,0.1705,0.5066,1.372,14,0.00423,0.01587,0.01169,0.006335,0.01943,0.002177,14.84,20.21,99.16,670.6,0.1105,0.2096,0.1346,0.06987,0.3323,0.07701
-901303,B,16.17,16.07,106.3,788.5,0.0988,0.1438,0.06651,0.05397,0.199,0.06572,0.1745,0.489,1.349,14.91,0.00451,0.01812,0.01951,0.01196,0.01934,0.003696,16.97,19.14,113.1,861.5,0.1235,0.255,0.2114,0.1251,0.3153,0.0896
-901315,B,10.57,20.22,70.15,338.3,0.09073,0.166,0.228,0.05941,0.2188,0.0845,0.1115,1.231,2.363,7.228,0.008499,0.07643,0.1535,0.02919,0.01617,0.0122,10.85,22.82,76.51,351.9,0.1143,0.3619,0.603,0.1465,0.2597,0.12
-9013579,B,13.46,28.21,85.89,562.1,0.07517,0.04726,0.01271,0.01117,0.1421,0.05763,0.1689,1.15,1.4,14.91,0.004942,0.01203,0.007508,0.005179,0.01442,0.001684,14.69,35.63,97.11,680.6,0.1108,0.1457,0.07934,0.05781,0.2694,0.07061
-9013594,B,13.66,15.15,88.27,580.6,0.08268,0.07548,0.04249,0.02471,0.1792,0.05897,0.1402,0.5417,1.101,11.35,0.005212,0.02984,0.02443,0.008356,0.01818,0.004868,14.54,19.64,97.96,657,0.1275,0.3104,0.2569,0.1054,0.3387,0.09638
-9013838,M,11.08,18.83,73.3,361.6,0.1216,0.2154,0.1689,0.06367,0.2196,0.0795,0.2114,1.027,1.719,13.99,0.007405,0.04549,0.04588,0.01339,0.01738,0.004435,13.24,32.82,91.76,508.1,0.2184,0.9379,0.8402,0.2524,0.4154,0.1403
-901549,B,11.27,12.96,73.16,386.3,0.1237,0.1111,0.079,0.0555,0.2018,0.06914,0.2562,0.9858,1.809,16.04,0.006635,0.01777,0.02101,0.01164,0.02108,0.003721,12.84,20.53,84.93,476.1,0.161,0.2429,0.2247,0.1318,0.3343,0.09215
-901836,B,11.04,14.93,70.67,372.7,0.07987,0.07079,0.03546,0.02074,0.2003,0.06246,0.1642,1.031,1.281,11.68,0.005296,0.01903,0.01723,0.00696,0.0188,0.001941,12.09,20.83,79.73,447.1,0.1095,0.1982,0.1553,0.06754,0.3202,0.07287
-90250,B,12.05,22.72,78.75,447.8,0.06935,0.1073,0.07943,0.02978,0.1203,0.06659,0.1194,1.434,1.778,9.549,0.005042,0.0456,0.04305,0.01667,0.0247,0.007358,12.57,28.71,87.36,488.4,0.08799,0.3214,0.2912,0.1092,0.2191,0.09349
-90251,B,12.39,17.48,80.64,462.9,0.1042,0.1297,0.05892,0.0288,0.1779,0.06588,0.2608,0.873,2.117,19.2,0.006715,0.03705,0.04757,0.01051,0.01838,0.006884,14.18,23.13,95.23,600.5,0.1427,0.3593,0.3206,0.09804,0.2819,0.1118
-902727,B,13.28,13.72,85.79,541.8,0.08363,0.08575,0.05077,0.02864,0.1617,0.05594,0.1833,0.5308,1.592,15.26,0.004271,0.02073,0.02828,0.008468,0.01461,0.002613,14.24,17.37,96.59,623.7,0.1166,0.2685,0.2866,0.09173,0.2736,0.0732
-90291,M,14.6,23.29,93.97,664.7,0.08682,0.06636,0.0839,0.05271,0.1627,0.05416,0.4157,1.627,2.914,33.01,0.008312,0.01742,0.03389,0.01576,0.0174,0.002871,15.79,31.71,102.2,758.2,0.1312,0.1581,0.2675,0.1359,0.2477,0.06836
-902975,B,12.21,14.09,78.78,462,0.08108,0.07823,0.06839,0.02534,0.1646,0.06154,0.2666,0.8309,2.097,19.96,0.004405,0.03026,0.04344,0.01087,0.01921,0.004622,13.13,19.29,87.65,529.9,0.1026,0.2431,0.3076,0.0914,0.2677,0.08824
-902976,B,13.88,16.16,88.37,596.6,0.07026,0.04831,0.02045,0.008507,0.1607,0.05474,0.2541,0.6218,1.709,23.12,0.003728,0.01415,0.01988,0.007016,0.01647,0.00197,15.51,19.97,99.66,745.3,0.08484,0.1233,0.1091,0.04537,0.2542,0.06623
-903011,B,11.27,15.5,73.38,392,0.08365,0.1114,0.1007,0.02757,0.181,0.07252,0.3305,1.067,2.569,22.97,0.01038,0.06669,0.09472,0.02047,0.01219,0.01233,12.04,18.93,79.73,450,0.1102,0.2809,0.3021,0.08272,0.2157,0.1043
-90312,M,19.55,23.21,128.9,1174,0.101,0.1318,0.1856,0.1021,0.1989,0.05884,0.6107,2.836,5.383,70.1,0.01124,0.04097,0.07469,0.03441,0.02768,0.00624,20.82,30.44,142,1313,0.1251,0.2414,0.3829,0.1825,0.2576,0.07602
-90317302,B,10.26,12.22,65.75,321.6,0.09996,0.07542,0.01923,0.01968,0.18,0.06569,0.1911,0.5477,1.348,11.88,0.005682,0.01365,0.008496,0.006929,0.01938,0.002371,11.38,15.65,73.23,394.5,0.1343,0.165,0.08615,0.06696,0.2937,0.07722
-903483,B,8.734,16.84,55.27,234.3,0.1039,0.07428,0,0,0.1985,0.07098,0.5169,2.079,3.167,28.85,0.01582,0.01966,0,0,0.01865,0.006736,10.17,22.8,64.01,317,0.146,0.131,0,0,0.2445,0.08865
-903507,M,15.49,19.97,102.4,744.7,0.116,0.1562,0.1891,0.09113,0.1929,0.06744,0.647,1.331,4.675,66.91,0.007269,0.02928,0.04972,0.01639,0.01852,0.004232,21.2,29.41,142.1,1359,0.1681,0.3913,0.5553,0.2121,0.3187,0.1019
-903516,M,21.61,22.28,144.4,1407,0.1167,0.2087,0.281,0.1562,0.2162,0.06606,0.6242,0.9209,4.158,80.99,0.005215,0.03726,0.04718,0.01288,0.02045,0.004028,26.23,28.74,172,2081,0.1502,0.5717,0.7053,0.2422,0.3828,0.1007
-903554,B,12.1,17.72,78.07,446.2,0.1029,0.09758,0.04783,0.03326,0.1937,0.06161,0.2841,1.652,1.869,22.22,0.008146,0.01631,0.01843,0.007513,0.02015,0.001798,13.56,25.8,88.33,559.5,0.1432,0.1773,0.1603,0.06266,0.3049,0.07081
-903811,B,14.06,17.18,89.75,609.1,0.08045,0.05361,0.02681,0.03251,0.1641,0.05764,0.1504,1.685,1.237,12.67,0.005371,0.01273,0.01132,0.009155,0.01719,0.001444,14.92,25.34,96.42,684.5,0.1066,0.1231,0.0846,0.07911,0.2523,0.06609
-90401601,B,13.51,18.89,88.1,558.1,0.1059,0.1147,0.0858,0.05381,0.1806,0.06079,0.2136,1.332,1.513,19.29,0.005442,0.01957,0.03304,0.01367,0.01315,0.002464,14.8,27.2,97.33,675.2,0.1428,0.257,0.3438,0.1453,0.2666,0.07686
-90401602,B,12.8,17.46,83.05,508.3,0.08044,0.08895,0.0739,0.04083,0.1574,0.0575,0.3639,1.265,2.668,30.57,0.005421,0.03477,0.04545,0.01384,0.01869,0.004067,13.74,21.06,90.72,591,0.09534,0.1812,0.1901,0.08296,0.1988,0.07053
-904302,B,11.06,14.83,70.31,378.2,0.07741,0.04768,0.02712,0.007246,0.1535,0.06214,0.1855,0.6881,1.263,12.98,0.004259,0.01469,0.0194,0.004168,0.01191,0.003537,12.68,20.35,80.79,496.7,0.112,0.1879,0.2079,0.05556,0.259,0.09158
-904357,B,11.8,17.26,75.26,431.9,0.09087,0.06232,0.02853,0.01638,0.1847,0.06019,0.3438,1.14,2.225,25.06,0.005463,0.01964,0.02079,0.005398,0.01477,0.003071,13.45,24.49,86,562,0.1244,0.1726,0.1449,0.05356,0.2779,0.08121
-90439701,M,17.91,21.02,124.4,994,0.123,0.2576,0.3189,0.1198,0.2113,0.07115,0.403,0.7747,3.123,41.51,0.007159,0.03718,0.06165,0.01051,0.01591,0.005099,20.8,27.78,149.6,1304,0.1873,0.5917,0.9034,0.1964,0.3245,0.1198
-904647,B,11.93,10.91,76.14,442.7,0.08872,0.05242,0.02606,0.01796,0.1601,0.05541,0.2522,1.045,1.649,18.95,0.006175,0.01204,0.01376,0.005832,0.01096,0.001857,13.8,20.14,87.64,589.5,0.1374,0.1575,0.1514,0.06876,0.246,0.07262
-904689,B,12.96,18.29,84.18,525.2,0.07351,0.07899,0.04057,0.01883,0.1874,0.05899,0.2357,1.299,2.397,20.21,0.003629,0.03713,0.03452,0.01065,0.02632,0.003705,14.13,24.61,96.31,621.9,0.09329,0.2318,0.1604,0.06608,0.3207,0.07247
-9047,B,12.94,16.17,83.18,507.6,0.09879,0.08836,0.03296,0.0239,0.1735,0.062,0.1458,0.905,0.9975,11.36,0.002887,0.01285,0.01613,0.007308,0.0187,0.001972,13.86,23.02,89.69,580.9,0.1172,0.1958,0.181,0.08388,0.3297,0.07834
-904969,B,12.34,14.95,78.29,469.1,0.08682,0.04571,0.02109,0.02054,0.1571,0.05708,0.3833,0.9078,2.602,30.15,0.007702,0.008491,0.01307,0.0103,0.0297,0.001432,13.18,16.85,84.11,533.1,0.1048,0.06744,0.04921,0.04793,0.2298,0.05974
-904971,B,10.94,18.59,70.39,370,0.1004,0.0746,0.04944,0.02932,0.1486,0.06615,0.3796,1.743,3.018,25.78,0.009519,0.02134,0.0199,0.01155,0.02079,0.002701,12.4,25.58,82.76,472.4,0.1363,0.1644,0.1412,0.07887,0.2251,0.07732
-905189,B,16.14,14.86,104.3,800,0.0949

<TRUNCATED>

[42/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/resources/bank-full.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/resources/bank-full.csv b/community/mahout-mr/examples/src/main/resources/bank-full.csv
deleted file mode 100644
index d7a2ede..0000000
--- a/community/mahout-mr/examples/src/main/resources/bank-full.csv
+++ /dev/null
@@ -1,45212 +0,0 @@
-"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"
-58;"management";"married";"tertiary";"no";2143;"yes";"no";"unknown";5;"may";261;1;-1;0;"unknown";"no"
-44;"technician";"single";"secondary";"no";29;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
-33;"entrepreneur";"married";"secondary";"no";2;"yes";"yes";"unknown";5;"may";76;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"unknown";"no";1506;"yes";"no";"unknown";5;"may";92;1;-1;0;"unknown";"no"
-33;"unknown";"single";"unknown";"no";1;"no";"no";"unknown";5;"may";198;1;-1;0;"unknown";"no"
-35;"management";"married";"tertiary";"no";231;"yes";"no";"unknown";5;"may";139;1;-1;0;"unknown";"no"
-28;"management";"single";"tertiary";"no";447;"yes";"yes";"unknown";5;"may";217;1;-1;0;"unknown";"no"
-42;"entrepreneur";"divorced";"tertiary";"yes";2;"yes";"no";"unknown";5;"may";380;1;-1;0;"unknown";"no"
-58;"retired";"married";"primary";"no";121;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
-43;"technician";"single";"secondary";"no";593;"yes";"no";"unknown";5;"may";55;1;-1;0;"unknown";"no"
-41;"admin.";"divorced";"secondary";"no";270;"yes";"no";"unknown";5;"may";222;1;-1;0;"unknown";"no"
-29;"admin.";"single";"secondary";"no";390;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";6;"yes";"no";"unknown";5;"may";517;1;-1;0;"unknown";"no"
-58;"technician";"married";"unknown";"no";71;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
-57;"services";"married";"secondary";"no";162;"yes";"no";"unknown";5;"may";174;1;-1;0;"unknown";"no"
-51;"retired";"married";"primary";"no";229;"yes";"no";"unknown";5;"may";353;1;-1;0;"unknown";"no"
-45;"admin.";"single";"unknown";"no";13;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";52;"yes";"no";"unknown";5;"may";38;1;-1;0;"unknown";"no"
-60;"retired";"married";"primary";"no";60;"yes";"no";"unknown";5;"may";219;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";54;1;-1;0;"unknown";"no"
-28;"blue-collar";"married";"secondary";"no";723;"yes";"yes";"unknown";5;"may";262;1;-1;0;"unknown";"no"
-56;"management";"married";"tertiary";"no";779;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
-32;"blue-collar";"single";"primary";"no";23;"yes";"yes";"unknown";5;"may";160;1;-1;0;"unknown";"no"
-25;"services";"married";"secondary";"no";50;"yes";"no";"unknown";5;"may";342;1;-1;0;"unknown";"no"
-40;"retired";"married";"primary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-44;"admin.";"married";"secondary";"no";-372;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
-39;"management";"single";"tertiary";"no";255;"yes";"no";"unknown";5;"may";296;1;-1;0;"unknown";"no"
-52;"entrepreneur";"married";"secondary";"no";113;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
-46;"management";"single";"secondary";"no";-246;"yes";"no";"unknown";5;"may";255;2;-1;0;"unknown";"no"
-36;"technician";"single";"secondary";"no";265;"yes";"yes";"unknown";5;"may";348;1;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";839;"no";"yes";"unknown";5;"may";225;1;-1;0;"unknown";"no"
-49;"management";"married";"tertiary";"no";378;"yes";"no";"unknown";5;"may";230;1;-1;0;"unknown";"no"
-60;"admin.";"married";"secondary";"no";39;"yes";"yes";"unknown";5;"may";208;1;-1;0;"unknown";"no"
-59;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";226;1;-1;0;"unknown";"no"
-51;"management";"married";"tertiary";"no";10635;"yes";"no";"unknown";5;"may";336;1;-1;0;"unknown";"no"
-57;"technician";"divorced";"secondary";"no";63;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
-25;"blue-collar";"married";"secondary";"no";-7;"yes";"no";"unknown";5;"may";365;1;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";-3;"no";"no";"unknown";5;"may";1666;1;-1;0;"unknown";"no"
-36;"admin.";"divorced";"secondary";"no";506;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
-44;"services";"divorced";"secondary";"no";2586;"yes";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
-50;"management";"married";"secondary";"no";49;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
-60;"blue-collar";"married";"unknown";"no";104;"yes";"no";"unknown";5;"may";22;1;-1;0;"unknown";"no"
-54;"retired";"married";"secondary";"no";529;"yes";"no";"unknown";5;"may";1492;1;-1;0;"unknown";"no"
-58;"retired";"married";"unknown";"no";96;"yes";"no";"unknown";5;"may";616;1;-1;0;"unknown";"no"
-36;"admin.";"single";"primary";"no";-171;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
-58;"self-employed";"married";"tertiary";"no";-364;"yes";"no";"unknown";5;"may";355;1;-1;0;"unknown";"no"
-44;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
-55;"technician";"divorced";"secondary";"no";0;"no";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
-29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";363;1;-1;0;"unknown";"no"
-54;"blue-collar";"married";"secondary";"no";1291;"yes";"no";"unknown";5;"may";266;1;-1;0;"unknown";"no"
-48;"management";"divorced";"tertiary";"no";-244;"yes";"no";"unknown";5;"may";253;1;-1;0;"unknown";"no"
-32;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";179;1;-1;0;"unknown";"no"
-42;"admin.";"single";"secondary";"no";-76;"yes";"no";"unknown";5;"may";787;1;-1;0;"unknown";"no"
-24;"technician";"single";"secondary";"no";-103;"yes";"yes";"unknown";5;"may";145;1;-1;0;"unknown";"no"
-38;"entrepreneur";"single";"tertiary";"no";243;"no";"yes";"unknown";5;"may";174;1;-1;0;"unknown";"no"
-38;"management";"single";"tertiary";"no";424;"yes";"no";"unknown";5;"may";104;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"unknown";"no";306;"yes";"no";"unknown";5;"may";13;1;-1;0;"unknown";"no"
-40;"blue-collar";"single";"unknown";"no";24;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
-46;"services";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";1778;1;-1;0;"unknown";"no"
-32;"admin.";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
-53;"technician";"divorced";"secondary";"no";989;"yes";"no";"unknown";5;"may";812;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";249;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";790;"yes";"no";"unknown";5;"may";391;1;-1;0;"unknown";"no"
-49;"blue-collar";"married";"unknown";"no";154;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
-51;"management";"married";"tertiary";"no";6530;"yes";"no";"unknown";5;"may";91;1;-1;0;"unknown";"no"
-60;"retired";"married";"tertiary";"no";100;"no";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
-59;"management";"divorced";"tertiary";"no";59;"yes";"no";"unknown";5;"may";273;1;-1;0;"unknown";"no"
-55;"technician";"married";"secondary";"no";1205;"yes";"no";"unknown";5;"may";158;2;-1;0;"unknown";"no"
-35;"blue-collar";"single";"secondary";"no";12223;"yes";"yes";"unknown";5;"may";177;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"secondary";"no";5935;"yes";"yes";"unknown";5;"may";258;1;-1;0;"unknown";"no"
-31;"services";"married";"secondary";"no";25;"yes";"yes";"unknown";5;"may";172;1;-1;0;"unknown";"no"
-54;"management";"married";"secondary";"no";282;"yes";"yes";"unknown";5;"may";154;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
-43;"technician";"married";"secondary";"no";1937;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";384;"yes";"no";"unknown";5;"may";176;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";582;"no";"yes";"unknown";5;"may";211;1;-1;0;"unknown";"no"
-55;"services";"divorced";"secondary";"no";91;"no";"no";"unknown";5;"may";349;1;-1;0;"unknown";"no"
-49;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";5;"may";272;1;-1;0;"unknown";"no"
-55;"services";"divorced";"secondary";"yes";1;"yes";"no";"unknown";5;"may";208;1;-1;0;"unknown";"no"
-45;"admin.";"single";"secondary";"no";206;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
-47;"services";"divorced";"secondary";"no";164;"no";"no";"unknown";5;"may";212;1;-1;0;"unknown";"no"
-42;"technician";"single";"secondary";"no";690;"yes";"no";"unknown";5;"may";20;1;-1;0;"unknown";"no"
-59;"admin.";"married";"secondary";"no";2343;"yes";"no";"unknown";5;"may";1042;1;-1;0;"unknown";"yes"
-46;"self-employed";"married";"tertiary";"no";137;"yes";"yes";"unknown";5;"may";246;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";173;"yes";"no";"unknown";5;"may";529;2;-1;0;"unknown";"no"
-56;"admin.";"married";"secondary";"no";45;"no";"no";"unknown";5;"may";1467;1;-1;0;"unknown";"yes"
-41;"technician";"married";"secondary";"no";1270;"yes";"no";"unknown";5;"may";1389;1;-1;0;"unknown";"yes"
-46;"management";"divorced";"secondary";"no";16;"yes";"yes";"unknown";5;"may";188;2;-1;0;"unknown";"no"
-57;"retired";"married";"secondary";"no";486;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
-42;"management";"single";"secondary";"no";50;"no";"no";"unknown";5;"may";48;1;-1;0;"unknown";"no"
-30;"technician";"married";"secondary";"no";152;"yes";"yes";"unknown";5;"may";213;2;-1;0;"unknown";"no"
-60;"admin.";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";583;1;-1;0;"unknown";"no"
-60;"blue-collar";"married";"unknown";"no";54;"yes";"no";"unknown";5;"may";221;1;-1;0;"unknown";"no"
-57;"entrepreneur";"divorced";"secondary";"no";-37;"no";"no";"unknown";5;"may";173;1;-1;0;"unknown";"no"
-36;"management";"married";"tertiary";"no";101;"yes";"yes";"unknown";5;"may";426;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";383;"no";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
-60;"retired";"married";"tertiary";"no";81;"yes";"no";"unknown";5;"may";101;1;-1;0;"unknown";"no"
-39;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";203;1;-1;0;"unknown";"no"
-46;"management";"married";"tertiary";"no";229;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";-674;"yes";"no";"unknown";5;"may";257;1;-1;0;"unknown";"no"
-53;"blue-collar";"married";"primary";"no";90;"no";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
-52;"blue-collar";"married";"primary";"no";128;"yes";"no";"unknown";5;"may";229;1;-1;0;"unknown";"no"
-59;"blue-collar";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";55;3;-1;0;"unknown";"no"
-27;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";400;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";54;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
-47;"technician";"married";"tertiary";"no";151;"yes";"no";"unknown";5;"may";190;1;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";61;"no";"yes";"unknown";5;"may";21;1;-1;0;"unknown";"no"
-59;"retired";"single";"secondary";"no";30;"yes";"no";"unknown";5;"may";514;1;-1;0;"unknown";"no"
-45;"management";"married";"tertiary";"no";523;"yes";"no";"unknown";5;"may";849;2;-1;0;"unknown";"no"
-29;"services";"divorced";"secondary";"no";31;"yes";"no";"unknown";5;"may";194;1;-1;0;"unknown";"no"
-46;"technician";"divorced";"secondary";"no";79;"no";"no";"unknown";5;"may";144;1;-1;0;"unknown";"no"
-56;"self-employed";"married";"primary";"no";-34;"yes";"yes";"unknown";5;"may";212;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"primary";"no";448;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
-59;"retired";"divorced";"primary";"no";81;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";144;"yes";"no";"unknown";5;"may";247;2;-1;0;"unknown";"no"
-41;"admin.";"married";"secondary";"no";351;"yes";"no";"unknown";5;"may";518;1;-1;0;"unknown";"no"
-33;"management";"single";"tertiary";"no";-67;"yes";"no";"unknown";5;"may";364;1;-1;0;"unknown";"no"
-59;"management";"divorced";"tertiary";"no";262;"no";"no";"unknown";5;"may";178;1;-1;0;"unknown";"no"
-57;"technician";"married";"primary";"no";0;"no";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-56;"technician";"divorced";"unknown";"no";56;"yes";"no";"unknown";5;"may";439;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
-34;"admin.";"married";"unknown";"no";3;"yes";"no";"unknown";5;"may";120;3;-1;0;"unknown";"no"
-43;"services";"married";"secondary";"no";41;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
-52;"technician";"married";"tertiary";"no";7;"no";"yes";"unknown";5;"may";175;1;-1;0;"unknown";"no"
-33;"technician";"single";"secondary";"no";105;"yes";"no";"unknown";5;"may";262;2;-1;0;"unknown";"no"
-29;"admin.";"single";"secondary";"no";818;"yes";"yes";"unknown";5;"may";61;1;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";-16;"yes";"yes";"unknown";5;"may";78;1;-1;0;"unknown";"no"
-31;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";143;1;-1;0;"unknown";"no"
-55;"services";"married";"secondary";"no";2476;"yes";"no";"unknown";5;"may";579;1;-1;0;"unknown";"yes"
-55;"management";"married";"unknown";"no";1185;"no";"no";"unknown";5;"may";677;1;-1;0;"unknown";"no"
-32;"admin.";"single";"secondary";"no";217;"yes";"no";"unknown";5;"may";345;1;-1;0;"unknown";"no"
-38;"technician";"single";"secondary";"no";1685;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
-55;"admin.";"single";"secondary";"no";802;"yes";"yes";"unknown";5;"may";100;2;-1;0;"unknown";"no"
-28;"unemployed";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
-23;"blue-collar";"married";"secondary";"no";94;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
-32;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";136;1;-1;0;"unknown";"no"
-43;"services";"single";"unknown";"no";0;"no";"no";"unknown";5;"may";73;1;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";517;"yes";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
-46;"blue-collar";"married";"secondary";"no";265;"yes";"no";"unknown";5;"may";541;1;-1;0;"unknown";"no"
-53;"housemaid";"divorced";"primary";"no";947;"yes";"no";"unknown";5;"may";163;1;-1;0;"unknown";"no"
-34;"self-employed";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";301;1;-1;0;"unknown";"no"
-57;"unemployed";"married";"tertiary";"no";42;"no";"no";"unknown";5;"may";46;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";37;"yes";"no";"unknown";5;"may";204;1;-1;0;"unknown";"no"
-59;"blue-collar";"married";"secondary";"no";57;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";22;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
-56;"blue-collar";"divorced";"primary";"no";8;"yes";"no";"unknown";5;"may";157;2;-1;0;"unknown";"no"
-48;"unemployed";"married";"secondary";"no";293;"yes";"no";"unknown";5;"may";243;1;-1;0;"unknown";"no"
-43;"services";"married";"primary";"no";3;"yes";"no";"unknown";5;"may";186;2;-1;0;"unknown";"no"
-54;"blue-collar";"married";"primary";"no";348;"yes";"no";"unknown";5;"may";579;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"unknown";"no";-19;"yes";"no";"unknown";5;"may";163;2;-1;0;"unknown";"no"
-26;"student";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";610;2;-1;0;"unknown";"no"
-40;"management";"married";"tertiary";"no";-4;"yes";"no";"unknown";5;"may";2033;1;-1;0;"unknown";"no"
-39;"management";"married";"secondary";"no";18;"yes";"no";"unknown";5;"may";85;1;-1;0;"unknown";"no"
-50;"technician";"married";"primary";"no";139;"no";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
-41;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"unknown";"no";1883;"yes";"no";"unknown";5;"may";57;1;-1;0;"unknown";"no"
-60;"retired";"divorced";"secondary";"no";216;"yes";"no";"unknown";5;"may";238;1;-1;0;"unknown";"no"
-52;"blue-collar";"married";"secondary";"no";782;"yes";"no";"unknown";5;"may";93;3;-1;0;"unknown";"no"
-48;"blue-collar";"married";"secondary";"no";904;"yes";"no";"unknown";5;"may";128;2;-1;0;"unknown";"no"
-48;"services";"married";"unknown";"no";1705;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
-39;"technician";"single";"tertiary";"no";47;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-47;"services";"single";"secondary";"no";176;"yes";"no";"unknown";5;"may";303;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";1225;"yes";"no";"unknown";5;"may";558;5;-1;0;"unknown";"no"
-45;"technician";"married";"secondary";"no";86;"yes";"no";"unknown";5;"may";270;1;-1;0;"unknown";"no"
-26;"admin.";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";228;1;-1;0;"unknown";"no"
-52;"management";"married";"tertiary";"no";271;"yes";"no";"unknown";5;"may";99;1;-1;0;"unknown";"no"
-54;"technician";"married";"secondary";"no";1378;"yes";"no";"unknown";5;"may";240;1;-1;0;"unknown";"no"
-54;"admin.";"married";"tertiary";"no";184;"no";"no";"unknown";5;"may";673;2;-1;0;"unknown";"yes"
-50;"blue-collar";"married";"primary";"no";0;"no";"no";"unknown";5;"may";233;3;-1;0;"unknown";"no"
-35;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";1056;1;-1;0;"unknown";"no"
-44;"services";"married";"secondary";"no";1357;"yes";"yes";"unknown";5;"may";250;1;-1;0;"unknown";"no"
-53;"entrepreneur";"married";"unknown";"no";19;"yes";"no";"unknown";5;"may";252;1;-1;0;"unknown";"no"
-35;"retired";"single";"primary";"no";434;"no";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
-60;"admin.";"divorced";"secondary";"no";92;"yes";"no";"unknown";5;"may";130;1;-1;0;"unknown";"no"
-53;"admin.";"divorced";"secondary";"no";1151;"yes";"no";"unknown";5;"may";412;1;-1;0;"unknown";"no"
-48;"unemployed";"married";"secondary";"no";41;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
-34;"technician";"married";"secondary";"no";51;"yes";"no";"unknown";5;"may";19;2;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"no";214;"yes";"no";"unknown";5;"may";458;2;-1;0;"unknown";"no"
-51;"management";"married";"secondary";"no";1161;"yes";"no";"unknown";5;"may";717;1;-1;0;"unknown";"no"
-31;"services";"married";"tertiary";"no";37;"yes";"no";"unknown";5;"may";313;1;-1;0;"unknown";"no"
-35;"technician";"divorced";"secondary";"no";787;"yes";"no";"unknown";5;"may";683;2;-1;0;"unknown";"no"
-35;"services";"married";"secondary";"no";59;"yes";"no";"unknown";5;"may";1077;1;-1;0;"unknown";"no"
-38;"technician";"married";"secondary";"no";253;"yes";"no";"unknown";5;"may";416;1;-1;0;"unknown";"no"
-36;"admin.";"married";"tertiary";"no";211;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
-58;"retired";"married";"primary";"no";235;"yes";"no";"unknown";5;"may";167;1;-1;0;"unknown";"no"
-40;"services";"divorced";"unknown";"no";4384;"yes";"no";"unknown";5;"may";315;1;-1;0;"unknown";"no"
-54;"management";"married";"secondary";"no";4080;"no";"no";"unknown";5;"may";140;1;-1;0;"unknown";"no"
-34;"blue-collar";"single";"secondary";"no";53;"yes";"yes";"unknown";5;"may";346;1;-1;0;"unknown";"no"
-31;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";562;1;-1;0;"unknown";"no"
-51;"retired";"married";"secondary";"no";2127;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
-33;"management";"married";"tertiary";"no";377;"yes";"no";"unknown";5;"may";217;1;-1;0;"unknown";"no"
-55;"management";"married";"tertiary";"no";73;"yes";"no";"unknown";5;"may";142;2;-1;0;"unknown";"no"
-42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";5;"may";67;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";243;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
-33;"blue-collar";"single";"secondary";"no";307;"yes";"no";"unknown";5;"may";309;2;-1;0;"unknown";"no"
-38;"services";"married";"secondary";"no";155;"yes";"no";"unknown";5;"may";248;1;-1;0;"unknown";"no"
-50;"technician";"divorced";"tertiary";"no";173;"no";"yes";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-43;"management";"married";"tertiary";"no";400;"yes";"no";"unknown";5;"may";256;1;-1;0;"unknown";"no"
-61;"blue-collar";"divorced";"primary";"no";1428;"yes";"no";"unknown";5;"may";82;2;-1;0;"unknown";"no"
-47;"admin.";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
-48;"self-employed";"married";"tertiary";"no";7;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";575;"yes";"no";"unknown";5;"may";477;1;-1;0;"unknown";"no"
-35;"student";"single";"unknown";"no";298;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
-35;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";471;1;-1;0;"unknown";"no"
-50;"services";"married";"secondary";"no";5699;"yes";"no";"unknown";5;"may";381;2;-1;0;"unknown";"no"
-41;"management";"married";"tertiary";"no";176;"yes";"yes";"unknown";5;"may";42;1;-1;0;"unknown";"no"
-41;"management";"married";"tertiary";"no";517;"yes";"no";"unknown";5;"may";251;1;-1;0;"unknown";"no"
-39;"services";"single";"unknown";"no";257;"yes";"no";"unknown";5;"may";408;1;-1;0;"unknown";"no"
-42;"retired";"married";"secondary";"no";56;"yes";"no";"unknown";5;"may";215;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";-390;"yes";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
-53;"retired";"married";"secondary";"no";330;"yes";"no";"unknown";5;"may";216;2;-1;0;"unknown";"no"
-59;"housemaid";"divorced";"primary";"no";195;"no";"no";"unknown";5;"may";366;2;-1;0;"unknown";"no"
-36;"services";"married";"secondary";"no";301;"yes";"no";"unknown";5;"may";210;1;-1;0;"unknown";"no"
-54;"blue-collar";"married";"primary";"no";-41;"yes";"no";"unknown";5;"may";288;1;-1;0;"unknown";"no"
-40;"technician";"married";"tertiary";"no";483;"yes";"no";"unknown";5;"may";168;1;-1;0;"unknown";"no"
-47;"unknown";"married";"unknown";"no";28;"no";"no";"unknown";5;"may";338;2;-1;0;"unknown";"no"
-53;"unemployed";"married";"unknown";"no";13;"no";"no";"unknown";5;"may";410;3;-1;0;"unknown";"no"
-46;"housemaid";"married";"primary";"no";965;"no";"no";"unknown";5;"may";177;1;-1;0;"unknown";"no"
-39;"management";"married";"tertiary";"no";378;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
-40;"unemployed";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
-28;"blue-collar";"married";"primary";"no";324;"yes";"no";"unknown";5;"may";175;1;-1;0;"unknown";"no"
-35;"entrepreneur";"divorced";"secondary";"no";-69;"yes";"no";"unknown";5;"may";300;1;-1;0;"unknown";"no"
-55;"retired";"married";"secondary";"no";0;"no";"yes";"unknown";5;"may";136;1;-1;0;"unknown";"no"
-43;"technician";"divorced";"unknown";"no";205;"yes";"no";"unknown";5;"may";1419;1;-1;0;"unknown";"no"
-48;"blue-collar";"married";"primary";"no";278;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
-58;"management";"married";"unknown";"no";1065;"yes";"no";"unknown";5;"may";213;3;-1;0;"unknown";"no"
-33;"management";"single";"tertiary";"no";34;"yes";"no";"unknown";5;"may";27;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"unknown";"no";1033;"no";"no";"unknown";5;"may";238;2;-1;0;"unknown";"no"
-53;"services";"divorced";"secondary";"no";1467;"yes";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"primary";"no";-12;"yes";"no";"unknown";5;"may";18;1;-1;0;"unknown";"no"
-31;"services";"married";"secondary";"no";388;"yes";"no";"unknown";5;"may";730;2;-1;0;"unknown";"no"
-57;"entrepreneur";"married";"secondary";"no";294;"yes";"no";"unknown";5;"may";746;2;-1;0;"unknown";"no"
-53;"blue-collar";"married";"unknown";"no";1827;"no";"no";"unknown";5;"may";121;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"primary";"no";627;"yes";"no";"unknown";5;"may";247;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";5;"may";40;1;-1;0;"unknown";"no"
-53;"admin.";"divorced";"secondary";"no";315;"yes";"no";"unknown";5;"may";181;2;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
-44;"admin.";"divorced";"secondary";"no";66;"yes";"no";"unknown";5;"may";206;1;-1;0;"unknown";"no"
-49;"blue-collar";"divorced";"primary";"no";-9;"yes";"yes";"unknown";5;"may";389;1;-1;0;"unknown";"no"
-46;"technician";"married";"secondary";"no";349;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
-43;"entrepreneur";"married";"unknown";"no";100;"yes";"no";"unknown";5;"may";702;1;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
-43;"technician";"married";"secondary";"no";434;"yes";"no";"unknown";5;"may";117;1;-1;0;"unknown";"no"
-49;"management";"married";"tertiary";"no";3237;"yes";"no";"unknown";5;"may";232;3;-1;0;"unknown";"no"
-42;"management";"married";"unknown";"no";275;"no";"no";"unknown";5;"may";408;2;-1;0;"unknown";"no"
-22;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
-40;"management";"married";"tertiary";"no";207;"yes";"no";"unknown";5;"may";39;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";483;"yes";"no";"unknown";5;"may";282;1;-1;0;"unknown";"no"
-51;"services";"married";"secondary";"no";2248;"yes";"no";"unknown";5;"may";714;2;-1;0;"unknown";"no"
-49;"admin.";"married";"secondary";"no";428;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
-53;"blue-collar";"married";"secondary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-34;"services";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";142;1;-1;0;"unknown";"no"
-33;"technician";"divorced";"secondary";"no";140;"yes";"no";"unknown";5;"may";227;1;-1;0;"unknown";"no"
-50;"management";"single";"tertiary";"no";297;"yes";"no";"unknown";5;"may";119;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";279;"yes";"no";"unknown";5;"may";361;1;-1;0;"unknown";"no"
-59;"entrepreneur";"divorced";"secondary";"no";901;"yes";"no";"unknown";5;"may";73;3;-1;0;"unknown";"no"
-30;"technician";"single";"secondary";"no";2573;"yes";"no";"unknown";5;"may";67;2;-1;0;"unknown";"no"
-36;"services";"married";"secondary";"no";143;"yes";"yes";"unknown";5;"may";350;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";475;"yes";"no";"unknown";5;"may";332;2;-1;0;"unknown";"no"
-53;"blue-collar";"married";"secondary";"no";70;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
-34;"management";"single";"tertiary";"no";318;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";275;"yes";"no";"unknown";5;"may";132;1;-1;0;"unknown";"no"
-42;"management";"divorced";"tertiary";"no";742;"yes";"no";"unknown";5;"may";58;3;-1;0;"unknown";"no"
-41;"entrepreneur";"married";"primary";"no";236;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
-30;"student";"single";"tertiary";"no";25;"yes";"no";"unknown";5;"may";89;2;-1;0;"unknown";"no"
-37;"management";"single";"tertiary";"no";600;"yes";"no";"unknown";5;"may";152;1;-1;0;"unknown";"no"
-39;"admin.";"divorced";"secondary";"no";-349;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
-41;"blue-collar";"married";"primary";"no";183;"yes";"yes";"unknown";5;"may";110;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";463;1;-1;0;"unknown";"no"
-42;"management";"single";"tertiary";"no";0;"yes";"yes";"unknown";5;"may";562;2;-1;0;"unknown";"yes"
-40;"blue-collar";"divorced";"primary";"no";0;"yes";"no";"unknown";5;"may";962;1;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";1078;"yes";"no";"unknown";5;"may";10;4;-1;0;"unknown";"no"
-56;"entrepreneur";"divorced";"secondary";"no";155;"no";"no";"unknown";5;"may";118;3;-1;0;"unknown";"no"
-37;"admin.";"married";"secondary";"no";190;"yes";"no";"unknown";5;"may";92;2;-1;0;"unknown";"no"
-59;"retired";"married";"secondary";"no";319;"yes";"no";"unknown";5;"may";143;3;-1;0;"unknown";"no"
-39;"services";"divorced";"secondary";"no";-185;"yes";"no";"unknown";5;"may";189;3;-1;0;"unknown";"no"
-49;"services";"married";"secondary";"no";47;"no";"no";"unknown";5;"may";234;2;-1;0;"unknown";"no"
-38;"services";"single";"secondary";"no";570;"yes";"no";"unknown";5;"may";75;2;-1;0;"unknown";"no"
-36;"self-employed";"married";"tertiary";"no";19;"no";"no";"unknown";5;"may";189;2;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";61;"yes";"no";"unknown";5;"may";621;3;-1;0;"unknown";"no"
-41;"admin.";"married";"secondary";"no";-62;"yes";"yes";"unknown";5;"may";55;2;-1;0;"unknown";"no"
-54;"technician";"married";"tertiary";"no";258;"no";"no";"unknown";5;"may";310;4;-1;0;"unknown";"no"
-58;"blue-collar";"married";"primary";"no";76;"yes";"no";"unknown";5;"may";156;2;-1;0;"unknown";"no"
-30;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";5;2;-1;0;"unknown";"no"
-33;"admin.";"single";"secondary";"no";352;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
-47;"admin.";"married";"secondary";"no";368;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
-50;"technician";"single";"tertiary";"no";339;"yes";"no";"unknown";5;"may";2;3;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";1331;"yes";"no";"unknown";5;"may";286;2;-1;0;"unknown";"no"
-40;"self-employed";"married";"secondary";"no";672;"yes";"no";"unknown";5;"may";164;2;-1;0;"unknown";"no"
-37;"management";"married";"tertiary";"no";58;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
-54;"technician";"single";"unknown";"no";447;"yes";"no";"unknown";5;"may";742;2;-1;0;"unknown";"no"
-24;"student";"single";"secondary";"no";423;"yes";"no";"unknown";5;"may";226;3;-1;0;"unknown";"no"
-54;"management";"married";"tertiary";"no";0;"no";"no";"unknown";5;"may";120;2;-1;0;"unknown";"no"
-34;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";5;"may";362;4;-1;0;"unknown";"no"
-56;"technician";"divorced";"primary";"no";13;"yes";"no";"unknown";5;"may";357;2;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";200;2;-1;0;"unknown";"no"
-24;"student";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";204;2;-1;0;"unknown";"no"
-42;"blue-collar";"divorced";"primary";"no";28;"yes";"no";"unknown";5;"may";126;3;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";792;"yes";"no";"unknown";5;"may";65;2;-1;0;"unknown";"no"
-42;"blue-collar";"married";"unknown";"no";408;"yes";"no";"unknown";5;"may";107;2;-1;0;"unknown";"no"
-51;"admin.";"married";"secondary";"no";531;"yes";"no";"unknown";5;"may";267;2;-1;0;"unknown";"no"
-57;"retired";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";248;2;-1;0;"unknown";"no"
-36;"services";"single";"secondary";"no";62;"yes";"no";"unknown";5;"may";215;2;-1;0;"unknown";"no"
-53;"services";"married";"unknown";"no";257;"yes";"no";"unknown";5;"may";209;2;-1;0;"unknown";"no"
-50;"technician";"married";"secondary";"no";1234;"yes";"no";"unknown";5;"may";205;2;-1;0;"unknown";"no"
-54;"management";"married";"tertiary";"no";313;"yes";"no";"unknown";5;"may";83;2;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";5;"may";106;3;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";129;"yes";"yes";"unknown";5;"may";189;2;-1;0;"unknown";"no"
-43;"management";"married";"unknown";"no";0;"yes";"no";"unknown";5;"may";105;2;-1;0;"unknown";"no"
-56;"admin.";"married";"secondary";"no";353;"yes";"no";"unknown";5;"may";106;2;-1;0;"unknown";"no"
-54;"technician";"married";"unknown";"no";851;"yes";"no";"unknown";5;"may";108;2;-1;0;"unknown";"no"
-55;"services";"divorced";"primary";"no";96;"yes";"yes";"unknown";5;"may";311;2;-1;0;"unknown";"no"
-37;"services";"divorced";"secondary";"no";398;"yes";"yes";"unknown";5;"may";214;2;-1;0;"unknown";"no"
-33;"admin.";"single";"tertiary";"no";193;"no";"no";"unknown";5;"may";132;2;-1;0;"unknown";"no"
-46;"admin.";"married";"secondary";"no";-358;"yes";"no";"unknown";5;"may";358;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";539;"yes";"yes";"unknown";5;"may";453;2;-1;0;"unknown";"no"
-51;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";364;2;-1;0;"unknown";"no"
-40;"retired";"single";"primary";"no";0;"no";"no";"unknown";5;"may";136;2;-1;0;"unknown";"no"
-42;"blue-collar";"married";"secondary";"no";490;"yes";"no";"unknown";5;"may";386;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";173;2;-1;0;"unknown";"no"
-49;"blue-collar";"married";"unknown";"no";403;"yes";"no";"unknown";5;"may";241;2;-1;0;"unknown";"no"
-48;"management";"married";"secondary";"no";161;"yes";"no";"unknown";5;"may";224;3;-1;0;"unknown";"no"
-32;"technician";"divorced";"tertiary";"no";2558;"no";"no";"unknown";5;"may";148;2;-1;0;"unknown";"no"
-31;"admin.";"single";"secondary";"no";98;"yes";"no";"unknown";5;"may";196;2;-1;0;"unknown";"no"
-55;"management";"single";"tertiary";"no";115;"no";"no";"unknown";5;"may";111;4;-1;0;"unknown";"no"
-40;"blue-collar";"single";"secondary";"no";436;"yes";"no";"unknown";5;"may";231;3;-1;0;"unknown";"no"
-47;"technician";"married";"tertiary";"no";831;"yes";"no";"unknown";5;"may";316;3;-1;0;"unknown";"no"
-57;"technician";"married";"unknown";"no";206;"yes";"no";"unknown";5;"may";216;3;-1;0;"unknown";"no"
-41;"blue-collar";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";240;2;-1;0;"unknown";"no"
-48;"blue-collar";"married";"secondary";"no";1;"no";"no";"unknown";5;"may";669;3;-1;0;"unknown";"no"
-42;"blue-collar";"married";"unknown";"no";57;"yes";"no";"unknown";5;"may";425;2;-1;0;"unknown";"no"
-30;"blue-collar";"single";"secondary";"no";-457;"yes";"no";"unknown";5;"may";143;2;-1;0;"unknown";"no"
-58;"management";"single";"tertiary";"no";1387;"yes";"no";"unknown";5;"may";174;5;-1;0;"unknown";"no"
-45;"management";"divorced";"tertiary";"no";24598;"yes";"no";"unknown";5;"may";313;3;-1;0;"unknown";"no"
-49;"blue-collar";"married";"secondary";"no";30;"yes";"no";"unknown";5;"may";135;4;-1;0;"unknown";"no"
-42;"admin.";"single";"secondary";"no";1022;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";56;"yes";"yes";"unknown";5;"may";152;2;-1;0;"unknown";"no"
-51;"admin.";"single";"secondary";"yes";-2;"no";"no";"unknown";5;"may";402;3;-1;0;"unknown";"no"
-32;"services";"single";"secondary";"no";121;"yes";"no";"unknown";5;"may";213;2;-1;0;"unknown";"no"
-41;"blue-collar";"single";"secondary";"no";842;"yes";"no";"unknown";5;"may";144;3;-1;0;"unknown";"no"
-43;"management";"divorced";"secondary";"no";693;"yes";"no";"unknown";5;"may";124;3;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"secondary";"no";-333;"yes";"no";"unknown";5;"may";183;2;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";1533;"yes";"no";"unknown";5;"may";325;2;-1;0;"unknown";"no"
-34;"management";"married";"tertiary";"no";46;"yes";"no";"unknown";5;"may";39;4;-1;0;"unknown";"no"
-53;"services";"married";"unknown";"no";18;"no";"no";"unknown";5;"may";503;2;-1;0;"unknown";"no"
-45;"technician";"married";"secondary";"no";44;"yes";"no";"unknown";5;"may";95;4;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";-100;"yes";"no";"unknown";5;"may";680;2;-1;0;"unknown";"no"
-44;"services";"married";"tertiary";"no";510;"yes";"no";"unknown";5;"may";421;4;-1;0;"unknown";"no"
-55;"management";"married";"tertiary";"no";685;"yes";"no";"unknown";5;"may";174;3;-1;0;"unknown";"no"
-46;"management";"single";"tertiary";"no";187;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
-45;"blue-collar";"married";"secondary";"no";66;"yes";"no";"unknown";5;"may";808;2;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";560;"yes";"no";"unknown";5;"may";198;3;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";195;2;-1;0;"unknown";"no"
-59;"unknown";"divorced";"unknown";"no";27;"no";"no";"unknown";5;"may";347;3;-1;0;"unknown";"no"
-31;"admin.";"single";"secondary";"no";12;"yes";"no";"unknown";5;"may";208;2;-1;0;"unknown";"no"
-44;"blue-collar";"single";"secondary";"no";34;"yes";"no";"unknown";5;"may";404;4;-1;0;"unknown";"no"
-33;"entrepreneur";"single";"tertiary";"no";1068;"yes";"no";"unknown";5;"may";396;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";216;4;-1;0;"unknown";"no"
-46;"admin.";"single";"tertiary";"no";377;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
-48;"management";"married";"tertiary";"no";263;"yes";"no";"unknown";5;"may";350;2;-1;0;"unknown";"no"
-42;"services";"married";"secondary";"no";1263;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
-27;"services";"married";"secondary";"no";8;"yes";"no";"unknown";6;"may";88;3;-1;0;"unknown";"no"
-48;"admin.";"married";"secondary";"no";126;"yes";"yes";"unknown";6;"may";379;2;-1;0;"unknown";"no"
-59;"admin.";"married";"secondary";"no";230;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
-46;"technician";"married";"tertiary";"no";841;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
-38;"admin.";"divorced";"secondary";"no";308;"yes";"no";"unknown";6;"may";102;1;-1;0;"unknown";"no"
-43;"management";"divorced";"tertiary";"no";1;"yes";"no";"unknown";6;"may";306;1;-1;0;"unknown";"no"
-38;"admin.";"divorced";"tertiary";"no";86;"yes";"no";"unknown";6;"may";218;1;-1;0;"unknown";"no"
-23;"student";"single";"secondary";"no";157;"yes";"no";"unknown";6;"may";54;1;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";22;"yes";"no";"unknown";6;"may";344;1;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";46;"yes";"yes";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";1293;"no";"no";"unknown";6;"may";652;1;-1;0;"unknown";"no"
-25;"admin.";"single";"secondary";"no";122;"yes";"no";"unknown";6;"may";286;1;-1;0;"unknown";"no"
-48;"blue-collar";"married";"unknown";"no";131;"yes";"no";"unknown";6;"may";189;1;-1;0;"unknown";"no"
-49;"blue-collar";"single";"secondary";"no";143;"yes";"no";"unknown";6;"may";83;1;-1;0;"unknown";"no"
-38;"admin.";"single";"secondary";"no";393;"no";"no";"unknown";6;"may";184;2;-1;0;"unknown";"no"
-43;"blue-collar";"married";"primary";"no";98;"yes";"no";"unknown";6;"may";235;1;-1;0;"unknown";"no"
-33;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";290;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";224;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";757;"yes";"no";"unknown";6;"may";133;1;-1;0;"unknown";"no"
-49;"services";"married";"secondary";"no";245;"yes";"yes";"unknown";6;"may";318;1;-1;0;"unknown";"no"
-40;"management";"married";"secondary";"no";8486;"no";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
-43;"admin.";"married";"unknown";"no";350;"no";"no";"unknown";6;"may";437;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";20;"yes";"no";"unknown";6;"may";402;1;-1;0;"unknown";"no"
-58;"services";"married";"secondary";"no";1667;"yes";"yes";"unknown";6;"may";85;1;-1;0;"unknown";"no"
-57;"technician";"married";"unknown";"no";345;"yes";"no";"unknown";6;"may";125;1;-1;0;"unknown";"no"
-32;"unemployed";"married";"secondary";"no";10;"yes";"no";"unknown";6;"may";501;4;-1;0;"unknown";"no"
-56;"management";"married";"tertiary";"no";830;"yes";"yes";"unknown";6;"may";1201;1;-1;0;"unknown";"yes"
-58;"blue-collar";"divorced";"unknown";"no";29;"yes";"no";"unknown";6;"may";253;1;-1;0;"unknown";"no"
-60;"retired";"divorced";"secondary";"no";545;"yes";"no";"unknown";6;"may";1030;1;-1;0;"unknown";"yes"
-37;"technician";"married";"tertiary";"no";8730;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
-46;"technician";"divorced";"tertiary";"no";477;"yes";"no";"unknown";6;"may";114;1;-1;0;"unknown";"no"
-27;"admin.";"married";"secondary";"no";4;"yes";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";769;2;-1;0;"unknown";"no"
-32;"technician";"single";"secondary";"no";0;"yes";"yes";"unknown";6;"may";135;3;-1;0;"unknown";"no"
-40;"admin.";"single";"secondary";"no";263;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";1;"no";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";283;"no";"yes";"unknown";6;"may";199;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"primary";"no";206;"yes";"no";"unknown";6;"may";152;1;-1;0;"unknown";"no"
-42;"housemaid";"married";"primary";"no";17;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
-48;"technician";"married";"secondary";"no";141;"yes";"yes";"unknown";6;"may";424;1;-1;0;"unknown";"no"
-29;"self-employed";"single";"tertiary";"no";16;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
-50;"services";"married";"secondary";"no";206;"yes";"no";"unknown";6;"may";154;1;-1;0;"unknown";"no"
-52;"technician";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";203;2;-1;0;"unknown";"no"
-50;"management";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";326;1;-1;0;"unknown";"no"
-58;"retired";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";393;1;-1;0;"unknown";"no"
-46;"blue-collar";"divorced";"primary";"no";1927;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
-38;"technician";"married";"secondary";"no";284;"yes";"no";"unknown";6;"may";483;1;-1;0;"unknown";"no"
-46;"blue-collar";"married";"secondary";"no";1660;"yes";"no";"unknown";6;"may";259;1;-1;0;"unknown";"no"
-32;"services";"single";"secondary";"no";406;"yes";"no";"unknown";6;"may";227;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";230;"yes";"no";"unknown";6;"may";673;1;-1;0;"unknown";"no"
-39;"admin.";"single";"secondary";"no";-25;"yes";"no";"unknown";6;"may";576;1;-1;0;"unknown";"no"
-48;"admin.";"married";"secondary";"no";182;"yes";"no";"unknown";6;"may";180;2;-1;0;"unknown";"no"
-36;"entrepreneur";"married";"tertiary";"no";1169;"yes";"no";"unknown";6;"may";168;2;-1;0;"unknown";"no"
-34;"admin.";"divorced";"secondary";"no";67;"yes";"no";"unknown";6;"may";90;1;-1;0;"unknown";"no"
-40;"technician";"married";"secondary";"no";77;"no";"no";"unknown";6;"may";505;1;-1;0;"unknown";"no"
-43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";245;1;-1;0;"unknown";"no"
-52;"blue-collar";"divorced";"primary";"no";55;"yes";"yes";"unknown";6;"may";186;1;-1;0;"unknown";"no"
-33;"technician";"married";"secondary";"yes";72;"yes";"no";"unknown";6;"may";623;1;-1;0;"unknown";"no"
-49;"management";"single";"tertiary";"no";163;"yes";"no";"unknown";6;"may";496;3;-1;0;"unknown";"no"
-32;"management";"single";"tertiary";"no";151;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
-39;"admin.";"single";"secondary";"no";113;"yes";"no";"unknown";6;"may";342;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
-38;"technician";"single";"tertiary";"no";9;"yes";"no";"unknown";6;"may";185;3;-1;0;"unknown";"no"
-43;"management";"married";"secondary";"no";375;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
-39;"services";"married";"secondary";"no";1142;"yes";"no";"unknown";6;"may";276;1;-1;0;"unknown";"no"
-54;"blue-collar";"married";"primary";"no";2102;"yes";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
-38;"technician";"single";"tertiary";"no";4325;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";217;"yes";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-55;"admin.";"married";"secondary";"no";131;"yes";"no";"unknown";6;"may";744;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";1680;"yes";"no";"unknown";6;"may";765;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";119;1;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";320;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
-55;"admin.";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"secondary";"no";39;"no";"no";"unknown";6;"may";241;1;-1;0;"unknown";"no"
-35;"management";"single";"tertiary";"no";560;"yes";"no";"unknown";6;"may";181;1;-1;0;"unknown";"no"
-58;"technician";"divorced";"secondary";"no";469;"no";"no";"unknown";6;"may";196;1;-1;0;"unknown";"no"
-35;"admin.";"married";"secondary";"no";530;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
-49;"services";"married";"primary";"no";61;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
-34;"technician";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";139;"yes";"no";"unknown";6;"may";309;2;-1;0;"unknown";"no"
-24;"self-employed";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
-34;"technician";"married";"secondary";"no";367;"yes";"no";"unknown";6;"may";140;1;-1;0;"unknown";"no"
-51;"admin.";"divorced";"secondary";"no";228;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
-39;"technician";"single";"unknown";"no";45248;"yes";"no";"unknown";6;"may";1623;1;-1;0;"unknown";"yes"
-50;"self-employed";"married";"unknown";"no";-84;"yes";"no";"unknown";6;"may";101;1;-1;0;"unknown";"no"
-32;"services";"single";"secondary";"no";310;"yes";"no";"unknown";6;"may";144;1;-1;0;"unknown";"no"
-42;"blue-collar";"married";"unknown";"no";132;"yes";"no";"unknown";6;"may";238;1;-1;0;"unknown";"no"
-50;"technician";"married";"secondary";"no";797;"yes";"no";"unknown";6;"may";354;1;-1;0;"unknown";"no"
-40;"services";"married";"secondary";"no";71;"no";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
-46;"management";"divorced";"unknown";"no";2;"yes";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
-37;"management";"married";"tertiary";"no";231;"yes";"yes";"unknown";6;"may";451;2;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";270;"yes";"yes";"unknown";6;"may";159;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";274;"yes";"yes";"unknown";6;"may";409;1;-1;0;"unknown";"no"
-40;"admin.";"single";"secondary";"no";-109;"yes";"yes";"unknown";6;"may";170;1;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";608;1;-1;0;"unknown";"yes"
-33;"blue-collar";"single";"secondary";"yes";-60;"no";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
-35;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
-58;"blue-collar";"divorced";"secondary";"no";-11;"no";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";-509;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
-39;"unemployed";"married";"primary";"no";408;"yes";"no";"unknown";6;"may";53;1;-1;0;"unknown";"no"
-36;"services";"single";"primary";"no";58;"yes";"no";"unknown";6;"may";134;1;-1;0;"unknown";"no"
-57;"retired";"single";"secondary";"no";1640;"no";"yes";"unknown";6;"may";204;4;-1;0;"unknown";"no"
-36;"admin.";"single";"secondary";"no";20;"yes";"no";"unknown";6;"may";186;1;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";71;"yes";"no";"unknown";6;"may";678;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";52;"yes";"no";"unknown";6;"may";182;1;-1;0;"unknown";"no"
-44;"self-employed";"married";"tertiary";"no";292;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
-44;"services";"divorced";"secondary";"no";424;"yes";"no";"unknown";6;"may";27;1;-1;0;"unknown";"no"
-39;"housemaid";"single";"primary";"no";109;"yes";"no";"unknown";6;"may";699;3;-1;0;"unknown";"no"
-46;"blue-collar";"married";"unknown";"no";1044;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";983;"yes";"no";"unknown";6;"may";97;1;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";869;"no";"no";"unknown";6;"may";1677;1;-1;0;"unknown";"yes"
-40;"blue-collar";"married";"primary";"no";668;"yes";"no";"unknown";6;"may";283;2;-1;0;"unknown";"no"
-50;"management";"married";"tertiary";"no";964;"yes";"no";"unknown";6;"may";323;1;-1;0;"unknown";"no"
-31;"management";"single";"secondary";"no";301;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";140;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
-39;"management";"single";"secondary";"no";1877;"yes";"no";"unknown";6;"may";185;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";1127;"yes";"no";"unknown";6;"may";47;1;-1;0;"unknown";"no"
-41;"technician";"married";"secondary";"no";871;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
-41;"technician";"married";"secondary";"no";767;"yes";"yes";"unknown";6;"may";204;1;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
-30;"services";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";30;2;-1;0;"unknown";"no"
-54;"management";"divorced";"primary";"no";0;"no";"no";"unknown";6;"may";472;1;-1;0;"unknown";"no"
-43;"blue-collar";"divorced";"secondary";"no";110;"yes";"yes";"unknown";6;"may";448;1;-1;0;"unknown";"no"
-59;"management";"divorced";"tertiary";"no";-76;"yes";"yes";"unknown";6;"may";264;1;-1;0;"unknown";"no"
-47;"technician";"married";"unknown";"no";178;"yes";"no";"unknown";6;"may";169;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";288;1;-1;0;"unknown";"no"
-32;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";176;2;-1;0;"unknown";"no"
-29;"blue-collar";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";215;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";337;1;-1;0;"unknown";"no"
-55;"unemployed";"married";"tertiary";"no";5345;"no";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
-30;"blue-collar";"divorced";"secondary";"no";-209;"yes";"no";"unknown";6;"may";188;2;-1;0;"unknown";"no"
-39;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
-39;"blue-collar";"divorced";"secondary";"no";42;"yes";"no";"unknown";6;"may";226;2;-1;0;"unknown";"no"
-50;"blue-collar";"divorced";"secondary";"no";41;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";-99;"yes";"no";"unknown";6;"may";111;2;-1;0;"unknown";"no"
-37;"technician";"single";"secondary";"no";17;"yes";"no";"unknown";6;"may";164;1;-1;0;"unknown";"no"
-46;"admin.";"married";"primary";"no";276;"yes";"yes";"unknown";6;"may";157;2;-1;0;"unknown";"no"
-32;"technician";"single";"unknown";"no";-170;"no";"no";"unknown";6;"may";46;1;-1;0;"unknown";"no"
-37;"management";"single";"tertiary";"no";230;"yes";"yes";"unknown";6;"may";374;1;-1;0;"unknown";"no"
-29;"blue-collar";"married";"secondary";"no";9;"yes";"no";"unknown";6;"may";349;1;-1;0;"unknown";"no"
-41;"blue-collar";"married";"secondary";"no";946;"yes";"no";"unknown";6;"may";325;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";1297;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
-57;"retired";"divorced";"secondary";"no";-331;"yes";"no";"unknown";6;"may";531;1;-1;0;"unknown";"no"
-48;"blue-collar";"single";"secondary";"no";44;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
-60;"retired";"married";"secondary";"yes";15;"no";"no";"unknown";6;"may";80;1;-1;0;"unknown";"no"
-26;"admin.";"single";"secondary";"no";712;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
-58;"retired";"married";"secondary";"no";5435;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";507;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
-55;"unemployed";"divorced";"secondary";"no";387;"yes";"no";"unknown";6;"may";918;1;-1;0;"unknown";"yes"
-41;"blue-collar";"married";"primary";"no";0;"yes";"yes";"unknown";6;"may";238;1;-1;0;"unknown";"no"
-50;"management";"divorced";"secondary";"no";1716;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
-49;"entrepreneur";"married";"secondary";"no";167;"yes";"yes";"unknown";6;"may";198;3;-1;0;"unknown";"no"
-44;"admin.";"married";"unknown";"no";40;"no";"yes";"unknown";6;"may";160;2;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";148;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
-31;"technician";"married";"secondary";"no";17;"yes";"yes";"unknown";6;"may";120;1;-1;0;"unknown";"no"
-34;"blue-collar";"single";"tertiary";"no";1011;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
-46;"management";"single";"unknown";"no";1527;"yes";"no";"unknown";6;"may";269;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";744;"no";"no";"unknown";6;"may";157;1;-1;0;"unknown";"no"
-52;"admin.";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";128;1;-1;0;"unknown";"no"
-29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
-53;"retired";"married";"primary";"no";136;"yes";"no";"unknown";6;"may";267;2;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";1335;"yes";"no";"unknown";6;"may";371;2;-1;0;"unknown";"no"
-38;"management";"married";"secondary";"no";517;"yes";"no";"unknown";6;"may";288;2;-1;0;"unknown";"no"
-46;"management";"married";"tertiary";"no";459;"yes";"no";"unknown";6;"may";221;1;-1;0;"unknown";"no"
-48;"management";"divorced";"unknown";"no";549;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
-30;"admin.";"divorced";"secondary";"no";83;"yes";"yes";"unknown";6;"may";310;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";213;"no";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
-31;"housemaid";"married";"primary";"no";203;"yes";"no";"unknown";6;"may";604;3;-1;0;"unknown";"no"
-42;"services";"single";"secondary";"no";518;"yes";"no";"unknown";6;"may";198;1;-1;0;"unknown";"no"
-40;"management";"single";"tertiary";"no";3877;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
-52;"admin.";"married";"secondary";"no";1236;"yes";"no";"unknown";6;"may";247;1;-1;0;"unknown";"no"
-45;"blue-collar";"divorced";"secondary";"no";756;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
-48;"blue-collar";"married";"secondary";"no";157;"yes";"no";"unknown";6;"may";73;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";263;2;-1;0;"unknown";"no"
-34;"blue-collar";"married";"unknown";"no";245;"yes";"no";"unknown";6;"may";13;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"primary";"no";-144;"yes";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
-46;"blue-collar";"married";"secondary";"no";71;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
-49;"services";"divorced";"secondary";"no";505;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
-50;"technician";"married";"primary";"no";249;"yes";"no";"unknown";6;"may";129;1;-1;0;"unknown";"no"
-34;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
-40;"unemployed";"single";"secondary";"no";11;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
-36;"admin.";"married";"secondary";"no";639;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
-59;"blue-collar";"divorced";"unknown";"no";124;"yes";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";250;1;-1;0;"unknown";"no"
-36;"self-employed";"married";"tertiary";"no";107;"yes";"no";"unknown";6;"may";146;1;-1;0;"unknown";"no"
-56;"services";"married";"secondary";"no";473;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
-42;"services";"divorced";"secondary";"no";372;"yes";"yes";"unknown";6;"may";121;2;-1;0;"unknown";"no"
-30;"admin.";"married";"secondary";"no";46;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
-30;"student";"single";"tertiary";"no";34;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
-47;"self-employed";"married";"unknown";"no";935;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
-33;"blue-collar";"married";"secondary";"no";-10;"yes";"no";"unknown";6;"may";123;1;-1;0;"unknown";"no"
-36;"admin.";"married";"secondary";"no";-106;"yes";"no";"unknown";6;"may";130;2;-1;0;"unknown";"no"
-39;"services";"divorced";"primary";"no";471;"yes";"no";"unknown";6;"may";161;2;-1;0;"unknown";"no"
-56;"admin.";"divorced";"secondary";"no";778;"yes";"no";"unknown";6;"may";149;2;-1;0;"unknown";"no"
-39;"blue-collar";"divorced";"unknown";"no";170;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
-42;"technician";"married";"secondary";"no";315;"yes";"no";"unknown";6;"may";259;2;-1;0;"unknown";"no"
-52;"blue-collar";"married";"secondary";"no";3165;"no";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
-36;"admin.";"divorced";"secondary";"no";131;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
-35;"entrepreneur";"married";"secondary";"yes";204;"yes";"no";"unknown";6;"may";424;2;-1;0;"unknown";"no"
-47;"technician";"married";"secondary";"no";83;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
-59;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";6;"may";97;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";5431;"yes";"yes";"unknown";6;"may";383;1;-1;0;"unknown";"no"
-38;"management";"married";"unknown";"no";1759;"yes";"no";"unknown";6;"may";440;1;-1;0;"unknown";"no"
-46;"unemployed";"married";"secondary";"no";-125;"yes";"no";"unknown";6;"may";23;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-28;"services";"single";"secondary";"no";5090;"yes";"no";"unknown";6;"may";1297;3;-1;0;"unknown";"yes"
-38;"technician";"married";"unknown";"no";573;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
-56;"blue-collar";"married";"secondary";"no";1602;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
-41;"blue-collar";"single";"primary";"yes";-137;"yes";"yes";"unknown";6;"may";189;1;-1;0;"unknown";"no"
-52;"technician";"married";"unknown";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"no";193;"no";"no";"unknown";6;"may";179;1;-1;0;"unknown";"no"
-61;"retired";"married";"secondary";"no";195;"yes";"yes";"unknown";6;"may";179;1;-1;0;"unknown";"no"
-53;"entrepreneur";"married";"secondary";"no";288;"no";"no";"unknown";6;"may";69;1;-1;0;"unknown";"no"
-47;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";6;"may";105;2;-1;0;"unknown";"no"
-53;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";6;"may";266;3;-1;0;"unknown";"no"
-46;"services";"married";"secondary";"no";216;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
-39;"blue-collar";"divorced";"primary";"no";190;"yes";"yes";"unknown";6;"may";96;2;-1;0;"unknown";"no"
-56;"technician";"divorced";"secondary";"no";99;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
-55;"services";"divorced";"primary";"no";2298;"yes";"no";"unknown";6;"may";162;2;-1;0;"unknown";"no"
-44;"management";"married";"tertiary";"no";17;"yes";"no";"unknown";6;"may";352;2;-1;0;"unknown";"no"
-37;"technician";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";76;4;-1;0;"unknown";"no"
-35;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";154;2;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";840;"yes";"no";"unknown";6;"may";310;2;-1;0;"unknown";"no"
-37;"services";"married";"secondary";"no";358;"yes";"no";"unknown";6;"may";390;3;-1;0;"unknown";"no"
-30;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";369;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";-325;"yes";"yes";"unknown";6;"may";112;2;-1;0;"unknown";"no"
-36;"technician";"single";"secondary";"no";-15;"yes";"no";"unknown";6;"may";341;3;-1;0;"unknown";"no"
-38;"technician";"married";"secondary";"no";581;"yes";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
-41;"admin.";"divorced";"primary";"no";4070;"yes";"no";"unknown";6;"may";140;2;-1;0;"unknown";"no"
-48;"retired";"married";"secondary";"no";74;"no";"yes";"unknown";6;"may";315;1;-1;0;"unknown";"no"
-55;"services";"divorced";"secondary";"no";141;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
-28;"services";"divorced";"secondary";"no";89;"no";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"yes";0;"yes";"no";"unknown";6;"may";138;3;-1;0;"unknown";"no"
-30;"blue-collar";"married";"secondary";"no";450;"no";"no";"unknown";6;"may";526;2;-1;0;"unknown";"no"
-48;"technician";"married";"tertiary";"no";310;"no";"no";"unknown";6;"may";135;1;-1;0;"unknown";"no"
-31;"self-employed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";36;5;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";384;"yes";"no";"unknown";6;"may";1906;3;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";395;"yes";"no";"unknown";6;"may";219;2;-1;0;"unknown";"no"
-37;"services";"single";"unknown";"no";-118;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
-56;"blue-collar";"married";"primary";"no";5;"yes";"yes";"unknown";6;"may";407;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"secondary";"no";50;"yes";"yes";"unknown";6;"may";121;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";285;"yes";"yes";"unknown";6;"may";209;1;-1;0;"unknown";"no"
-49;"technician";"married";"unknown";"no";15;"no";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";653;"yes";"yes";"unknown";6;"may";208;1;-1;0;"unknown";"no"
-43;"self-employed";"married";"secondary";"no";918;"yes";"no";"unknown";6;"may";193;1;-1;0;"unknown";"no"
-32;"services";"married";"secondary";"no";243;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
-29;"technician";"single";"tertiary";"no";405;"yes";"no";"unknown";6;"may";65;1;-1;0;"unknown";"no"
-48;"management";"divorced";"tertiary";"no";1328;"yes";"no";"unknown";6;"may";339;1;-1;0;"unknown";"no"
-55;"services";"married";"primary";"no";255;"yes";"no";"unknown";6;"may";285;1;-1;0;"unknown";"no"
-53;"blue-collar";"married";"secondary";"no";3397;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
-47;"technician";"married";"unknown";"no";2106;"yes";"no";"unknown";6;"may";168;1;-1;0;"unknown";"no"
-39;"management";"married";"tertiary";"no";2877;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
-31;"blue-collar";"single";"tertiary";"no";60;"yes";"yes";"unknown";6;"may";389;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";2226;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";2880;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
-40;"technician";"single";"unknown";"no";-5;"yes";"no";"unknown";6;"may";78;2;-1;0;"unknown";"no"
-48;"technician";"married";"secondary";"no";147;"no";"no";"unknown";6;"may";142;3;-1;0;"unknown";"no"
-33;"technician";"divorced";"secondary";"no";7;"yes";"yes";"unknown";6;"may";87;1;-1;0;"unknown";"no"
-40;"technician";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
-59;"retired";"married";"primary";"no";-119;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
-30;"technician";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";703;1;-1;0;"unknown";"yes"
-31;"management";"single";"tertiary";"no";1852;"yes";"no";"unknown";6;"may";170;3;-1;0;"unknown";"no"
-35;"unemployed";"married";"secondary";"no";533;"yes";"no";"unknown";6;"may";802;1;-1;0;"unknown";"no"
-54;"technician";"divorced";"secondary";"no";21;"yes";"no";"unknown";6;"may";381;2;-1;0;"unknown";"no"
-34;"admin.";"single";"unknown";"no";2434;"yes";"no";"unknown";6;"may";218;4;-1;0;"unknown";"no"
-32;"technician";"married";"secondary";"no";90;"yes";"yes";"unknown";6;"may";57;2;-1;0;"unknown";"no"
-56;"admin.";"divorced";"unknown";"no";4246;"yes";"no";"unknown";6;"may";304;2;-1;0;"unknown";"no"
-32;"admin.";"single";"tertiary";"no";395;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
-42;"blue-collar";"married";"primary";"no";15;"yes";"no";"unknown";6;"may";230;1;-1;0;"unknown";"no"
-33;"services";"married";"tertiary";"no";85;"no";"no";"unknown";6;"may";262;3;-1;0;"unknown";"no"
-52;"entrepreneur";"married";"tertiary";"no";-184;"yes";"yes";"unknown";6;"may";392;2;-1;0;"unknown";"no"
-52;"services";"married";"secondary";"no";660;"no";"no";"unknown";6;"may";201;2;-1;0;"unknown";"no"
-52;"blue-collar";"divorced";"primary";"yes";-183;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
-30;"unemployed";"divorced";"secondary";"no";1144;"yes";"no";"unknown";6;"may";252;1;-1;0;"unknown";"no"
-44;"services";"divorced";"secondary";"no";1;"yes";"no";"unknown";6;"may";235;4;-1;0;"unknown";"no"
-35;"admin.";"married";"secondary";"no";69;"yes";"yes";"unknown";6;"may";235;2;-1;0;"unknown";"no"
-55;"management";"single";"secondary";"no";220;"yes";"no";"unknown";6;"may";328;2;-1;0;"unknown";"no"
-33;"blue-collar";"married";"primary";"no";332;"yes";"no";"unknown";6;"may";116;2;-1;0;"unknown";"no"
-37;"blue-collar";"single";"secondary";"no";240;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
-42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";293;1;-1;0;"unknown";"no"
-43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";37;2;-1;0;"unknown";"no"
-38;"entrepreneur";"married";"tertiary";"no";898;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";123;"yes";"yes";"unknown";6;"may";530;2;-1;0;"unknown";"no"
-31;"student";"single";"secondary";"no";252;"yes";"no";"unknown";6;"may";175;3;-1;0;"unknown";"no"
-41;"management";"married";"tertiary";"no";65;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
-41;"technician";"married";"secondary";"no";-366;"yes";"yes";"unknown";6;"may";29;3;-1;0;"unknown";"no"
-29;"student";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";311;2;-1;0;"unknown";"no"
-38;"admin.";"single";"secondary";"no";221;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
-44;"self-employed";"divorced";"tertiary";"no";4;"yes";"no";"unknown";6;"may";312;3;-1;0;"unknown";"no"
-39;"admin.";"married";"secondary";"no";104;"yes";"no";"unknown";6;"may";412;1;-1;0;"unknown";"no"
-28;"technician";"single";"secondary";"no";312;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
-33;"blue-collar";"married";"secondary";"no";-349;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
-41;"services";"married";"unknown";"no";4;"no";"no";"unknown";6;"may";284;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";-322;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
-29;"admin.";"married";"secondary";"no";-150;"yes";"no";"unknown";6;"may";328;1;-1;0;"unknown";"no"
-38;"management";"married";"unknown";"no";1349;"yes";"no";"unknown";6;"may";100;1;-1;0;"unknown";"no"
-32;"admin.";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";226;1;-1;0;"unknown";"no"
-45;"services";"married";"secondary";"no";1259;"yes";"no";"unknown";6;"may";507;1;-1;0;"unknown";"no"
-33;"admin.";"single";"secondary";"no";101;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";848;"yes";"no";"unknown";6;"may";684;2;-1;0;"unknown";"no"
-41;"entrepreneur";"married";"unknown";"no";89;"yes";"no";"unknown";6;"may";333;2;-1;0;"unknown";"no"
-41;"blue-collar";"married";"secondary";"no";140;"yes";"no";"unknown";6;"may";311;3;-1;0;"unknown";"no"
-35;"admin.";"single";"secondary";"no";148;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
-40;"technician";"single";"secondary";"no";200;"yes";"no";"unknown";6;"may";322;2;-1;0;"unknown";"no"
-60;"self-employed";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";202;4;-1;0;"unknown";"no"
-47;"services";"divorced";"secondary";"no";201;"yes";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
-46;"blue-collar";"married";"primary";"no";530;"yes";"no";"unknown";6;"may";739;3;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";273;2;-1;0;"unknown";"no"
-49;"self-employed";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
-29;"blue-collar";"married";"secondary";"no";43;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
-31;"management";"single";"tertiary";"no";-173;"yes";"no";"unknown";6;"may";396;2;-1;0;"unknown";"no"
-38;"management";"married";"tertiary";"no";389;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";215;"yes";"yes";"unknown";6;"may";308;3;-1;0;"unknown";"no"
-35;"technician";"married";"secondary";"no";-131;"yes";"no";"unknown";6;"may";467;2;-1;0;"unknown";"no"
-31;"management";"single";"secondary";"no";783;"yes";"no";"unknown";6;"may";320;1;-1;0;"unknown";"no"
-41;"admin.";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
-46;"services";"married";"unknown";"no";80;"yes";"no";"unknown";6;"may";245;2;-1;0;"unknown";"no"
-40;"services";"divorced";"secondary";"no";105;"yes";"no";"unknown";6;"may";189;2;-1;0;"unknown";"no"
-29;"admin.";"married";"secondary";"no";182;"yes";"yes";"unknown";6;"may";477;1;-1;0;"unknown";"no"
-49;"admin.";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";65;3;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"no";510;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
-40;"management";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
-53;"admin.";"married";"secondary";"no";244;"yes";"yes";"unknown";6;"may";197;2;-1;0;"unknown";"no"
-49;"management";"married";"tertiary";"no";92;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";6;"may";64;2;-1;0;"unknown";"no"
-29;"student";"single";"secondary";"no";948;"yes";"no";"unknown";6;"may";75;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";6;"may";400;2;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";710;"yes";"no";"unknown";6;"may";378;3;-1;0;"unknown";"no"
-39;"services";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";118;2;-1;0;"unknown";"no"
-36;"technician";"married";"secondary";"no";368;"yes";"yes";"unknown";6;"may";1597;2;-1;0;"unknown";"yes"
-44;"entrepreneur";"married";"tertiary";"no";1631;"yes";"no";"unknown";6;"may";346;2;-1;0;"unknown";"no"
-40;"admin.";"married";"secondary";"no";6;"yes";"no";"unknown";6;"may";60;3;-1;0;"unknown";"no"
-49;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";6;"may";276;2;-1;0;"unknown";"no"
-30;"technician";"single";"unknown";"no";-48;"yes";"no";"unknown";6;"may";152;2;-1;0;"unknown";"no"
-57;"management";"married";"tertiary";"no";2142;"yes";"no";"unknown";6;"may";251;3;-1;0;"unknown";"no"
-24;"services";"single";"secondary";"no";77;"yes";"yes";"unknown";6;"may";390;2;-1;0;"unknown";"no"
-46;"blue-collar";"married";"unknown";"no";401;"yes";"no";"unknown";6;"may";306;2;-1;0;"unknown";"no"
-33;"admin.";"married";"secondary";"no";21;"no";"no";"unknown";6;"may";189;3;-1;0;"unknown";"no"
-43;"services";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";125;2;-1;0;"unknown";"no"
-43;"admin.";"single";"secondary";"no";-497;"yes";"no";"unknown";6;"may";234;2;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"primary";"no";369;"no";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
-44;"technician";"single";"unknown";"no";78;"yes";"no";"unknown";6;"may";13;6;-1;0;"unknown";"no"
-35;"technician";"single";"tertiary";"no";226;"yes";"yes";"unknown";6;"may";283;3;-1;0;"unknown";"no"
-47;"technician";"married";"secondary";"no";503;"yes";"no";"unknown";6;"may";109;2;-1;0;"unknown";"no"
-33;"blue-collar";"married";"secondary";"no";372;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
-31;"admin.";"married";"secondary";"no";0;"yes";"yes";"unknown";6;"may";144;2;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";121;2;-1;0;"unknown";"no"
-36;"entrepreneur";"married";"tertiary";"no";125;"yes";"no";"unknown";6;"may";95;3;-1;0;"unknown";"no"
-56;"retired";"divorced";"primary";"no";4;"yes";"no";"unknown";6;"may";31;3;-1;0;"unknown";"no"
-40;"admin.";"single";"unknown";"no";419;"yes";"no";"unknown";6;"may";112;3;-1;0;"unknown";"no"
-41;"admin.";"divorced";"secondary";"no";322;"yes";"no";"unknown";6;"may";87;4;-1;0;"unknown";"no"
-53;"retired";"married";"secondary";"no";303;"yes";"no";"unknown";6;"may";593;2;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";607;"yes";"no";"unknown";6;"may";99;2;-1;0;"unknown";"no"
-44;"blue-collar";"divorced";"secondary";"no";579;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";3047;"yes";"no";"unknown";6;"may";285;2;-1;0;"unknown";"no"
-54;"technician";"divorced";"secondary";"no";83;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
-58;"management";"married";"tertiary";"no";68;"yes";"no";"unknown";6;"may";172;5;-1;0;"unknown";"no"
-52;"blue-collar";"married";"primary";"no";58;"yes";"no";"unknown";6;"may";213;3;-1;0;"unknown";"no"
-28;"admin.";"single";"secondary";"no";251;"yes";"no";"unknown";6;"may";178;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";688;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
-60;"retired";"married";"primary";"no";364;"yes";"no";"unknown";6;"may";631;2;-1;0;"unknown";"no"
-42;"services";"divorced";"secondary";"no";55;"yes";"no";"unknown";6;"may";176;5;-1;0;"unknown";"no"
-42;"admin.";"married";"secondary";"no";101;"yes";"no";"unknown";6;"may";32;3;-1;0;"unknown";"no"
-44;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";1529;2;-1;0;"unknown";"no"
-51;"blue-collar";"divorced";"primary";"no";325;"yes";"no";"unknown";6;"may";254;2;-1;0;"unknown";"no"
-49;"blue-collar";"married";"primary";"no";198;"yes";"no";"unknown";6;"may";200;2;-1;0;"unknown";"no"
-47;"entrepreneur";"married";"unknown";"no";209;"yes";"no";"unknown";6;"may";135;2;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";112;4;-1;0;"unknown";"no"
-34;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";314;3;-1;0;"unknown";"no"
-35;"services";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";597;3;-1;0;"unknown";"no"
-35;"blue-collar";"single";"secondary";"no";376;"yes";"yes";"unknown";6;"may";207;3;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";-7;"yes";"no";"unknown";6;"may";410;2;-1;0;"unknown";"no"
-55;"technician";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
-55;"retired";"married";"secondary";"no";143;"yes";"no";"unknown";6;"may";42;3;-1;0;"unknown";"no"
-35;"management";"single";"tertiary";"no";550;"yes";"no";"unknown";6;"may";55;2;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";162;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
-53;"management";"married";"tertiary";"no";115;"yes";"no";"unknown";6;"may";336;3;-1;0;"unknown";"no"
-41;"blue-collar";"married";"primary";"no";512;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
-57;"blue-collar";"married";"unknown";"no";807;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
-45;"blue-collar";"married";"unknown";"no";248;"yes";"no";"unknown";6;"may";88;5;-1;0;"unknown";"no"
-43;"blue-collar";"married";"primary";"no";1211;"yes";"no";"unknown";6;"may";208;3;-1;0;"unknown";"no"
-56;"self-employed";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";305;2;-1;0;"unknown";"no"
-31;"entrepreneur";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";206;2;-1;0;"unknown";"no"
-37;"blue-collar";"single";"secondary";"no";88;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
-30;"management";"married";"tertiary";"no";32;"yes";"no";"unknown";6;"may";122;3;-1;0;"unknown";"no"
-30;"admin.";"single";"secondary";"no";115;"yes";"no";"unknown";6;"may";66;3;-1;0;"unknown";"no"
-54;"blue-collar";"married";"secondary";"no";254;"yes";"no";"unknown";6;"may";66;2;-1;0;"unknown";"no"
-36;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";6;"may";164;2;-1;0;"unknown";"no"
-55;"unemployed";"married";"tertiary";"no";383;"no";"no";"unknown";6;"may";343;3;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";569;"yes";"yes";"unknown";6;"may";126;2;-1;0;"unknown";"no"
-38;"housemaid";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";59;3;-1;0;"unknown";"no"
-48;"admin.";"married";"secondary";"no";3754;"yes";"no";"unknown";6;"may";249;3;-1;0;"unknown";"no"
-55;"housemaid";"divorced";"tertiary";"no";6920;"yes";"no";"unknown";6;"may";406;3;-1;0;"unknown";"no"
-59;"services";"married";"secondary";"no";307;"yes";"yes";"unknown";6;"may";250;7;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";-421;"yes";"no";"unknown";6;"may";183;5;-1;0;"unknown";"no"
-33;"blue-collar";"divorced";"secondary";"no";60;"no";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";67;"yes";"no";"unknown";6;"may";220;2;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";402;"yes";"no";"unknown";6;"may";153;3;-1;0;"unknown";"no"
-30;"self-employed";"single";"tertiary";"no";800;"no";"no";"unknown";6;"may";95;2;-1;0;"unknown";"no"
-42;"technician";"married";"tertiary";"no";239;"yes";"yes";"unknown";6;"may";191;3;-1;0;"unknown";"no"
-51;"blue-collar";"divorced";"secondary";"no";421;"yes";"no";"unknown";6;"may";216;2;-1;0;"unknown";"no"
-44;"admin.";"divorced";"secondary";"no";161;"yes";"no";"unknown";7;"may";89;2;-1;0;"unknown";"no"
-46;"technician";"married";"secondary";"yes";289;"no";"no";"unknown";7;"may";51;3;-1;0;"unknown";"no"
-29;"student";"single";"secondary";"no";110;"yes";"no";"unknown";7;"may";169;3;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";245;"yes";"no";"unknown";7;"may";148;3;-1;0;"unknown";"no"
-42;"services";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";132;3;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";156;"yes";"no";"unknown";7;"may";117;3;-1;0;"unknown";"no"
-42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";7;"may";275;4;-1;0;"unknown";"no"
-39;"admin.";"married";"secondary";"no";20;"yes";"no";"unknown";7;"may";124;2;-1;0;"unknown";"no"
-55;"technician";"single";"tertiary";"no";92;"yes";"no";"unknown";7;"may";118;3;-1;0;"unknown";"no"
-46;"services";"married";"secondary";"no";89;"yes";"no";"unknown";7;"may";479;2;-1;0;"unknown";"no"
-42;"blue-collar";"married";"secondary";"no";166;"yes";"no";"unknown";7;"may";285;3;-1;0;"unknown";"no"
-45;"management";"married";"tertiary";"no";103;"yes";"no";"unknown";7;"may";35;4;-1;0;"unknown";"no"
-43;"blue-collar";"married";"primary";"no";-454;"yes";"no";"unknown";7;"may";322;2;-1;0;"unknown";"no"
-42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";7;"may";202;2;-1;0;"unknown";"no"
-30;"admin.";"married";"secondary";"no";4;"no";"no";"unknown";7;"may";172;8;-1;0;"unknown";"no"
-47;"blue-collar";"married";"secondary";"no";1001;"yes";"no";"unknown";7;"may";201;4;-1;0;"unknown";"no"
-51;"services";"divorced";"secondary";"no";-69;"yes";"no";"unknown";7;"may";216;3;-1;0;"unknown";"no"
-38;"technician";"single";"secondary";"no";42;"yes";"no";"unknown";7;"may";195;2;-1;0;"unknown";"no"
-57;"technician";"married";"unknown";"no";1617;"yes";"no";"unknown";7;"may";96;2;-1;0;"unknown";"no"
-42;"management";"divorced";"tertiary";"no";221;"yes";"no";"unknown";7;"may";720;2;-1;0;"unknown";"no"
-32;"technician";"divorced";"secondary";"no";210;"yes";"yes";"unknown";7;"may";188;2;-1;0;"unknown";"no"
-46;"management";"married";"tertiary";"no";0;"no";"no";"unknown";7;"may";70;2;-1;0;"unknown";"no"
-29;"student";"single";"tertiary";"no";185;"yes";"no";"unknown";7;"may";141;3;-1;0;"unknown";"no"
-59;"retired";"married";"secondary";"no";836;"yes";"no";"unknown";7;"may";106;1;-1;0;"unknown";"no"
-32;"blue-collar";"single";"secondary";"no";301;"yes";"no";"unknown";7;"may";395;2;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";503;"yes";"no";"unknown";7;"may";629;2;-1;0;"unknown";"no"
-40;"retired";"married";"primary";"no";407;"yes";"no";"unknown";7;"may";502;1;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";53;"yes";"no";"unknown";7;"may";446;1;-1;0;"unknown";"no"
-46;"self-employed";"married";"tertiary";"no";2303;"yes";"no";"unknown";7;"may";241;1;-1;0;"unknown";"no"
-43;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";7;"may";131;3;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";205;"yes";"no";"unknown";7;"may";312;1;-1;0;"unknown";"no"
-39;"management";"married";"tertiary";"no";305;"yes";"no";"unknown";7;"may";275;6;-1;0;"unknown";"no"
-30;"blue-collar";"divorced";"secondary";"no";251;"yes";"yes";"unknown";7;"may";120;2;-1;0;"unknown";"no"
-56;"retired";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";333;4;-1;0;"unknown";"no"
-29;"technician";"married";"secondary";"no";8;"no";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"secondary";"no";139;"yes";"no";"unknown";7;"may";91;1;-1;0;"unknown";"no"
-36;"services";"married";"secondary";"no";184;"yes";"no";"unknown";7;"may";128;3;-1;0;"unknown";"no"
-37;"blue-collar";"single";"secondary";"no";238;"yes";"no";"unknown";7;"may";200;2;-1;0;"unknown";"no"
-35;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";7;"may";326;1;-1;0;"unknown";"no"
-35;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";7;"may";292;1;-1;0;"unknown";"no"
-47;"services";"married";"primary";"no";222;"yes";"no";"unknown";7;"may";68;1;-1;0;"unknown";"no"
-31;"services";"married";"secondary";"no";414;"yes";"no";"unknown";7;"may";215;1;-1;0;"unknown";"no"
-56;"retired";"single";"primary";"no";223;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";197;"no";"no";"unknown";7;"may";32;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";-251;"yes";"no";"unknown";7;"may";162;1;-1;0;"unknown";"no"
-45;"self-employed";"divorced";"secondary";"no";-139;"yes";"no";"unknown";7;"may";152;3;-1;0;"unknown";"no"
-47;"blue-collar";"married";"unknown";"no";733;"yes";"no";"unknown";7;"may";268;1;-1;0;"unknown";"no"
-29;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";7;"may";104;2;-1;0;"unknown";"no"
-57;"services";"married";"secondary";"no";1;"no";"no";"unknown";7;"may";852;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";97;"yes";"no";"unknown";7;"may";923;3;-1;0;"unknown";"no"
-31;"blue-collar";"single";"primary";"no";435;"yes";"no";"unknown";7;"may";159;2;-1;0;"unknown";"no"
-31;"management";"divorced";"tertiary";"no";0;"yes";"no";"unknown";7;"may";953;3;-1;0;"unknown";"no"
-37;"technician";"single";"tertiary";"no";147;"no";"no";"unknown";7;"may";416;2;-1;0;"unknown";"no"
-30;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";174;1;-1;0;"unknown";"no"
-58;"services";"divorced";"secondary";"no";1109;"yes";"yes";"unknown";7;"may";180;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";404;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";981;"yes";"no";"unknown";7;"may";294;1;-1;0;"unknown";"no"
-33;"blue-collar";"single";"primary";"no";95;"yes";"no";"unknown";7;"may";102;1;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";302;"yes";"no";"unknown";7;"may";124;1;-1;0;"unknown";"no"
-36;"services";"divorced";"secondary";"no";-290;"yes";"yes";"unknown";7;"may";128;1;-1;0;"unknown";"no"
-37;"services";"single";"secondary";"no";259;"yes";"no";"unknown";7;"may";130;1;-1;0;"unknown";"no"
-35;"blue-collar";"married";"secondary";"no";527;"yes";"yes";"unknown";7;"may";143;1;-1;0;"unknown";"no"
-55;"retired";"married";"secondary";"no";102;"yes";"no";"unknown";7;"may";74;1;-1;0;"unknown";"no"
-34;"management";"single";"tertiary";"no";872;"yes";"no";"unknown";7;"may";105;2;-1;0;"unknown";"no"
-40;"management";"divorced";"tertiary";"no";490;"yes";"no";"unknown";7;"may";477;2;-1;0;"unknown";"no"
-42;"blue-collar";"single";"primary";"no";19;"yes";"no";"unknown";7;"may";158;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";16;"yes";"no";"unknown";7;"may";250;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";386;"yes";"no";"unknown";7;"may";168;1;-1;0;"unknown";"no"
-35;"technician";"single";"secondary";"no";539;"yes";"no";"unknown";7;"may";520;1;-1;0;"unknown";"no"
-44;"technician";"divorced";"secondary";"no";-329;"yes";"no";"unknown";7;"may";171;1;-1;0;"unknown";"no"
-30;"services";"single";"secondary";"no";-174;"yes";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
-45;"entrepreneur";"married";"secondary";"no";68;"yes";"no";"unknown";7;"may";254;1;-1;0;"unknown";"no"
-35;"blue-collar";"single";"unknown";"yes";-532;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
-36;"admin.";"divorced";"secondary";"no";0;"yes";"no";"unknown";7;"may";133;2;-1;0;"unknown";"no"
-49;"blue-collar";"married";"secondary";"no";64;"yes";"no";"unknown";7;"may";293;3;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";1415;"yes";"no";"unknown";7;"may";485;1;-1;0;"unknown";"no"
-31;"technician";"single";"secondary";"no";147;"yes";"no";"unknown";7;"may";374;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";72;"yes";"no";"unknown";7;"may";425;6;-1;0;"unknown";"no"
-37;"services";"single";"secondary";"no";-196;"yes";"no";"unknown";7;"may";207;1;-1;0;"unknown";"no"
-33;"blue-collar";"married";"primary";"no";716;"yes";"no";"unknown";7;"may";83;3;-1;0;"unknown";"no"
-37;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";7;"may";228;1;-1;0;"unknown";"no"
-42;"services";"married";"secondary";"no";-246;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
-56;"blue-collar";"married";"secondary";"no";-203;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";245;"yes";"yes";"unknown";7;"may";732;2;-1;0;"unknown";"yes"
-36;"services";"single";"secondary";"no";342;"yes";"no";"unknown";7;"may";142;1;-1;0;"unknown";"no"
-29;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
-54;"management";"married";"tertiary";"yes";-248;"yes";"yes";"unknown";7;"may";112;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";376;"yes";"no";"unknown";7;"may";1521;1;-1;0;"unknown";"no"
-43;"blue-collar";"divorced";"secondary";"no";370;"yes";"no";"unknown";7;"may";216;1;-1;0;"unknown";"no"
-47;"admin.";"single";"secondary";"no";594;"yes";"no";"unknown";7;"may";161;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"secondary";"no";387;"yes";"no";"unknown";7;"may";122;2;-1;0;"unknown";"no"
-38;"services";"married";"secondary";"no";208;"yes";"no";"unknown";7;"may";800;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";563;"yes";"no";"unknown";7;"may";615;1;-1;0;"unknown";"no"
-33;"services";"divorced";"secondary";"no";392;"yes";"yes";"unknown";7;"may";254;1;-1;0;"unknown";"no"
-33;"retired";"married";"secondary";"no";165;"no";"no";"unknown";7;"may";111;1;-1;0;"unknown";"no"
-53;"admin.";"divorced";"unknown";"no";236;"yes";"no";"unknown";7;"may";354;1;-1;0;"unknown";"no"
-37;"services";"married";"primary";"no";52;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
-40;"management";"single";"tertiary";"no";1265;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";693;"yes";"no";"unknown";7;"may";327;3;-1;0;"unknown";"no"
-35;"technician";"married";"secondary";"no";118;"yes";"no";"unknown";7;"may";236;1;-1;0;"unknown";"no"
-49;"blue-collar";"married";"primary";"no";3659;"yes";"no";"unknown";7;"may";160;1;-1;0;"unknown";"no"
-26;"blue-collar";"single";"secondary";"no";24;"yes";"no";"unknown";7;"may";180;1;-1;0;"unknown";"no"
-38;"management";"single";"tertiary";"no";673;"yes";"no";"unknown";7;"may";184;1;-1;0;"unknown";"no"
-52;"self-employed";"married";"secondary";"no";273;"no";"no";"unknown";7;"may";227;1;-1;0;"unknown";"no"
-33;"services";"divorced";"secondary";"no";327;"yes";"no";"unknown";7;"may";109;1;-1;0;"unknown";"no"
-31;"admin.";"single";"secondary";"no";299;"yes";"no";"unknown";7;"may";492;2;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";298;1;-1;0;"unknown";"no"
-35;"blue-collar";"single";"primary";"no";109;"yes";"no";"unknown";7;"may";83;2;-1;0;"unknown";"no"
-55;"management";"divorced";"tertiary";"no";552;"no";"no";"unknown";7;"may";241;2;-1;0;"unknown";"no"
-32;"blue-collar";"divorced";"primary";"no";473;"yes";"no";"unknown";7;"may";204;2;-1;0;"unknown";"no"
-37;"unknown";"single";"unknown";"no";414;"yes";"no";"unknown";7;"may";131;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";

<TRUNCATED>

[39/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/integration/pom.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/pom.xml b/community/mahout-mr/integration/pom.xml
index cb0c19a..8dbe599 100644
--- a/community/mahout-mr/integration/pom.xml
+++ b/community/mahout-mr/integration/pom.xml
@@ -25,7 +25,7 @@
     <groupId>org.apache.mahout</groupId>
     <artifactId>mahout</artifactId>
     <version>0.13.1-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
+    <relativePath>../mr/pom.xml</relativePath>
   </parent>
 
   <artifactId>mahout-integration</artifactId>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/README.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/README.txt b/community/mahout-mr/mr-examples/bin/README.txt
new file mode 100644
index 0000000..7ad3a38
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/README.txt
@@ -0,0 +1,13 @@
+This directory contains helpful shell scripts for working with some of Mahout's examples.  
+
+To set a non-default temporary work directory: `export MAHOUT_WORK_DIR=/path/in/hdfs/to/temp/dir`
+  Note that this requires the same path to be writable both on the local file system as well as on HDFS.
+
+Here's a description of what each does:
+
+classify-20newsgroups.sh -- Run SGD and Bayes classifiers over the classic 20 News Groups.  Downloads the data set automatically.
+cluster-reuters.sh -- Cluster the Reuters data set using a variety of algorithms.  Downloads the data set automatically.
+cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set.  Downloads the data set automatically.
+factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on the Grouplens data set (size 1M).
+factorize-netflix.sh -- (Deprecated due to lack of availability of the data set) Run the ALS Recommender on the Netflix data set.
+spark-document-classifier.mscala -- A mahout-shell script which trains and tests a Naive Bayes model on the Wikipedia XML dump and defines simple methods to classify new text.

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh b/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh
new file mode 100755
index 0000000..f47d5c5
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh
@@ -0,0 +1,197 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the 20newsgroups dataset, trains and tests a classifier.
+#
+# To run:  change into the mahout directory and type:
+# examples/bin/classify-20newsgroups.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script runs SGD and Bayes classifiers over the classic 20 News Groups."
+  exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+  cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+  WORK_DIR=/tmp/mahout-work-${USER}
+else
+  WORK_DIR=$MAHOUT_WORK_DIR
+fi
+algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean)
+if [ -n "$1" ]; then
+  choice=$1
+else
+  echo "Please select a number to choose the corresponding task to run"
+  echo "1. ${algorithm[0]}"
+  echo "2. ${algorithm[1]}"
+  echo "3. ${algorithm[2]}"
+  echo "4. ${algorithm[3]}"
+  echo "5. ${algorithm[4]}"
+  echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR"
+  read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
+
+# Spark specific check and work 
+if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then
+  if [ "$MASTER" == "" ] ; then
+    echo "Please set your MASTER env variable to point to your Spark Master URL. exiting..."
+    exit 1
+  fi
+  if [ "$MAHOUT_LOCAL" != "" ] ; then
+    echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..."
+    exit 1
+  fi
+fi
+
+if [ "x$alg" != "xclean" ]; then
+  echo "creating work directory at ${WORK_DIR}"
+
+  mkdir -p ${WORK_DIR}
+  if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then
+    if [ ! -e ${WORK_DIR}/20news-bydate ]; then
+      if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then
+        echo "Downloading 20news-bydate"
+        curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz
+      fi
+      mkdir -p ${WORK_DIR}/20news-bydate
+      echo "Extracting..."
+      cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd ..
+    fi
+  fi
+fi
+#echo $START_PATH
+cd $START_PATH
+cd ../..
+
+set -e
+
+if  ( [ "x$alg" == "xnaivebayes-MapReduce" ] ||  [ "x$alg" == "xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark"  ] || [ "x$alg" == "xcnaivebayes-Spark" ] ); then
+  c=""
+
+  if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" == "xnaivebayes-Spark" ]; then
+    c=" -c"
+  fi
+
+  set -x
+  echo "Preparing 20newsgroups data"
+  rm -rf ${WORK_DIR}/20news-all
+  mkdir ${WORK_DIR}/20news-all
+  cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all
+
+  if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+    echo "Copying 20newsgroups data to HDFS"
+    set +e
+    $DFSRM ${WORK_DIR}/20news-all
+    $DFS -mkdir -p ${WORK_DIR}
+    $DFS -mkdir ${WORK_DIR}/20news-all
+    set -e
+    if [ $HVERSION -eq "1" ] ; then
+      echo "Copying 20newsgroups data to Hadoop 1 HDFS"
+      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all
+    elif [ $HVERSION -eq "2" ] ; then
+      echo "Copying 20newsgroups data to Hadoop 2 HDFS"
+      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/
+    fi
+  fi
+
+  echo "Creating sequence files from 20newsgroups data"
+  ./bin/mahout seqdirectory \
+    -i ${WORK_DIR}/20news-all \
+    -o ${WORK_DIR}/20news-seq -ow
+
+  echo "Converting sequence files to vectors"
+  ./bin/mahout seq2sparse \
+    -i ${WORK_DIR}/20news-seq \
+    -o ${WORK_DIR}/20news-vectors  -lnorm -nv  -wt tfidf
+
+  echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
+  ./bin/mahout split \
+    -i ${WORK_DIR}/20news-vectors/tfidf-vectors \
+    --trainingOutput ${WORK_DIR}/20news-train-vectors \
+    --testOutput ${WORK_DIR}/20news-test-vectors  \
+    --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential
+
+    if [ "x$alg" == "xnaivebayes-MapReduce"  -o  "x$alg" == "xcnaivebayes-MapReduce" ]; then
+
+      echo "Training Naive Bayes model"
+      ./bin/mahout trainnb \
+        -i ${WORK_DIR}/20news-train-vectors \
+        -o ${WORK_DIR}/model \
+        -li ${WORK_DIR}/labelindex \
+        -ow $c
+
+      echo "Self testing on training set"
+
+      ./bin/mahout testnb \
+        -i ${WORK_DIR}/20news-train-vectors\
+        -m ${WORK_DIR}/model \
+        -l ${WORK_DIR}/labelindex \
+        -ow -o ${WORK_DIR}/20news-testing $c
+
+      echo "Testing on holdout set"
+
+      ./bin/mahout testnb \
+        -i ${WORK_DIR}/20news-test-vectors\
+        -m ${WORK_DIR}/model \
+        -l ${WORK_DIR}/labelindex \
+        -ow -o ${WORK_DIR}/20news-testing $c
+
+    elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then
+
+      echo "Training Naive Bayes model"
+      ./bin/mahout spark-trainnb \
+        -i ${WORK_DIR}/20news-train-vectors \
+        -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER
+
+      echo "Self testing on training set"
+      ./bin/mahout spark-testnb \
+        -i ${WORK_DIR}/20news-train-vectors\
+        -m ${WORK_DIR}/spark-model $c -ma $MASTER
+
+      echo "Testing on holdout set"
+      ./bin/mahout spark-testnb \
+        -i ${WORK_DIR}/20news-test-vectors\
+        -m ${WORK_DIR}/spark-model $c -ma $MASTER
+        
+    fi
+elif [ "x$alg" == "xsgd" ]; then
+  if [ ! -e "/tmp/news-group.model" ]; then
+    echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/"
+    ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups ${WORK_DIR}/20news-bydate/20news-bydate-train/
+  fi
+  echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model"
+  ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model
+elif [ "x$alg" == "xclean" ]; then
+  rm -rf $WORK_DIR
+  rm -rf /tmp/news-group.model
+  $DFSRM $WORK_DIR
+fi
+# Remove the work directory
+#

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh b/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh
new file mode 100755
index 0000000..41dc0c9
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh
@@ -0,0 +1,196 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads a (partial) wikipedia dump, trains and tests a classifier.
+#
+# To run:  change into the mahout directory and type:
+# examples/bin/classify-wikipedia.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
+  exit
+fi
+
+# ensure that MAHOUT_HOME is set
+if [[ -z "$MAHOUT_HOME" ]]; then
+  echo "Please set MAHOUT_HOME."
+  exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+  cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+  WORK_DIR=/tmp/mahout-work-wiki
+else
+  WORK_DIR=$MAHOUT_WORK_DIR
+fi
+algorithm=( CBayes BinaryCBayes clean)
+if [ -n "$1" ]; then
+  choice=$1
+else
+  echo "Please select a number to choose the corresponding task to run"
+  echo "1. ${algorithm[0]} (may require increased heap space on yarn)"
+  echo "2. ${algorithm[1]}"
+  echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR"
+  read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
+
+if [ "x$alg" != "xclean" ]; then
+  echo "creating work directory at ${WORK_DIR}"
+
+  mkdir -p ${WORK_DIR}
+    if [ ! -e ${WORK_DIR}/wikixml ]; then
+        mkdir -p ${WORK_DIR}/wikixml
+    fi
+    if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ]; then
+        echo "Downloading wikipedia XML dump"
+        ########################################################   
+        #  Datasets: uncomment and run "clean" to change dataset   
+        ########################################################
+        ########## partial small 42.5M zipped
+        # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000030302.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+        ########## partial larger 256M zipped
+        curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p2336425p3046511.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+        ######### full wikipedia dump: 10G zipped
+        # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+        ########################################################
+    fi
+    if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml ]; then
+        echo "Extracting..."
+       
+        cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd ..
+    fi
+
+echo $START_PATH
+
+set -e
+
+if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then
+
+  set -x
+  echo "Preparing wikipedia data"
+  rm -rf ${WORK_DIR}/wiki
+  mkdir ${WORK_DIR}/wiki
+  
+  if [ "x$alg" == "xCBayes" ] ; then
+    # use a list of 10 countries as categories
+    cp $MAHOUT_HOME/examples/bin/resources/country10.txt ${WORK_DIR}/country.txt
+    chmod 666 ${WORK_DIR}/country.txt
+  fi
+  
+  if [ "x$alg" == "xBinaryCBayes" ] ; then
+    # use United States and United Kingdom as categories
+    cp $MAHOUT_HOME/examples/bin/resources/country2.txt ${WORK_DIR}/country.txt
+    chmod 666 ${WORK_DIR}/country.txt
+  fi
+
+  if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+    echo "Copying wikipedia data to HDFS"
+    set +e
+    $DFSRM ${WORK_DIR}/wikixml
+    $DFS -mkdir -p ${WORK_DIR}
+    set -e
+    $DFS -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
+  fi
+
+  echo "Creating sequence files from wikiXML"
+  $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt \
+                                  -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml \
+                                  -o ${WORK_DIR}/wikipediainput
+   
+  # if using the 10 class problem use bigrams
+  if [ "x$alg" == "xCBayes" ] ; then
+    echo "Converting sequence files to vectors using bigrams"
+    $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
+                                       -o ${WORK_DIR}/wikipediaVecs \
+                                       -wt tfidf \
+                                       -lnorm -nv \
+                                       -ow -ng 2
+  fi
+  
+  # if using the 2 class problem try different options
+  if [ "x$alg" == "xBinaryCBayes" ] ; then
+    echo "Converting sequence files to vectors using unigrams and a max document frequency of 30%"
+    $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
+                                       -o ${WORK_DIR}/wikipediaVecs \
+                                       -wt tfidf \
+                                       -lnorm \
+                                       -nv \
+                                       -ow \
+                                       -ng 1 \
+                                       -x 30
+  fi
+  
+  echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
+  $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ \
+                                --trainingOutput ${WORK_DIR}/training \
+                                --testOutput ${WORK_DIR}/testing \
+                                -rp 20 \
+                                -ow \
+                                -seq \
+                                -xm sequential
+
+  echo "Training Naive Bayes model"
+  $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training \
+                                  -o ${WORK_DIR}/model \
+                                  -li ${WORK_DIR}/labelindex \
+                                  -ow \
+                                  -c
+
+  echo "Self testing on training set"
+  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training \
+                                 -m ${WORK_DIR}/model \
+                                 -l ${WORK_DIR}/labelindex \
+                                 -ow \
+                                 -o ${WORK_DIR}/output \
+                                 -c
+
+  echo "Testing on holdout set: Bayes"
+  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
+                                 -m ${WORK_DIR}/model \
+                                 -l ${WORK_DIR}/labelindex \
+                                 -ow \
+                                 -o ${WORK_DIR}/output \
+                                 -seq
+
+ echo "Testing on holdout set: CBayes"
+  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
+                                 -m ${WORK_DIR}/model -l \
+                                 ${WORK_DIR}/labelindex \
+                                 -ow \
+                                 -o ${WORK_DIR}/output  \
+                                 -c \
+                                 -seq
+fi
+
+elif [ "x$alg" == "xclean" ]; then
+  rm -rf $WORK_DIR
+  $DFSRM $WORK_DIR
+fi
+# Remove the work directory

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/cluster-reuters.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/cluster-reuters.sh b/community/mahout-mr/mr-examples/bin/cluster-reuters.sh
new file mode 100755
index 0000000..49f6c94
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/cluster-reuters.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the Reuters dataset and prepares it for clustering
+#
+# To run:  change into the mahout directory and type:
+#  examples/bin/cluster-reuters.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script clusters the Reuters data set using a variety of algorithms.  The data set is downloaded automatically."
+  exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then 
+  cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+MAHOUT="../../bin/mahout"
+
+if [ ! -e $MAHOUT ]; then
+  echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.."
+  exit 1
+fi
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+  WORK_DIR=/tmp/mahout-work-${USER}
+else
+  WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+algorithm=( kmeans fuzzykmeans lda streamingkmeans clean)
+if [ -n "$1" ]; then
+  choice=$1
+else
+  echo "Please select a number to choose the corresponding clustering algorithm"
+  echo "1. ${algorithm[0]} clustering (runs from this example script in cluster mode only)" 
+  echo "2. ${algorithm[1]} clustering (may require increased heap space on yarn)"
+  echo "3. ${algorithm[2]} clustering"
+  echo "4. ${algorithm[3]} clustering"
+  echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR"
+  read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+clustertype=${algorithm[$choice-1]}
+
+if [ "x$clustertype" == "xclean" ]; then
+  rm -rf $WORK_DIR
+  $DFSRM $WORK_DIR
+  exit 1
+else
+  $DFS -mkdir -p $WORK_DIR
+  mkdir -p $WORK_DIR
+  echo "Creating work directory at ${WORK_DIR}"
+fi
+if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
+  if [ ! -e ${WORK_DIR}/reuters-out ]; then
+    if [ ! -e ${WORK_DIR}/reuters-sgm ]; then
+      if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
+	  if [ -n "$2" ]; then
+	      echo "Copying Reuters from local download"
+	      cp $2 ${WORK_DIR}/reuters21578.tar.gz
+	  else
+              echo "Downloading Reuters-21578"
+              curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${WORK_DIR}/reuters21578.tar.gz
+	  fi
+      fi
+      #make sure it was actually downloaded
+      if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
+	  echo "Failed to download reuters"
+	  exit 1
+      fi
+      mkdir -p ${WORK_DIR}/reuters-sgm
+      echo "Extracting..."
+      tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm
+    fi
+    echo "Extracting Reuters"
+    $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out
+    if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+        echo "Copying Reuters data to Hadoop"
+        set +e
+        $DFSRM ${WORK_DIR}/reuters-sgm
+        $DFSRM ${WORK_DIR}/reuters-out
+        $DFS -mkdir -p ${WORK_DIR}/
+        $DFS -mkdir ${WORK_DIR}/reuters-sgm
+        $DFS -mkdir ${WORK_DIR}/reuters-out
+        $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm
+        $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out
+        set -e
+    fi
+  fi
+  echo "Converting to Sequence Files from Directory"
+  $MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o ${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential
+fi
+
+if [ "x$clustertype" == "xkmeans" ]; then
+  $MAHOUT seq2sparse \
+    -i ${WORK_DIR}/reuters-out-seqdir/ \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 --namedVector \
+  && \
+  $MAHOUT kmeans \
+    -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
+    -c ${WORK_DIR}/reuters-kmeans-clusters \
+    -o ${WORK_DIR}/reuters-kmeans \
+    -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
+    -x 10 -k 20 -ow --clustering \
+  && \
+  $MAHOUT clusterdump \
+    -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print $8}'` \
+    -o ${WORK_DIR}/reuters-kmeans/clusterdump \
+    -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
+    -dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \
+    --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints \
+    && \
+  cat ${WORK_DIR}/reuters-kmeans/clusterdump
+elif [ "x$clustertype" == "xfuzzykmeans" ]; then
+  $MAHOUT seq2sparse \
+    -i ${WORK_DIR}/reuters-out-seqdir/ \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 --namedVector \
+  && \
+  $MAHOUT fkmeans \
+    -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \
+    -c ${WORK_DIR}/reuters-fkmeans-clusters \
+    -o ${WORK_DIR}/reuters-fkmeans \
+    -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
+    -x 10 -k 20 -ow -m 1.1 \
+  && \
+  $MAHOUT clusterdump \
+    -i ${WORK_DIR}/reuters-fkmeans/clusters-*-final \
+    -o ${WORK_DIR}/reuters-fkmeans/clusterdump \
+    -d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0 \
+    -dt sequencefile -b 100 -n 20 -sp 0 \
+    && \
+  cat ${WORK_DIR}/reuters-fkmeans/clusterdump
+elif [ "x$clustertype" == "xlda" ]; then
+  $MAHOUT seq2sparse \
+    -i ${WORK_DIR}/reuters-out-seqdir/ \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector \
+  && \
+  $MAHOUT rowid \
+    -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors \
+    -o ${WORK_DIR}/reuters-out-matrix \
+  && \
+  rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics ${WORK_DIR}/reuters-lda-model \
+  && \
+  $MAHOUT cvb \
+    -i ${WORK_DIR}/reuters-out-matrix/matrix \
+    -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \
+    -dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
+    -dt ${WORK_DIR}/reuters-lda-topics \
+    -mt ${WORK_DIR}/reuters-lda-model \
+  && \
+  $MAHOUT vectordump \
+    -i ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
+    -o ${WORK_DIR}/reuters-lda/vectordump \
+    -vs 10 -p true \
+    -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
+    -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
+    && \
+  cat ${WORK_DIR}/reuters-lda/vectordump
+elif [ "x$clustertype" == "xstreamingkmeans" ]; then
+  $MAHOUT seq2sparse \
+    -i ${WORK_DIR}/reuters-out-seqdir/ \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow --maxDFPercent 85 --namedVector \
+  && \
+  rm -rf ${WORK_DIR}/reuters-streamingkmeans \
+  && \
+  $MAHOUT streamingkmeans \
+    -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/ \
+    --tempDir ${WORK_DIR}/tmp \
+    -o ${WORK_DIR}/reuters-streamingkmeans \
+    -sc org.apache.mahout.math.neighborhood.FastProjectionSearch \
+    -dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure \
+    -k 10 -km 100 -ow \
+  && \
+  $MAHOUT qualcluster \
+    -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/part-r-00000 \
+    -c ${WORK_DIR}/reuters-streamingkmeans/part-r-00000   \
+    -o ${WORK_DIR}/reuters-cluster-distance.csv \
+    && \
+  cat ${WORK_DIR}/reuters-cluster-distance.csv
+fi

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh b/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh
new file mode 100755
index 0000000..796da33
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the Synthetic control dataset and prepares it for clustering
+#
+# To run:  change into the mahout directory and type:
+#  examples/bin/cluster-syntheticcontrol.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script clusters the Synthetic Control data set.  The data set is downloaded automatically."
+  exit
+fi
+
+algorithm=( kmeans fuzzykmeans )
+if [ -n "$1" ]; then
+  choice=$1
+else
+  echo "Please select a number to choose the corresponding clustering algorithm"
+  echo "1. ${algorithm[0]} clustering"
+  echo "2. ${algorithm[1]} clustering"
+  read -p "Enter your choice : " choice
+fi
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+clustertype=${algorithm[$choice-1]}
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+  cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+  WORK_DIR=/tmp/mahout-work-${USER}
+else
+  WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+echo "creating work directory at ${WORK_DIR}"
+mkdir -p ${WORK_DIR}
+if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
+  if [ -n "$2" ]; then
+    cp $2 ${WORK_DIR}/.
+  else
+    echo "Downloading Synthetic control data"
+    curl http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data  -o ${WORK_DIR}/synthetic_control.data
+  fi
+fi
+if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
+  echo "Couldn't download synthetic control"
+  exit 1
+fi
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]; then
+  echo "Checking the health of DFS..."
+  $DFS -ls /
+  if [ $? -eq 0 ];then 
+    echo "DFS is healthy... "
+    echo "Uploading Synthetic control data to HDFS"
+    $DFSRM ${WORK_DIR}/testdata
+    $DFS -mkdir -p ${WORK_DIR}/testdata
+    $DFS -put ${WORK_DIR}/synthetic_control.data ${WORK_DIR}/testdata
+    echo "Successfully Uploaded Synthetic control data to HDFS "
+
+    options="--input ${WORK_DIR}/testdata --output ${WORK_DIR}/output --maxIter 10 --convergenceDelta 0.5"
+
+    if [ "${clustertype}" == "kmeans" ]; then
+      options="${options} --numClusters 6"
+      # t1 & t2 not used if --numClusters specified, but parser requires input
+      options="${options} --t1 1 --t2 2"
+      ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
+    else
+      options="${options} --m 2.0f --t1 80 --t2 55"
+      ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
+    fi
+  else
+    echo " HADOOP is not running. Please make sure you hadoop is running. "
+  fi
+elif [ "$MAHOUT_LOCAL" != "" ]; then
+  echo "running MAHOUT_LOCAL"
+  cp ${WORK_DIR}/synthetic_control.data testdata
+  ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job
+  rm testdata
+else
+  echo " HADOOP_HOME variable is not set. Please set this environment variable and rerun the script"
+fi
+# Remove the work directory
+rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh b/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh
new file mode 100755
index 0000000..29730e1
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Instructions:
+#
+# Before using this script, you have to download and extract the Movielens 1M dataset
+# from http://www.grouplens.org/node/73
+#
+# To run:  change into the mahout directory and type:
+#  export MAHOUT_LOCAL=true
+# Then:
+#  examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script runs the Alternating Least Squares Recommender on the Grouplens data set (size 1M)."
+  echo "Syntax: $0 /path/to/ratings.dat\n"
+  exit
+fi
+
+if [ $# -ne 1 ]
+then
+  echo -e "\nYou have to download the Movielens 1M dataset from http://www.grouplens.org/node/73 before"
+  echo -e "you can run this example. After that extract it and supply the path to the ratings.dat file.\n"
+  echo -e "Syntax: $0 /path/to/ratings.dat\n"
+  exit -1
+fi
+
+export MAHOUT_LOCAL=true
+MAHOUT="$MAHOUT_HOME/bin/mahout"
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+  WORK_DIR=/tmp/mahout-work-${USER}
+else
+  WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+echo "creating work directory at ${WORK_DIR}"
+mkdir -p ${WORK_DIR}/movielens
+
+echo "Converting ratings..."
+cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv
+
+# create a 90% percent training set and a 10% probe set
+$MAHOUT splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset \
+    --trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp
+
+# run distributed ALS-WR to factorize the rating matrix defined by the training set
+$MAHOUT parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out \
+    --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 2
+
+# compute predictions against the probe set, measure the error
+$MAHOUT evaluateFactorization --input ${WORK_DIR}/dataset/probeSet/ --output ${WORK_DIR}/als/rmse/ \
+    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
+
+# compute recommendations
+$MAHOUT recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations/ \
+    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ \
+    --numRecommendations 6 --maxRating 5 --numThreads 2
+
+# print the error
+echo -e "\nRMSE is:\n"
+cat ${WORK_DIR}/als/rmse/rmse.txt
+echo -e "\n"
+
+echo -e "\nSample recommendations:\n"
+shuf ${WORK_DIR}/recommendations/part-m-00000 |head
+echo -e "\n\n"
+
+echo "removing work directory"
+rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/factorize-netflix.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/factorize-netflix.sh b/community/mahout-mr/mr-examples/bin/factorize-netflix.sh
new file mode 100755
index 0000000..26faf66
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/factorize-netflix.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Instructions:
+#
+# You can only use this script in conjunction with the Netflix dataset. Unpack the Netflix dataset and provide the
+# following:
+#
+#   1) the path to the folder 'training_set' that contains all the movie rating files
+#   2) the path to the file 'qualifying.txt' that contains the user,item pairs to predict
+#   3) the path to the file 'judging.txt' that contains the ratings of user,item pairs to predict for
+#
+# To run:
+#  ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt
+
+echo "Note this script has been deprecated due to the lack of access to the Netflix data set."
+exit 1
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script runs the ALS Recommender on the Netflix data set."
+  echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n"
+  exit
+fi
+
+if [ $# -ne 3 ]
+then
+  echo -e "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n"
+  exit -1
+fi
+
+MAHOUT="../../bin/mahout"
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+  WORK_DIR=/tmp/mahout-work-${USER}
+else
+  WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+echo "Preparing data..."
+$MAHOUT org.apache.mahout.cf.taste.hadoop.example.als.netflix.NetflixDatasetConverter $1 $2 $3 ${WORK_DIR}
+
+# run distributed ALS-WR to factorize the rating matrix defined by the training set
+$MAHOUT parallelALS --input ${WORK_DIR}/trainingSet/ratings.tsv --output ${WORK_DIR}/als/out \
+    --tempDir ${WORK_DIR}/als/tmp --numFeatures 25 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 4
+
+# compute predictions against the probe set, measure the error
+$MAHOUT evaluateFactorization --input ${WORK_DIR}/probeSet/ratings.tsv --output ${WORK_DIR}/als/rmse/ \
+    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
+
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+
+  # print the error, should be around 0.923
+  echo -e "\nRMSE is:\n"
+  $DFS -tail ${WORK_DIR}/als/rmse/rmse.txt
+  echo -e "\n"
+  echo "removing work directory"
+  set +e
+  $DFSRM ${WORK_DIR}
+
+else
+
+  # print the error, should be around 0.923
+  echo -e "\nRMSE is:\n"
+  cat ${WORK_DIR}/als/rmse/rmse.txt
+  echo -e "\n"
+  echo "removing work directory"
+  rm -rf ${WORK_DIR}
+
+fi
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/get-all-examples.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/get-all-examples.sh b/community/mahout-mr/mr-examples/bin/get-all-examples.sh
new file mode 100755
index 0000000..4128e47
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/get-all-examples.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Clones Mahout example code from remote repositories with their own 
+# build process.  Follow the README for each example for instructions.
+#
+# Usage:  change into the mahout directory and type:
+#  examples/bin/get-all-examples.sh
+
+# Solr-recommender
+echo " Solr-recommender example: "
+echo " 1) imports text 'log files' of some delimited form for user preferences"
+echo " 2) creates the correct Mahout files and stores distionaries to translate external Id to and from Mahout Ids"
+echo " 3) it implements a prototype two actions 'cross-recommender', which takes two actions made by the same user and creates recommendations"
+echo " 4) it creates output for user->preference history CSV and and item->similar items 'similarity' matrix for use in a Solr-recommender."
+echo "    To use Solr you would index the similarity matrix CSV, and use user preference history from the history CSV as a query, the result"
+echo "    from Solr will be an ordered list of recommendations returning the same item Ids as were input."
+echo " For further description see the README.md here https://github.com/pferrel/solr-recommender"
+echo " To build run 'cd solr-recommender; mvn install'"
+echo " To process the example after building make sure MAHOUT_LOCAL IS SET and hadoop is in local mode then "
+echo " run 'cd scripts; ./solr-recommender-example'"
+git clone https://github.com/pferrel/solr-recommender

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/lda.algorithm
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/lda.algorithm b/community/mahout-mr/mr-examples/bin/lda.algorithm
new file mode 100644
index 0000000..fb84ea0
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/lda.algorithm
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+merge.policy=org.apache.lucene.index.LogDocMergePolicy
+merge.factor=mrg:10:20
+max.buffered=buf:100:1000
+compound=true
+
+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+directory=FSDirectory
+
+doc.stored=true
+doc.term.vector=true
+doc.tokenized=true
+log.step=600
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
+content.source.forever=false
+doc.maker.forever=false
+query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
+
+# task at this depth or less would print when they start
+task.max.depth.log=2
+
+log.queries=false
+# --------- alg
+{ "BuildReuters"
+  CreateIndex 
+  { "AddDocs" AddDoc > : *
+#  Optimize
+  CloseIndex
+}
+


[34/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java
new file mode 100644
index 0000000..b2ce8b1
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.math.stats.GlobalOnlineAuc;
+import org.apache.mahout.math.stats.GroupedOnlineAuc;
+import org.apache.mahout.math.stats.OnlineAuc;
+
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+public class AdaptiveLogisticModelParameters extends LogisticModelParameters {
+
+  private AdaptiveLogisticRegression alr;
+  private int interval = 800;
+  private int averageWindow = 500;
+  private int threads = 4;
+  private String prior = "L1";
+  private double priorOption = Double.NaN;
+  private String auc = null;
+
+  public AdaptiveLogisticRegression createAdaptiveLogisticRegression() {
+
+    if (alr == null) {
+      alr = new AdaptiveLogisticRegression(getMaxTargetCategories(),
+                                           getNumFeatures(), createPrior(prior, priorOption));
+      alr.setInterval(interval);
+      alr.setAveragingWindow(averageWindow);
+      alr.setThreadCount(threads);
+      alr.setAucEvaluator(createAUC(auc));
+    }
+    return alr;
+  }
+
+  public void checkParameters() {
+    if (prior != null) {
+      String priorUppercase = prior.toUpperCase(Locale.ENGLISH).trim();
+      if (("TP".equals(priorUppercase) || "EBP".equals(priorUppercase)) && Double.isNaN(priorOption)) {
+        throw new IllegalArgumentException("You must specify a double value for TPrior and ElasticBandPrior.");
+      }
+    }
+  }
+
+  private static PriorFunction createPrior(String cmd, double priorOption) {
+    if (cmd == null) {
+      return null;
+    }
+    if ("L1".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
+      return new L1();
+    }
+    if ("L2".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
+      return new L2();
+    }
+    if ("UP".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
+      return new UniformPrior();
+    }
+    if ("TP".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
+      return new TPrior(priorOption);
+    }
+    if ("EBP".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
+      return new ElasticBandPrior(priorOption);
+    }
+
+    return null;
+  }
+
+  private static OnlineAuc createAUC(String cmd) {
+    if (cmd == null) {
+      return null;
+    }
+    if ("GLOBAL".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
+      return new GlobalOnlineAuc();
+    }
+    if ("GROUPED".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
+      return new GroupedOnlineAuc();
+    }
+    return null;
+  }
+
+  @Override
+  public void saveTo(OutputStream out) throws IOException {
+    if (alr != null) {
+      alr.close();
+    }
+    setTargetCategories(getCsvRecordFactory().getTargetCategories());
+    write(new DataOutputStream(out));
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeUTF(getTargetVariable());
+    out.writeInt(getTypeMap().size());
+    for (Map.Entry<String, String> entry : getTypeMap().entrySet()) {
+      out.writeUTF(entry.getKey());
+      out.writeUTF(entry.getValue());
+    }
+    out.writeInt(getNumFeatures());
+    out.writeInt(getMaxTargetCategories());
+    out.writeInt(getTargetCategories().size());
+    for (String category : getTargetCategories()) {
+      out.writeUTF(category);
+    }
+
+    out.writeInt(interval);
+    out.writeInt(averageWindow);
+    out.writeInt(threads);
+    out.writeUTF(prior);
+    out.writeDouble(priorOption);
+    out.writeUTF(auc);
+
+    // skip csv
+    alr.write(out);
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    setTargetVariable(in.readUTF());
+    int typeMapSize = in.readInt();
+    Map<String, String> typeMap = new HashMap<>(typeMapSize);
+    for (int i = 0; i < typeMapSize; i++) {
+      String key = in.readUTF();
+      String value = in.readUTF();
+      typeMap.put(key, value);
+    }
+    setTypeMap(typeMap);
+
+    setNumFeatures(in.readInt());
+    setMaxTargetCategories(in.readInt());
+    int targetCategoriesSize = in.readInt();
+    List<String> targetCategories = new ArrayList<>(targetCategoriesSize);
+    for (int i = 0; i < targetCategoriesSize; i++) {
+      targetCategories.add(in.readUTF());
+    }
+    setTargetCategories(targetCategories);
+
+    interval = in.readInt();
+    averageWindow = in.readInt();
+    threads = in.readInt();
+    prior = in.readUTF();
+    priorOption = in.readDouble();
+    auc = in.readUTF();
+
+    alr = new AdaptiveLogisticRegression();
+    alr.readFields(in);
+  }
+
+
+  private static AdaptiveLogisticModelParameters loadFromStream(InputStream in) throws IOException {
+    AdaptiveLogisticModelParameters result = new AdaptiveLogisticModelParameters();
+    result.readFields(new DataInputStream(in));
+    return result;
+  }
+
+  public static AdaptiveLogisticModelParameters loadFromFile(File in) throws IOException {
+    try (InputStream input = new FileInputStream(in)) {
+      return loadFromStream(input);
+    }
+  }
+
+  public int getInterval() {
+    return interval;
+  }
+
+  public void setInterval(int interval) {
+    this.interval = interval;
+  }
+
+  public int getAverageWindow() {
+    return averageWindow;
+  }
+
+  public void setAverageWindow(int averageWindow) {
+    this.averageWindow = averageWindow;
+  }
+
+  public int getThreads() {
+    return threads;
+  }
+
+  public void setThreads(int threads) {
+    this.threads = threads;
+  }
+
+  public String getPrior() {
+    return prior;
+  }
+
+  public void setPrior(String prior) {
+    this.prior = prior;
+  }
+
+  public String getAuc() {
+    return auc;
+  }
+
+  public void setAuc(String auc) {
+    this.auc = auc;
+  }
+
+  public double getPriorOption() {
+    return priorOption;
+  }
+
+  public void setPriorOption(double priorOption) {
+    this.priorOption = priorOption;
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
new file mode 100644
index 0000000..e762924
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.base.Preconditions;
+import com.google.common.io.Closeables;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * Encapsulates everything we need to know about a model and how it reads and vectorizes its input.
+ * This encapsulation allows us to coherently save and restore a model from a file.  This also
+ * allows us to keep command line arguments that affect learning in a coherent way.
+ */
+public class LogisticModelParameters implements Writable {
+  private String targetVariable;
+  private Map<String, String> typeMap;
+  private int numFeatures;
+  private boolean useBias;
+  private int maxTargetCategories;
+  private List<String> targetCategories;
+  private double lambda;
+  private double learningRate;
+  private CsvRecordFactory csv;
+  private OnlineLogisticRegression lr;
+
+  /**
+   * Returns a CsvRecordFactory compatible with this logistic model.  The reason that this is tied
+   * in here is so that we have access to the list of target categories when it comes time to save
+   * the model.  If the input isn't CSV, then calling setTargetCategories before calling saveTo will
+   * suffice.
+   *
+   * @return The CsvRecordFactory.
+   */
+  public CsvRecordFactory getCsvRecordFactory() {
+    if (csv == null) {
+      csv = new CsvRecordFactory(getTargetVariable(), getTypeMap())
+              .maxTargetValue(getMaxTargetCategories())
+              .includeBiasTerm(useBias());
+      if (targetCategories != null) {
+        csv.defineTargetCategories(targetCategories);
+      }
+    }
+    return csv;
+  }
+
+  /**
+   * Creates a logistic regression trainer using the parameters collected here.
+   *
+   * @return The newly allocated OnlineLogisticRegression object
+   */
+  public OnlineLogisticRegression createRegression() {
+    if (lr == null) {
+      lr = new OnlineLogisticRegression(getMaxTargetCategories(), getNumFeatures(), new L1())
+              .lambda(getLambda())
+              .learningRate(getLearningRate())
+              .alpha(1 - 1.0e-3);
+    }
+    return lr;
+  }
+
+  /**
+   * Saves a model to an output stream.
+   */
+  public void saveTo(OutputStream out) throws IOException {
+    Closeables.close(lr, false);
+    targetCategories = getCsvRecordFactory().getTargetCategories();
+    write(new DataOutputStream(out));
+  }
+
+  /**
+   * Reads a model from a stream.
+   */
+  public static LogisticModelParameters loadFrom(InputStream in) throws IOException {
+    LogisticModelParameters result = new LogisticModelParameters();
+    result.readFields(new DataInputStream(in));
+    return result;
+  }
+
+  /**
+   * Reads a model from a file.
+   * @throws IOException If there is an error opening or closing the file.
+   */
+  public static LogisticModelParameters loadFrom(File in) throws IOException {
+    try (InputStream input = new FileInputStream(in)) {
+      return loadFrom(input);
+    }
+  }
+
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeUTF(targetVariable);
+    out.writeInt(typeMap.size());
+    for (Map.Entry<String,String> entry : typeMap.entrySet()) {
+      out.writeUTF(entry.getKey());
+      out.writeUTF(entry.getValue());
+    }
+    out.writeInt(numFeatures);
+    out.writeBoolean(useBias);
+    out.writeInt(maxTargetCategories);
+
+    if (targetCategories == null) {
+      out.writeInt(0);
+    } else {
+      out.writeInt(targetCategories.size());
+      for (String category : targetCategories) {
+        out.writeUTF(category);
+      }
+    }
+    out.writeDouble(lambda);
+    out.writeDouble(learningRate);
+    // skip csv
+    lr.write(out);
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    targetVariable = in.readUTF();
+    int typeMapSize = in.readInt();
+    typeMap = new HashMap<>(typeMapSize);
+    for (int i = 0; i < typeMapSize; i++) {
+      String key = in.readUTF();
+      String value = in.readUTF();
+      typeMap.put(key, value);
+    }
+    numFeatures = in.readInt();
+    useBias = in.readBoolean();
+    maxTargetCategories = in.readInt();
+    int targetCategoriesSize = in.readInt();
+    targetCategories = new ArrayList<>(targetCategoriesSize);
+    for (int i = 0; i < targetCategoriesSize; i++) {
+      targetCategories.add(in.readUTF());
+    }
+    lambda = in.readDouble();
+    learningRate = in.readDouble();
+    csv = null;
+    lr = new OnlineLogisticRegression();
+    lr.readFields(in);
+  }
+
+  /**
+   * Sets the types of the predictors.  This will later be used when reading CSV data.  If you don't
+   * use the CSV data and convert to vectors on your own, you don't need to call this.
+   *
+   * @param predictorList The list of variable names.
+   * @param typeList      The list of types in the format preferred by CsvRecordFactory.
+   */
+  public void setTypeMap(Iterable<String> predictorList, List<String> typeList) {
+    Preconditions.checkArgument(!typeList.isEmpty(), "Must have at least one type specifier");
+    typeMap = new HashMap<>();
+    Iterator<String> iTypes = typeList.iterator();
+    String lastType = null;
+    for (Object x : predictorList) {
+      // type list can be short .. we just repeat last spec
+      if (iTypes.hasNext()) {
+        lastType = iTypes.next();
+      }
+      typeMap.put(x.toString(), lastType);
+    }
+  }
+
+  /**
+   * Sets the target variable.  If you don't use the CSV record factory, then this is irrelevant.
+   *
+   * @param targetVariable The name of the target variable.
+   */
+  public void setTargetVariable(String targetVariable) {
+    this.targetVariable = targetVariable;
+  }
+
+  /**
+   * Sets the number of target categories to be considered.
+   *
+   * @param maxTargetCategories The number of target categories.
+   */
+  public void setMaxTargetCategories(int maxTargetCategories) {
+    this.maxTargetCategories = maxTargetCategories;
+  }
+
+  public void setNumFeatures(int numFeatures) {
+    this.numFeatures = numFeatures;
+  }
+
+  public void setTargetCategories(List<String> targetCategories) {
+    this.targetCategories = targetCategories;
+    maxTargetCategories = targetCategories.size();
+  }
+
+  public List<String> getTargetCategories() {
+    return this.targetCategories;
+  }
+
+  public void setUseBias(boolean useBias) {
+    this.useBias = useBias;
+  }
+
+  public boolean useBias() {
+    return useBias;
+  }
+
+  public String getTargetVariable() {
+    return targetVariable;
+  }
+
+  public Map<String, String> getTypeMap() {
+    return typeMap;
+  }
+
+  public void setTypeMap(Map<String, String> map) {
+    this.typeMap = map;
+  }
+
+  public int getNumFeatures() {
+    return numFeatures;
+  }
+
+  public int getMaxTargetCategories() {
+    return maxTargetCategories;
+  }
+
+  public double getLambda() {
+    return lambda;
+  }
+
+  public void setLambda(double lambda) {
+    this.lambda = lambda;
+  }
+
+  public double getLearningRate() {
+    return learningRate;
+  }
+
+  public void setLearningRate(double learningRate) {
+    this.learningRate = learningRate;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java
new file mode 100644
index 0000000..3ec6a06
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.base.Preconditions;
+
+import java.io.BufferedReader;
+
+/**
+ * Uses the same logic as TrainLogistic and RunLogistic for finding an input, but instead
+ * of processing the input, this class just prints the input to standard out.
+ */
+public final class PrintResourceOrFile {
+
+  private PrintResourceOrFile() {
+  }
+
+  public static void main(String[] args) throws Exception {
+    Preconditions.checkArgument(args.length == 1, "Must have a single argument that names a file or resource.");
+    try (BufferedReader in = TrainLogistic.open(args[0])){
+      String line;
+      while ((line = in.readLine()) != null) {
+        System.out.println(line);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java
new file mode 100644
index 0000000..678a8f5
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.classifier.sgd.AdaptiveLogisticRegression.Wrapper;
+import org.apache.mahout.ep.State;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.HashMap;
+import java.util.Map;
+
+public final class RunAdaptiveLogistic {
+
+  private static String inputFile;
+  private static String modelFile;
+  private static String outputFile;
+  private static String idColumn;
+  private static boolean maxScoreOnly;
+
+  private RunAdaptiveLogistic() {
+  }
+
+  public static void main(String[] args) throws Exception {
+    mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+  }
+
+  static void mainToOutput(String[] args, PrintWriter output) throws Exception {
+    if (!parseArgs(args)) {
+      return;
+    }
+    AdaptiveLogisticModelParameters lmp = AdaptiveLogisticModelParameters
+        .loadFromFile(new File(modelFile));
+
+    CsvRecordFactory csv = lmp.getCsvRecordFactory();
+    csv.setIdName(idColumn);
+
+    AdaptiveLogisticRegression lr = lmp.createAdaptiveLogisticRegression();
+
+    State<Wrapper, CrossFoldLearner> best = lr.getBest();
+    if (best == null) {
+      output.println("AdaptiveLogisticRegression has not be trained probably.");
+      return;
+    }
+    CrossFoldLearner learner = best.getPayload().getLearner();
+
+    BufferedReader in = TrainAdaptiveLogistic.open(inputFile);
+    int k = 0;
+
+    try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile),
+        Charsets.UTF_8))) {
+      out.write(idColumn + ",target,score");
+      out.newLine();
+
+      String line = in.readLine();
+      csv.firstLine(line);
+      line = in.readLine();
+      Map<String, Double> results = new HashMap<>();
+      while (line != null) {
+        Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
+        csv.processLine(line, v, false);
+        Vector scores = learner.classifyFull(v);
+        results.clear();
+        if (maxScoreOnly) {
+          results.put(csv.getTargetLabel(scores.maxValueIndex()),
+              scores.maxValue());
+        } else {
+          for (int i = 0; i < scores.size(); i++) {
+            results.put(csv.getTargetLabel(i), scores.get(i));
+          }
+        }
+
+        for (Map.Entry<String, Double> entry : results.entrySet()) {
+          out.write(csv.getIdString(line) + ',' + entry.getKey() + ',' + entry.getValue());
+          out.newLine();
+        }
+        k++;
+        if (k % 100 == 0) {
+          output.println(k + " records processed");
+        }
+        line = in.readLine();
+      }
+      out.flush();
+    }
+    output.println(k + " records processed totally.");
+  }
+
+  private static boolean parseArgs(String[] args) {
+    DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+    Option help = builder.withLongName("help")
+      .withDescription("print this list").create();
+
+    Option quiet = builder.withLongName("quiet")
+      .withDescription("be extra quiet").create();
+
+    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+    Option inputFileOption = builder
+      .withLongName("input")
+      .withRequired(true)
+      .withArgument(
+          argumentBuilder.withName("input").withMaximum(1)
+            .create())
+      .withDescription("where to get training data").create();
+
+    Option modelFileOption = builder
+      .withLongName("model")
+      .withRequired(true)
+      .withArgument(
+          argumentBuilder.withName("model").withMaximum(1)
+            .create())
+      .withDescription("where to get the trained model").create();
+    
+    Option outputFileOption = builder
+      .withLongName("output")
+      .withRequired(true)
+      .withDescription("the file path to output scores")
+      .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
+      .create();
+    
+    Option idColumnOption = builder
+      .withLongName("idcolumn")
+      .withRequired(true)
+      .withDescription("the name of the id column for each record")
+      .withArgument(argumentBuilder.withName("idcolumn").withMaximum(1).create())
+      .create();
+    
+    Option maxScoreOnlyOption = builder
+      .withLongName("maxscoreonly")
+      .withDescription("only output the target label with max scores")
+      .create();
+
+    Group normalArgs = new GroupBuilder()
+      .withOption(help).withOption(quiet)
+      .withOption(inputFileOption).withOption(modelFileOption)
+      .withOption(outputFileOption).withOption(idColumnOption)
+      .withOption(maxScoreOnlyOption)
+      .create();
+
+    Parser parser = new Parser();
+    parser.setHelpOption(help);
+    parser.setHelpTrigger("--help");
+    parser.setGroup(normalArgs);
+    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+    CommandLine cmdLine = parser.parseAndHelp(args);
+
+    if (cmdLine == null) {
+      return false;
+    }
+
+    inputFile = getStringArgument(cmdLine, inputFileOption);
+    modelFile = getStringArgument(cmdLine, modelFileOption);
+    outputFile = getStringArgument(cmdLine, outputFileOption);
+    idColumn = getStringArgument(cmdLine, idColumnOption);
+    maxScoreOnly = getBooleanArgument(cmdLine, maxScoreOnlyOption);    
+    return true;
+  }
+
+  private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
+    return cmdLine.hasOption(option);
+  }
+
+  private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
+    return (String) cmdLine.getValue(inputFile);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java
new file mode 100644
index 0000000..2d57016
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.classifier.evaluation.Auc;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.Locale;
+
+public final class RunLogistic {
+
+  private static String inputFile;
+  private static String modelFile;
+  private static boolean showAuc;
+  private static boolean showScores;
+  private static boolean showConfusion;
+
+  private RunLogistic() {
+  }
+
+  public static void main(String[] args) throws Exception {
+    mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+  }
+
+  static void mainToOutput(String[] args, PrintWriter output) throws Exception {
+    if (parseArgs(args)) {
+      if (!showAuc && !showConfusion && !showScores) {
+        showAuc = true;
+        showConfusion = true;
+      }
+
+      Auc collector = new Auc();
+      LogisticModelParameters lmp = LogisticModelParameters.loadFrom(new File(modelFile));
+
+      CsvRecordFactory csv = lmp.getCsvRecordFactory();
+      OnlineLogisticRegression lr = lmp.createRegression();
+      BufferedReader in = TrainLogistic.open(inputFile);
+      String line = in.readLine();
+      csv.firstLine(line);
+      line = in.readLine();
+      if (showScores) {
+        output.println("\"target\",\"model-output\",\"log-likelihood\"");
+      }
+      while (line != null) {
+        Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
+        int target = csv.processLine(line, v);
+
+        double score = lr.classifyScalar(v);
+        if (showScores) {
+          output.printf(Locale.ENGLISH, "%d,%.3f,%.6f%n", target, score, lr.logLikelihood(target, v));
+        }
+        collector.add(target, score);
+        line = in.readLine();
+      }
+
+      if (showAuc) {
+        output.printf(Locale.ENGLISH, "AUC = %.2f%n", collector.auc());
+      }
+      if (showConfusion) {
+        Matrix m = collector.confusion();
+        output.printf(Locale.ENGLISH, "confusion: [[%.1f, %.1f], [%.1f, %.1f]]%n",
+          m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
+        m = collector.entropy();
+        output.printf(Locale.ENGLISH, "entropy: [[%.1f, %.1f], [%.1f, %.1f]]%n",
+          m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
+      }
+    }
+  }
+
+  private static boolean parseArgs(String[] args) {
+    DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+    Option help = builder.withLongName("help").withDescription("print this list").create();
+
+    Option quiet = builder.withLongName("quiet").withDescription("be extra quiet").create();
+
+    Option auc = builder.withLongName("auc").withDescription("print AUC").create();
+    Option confusion = builder.withLongName("confusion").withDescription("print confusion matrix").create();
+
+    Option scores = builder.withLongName("scores").withDescription("print scores").create();
+
+    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+    Option inputFileOption = builder.withLongName("input")
+            .withRequired(true)
+            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+            .withDescription("where to get training data")
+            .create();
+
+    Option modelFileOption = builder.withLongName("model")
+            .withRequired(true)
+            .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
+            .withDescription("where to get a model")
+            .create();
+
+    Group normalArgs = new GroupBuilder()
+            .withOption(help)
+            .withOption(quiet)
+            .withOption(auc)
+            .withOption(scores)
+            .withOption(confusion)
+            .withOption(inputFileOption)
+            .withOption(modelFileOption)
+            .create();
+
+    Parser parser = new Parser();
+    parser.setHelpOption(help);
+    parser.setHelpTrigger("--help");
+    parser.setGroup(normalArgs);
+    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+    CommandLine cmdLine = parser.parseAndHelp(args);
+
+    if (cmdLine == null) {
+      return false;
+    }
+
+    inputFile = getStringArgument(cmdLine, inputFileOption);
+    modelFile = getStringArgument(cmdLine, modelFileOption);
+    showAuc = getBooleanArgument(cmdLine, auc);
+    showScores = getBooleanArgument(cmdLine, scores);
+    showConfusion = getBooleanArgument(cmdLine, confusion);
+
+    return true;
+  }
+
+  private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
+    return cmdLine.hasOption(option);
+  }
+
+  private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
+    return (String) cmdLine.getValue(inputFile);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java
new file mode 100644
index 0000000..c657803
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java
@@ -0,0 +1,151 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.collect.Multiset;
+import org.apache.mahout.classifier.NewsgroupHelper;
+import org.apache.mahout.ep.State;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.DoubleFunction;
+import org.apache.mahout.math.function.Functions;
+import org.apache.mahout.vectorizer.encoders.Dictionary;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.TreeMap;
+
+public final class SGDHelper {
+
+  private static final String[] LEAK_LABELS = {"none", "month-year", "day-month-year"};
+
+  private SGDHelper() {
+  }
+
+  public static void dissect(int leakType,
+                             Dictionary dictionary,
+                             AdaptiveLogisticRegression learningAlgorithm,
+                             Iterable<File> files, Multiset<String> overallCounts) throws IOException {
+    CrossFoldLearner model = learningAlgorithm.getBest().getPayload().getLearner();
+    model.close();
+
+    Map<String, Set<Integer>> traceDictionary = new TreeMap<>();
+    ModelDissector md = new ModelDissector();
+
+    NewsgroupHelper helper = new NewsgroupHelper();
+    helper.getEncoder().setTraceDictionary(traceDictionary);
+    helper.getBias().setTraceDictionary(traceDictionary);
+
+    for (File file : permute(files, helper.getRandom()).subList(0, 500)) {
+      String ng = file.getParentFile().getName();
+      int actual = dictionary.intern(ng);
+
+      traceDictionary.clear();
+      Vector v = helper.encodeFeatureVector(file, actual, leakType, overallCounts);
+      md.update(v, traceDictionary, model);
+    }
+
+    List<String> ngNames = new ArrayList<>(dictionary.values());
+    List<ModelDissector.Weight> weights = md.summary(100);
+    System.out.println("============");
+    System.out.println("Model Dissection");
+    for (ModelDissector.Weight w : weights) {
+      System.out.printf("%s\t%.1f\t%s\t%.1f\t%s\t%.1f\t%s%n",
+                        w.getFeature(), w.getWeight(), ngNames.get(w.getMaxImpact() + 1),
+                        w.getCategory(1), w.getWeight(1), w.getCategory(2), w.getWeight(2));
+    }
+  }
+
+  public static List<File> permute(Iterable<File> files, Random rand) {
+    List<File> r = new ArrayList<>();
+    for (File file : files) {
+      int i = rand.nextInt(r.size() + 1);
+      if (i == r.size()) {
+        r.add(file);
+      } else {
+        r.add(r.get(i));
+        r.set(i, file);
+      }
+    }
+    return r;
+  }
+
+  static void analyzeState(SGDInfo info, int leakType, int k, State<AdaptiveLogisticRegression.Wrapper,
+      CrossFoldLearner> best) throws IOException {
+    int bump = info.getBumps()[(int) Math.floor(info.getStep()) % info.getBumps().length];
+    int scale = (int) Math.pow(10, Math.floor(info.getStep() / info.getBumps().length));
+    double maxBeta;
+    double nonZeros;
+    double positive;
+    double norm;
+
+    double lambda = 0;
+    double mu = 0;
+
+    if (best != null) {
+      CrossFoldLearner state = best.getPayload().getLearner();
+      info.setAverageCorrect(state.percentCorrect());
+      info.setAverageLL(state.logLikelihood());
+
+      OnlineLogisticRegression model = state.getModels().get(0);
+      // finish off pending regularization
+      model.close();
+
+      Matrix beta = model.getBeta();
+      maxBeta = beta.aggregate(Functions.MAX, Functions.ABS);
+      nonZeros = beta.aggregate(Functions.PLUS, new DoubleFunction() {
+        @Override
+        public double apply(double v) {
+          return Math.abs(v) > 1.0e-6 ? 1 : 0;
+        }
+      });
+      positive = beta.aggregate(Functions.PLUS, new DoubleFunction() {
+        @Override
+        public double apply(double v) {
+          return v > 0 ? 1 : 0;
+        }
+      });
+      norm = beta.aggregate(Functions.PLUS, Functions.ABS);
+
+      lambda = best.getMappedParams()[0];
+      mu = best.getMappedParams()[1];
+    } else {
+      maxBeta = 0;
+      nonZeros = 0;
+      positive = 0;
+      norm = 0;
+    }
+    if (k % (bump * scale) == 0) {
+      if (best != null) {
+        File modelFile = new File(System.getProperty("java.io.tmpdir"), "news-group-" + k + ".model");
+        ModelSerializer.writeBinary(modelFile.getAbsolutePath(), best.getPayload().getLearner().getModels().get(0));
+      }
+
+      info.setStep(info.getStep() + 0.25);
+      System.out.printf("%.2f\t%.2f\t%.2f\t%.2f\t%.8g\t%.8g\t", maxBeta, nonZeros, positive, norm, lambda, mu);
+      System.out.printf("%d\t%.3f\t%.2f\t%s%n",
+        k, info.getAverageLL(), info.getAverageCorrect() * 100, LEAK_LABELS[leakType % 3]);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java
new file mode 100644
index 0000000..be55d43
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+final class SGDInfo {
+
+  private double averageLL;
+  private double averageCorrect;
+  private double step;
+  private int[] bumps = {1, 2, 5};
+
+  double getAverageLL() {
+    return averageLL;
+  }
+
+  void setAverageLL(double averageLL) {
+    this.averageLL = averageLL;
+  }
+
+  double getAverageCorrect() {
+    return averageCorrect;
+  }
+
+  void setAverageCorrect(double averageCorrect) {
+    this.averageCorrect = averageCorrect;
+  }
+
+  double getStep() {
+    return step;
+  }
+
+  void setStep(double step) {
+    this.step = step;
+  }
+
+  int[] getBumps() {
+    return bumps;
+  }
+
+  void setBumps(int[] bumps) {
+    this.bumps = bumps;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
new file mode 100644
index 0000000..b3da452
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
@@ -0,0 +1,283 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Lists;
+import com.google.common.io.Closeables;
+import com.google.common.io.Files;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.list.IntArrayList;
+import org.apache.mahout.math.stats.OnlineSummarizer;
+import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
+import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.Closeable;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * Shows how different encoding choices can make big speed differences.
+ * <p/>
+ * Run with command line options --generate 1000000 test.csv to generate a million data lines in
+ * test.csv.
+ * <p/>
+ * Run with command line options --parser test.csv to time how long it takes to parse and encode
+ * those million data points
+ * <p/>
+ * Run with command line options --fast test.csv to time how long it takes to parse and encode those
+ * million data points using byte-level parsing and direct value encoding.
+ * <p/>
+ * This doesn't demonstrate text encoding which is subject to somewhat different tricks.  The basic
+ * idea of caching hash locations and byte level parsing still very much applies to text, however.
+ */
+public final class SimpleCsvExamples {
+
+  public static final char SEPARATOR_CHAR = '\t';
+  private static final int FIELDS = 100;
+
+  private static final Logger log = LoggerFactory.getLogger(SimpleCsvExamples.class);
+
+  private SimpleCsvExamples() {}
+
+  public static void main(String[] args) throws IOException {
+    FeatureVectorEncoder[] encoder = new FeatureVectorEncoder[FIELDS];
+    for (int i = 0; i < FIELDS; i++) {
+      encoder[i] = new ConstantValueEncoder("v" + 1);
+    }
+
+    OnlineSummarizer[] s = new OnlineSummarizer[FIELDS];
+    for (int i = 0; i < FIELDS; i++) {
+      s[i] = new OnlineSummarizer();
+    }
+    long t0 = System.currentTimeMillis();
+    Vector v = new DenseVector(1000);
+    if ("--generate".equals(args[0])) {
+      try (PrintWriter out =
+               new PrintWriter(new OutputStreamWriter(new FileOutputStream(new File(args[2])), Charsets.UTF_8))) {
+        int n = Integer.parseInt(args[1]);
+        for (int i = 0; i < n; i++) {
+          Line x = Line.generate();
+          out.println(x);
+        }
+      }
+    } else if ("--parse".equals(args[0])) {
+      try (BufferedReader in = Files.newReader(new File(args[1]), Charsets.UTF_8)){
+        String line = in.readLine();
+        while (line != null) {
+          v.assign(0);
+          Line x = new Line(line);
+          for (int i = 0; i < FIELDS; i++) {
+            s[i].add(x.getDouble(i));
+            encoder[i].addToVector(x.get(i), v);
+          }
+          line = in.readLine();
+        }
+      }
+      String separator = "";
+      for (int i = 0; i < FIELDS; i++) {
+        System.out.printf("%s%.3f", separator, s[i].getMean());
+        separator = ",";
+      }
+    } else if ("--fast".equals(args[0])) {
+      try (FastLineReader in = new FastLineReader(new FileInputStream(args[1]))){
+        FastLine line = in.read();
+        while (line != null) {
+          v.assign(0);
+          for (int i = 0; i < FIELDS; i++) {
+            double z = line.getDouble(i);
+            s[i].add(z);
+            encoder[i].addToVector((byte[]) null, z, v);
+          }
+          line = in.read();
+        }
+      }
+
+      String separator = "";
+      for (int i = 0; i < FIELDS; i++) {
+        System.out.printf("%s%.3f", separator, s[i].getMean());
+        separator = ",";
+      }
+    }
+    System.out.printf("\nElapsed time = %.3f%n", (System.currentTimeMillis() - t0) / 1000.0);
+  }
+
+
+  private static final class Line {
+    private static final Splitter ON_TABS = Splitter.on(SEPARATOR_CHAR).trimResults();
+    public static final Joiner WITH_COMMAS = Joiner.on(SEPARATOR_CHAR);
+
+    public static final Random RAND = RandomUtils.getRandom();
+
+    private final List<String> data;
+
+    private Line(CharSequence line) {
+      data = Lists.newArrayList(ON_TABS.split(line));
+    }
+
+    private Line() {
+      data = new ArrayList<>();
+    }
+
+    public double getDouble(int field) {
+      return Double.parseDouble(data.get(field));
+    }
+
+    /**
+     * Generate a random line with 20 fields each with integer values.
+     *
+     * @return A new line with data.
+     */
+    public static Line generate() {
+      Line r = new Line();
+      for (int i = 0; i < FIELDS; i++) {
+        double mean = ((i + 1) * 257) % 50 + 1;
+        r.data.add(Integer.toString(randomValue(mean)));
+      }
+      return r;
+    }
+
+    /**
+     * Returns a random exponentially distributed integer with a particular mean value.  This is
+     * just a way to create more small numbers than big numbers.
+     *
+     * @param mean mean of the distribution
+     * @return random exponentially distributed integer with the specific mean
+     */
+    private static int randomValue(double mean) {
+      return (int) (-mean * Math.log1p(-RAND.nextDouble()));
+    }
+
+    @Override
+    public String toString() {
+      return WITH_COMMAS.join(data);
+    }
+
+    public String get(int field) {
+      return data.get(field);
+    }
+  }
+
+  private static final class FastLine {
+
+    private final ByteBuffer base;
+    private final IntArrayList start = new IntArrayList();
+    private final IntArrayList length = new IntArrayList();
+
+    private FastLine(ByteBuffer base) {
+      this.base = base;
+    }
+
+    public static FastLine read(ByteBuffer buf) {
+      FastLine r = new FastLine(buf);
+      r.start.add(buf.position());
+      int offset = buf.position();
+      while (offset < buf.limit()) {
+        int ch = buf.get();
+        offset = buf.position();
+        switch (ch) {
+          case '\n':
+            r.length.add(offset - r.start.get(r.length.size()) - 1);
+            return r;
+          case SEPARATOR_CHAR:
+            r.length.add(offset - r.start.get(r.length.size()) - 1);
+            r.start.add(offset);
+            break;
+          default:
+            // nothing to do for now
+        }
+      }
+      throw new IllegalArgumentException("Not enough bytes in buffer");
+    }
+
+    public double getDouble(int field) {
+      int offset = start.get(field);
+      int size = length.get(field);
+      switch (size) {
+        case 1:
+          return base.get(offset) - '0';
+        case 2:
+          return (base.get(offset) - '0') * 10 + base.get(offset + 1) - '0';
+        default:
+          double r = 0;
+          for (int i = 0; i < size; i++) {
+            r = 10 * r + base.get(offset + i) - '0';
+          }
+          return r;
+      }
+    }
+  }
+
+  private static final class FastLineReader implements Closeable {
+    private final InputStream in;
+    private final ByteBuffer buf = ByteBuffer.allocate(100000);
+
+    private FastLineReader(InputStream in) throws IOException {
+      this.in = in;
+      buf.limit(0);
+      fillBuffer();
+    }
+
+    public FastLine read() throws IOException {
+      fillBuffer();
+      if (buf.remaining() > 0) {
+        return FastLine.read(buf);
+      } else {
+        return null;
+      }
+    }
+
+    private void fillBuffer() throws IOException {
+      if (buf.remaining() < 10000) {
+        buf.compact();
+        int n = in.read(buf.array(), buf.position(), buf.remaining());
+        if (n == -1) {
+          buf.flip();
+        } else {
+          buf.limit(buf.position() + n);
+          buf.position(0);
+        }
+      }
+    }
+
+    @Override
+    public void close() {
+      try {
+        Closeables.close(in, true);
+      } catch (IOException e) {
+        log.error(e.getMessage(), e);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java
new file mode 100644
index 0000000..074f774
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java
@@ -0,0 +1,152 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.classifier.ClassifierResult;
+import org.apache.mahout.classifier.ResultAnalyzer;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.vectorizer.encoders.Dictionary;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+
+/**
+ * Run the ASF email, as trained by TrainASFEmail
+ */
+public final class TestASFEmail {
+
+  private String inputFile;
+  private String modelFile;
+
+  private TestASFEmail() {}
+
+  public static void main(String[] args) throws IOException {
+    TestASFEmail runner = new TestASFEmail();
+    if (runner.parseArgs(args)) {
+      runner.run(new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+    }
+  }
+
+  public void run(PrintWriter output) throws IOException {
+
+    File base = new File(inputFile);
+    //contains the best model
+    OnlineLogisticRegression classifier =
+        ModelSerializer.readBinary(new FileInputStream(modelFile), OnlineLogisticRegression.class);
+
+
+    Dictionary asfDictionary = new Dictionary();
+    Configuration conf = new Configuration();
+    PathFilter testFilter = new PathFilter() {
+      @Override
+      public boolean accept(Path path) {
+        return path.getName().contains("test");
+      }
+    };
+    SequenceFileDirIterator<Text, VectorWritable> iter =
+        new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, testFilter,
+        null, true, conf);
+
+    long numItems = 0;
+    while (iter.hasNext()) {
+      Pair<Text, VectorWritable> next = iter.next();
+      asfDictionary.intern(next.getFirst().toString());
+      numItems++;
+    }
+
+    System.out.println(numItems + " test files");
+    ResultAnalyzer ra = new ResultAnalyzer(asfDictionary.values(), "DEFAULT");
+    iter = new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, testFilter,
+            null, true, conf);
+    while (iter.hasNext()) {
+      Pair<Text, VectorWritable> next = iter.next();
+      String ng = next.getFirst().toString();
+
+      int actual = asfDictionary.intern(ng);
+      Vector result = classifier.classifyFull(next.getSecond().get());
+      int cat = result.maxValueIndex();
+      double score = result.maxValue();
+      double ll = classifier.logLikelihood(actual, next.getSecond().get());
+      ClassifierResult cr = new ClassifierResult(asfDictionary.values().get(cat), score, ll);
+      ra.addInstance(asfDictionary.values().get(actual), cr);
+
+    }
+    output.println(ra);
+  }
+
+  boolean parseArgs(String[] args) {
+    DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+    Option help = builder.withLongName("help").withDescription("print this list").create();
+
+    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+    Option inputFileOption = builder.withLongName("input")
+            .withRequired(true)
+            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+            .withDescription("where to get training data")
+            .create();
+
+    Option modelFileOption = builder.withLongName("model")
+            .withRequired(true)
+            .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
+            .withDescription("where to get a model")
+            .create();
+
+    Group normalArgs = new GroupBuilder()
+            .withOption(help)
+            .withOption(inputFileOption)
+            .withOption(modelFileOption)
+            .create();
+
+    Parser parser = new Parser();
+    parser.setHelpOption(help);
+    parser.setHelpTrigger("--help");
+    parser.setGroup(normalArgs);
+    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+    CommandLine cmdLine = parser.parseAndHelp(args);
+
+    if (cmdLine == null) {
+      return false;
+    }
+
+    inputFile = (String) cmdLine.getValue(inputFileOption);
+    modelFile = (String) cmdLine.getValue(modelFileOption);
+    return true;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java
new file mode 100644
index 0000000..f0316e9
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.classifier.ClassifierResult;
+import org.apache.mahout.classifier.NewsgroupHelper;
+import org.apache.mahout.classifier.ResultAnalyzer;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.vectorizer.encoders.Dictionary;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Run the 20 news groups test data through SGD, as trained by {@link org.apache.mahout.classifier.sgd.TrainNewsGroups}.
+ */
+public final class TestNewsGroups {
+
+  private String inputFile;
+  private String modelFile;
+
+  private TestNewsGroups() {
+  }
+
+  public static void main(String[] args) throws IOException {
+    TestNewsGroups runner = new TestNewsGroups();
+    if (runner.parseArgs(args)) {
+      runner.run(new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+    }
+  }
+
+  public void run(PrintWriter output) throws IOException {
+
+    File base = new File(inputFile);
+    //contains the best model
+    OnlineLogisticRegression classifier =
+        ModelSerializer.readBinary(new FileInputStream(modelFile), OnlineLogisticRegression.class);
+
+    Dictionary newsGroups = new Dictionary();
+    Multiset<String> overallCounts = HashMultiset.create();
+
+    List<File> files = new ArrayList<>();
+    for (File newsgroup : base.listFiles()) {
+      if (newsgroup.isDirectory()) {
+        newsGroups.intern(newsgroup.getName());
+        files.addAll(Arrays.asList(newsgroup.listFiles()));
+      }
+    }
+    System.out.println(files.size() + " test files");
+    ResultAnalyzer ra = new ResultAnalyzer(newsGroups.values(), "DEFAULT");
+    for (File file : files) {
+      String ng = file.getParentFile().getName();
+
+      int actual = newsGroups.intern(ng);
+      NewsgroupHelper helper = new NewsgroupHelper();
+      //no leak type ensures this is a normal vector
+      Vector input = helper.encodeFeatureVector(file, actual, 0, overallCounts);
+      Vector result = classifier.classifyFull(input);
+      int cat = result.maxValueIndex();
+      double score = result.maxValue();
+      double ll = classifier.logLikelihood(actual, input);
+      ClassifierResult cr = new ClassifierResult(newsGroups.values().get(cat), score, ll);
+      ra.addInstance(newsGroups.values().get(actual), cr);
+
+    }
+    output.println(ra);
+  }
+
+  boolean parseArgs(String[] args) {
+    DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+    Option help = builder.withLongName("help").withDescription("print this list").create();
+
+    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+    Option inputFileOption = builder.withLongName("input")
+            .withRequired(true)
+            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+            .withDescription("where to get training data")
+            .create();
+
+    Option modelFileOption = builder.withLongName("model")
+            .withRequired(true)
+            .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
+            .withDescription("where to get a model")
+            .create();
+
+    Group normalArgs = new GroupBuilder()
+            .withOption(help)
+            .withOption(inputFileOption)
+            .withOption(modelFileOption)
+            .create();
+
+    Parser parser = new Parser();
+    parser.setHelpOption(help);
+    parser.setHelpTrigger("--help");
+    parser.setGroup(normalArgs);
+    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+    CommandLine cmdLine = parser.parseAndHelp(args);
+
+    if (cmdLine == null) {
+      return false;
+    }
+
+    inputFile = (String) cmdLine.getValue(inputFileOption);
+    modelFile = (String) cmdLine.getValue(modelFileOption);
+    return true;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java
new file mode 100644
index 0000000..e681f92
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
+import org.apache.mahout.ep.State;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.vectorizer.encoders.Dictionary;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+public final class TrainASFEmail extends AbstractJob {
+
+  private TrainASFEmail() {
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+    addInputOption();
+    addOutputOption();
+    addOption("categories", "nc", "The number of categories to train on", true);
+    addOption("cardinality", "c", "The size of the vectors to use", "100000");
+    addOption("threads", "t", "The number of threads to use in the learner", "20");
+    addOption("poolSize", "p", "The number of CrossFoldLearners to use in the AdaptiveLogisticRegression. "
+                               + "Higher values require more memory.", "5");
+    if (parseArguments(args) == null) {
+      return -1;
+    }
+
+    File base = new File(getInputPath().toString());
+
+    Multiset<String> overallCounts = HashMultiset.create();
+    File output = new File(getOutputPath().toString());
+    output.mkdirs();
+    int numCats = Integer.parseInt(getOption("categories"));
+    int cardinality = Integer.parseInt(getOption("cardinality", "100000"));
+    int threadCount = Integer.parseInt(getOption("threads", "20"));
+    int poolSize = Integer.parseInt(getOption("poolSize", "5"));
+    Dictionary asfDictionary = new Dictionary();
+    AdaptiveLogisticRegression learningAlgorithm =
+        new AdaptiveLogisticRegression(numCats, cardinality, new L1(), threadCount, poolSize);
+    learningAlgorithm.setInterval(800);
+    learningAlgorithm.setAveragingWindow(500);
+
+    //We ran seq2encoded and split input already, so let's just build up the dictionary
+    Configuration conf = new Configuration();
+    PathFilter trainFilter = new PathFilter() {
+      @Override
+      public boolean accept(Path path) {
+        return path.getName().contains("training");
+      }
+    };
+    SequenceFileDirIterator<Text, VectorWritable> iter =
+        new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, trainFilter, null, true, conf);
+    long numItems = 0;
+    while (iter.hasNext()) {
+      Pair<Text, VectorWritable> next = iter.next();
+      asfDictionary.intern(next.getFirst().toString());
+      numItems++;
+    }
+
+    System.out.println(numItems + " training files");
+
+    SGDInfo info = new SGDInfo();
+
+    iter = new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, trainFilter,
+            null, true, conf);
+    int k = 0;
+    while (iter.hasNext()) {
+      Pair<Text, VectorWritable> next = iter.next();
+      String ng = next.getFirst().toString();
+      int actual = asfDictionary.intern(ng);
+      //we already have encoded
+      learningAlgorithm.train(actual, next.getSecond().get());
+      k++;
+      State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();
+
+      SGDHelper.analyzeState(info, 0, k, best);
+    }
+    learningAlgorithm.close();
+    //TODO: how to dissection since we aren't processing the files here
+    //SGDHelper.dissect(leakType, asfDictionary, learningAlgorithm, files, overallCounts);
+    System.out.println("exiting main, writing model to " + output);
+
+    ModelSerializer.writeBinary(output + "/asf.model",
+            learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
+
+    List<Integer> counts = new ArrayList<>();
+    System.out.println("Word counts");
+    for (String count : overallCounts.elementSet()) {
+      counts.add(overallCounts.count(count));
+    }
+    Collections.sort(counts, Ordering.natural().reverse());
+    k = 0;
+    for (Integer count : counts) {
+      System.out.println(k + "\t" + count);
+      k++;
+      if (k > 1000) {
+        break;
+      }
+    }
+    return 0;
+  }
+
+  public static void main(String[] args) throws Exception {
+    TrainASFEmail trainer = new TrainASFEmail();
+    trainer.run(args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java
new file mode 100644
index 0000000..defb5b9
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java
@@ -0,0 +1,377 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.io.Resources;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.classifier.sgd.AdaptiveLogisticRegression.Wrapper;
+import org.apache.mahout.ep.State;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+public final class TrainAdaptiveLogistic {
+
+  private static String inputFile;
+  private static String outputFile;
+  private static AdaptiveLogisticModelParameters lmp;
+  private static int passes;
+  private static boolean showperf;
+  private static int skipperfnum = 99;
+  private static AdaptiveLogisticRegression model;
+
+  private TrainAdaptiveLogistic() {
+  }
+
+  public static void main(String[] args) throws Exception {
+    mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+  }
+
+  static void mainToOutput(String[] args, PrintWriter output) throws Exception {
+    if (parseArgs(args)) {
+
+      CsvRecordFactory csv = lmp.getCsvRecordFactory();
+      model = lmp.createAdaptiveLogisticRegression();
+      State<Wrapper, CrossFoldLearner> best;
+      CrossFoldLearner learner = null;
+
+      int k = 0;
+      for (int pass = 0; pass < passes; pass++) {
+        BufferedReader in = open(inputFile);
+
+        // read variable names
+        csv.firstLine(in.readLine());
+
+        String line = in.readLine();
+        while (line != null) {
+          // for each new line, get target and predictors
+          Vector input = new RandomAccessSparseVector(lmp.getNumFeatures());
+          int targetValue = csv.processLine(line, input);
+
+          // update model
+          model.train(targetValue, input);
+          k++;
+
+          if (showperf && (k % (skipperfnum + 1) == 0)) {
+
+            best = model.getBest();
+            if (best != null) {
+              learner = best.getPayload().getLearner();
+            }
+            if (learner != null) {
+              double averageCorrect = learner.percentCorrect();
+              double averageLL = learner.logLikelihood();
+              output.printf("%d\t%.3f\t%.2f%n",
+                            k, averageLL, averageCorrect * 100);
+            } else {
+              output.printf(Locale.ENGLISH,
+                            "%10d %2d %s%n", k, targetValue,
+                            "AdaptiveLogisticRegression has not found a good model ......");
+            }
+          }
+          line = in.readLine();
+        }
+        in.close();
+      }
+
+      best = model.getBest();
+      if (best != null) {
+        learner = best.getPayload().getLearner();
+      }
+      if (learner == null) {
+        output.println("AdaptiveLogisticRegression has failed to train a model.");
+        return;
+      }
+
+      try (OutputStream modelOutput = new FileOutputStream(outputFile)) {
+        lmp.saveTo(modelOutput);
+      }
+
+      OnlineLogisticRegression lr = learner.getModels().get(0);
+      output.println(lmp.getNumFeatures());
+      output.println(lmp.getTargetVariable() + " ~ ");
+      String sep = "";
+      for (String v : csv.getTraceDictionary().keySet()) {
+        double weight = predictorWeight(lr, 0, csv, v);
+        if (weight != 0) {
+          output.printf(Locale.ENGLISH, "%s%.3f*%s", sep, weight, v);
+          sep = " + ";
+        }
+      }
+      output.printf("%n");
+
+      for (int row = 0; row < lr.getBeta().numRows(); row++) {
+        for (String key : csv.getTraceDictionary().keySet()) {
+          double weight = predictorWeight(lr, row, csv, key);
+          if (weight != 0) {
+            output.printf(Locale.ENGLISH, "%20s %.5f%n", key, weight);
+          }
+        }
+        for (int column = 0; column < lr.getBeta().numCols(); column++) {
+          output.printf(Locale.ENGLISH, "%15.9f ", lr.getBeta().get(row, column));
+        }
+        output.println();
+      }
+    }
+
+  }
+
+  private static double predictorWeight(OnlineLogisticRegression lr, int row, RecordFactory csv, String predictor) {
+    double weight = 0;
+    for (Integer column : csv.getTraceDictionary().get(predictor)) {
+      weight += lr.getBeta().get(row, column);
+    }
+    return weight;
+  }
+
+  private static boolean parseArgs(String[] args) {
+    DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+    Option help = builder.withLongName("help")
+        .withDescription("print this list").create();
+
+    Option quiet = builder.withLongName("quiet")
+        .withDescription("be extra quiet").create();
+    
+   
+    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+    Option showperf = builder
+      .withLongName("showperf")
+      .withDescription("output performance measures during training")
+      .create();
+
+    Option inputFile = builder
+        .withLongName("input")
+        .withRequired(true)
+        .withArgument(
+            argumentBuilder.withName("input").withMaximum(1)
+                .create())
+        .withDescription("where to get training data").create();
+
+    Option outputFile = builder
+        .withLongName("output")
+        .withRequired(true)
+        .withArgument(
+            argumentBuilder.withName("output").withMaximum(1)
+                .create())
+        .withDescription("where to write the model content").create();
+
+    Option threads = builder.withLongName("threads")
+        .withArgument(
+            argumentBuilder.withName("threads").withDefault("4").create())
+        .withDescription("the number of threads AdaptiveLogisticRegression uses")
+        .create();
+
+
+    Option predictors = builder.withLongName("predictors")
+        .withRequired(true)
+        .withArgument(argumentBuilder.withName("predictors").create())
+        .withDescription("a list of predictor variables").create();
+
+    Option types = builder
+        .withLongName("types")
+        .withRequired(true)
+        .withArgument(argumentBuilder.withName("types").create())
+        .withDescription(
+            "a list of predictor variable types (numeric, word, or text)")
+        .create();
+
+    Option target = builder
+        .withLongName("target")
+        .withDescription("the name of the target variable")    
+        .withRequired(true)    
+        .withArgument(
+            argumentBuilder.withName("target").withMaximum(1)
+                .create())
+         .create();
+    
+    Option targetCategories = builder
+      .withLongName("categories")
+      .withDescription("the number of target categories to be considered")
+      .withRequired(true)
+      .withArgument(argumentBuilder.withName("categories").withMaximum(1).create())
+      .create();
+    
+
+    Option features = builder
+        .withLongName("features")
+        .withDescription("the number of internal hashed features to use")
+        .withArgument(
+            argumentBuilder.withName("numFeatures")
+                .withDefault("1000").withMaximum(1).create())        
+        .create();
+
+    Option passes = builder
+        .withLongName("passes")
+        .withDescription("the number of times to pass over the input data")
+        .withArgument(
+            argumentBuilder.withName("passes").withDefault("2")
+                .withMaximum(1).create())        
+        .create();
+
+    Option interval = builder.withLongName("interval")
+        .withArgument(
+            argumentBuilder.withName("interval").withDefault("500").create())
+        .withDescription("the interval property of AdaptiveLogisticRegression")
+        .create();
+
+    Option window = builder.withLongName("window")
+        .withArgument(
+            argumentBuilder.withName("window").withDefault("800").create())
+        .withDescription("the average propery of AdaptiveLogisticRegression")
+        .create();
+
+    Option skipperfnum = builder.withLongName("skipperfnum")
+        .withArgument(
+            argumentBuilder.withName("skipperfnum").withDefault("99").create())
+        .withDescription("show performance measures every (skipperfnum + 1) rows")
+        .create();
+
+    Option prior = builder.withLongName("prior")
+        .withArgument(
+            argumentBuilder.withName("prior").withDefault("L1").create())
+        .withDescription("the prior algorithm to use: L1, L2, ebp, tp, up")
+        .create();
+
+    Option priorOption = builder.withLongName("prioroption")
+        .withArgument(
+            argumentBuilder.withName("prioroption").create())
+        .withDescription("constructor parameter for ElasticBandPrior and TPrior")
+        .create();
+
+    Option auc = builder.withLongName("auc")
+        .withArgument(
+            argumentBuilder.withName("auc").withDefault("global").create())
+        .withDescription("the auc to use: global or grouped")
+        .create();
+
+    
+
+    Group normalArgs = new GroupBuilder().withOption(help)
+        .withOption(quiet).withOption(inputFile).withOption(outputFile)
+        .withOption(target).withOption(targetCategories)
+        .withOption(predictors).withOption(types).withOption(passes)
+        .withOption(interval).withOption(window).withOption(threads)
+        .withOption(prior).withOption(features).withOption(showperf)
+        .withOption(skipperfnum).withOption(priorOption).withOption(auc)
+        .create();
+
+    Parser parser = new Parser();
+    parser.setHelpOption(help);
+    parser.setHelpTrigger("--help");
+    parser.setGroup(normalArgs);
+    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+    CommandLine cmdLine = parser.parseAndHelp(args);
+
+    if (cmdLine == null) {
+      return false;
+    }
+
+    TrainAdaptiveLogistic.inputFile = getStringArgument(cmdLine, inputFile);
+    TrainAdaptiveLogistic.outputFile = getStringArgument(cmdLine,
+                                                         outputFile);
+
+    List<String> typeList = new ArrayList<>();
+    for (Object x : cmdLine.getValues(types)) {
+      typeList.add(x.toString());
+    }
+
+    List<String> predictorList = new ArrayList<>();
+    for (Object x : cmdLine.getValues(predictors)) {
+      predictorList.add(x.toString());
+    }
+
+    lmp = new AdaptiveLogisticModelParameters();
+    lmp.setTargetVariable(getStringArgument(cmdLine, target));
+    lmp.setMaxTargetCategories(getIntegerArgument(cmdLine, targetCategories));
+    lmp.setNumFeatures(getIntegerArgument(cmdLine, features));
+    lmp.setInterval(getIntegerArgument(cmdLine, interval));
+    lmp.setAverageWindow(getIntegerArgument(cmdLine, window));
+    lmp.setThreads(getIntegerArgument(cmdLine, threads));
+    lmp.setAuc(getStringArgument(cmdLine, auc));
+    lmp.setPrior(getStringArgument(cmdLine, prior));
+    if (cmdLine.getValue(priorOption) != null) {
+      lmp.setPriorOption(getDoubleArgument(cmdLine, priorOption));
+    }
+    lmp.setTypeMap(predictorList, typeList);
+    TrainAdaptiveLogistic.showperf = getBooleanArgument(cmdLine, showperf);
+    TrainAdaptiveLogistic.skipperfnum = getIntegerArgument(cmdLine, skipperfnum);
+    TrainAdaptiveLogistic.passes = getIntegerArgument(cmdLine, passes);
+
+    lmp.checkParameters();
+
+    return true;
+  }
+
+  private static String getStringArgument(CommandLine cmdLine,
+                                          Option inputFile) {
+    return (String) cmdLine.getValue(inputFile);
+  }
+
+  private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
+    return cmdLine.hasOption(option);
+  }
+
+  private static int getIntegerArgument(CommandLine cmdLine, Option features) {
+    return Integer.parseInt((String) cmdLine.getValue(features));
+  }
+
+  private static double getDoubleArgument(CommandLine cmdLine, Option op) {
+    return Double.parseDouble((String) cmdLine.getValue(op));
+  }
+
+  public static AdaptiveLogisticRegression getModel() {
+    return model;
+  }
+
+  public static LogisticModelParameters getParameters() {
+    return lmp;
+  }
+
+  static BufferedReader open(String inputFile) throws IOException {
+    InputStream in;
+    try {
+      in = Resources.getResource(inputFile).openStream();
+    } catch (IllegalArgumentException e) {
+      in = new FileInputStream(new File(inputFile));
+    }
+    return new BufferedReader(new InputStreamReader(in, Charsets.UTF_8));
+  }
+   
+}


[27/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/mahout-logo.svg
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo.svg b/community/mahout-mr/mr/src/images/logos/mahout-logo.svg
new file mode 100644
index 0000000..374c89d
--- /dev/null
+++ b/community/mahout-mr/mr/src/images/logos/mahout-logo.svg
@@ -0,0 +1,627 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 width="956px" height="400px" viewBox="0 0 956 400" enable-background="new 0 0 956 400" xml:space="preserve">
+<g>
+	<path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M709.799,389.6c-21.38,0-37.761-6.839-48.688-20.322
+		c-0.377-0.467-0.747-0.935-1.11-1.408V376c0,5.523-4.478,10.001-10.001,10.001h-28.6c-5.522,0-10-4.478-10-10.001v-64.87
+		c0-4.989-0.908-7.693-1.669-9.083c-0.053-0.096-0.104-0.194-0.154-0.292c-0.32-0.634-0.987-1.954-5.366-1.954
+		c-5.29,0-7.384,1.85-8.617,3.464c-2.353,3.07-3.593,8.255-3.593,15.005V376c0,5.523-4.477,10.001-10,10.001h-27.8
+		c-0.756,0-1.492-0.085-2.201-0.244c-0.708,0.159-1.444,0.244-2.2,0.244h-30.271c-3.453,0-6.61-1.776-8.425-4.61
+		c-0.791,0.505-1.595,0.995-2.412,1.471c-7.595,4.351-16.133,6.54-25.442,6.54c-11.384,0-21.145-3.183-29.042-9.469
+		c-1.529,3.569-5.072,6.068-9.198,6.068h-28.408c-5.523,0-10-4.478-10-10.001v-67.812c0-3.194-0.564-4.789-0.9-5.458
+		c-0.392-0.777-0.97-1.93-4.821-1.93c-4.724,0-5.983,1.728-6.896,3.676c-0.919,2.061-1.383,4.79-1.383,8.113V376
+		c0,5.523-4.477,10.001-10,10.001h-27.8c-5.523,0-10-4.478-10-10.001v-63.33c0-6.95-0.88-9.239-1.055-9.627
+		c-0.351-0.763-0.845-1.844-4.675-1.844c-5.691,0-6.793,1.673-7.148,2.329c-0.298,0.616-1.122,2.832-1.122,8.451V376
+		c0,5.523-4.477,10.001-10,10.001h-28.199c-5.523,0-10-4.478-10-10.001V269.8c0-5.522,4.477-10,10-10h26.999
+		c2.902,0,5.514,1.235,7.34,3.209c6.486-3.852,14.321-5.809,23.34-5.809c10.216,0,18.796,2.437,25.504,7.242
+		c0.185,0.133,0.368,0.272,0.545,0.419c1.322,1.091,2.566,2.261,3.73,3.505c2.438-2.188,5.07-4.048,7.884-5.57
+		c0.07-0.037,0.14-0.074,0.211-0.111c7.126-3.639,15.103-5.484,23.707-5.484c5.958,0,11.882,1.164,17.608,3.456
+		c6.131,2.448,11.667,6.673,16.449,12.554c1.573,1.945,2.946,4.052,4.116,6.312c0.939-1.602,1.974-3.131,3.1-4.586
+		C462.511,263.016,477.94,257,499.041,257c13.235,0,25.249,2.715,35.706,8.067c3.12,1.598,6.458,3.872,9.454,7.101v-39.569
+		c0-5.522,4.477-10,10-10h27.8c5.523,0,10,4.478,10,10v28.484c6.504-2.974,13.447-4.483,20.639-4.483
+		c7.865,0,15.192,1.418,21.774,4.218c7.009,3,12.832,7.628,17.329,13.761c2.014,2.758,3.63,5.599,4.846,8.499
+		c1.368-2.145,2.862-4.229,4.481-6.253c10.92-13.683,27.316-20.624,48.729-20.624c21.414,0,37.812,6.941,48.737,20.633
+		c0.225,0.278,0.444,0.562,0.665,0.843v-8.274c0-5.523,4.477-10,10-10h28.6c5.523,0,10,4.477,10,10v64.358
+		c0,6.407,0.92,8.881,1.203,9.484c0.409,0.88,1.098,2.354,5.816,2.354c6.393,0,8.763-2.237,10.312-5.607
+		c0.86-2.016,1.867-5.809,1.867-12.502v-58.088c0-5.523,4.477-10,10-10h28.201c1.719,0,3.338,0.434,4.749,1.198h2.85v-20.001
+		c0-5.522,4.478-10,10.001-10h27.6c5.522,0,10,4.478,10,10V260.6h7.198c5.523,0,10,4.477,10,10v19.602c0,5.523-4.477,10-10,10H920.4
+		v46.178c0.521,0.013,1.106,0.021,1.76,0.021c0.63,0,1.279-0.023,1.929-0.071c0.704-0.053,1.405-0.129,2.085-0.227
+		c0.475-0.067,0.952-0.103,1.427-0.103c2.388,0,4.717,0.856,6.547,2.442c2.192,1.899,3.451,4.658,3.451,7.558v20.8
+		c0,5.347-4.205,9.745-9.545,9.989l-13.179,0.602c-0.037,0.002-0.076,0.004-0.113,0.004c-1.198,0.042-2.364,0.062-3.501,0.062
+		c-14.403,0-24.539-3.26-30.987-9.963c-2.15-2.205-3.846-4.837-5.072-7.872V376c0,5.523-4.478,10.001-10,10.001H838.2
+		c-3.148,0-5.959-1.456-7.791-3.732c-2.405,1.436-4.804,2.577-7.188,3.416c-5.142,1.804-11.065,2.717-17.621,2.717
+		c-24.711,0-35.835-12.303-40.818-22.626c-0.51-1.045-0.984-2.142-1.422-3.292c-1.476,2.343-3.101,4.608-4.874,6.796
+		C747.562,382.761,731.181,389.6,709.799,389.6L709.799,389.6z M487.944,348.278c0.598,0.447,1.538,0.922,3.414,0.922
+		c4.033,0,7.665-1.15,11.099-3.517c1.935-1.333,2.882-4.174,3.318-7.126c-0.231,0.043-0.465,0.089-0.702,0.133l-6.347,1.172
+		c-6.723,1.191-9.018,2.316-9.562,2.634c-0.961,0.561-1.564,1.024-1.564,3.194C487.601,347.181,487.822,347.995,487.944,348.278
+		L487.944,348.278z M709.751,299.801c-6.414,0-9.15,2.51-10.819,4.697c-3.009,3.937-4.531,10.177-4.531,18.552
+		c0,8.386,1.529,14.651,4.544,18.623c1.671,2.205,4.405,4.728,10.807,4.728c6.375,0,9.085-2.51,10.732-4.697
+		c2.995-3.98,4.517-10.259,4.517-18.653c0-8.384-1.515-14.637-4.504-18.585C718.854,302.298,716.139,299.801,709.751,299.801
+		L709.751,299.801z M491.611,300.711c-0.264,0.336-0.564,0.824-0.854,1.53l7.135-0.876c3.8-0.479,5.996-0.97,7.181-1.303
+		c-1.357-0.336-3.556-0.663-6.974-0.663C493.944,299.399,492.062,300.24,491.611,300.711L491.611,300.711z"/>
+	<path fill="#1F1F1F" d="M582,232.6v50.641c4.02-6.2,8.67-10.52,13.96-12.971c5.28-2.449,10.851-3.67,16.681-3.67
+		c6.549,0,12.5,1.141,17.859,3.42c5.35,2.291,9.74,5.78,13.18,10.471c2.91,3.99,4.7,8.08,5.35,12.289
+		c0.65,4.201,0.971,11.07,0.971,20.601V376h-28.6v-64.87c0-5.739-0.971-10.37-2.9-13.89c-2.51-4.961-7.27-7.44-14.29-7.44
+		c-7.271,0-12.79,2.46-16.56,7.39c-3.771,4.92-5.65,11.951-5.65,21.08V376h-27.8V232.6H582 M910.4,240.6v30H927.6V290.2H910.4
+		v56.409c0,4.371,0.55,7.101,1.649,8.17c1.101,1.08,4.47,1.621,10.11,1.621c0.84,0,1.73-0.03,2.67-0.101
+		c0.939-0.069,1.859-0.17,2.77-0.3v20.8l-13.18,0.601c-1.082,0.037-2.135,0.056-3.16,0.056c-11.43,0-19.356-2.298-23.779-6.896
+		c-3.121-3.201-4.681-8.121-4.681-14.761v-65.6H868V270.6h14.8v-30H910.4 M709.8,266.2c18.3,0,31.94,5.62,40.92,16.87
+		c8.99,11.24,13.48,24.539,13.48,39.88c0,15.6-4.49,28.94-13.48,40.03c-8.979,11.08-22.62,16.619-40.92,16.619
+		s-31.94-5.539-40.92-16.619c-8.989-11.09-13.479-24.431-13.479-40.03c0-15.341,4.49-28.64,13.479-39.88
+		C677.859,271.82,691.5,266.2,709.8,266.2 M709.75,356.4c8.12,0,14.359-2.891,18.72-8.68c4.351-5.781,6.53-14.011,6.53-24.671
+		c0-10.659-2.18-18.87-6.53-24.62c-4.36-5.75-10.6-8.63-18.72-8.63c-8.13,0-14.38,2.88-18.77,8.63
+		c-4.391,5.75-6.58,13.961-6.58,24.62c0,10.66,2.189,18.89,6.58,24.671C695.37,353.51,701.62,356.4,709.75,356.4 M499.04,267
+		c11.69,0,22.069,2.32,31.149,6.971c9.07,4.639,13.61,13.369,13.61,26.18v48.76c0,3.38,0.07,7.48,0.2,12.29
+		c0.2,3.63,0.75,6.09,1.67,7.39c0.92,1.301,2.29,2.37,4.13,3.21v4.2h-30.271c-0.84-2.141-1.43-4.141-1.75-6.02
+		c-0.329-1.881-0.59-4.021-0.779-6.41c-3.859,4.17-8.311,7.72-13.34,10.65c-6.02,3.449-12.82,5.18-20.41,5.18
+		c-9.68,0-17.67-2.75-23.98-8.26c-6.31-5.5-9.47-13.301-9.47-23.4c0-13.1,5.08-22.57,15.23-28.44c5.56-3.19,13.75-5.47,24.55-6.84
+		l9.529-1.17c5.17-0.649,8.871-1.47,11.101-2.44c3.99-1.699,5.99-4.34,5.99-7.92c0-4.359-1.53-7.38-4.601-9.039
+		c-3.06-1.66-7.56-2.49-13.5-2.49c-6.66,0-11.379,1.619-14.14,4.869c-1.979,2.4-3.3,5.641-3.96,9.73h-26.8
+		c0.59-9.311,3.2-16.95,7.84-22.939C468.41,271.689,481.08,267,499.04,267 M491.359,359.2c6.07,0,11.66-1.761,16.771-5.28
+		c5.12-3.529,7.771-9.949,7.97-19.279V324.26c-1.779,1.11-3.58,2.01-5.39,2.69c-1.81,0.69-4.3,1.319-7.47,1.909l-6.33,1.17
+		c-5.93,1.051-10.189,2.32-12.77,3.82c-4.361,2.551-6.541,6.49-6.541,11.84c0,4.771,1.339,8.211,4.009,10.33
+		C484.279,358.141,487.529,359.2,491.359,359.2 M411.86,267.2c4.7,0,9.32,0.909,13.89,2.739c4.56,1.82,8.7,5.021,12.41,9.58
+		c3,3.711,5.02,8.271,6.06,13.67c0.65,3.58,0.98,8.82,0.98,15.73L445.01,376H416.6v-67.811c0-4.039-0.66-7.359-1.97-9.959
+		c-2.49-4.961-7.07-7.431-13.75-7.431c-7.73,0-13.07,3.19-16.02,9.58c-1.51,3.38-2.26,7.45-2.26,12.21V376h-27.8v-63.33
+		c0-6.311-0.65-10.9-1.95-13.76c-2.35-5.141-6.94-7.71-13.78-7.71c-7.95,0-13.29,2.569-16.02,7.71c-1.5,2.93-2.25,7.279-2.25,13.07
+		V376h-28.2V269.8h27v15.46c3.44-5.529,6.69-9.47,9.74-11.81c5.39-4.171,12.37-6.25,20.94-6.25c8.12,0,14.68,1.79,19.68,5.37
+		c4.02,3.32,7.08,7.58,9.15,12.779c3.65-6.24,8.18-10.83,13.59-13.76C398.44,268.66,404.82,267.2,411.86,267.2 M865.2,269.4V376h-27
+		v-14.96c-0.261,0.33-0.91,1.3-1.95,2.931c-1.04,1.619-2.28,3.049-3.71,4.289c-4.36,3.9-8.57,6.561-12.64,7.99
+		c-4.07,1.43-8.83,2.15-14.301,2.15c-15.74,0-26.35-5.66-31.81-16.971c-3.06-6.27-4.59-15.5-4.59-27.699V269.4h28.6v64.359
+		c0,6.07,0.71,10.641,2.14,13.711c2.53,5.42,7.49,8.129,14.881,8.129c9.47,0,15.959-3.85,19.459-11.56
+		c1.811-4.181,2.721-9.7,2.721-16.55V269.4H865.2 M582,212.6h-27.8c-11.046,0-20,8.954-20,20v21.182
+		C523.599,249.28,511.796,247,499.04,247c-20.979,0-37.309,5.431-48.668,16.161c-5.107-5.312-10.877-9.27-17.208-11.796
+		c-6.893-2.761-14.068-4.165-21.305-4.165c-10.198,0-19.703,2.213-28.252,6.576c-0.145,0.074-0.289,0.149-0.431,0.227
+		c-0.904,0.49-1.792,1.006-2.664,1.55c-8.252-5.543-18.415-8.353-30.233-8.353c-8.355,0-15.932,1.435-22.647,4.278
+		c-2.458-1.08-5.175-1.679-8.032-1.679h-27c-11.045,0-20,8.954-20,20V376c0,11.046,8.955,20,20,20h28.2
+		c7.177,0,13.472-3.781,17-9.459c3.528,5.678,9.823,9.459,17,9.459h27.8c7.177,0,13.471-3.781,17-9.459
+		c3.528,5.678,9.823,9.459,17,9.459h28.41c3.945,0,7.625-1.143,10.724-3.115c8.044,4.328,17.258,6.516,27.516,6.516
+		c9.591,0,18.534-1.975,26.644-5.875c2.891,1.591,6.19,2.475,9.636,2.475H549.8c0.743,0,1.478-0.04,2.2-0.119
+		c0.723,0.079,1.457,0.119,2.2,0.119H582c9.862,0,18.058-7.139,19.7-16.531c1.643,9.393,9.838,16.531,19.7,16.531H650
+		c6.725,0,12.675-3.318,16.3-8.408c11.611,7.979,26.173,12.008,43.5,12.008c22.084,0,39.678-6.547,52.395-19.475
+		c7.525,9.087,20.741,18.275,43.405,18.275c7.69,0,14.732-1.104,20.93-3.281c0.97-0.341,1.939-0.72,2.908-1.136
+		c2.646,1.292,5.62,2.017,8.763,2.017h27c5.679,0,10.805-2.367,14.445-6.168c7.948,5.119,18.378,7.624,31.614,7.624
+		c1.246,0,2.539-0.022,3.843-0.067c0.076-0.003,0.151-0.006,0.228-0.009l13.18-0.601c10.681-0.487,19.09-9.288,19.09-19.979V356
+		c0-5.798-2.516-11.311-6.896-15.108c-2.94-2.551-6.527-4.16-10.304-4.694v-26.191c9.72-1.362,17.199-9.711,17.199-19.806V270.6
+		c0-10.095-7.479-18.443-17.199-19.806V240.6c0-11.046-8.954-20-20-20H882.8c-11.046,0-20,8.954-20,20v8.801H837
+		c-9.677,0-17.747,6.871-19.601,16.001c-1.852-9.13-9.923-16.001-19.6-16.001h-28.6c-6.813,0-12.833,3.408-16.443,8.612
+		c-3.523-2.381-7.322-4.414-11.38-6.087c-9.217-3.799-19.841-5.726-31.577-5.726s-22.36,1.927-31.577,5.726
+		c-7.925,3.267-14.862,7.909-20.695,13.84c-5.208-6.167-11.636-10.911-19.153-14.131c-0.016-0.007-0.031-0.014-0.047-0.021
+		c-7.824-3.327-16.467-5.015-25.687-5.015c-3.604,0-7.156,0.315-10.641,0.943V232.6C602,221.554,593.046,212.6,582,212.6L582,212.6z
+		 M709.75,336.4c-2.254,0-2.562-0.406-2.833-0.764c-0.598-0.787-2.517-3.982-2.517-12.587c0-8.573,1.895-11.722,2.476-12.482
+		c0.263-0.343,0.587-0.768,2.874-0.768c2.241,0,2.542,0.396,2.783,0.715c0.569,0.752,2.467,3.929,2.467,12.535
+		c0,8.638-1.922,11.862-2.511,12.645C712.255,336.006,711.958,336.4,709.75,336.4L709.75,336.4z"/>
+</g>
+<g>
+	<path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M293.499,388c-14.734,0-16.194-10.602-16.491-15.158
+		c-2.282,0.969-5.548,2.491-8.354,3.799C254.849,383.077,243.715,388,236.501,388c-25.962,0-44.167-21.608-49.721-41.42
+		c-0.496,1.273-1.104,2.537-1.848,3.777l-0.259,0.435l-0.316,0.395c-8.148,10.178-36.573,10.815-36.855,10.815
+		c-13.224,0-22.923-3.371-28.833-10.016c-3.175-3.571-6.704-9.897-5.67-19.862c-0.078-13.16,4.078-39.976,7.317-50.777l1.603-5.348
+		h5.582h11h3.107l2.196,2.198c2.883,2.884,2.607,6.303,2.405,8.801c-0.188,2.295-0.534,6.566-0.213,15.226
+		c0.097,2.288,2.599,9.209,5.632,13.571c2.909-2.997,8.484-10.194,18.782-27.42c1.031-1.728,1.504-2.515,1.852-3.035l4.313-6.47
+		c-2.459-5.739-5.026-12.353-5.562-21.952L171,256.709V256.5c0-1.622,0.274-3.164,0.536-4.655c0.063-0.361,0.141-0.804,0.208-1.224
+		c-1.643-1.129-3.839-2.151-6.13-3.219c-2.105-0.981-4.286-1.998-6.391-3.253c-0.369-0.209-0.732-0.424-1.084-0.646
+		c0.54,1.213,0.863,2.522,0.863,3.995c0,3.938-4.782,14.329-8.794,22.355l-1.475,2.951l-3.172,0.907
+		c-4.74,1.354-14.825,1.835-22.685,1.835c-3.458,0-7.982-0.087-12.876-0.411v1.362c0,1.262,0.243,3.584,0.437,5.449
+		c0.245,2.333,0.395,3.824,0.395,5.052c0,9.625-4.9,16.854-13.795,20.354c-5.909,2.326-12.401,2.647-18.535,2.647
+		c-14.37,0-22.193-2.224-27.005-7.674c-4.932-5.586-4.944-12.661-4.959-20.85c-0.002-1.473-0.004-3.027-0.036-4.666
+		c-0.019-0.987,0.051-4.084,0.19-9.929c0.137-5.841,0.308-13.11,0.308-16.382v-21.006c-4.691-11.945-6.906-23.596-7.927-30.968
+		c-1.042-7.547,0.479-14.028,4.519-19.263c2.712-3.514,6.315-6.115,10.41-8.083V167.5c0-4.225,0-8.547,0.348-12.964
+		c-0.274-0.088-0.551-0.179-0.829-0.27c-7.124-2.318-15.989-5.206-21.714-11.884c-9.206-10.842-14.806-37.737-14.806-40.882
+		c0-9.415,5.693-15.5,14.502-15.5c9.336,0,14.5,8.575,14.5,14.5c0,2.35-0.814,5.752-2.542,12.427
+		c-0.538,2.071-1.259,4.855-1.454,5.917c0.127,5.01,3.023,8.396,5.461,10.37c3.111,2.514,7.279,4.155,11.751,4.676
+		c17.654-45.552,69.792-61.89,110.282-61.89c50.339,0,81.613,26.563,86.226,73.025c15.616-5.543,33.031-11.026,46.774-11.026
+		c10.264,0,22.501,4.947,22.501,28.502c0,26.979-14.823,65.564-47.938,90.951l-5.499,4.217l-4.639-5.151
+		c-6.05-6.721-13.757-10.396-24.254-11.563l-1.745-0.194c0.874,3.85,2.272,7.381,3.797,11.229c1.422,3.59,2.945,7.434,4.069,11.783
+		l0.006-0.038l10.701,14.268c6.913,9.214,14.502,33.55,14.502,46.5c0,0.402-0.011,0.822-0.036,1.257
+		c3.445-4.229,8.915-6.759,15.534-6.759c13.399,0,19.501,8.554,19.501,16.5c0,3.264-1.628,6.606-4.312,11.725
+		c-0.299,0.573-0.668,1.275-1.004,1.937c0.4,0.484,0.85,1.01,1.234,1.457c3.217,3.753,8.081,9.421,8.081,16.884
+		C313,379.379,304.799,388,293.499,388L293.499,388z M246.438,356.085c-0.279,0.348-0.393,0.734-0.435,1.228
+		C246.151,356.929,246.297,356.518,246.438,356.085L246.438,356.085z M270.053,335.944c-1.209,1.354-2.772,2.58-4.778,3.571
+		c1.533-0.104,3.139-0.207,4.788-0.296c-0.04-0.548-0.065-1.122-0.065-1.719C269.998,336.974,270.017,336.455,270.053,335.944
+		L270.053,335.944z M219.022,317.98c0.091,0.007,0.192,0.013,0.299,0.017c0.586-0.088,1.987-0.419,2.938-0.646
+		c0.477-0.113,0.958-0.226,1.438-0.337c-1.721,0.031-3.757,0.146-4.62,0.546C219.061,317.656,219.037,317.793,219.022,317.98
+		L219.022,317.98z M172.535,125.259c8.01,5.611,15.055,13.589,20.567,20.67c2.555-14.029,4.93-23.667,8.843-29.008
+		c-5.7,1.628-9.896,5.062-12.694,7.354c-2.441,2-4.55,3.727-7.75,3.727c-2.044,0-3.801-0.7-6.71-1.858
+		C174.113,125.873,173.356,125.571,172.535,125.259L172.535,125.259z"/>
+	<path fill="#1F1F1F" d="M169.5,79.5c36,0,75,15,79,69h-3c-5-28-16-40-37-40c-16,0-25,12-27,12s-12.5-6-23-6c-21,0-43,12-42,42
+		l-55,11c0-6,0-12,1-18c-7-3-19-5-25-12c-7.5-8.83-13-34-13-36c0-6,3-8,7-8c5,0,7,5,7,7c0,3-4,16-4,18
+		c0,13.355,12.737,23.069,27.8,23.069c0.728,0,1.463-0.023,2.2-0.069C79.5,93.5,134.5,79.5,169.5,79.5 M213.537,119.277
+		c18.366,0.001,22.214,25.926,26.963,39.223c17-6,44-17,62-17c13,0,15,11,15,21c0,26-15,62-45,85c-9-10-20-13-29-14
+		c8.5-20.5,10.83-49,1-49c-6,0-3,11-4,14c-2,11-8,32-8,34c0,18,10.5,26.5,10.5,44.5c0,3-4.5,22.5-7.5,33.5c-3-1-8-1-10-1
+		c-6,0-14,0-14,9c0,6,5,7,8,7c2,0,8-2,11-2c2,0,6,1,6,5c0,10-20,4-20,16c0,6,3,7,6,7c2,0,18.01-9.73,21-10
+		c0.204-0.019,0.396-0.027,0.579-0.027c4.739,0,2.421,6.027,2.421,6.027c-8.83,3.5-8,9-8,12c0,5,3,8,6,8c10,0,11-19,11-20
+		c6,0,14-1,22-1c1-3,0-6,0-9c0-8,6-11,12-11c8,0,12,4,12,9c0,3-6,12-6,14c0,4,10,10,10,18s-5,13-12,13c-16,0-3-16-15-16
+		c-4,0-32,16-42,16c-27,0-44-28-44-46v-22c-1-7-2-16-4-23c-3-12-9.17-18.17-10-33c0-3,2-8,0-10c-4-4-10.5-5.83-15.5-8.83
+		c-9-5-11.5-16.17-13.5-21.17c-1-4-3-7-6-11c-7,3-6,9-6,13c0,18,14,25,14,29c0,2-5,13-8,19c-3.04,0.868-11.171,1.549-20.627,1.549
+		c-12.319,0-26.887-1.154-35.373-4.549c-29-10-38.26-46.189-41-66C43.67,177,65.83,174.17,84,172c12.6-1.5,31.5-4.5,45.5-6.5
+		c0,0,1,0,1-2c0-3-2-11-2-13v-6c0-10,12.5-19,24-19c20.17,0,40,33,45,39c3.5-20.17,6.83-43.83,13-45
+		C211.555,119.349,212.566,119.277,213.537,119.277 M54.5,250.5c10.601,13.491,30.487,26.055,46.237,26.055
+		c0.593,0,1.182-0.019,1.763-0.055c0,3,0.83,8.5,0.83,10.5c0,15-15.83,15.5-24.83,15.5c-27,0-24.17-8.17-24.5-25.83
+		C53.96,274.67,54.5,256.5,54.5,250.5 M253.5,282.5c6,8,13,31,13,42c0,8-6,10-14,10c-7,0-7-9-7-13
+		C245.5,318.5,251.5,295.5,253.5,282.5 M138.5,283.5c1,1-0.59,3.01,0,19c0.17,4.5,4.83,17.17,11,22
+		c0.394,0.309,0.843,0.454,1.342,0.454c7.473,0,25.783-32.642,27.658-35.454l3,41c0,5,0,11-3,16c-4,5-22,8-31,8c-15,0-29-5-27-22
+		c-0.17-12.17,4-39,7-49H138.5 M169.5,64.5c-22.887,0-47.102,5.267-66.436,14.451c-22.318,10.602-38.762,26.385-48.174,46.081
+		c-2.892-1.323-4.917-3.379-5.317-5.69c0.286-1.215,0.786-3.146,1.146-4.538c1.934-7.468,2.781-11.078,2.781-14.303
+		c0-10.625-8.84-22-22-22c-12.953,0-22,9.458-22,23c0,5.403,4.153,19.196,4.33,19.781c3.642,12.04,7.645,20.521,12.238,25.929
+		l0.022,0.026l0.021,0.025c5.737,6.693,13.633,10.188,20.458,12.587c-0.062,2.329-0.068,4.619-0.069,6.88
+		c-3.329,2.099-6.335,4.7-8.847,7.953c-3.655,4.735-7.666,12.894-6.012,24.87c1.152,8.331,3.418,19.827,7.859,31.553V250.5
+		c0,3.185-0.17,10.406-0.308,16.209c-0.158,6.708-0.211,9.153-0.189,10.261c0.029,1.536,0.031,3.052,0.034,4.518
+		c0.016,8.896,0.031,18.095,6.835,25.802C53.794,316.263,66.235,317.5,78.5,317.5c6.544,0,14.191-0.376,21.283-3.167
+		c2.781-1.094,5.281-2.484,7.479-4.137c-1.056,8.09-1.759,15.937-1.766,21.561c-1.177,12.446,3.429,20.561,7.567,25.214
+		c7.394,8.313,18.98,12.529,34.438,12.529c5.904,0,13.821-0.954,20.661-2.489c6.875-1.544,12.2-3.518,16.228-6.052
+		c2.301,4.51,5.13,8.851,8.412,12.832C204.34,387.79,219.86,395.5,236.5,395.5c8.772,0,20.174-4.999,35.324-12.061
+		c0.02-0.01,0.04-0.019,0.06-0.028c0.447,0.926,0.981,1.858,1.621,2.783c2.932,4.245,8.782,9.306,19.996,9.306
+		c7.6,0,14.536-2.912,19.53-8.2c4.817-5.101,7.47-12.132,7.47-19.8c0-8.514-4.28-14.937-7.848-19.338
+		c2.113-4.158,3.848-8.218,3.848-12.662c0-11.927-9.274-24-27-24c-3.298,0-6.405,0.485-9.255,1.394
+		c-2.485-13.582-8.349-30.865-14.745-39.394l-9.87-13.159c-0.968-3.414-2.118-6.49-3.218-9.3c3.468,1.514,6.374,3.645,8.938,6.493
+		l9.274,10.305l11.002-8.435C316.77,232.461,332.5,191.32,332.5,162.5c0-5.601-0.454-13.9-4.378-21.287
+		c-5.04-9.488-14.14-14.713-25.622-14.713c-12.294,0-26.813,3.88-40.602,8.463c-1.801-9.966-4.853-19.031-9.12-27.063
+		c-5.635-10.608-13.4-19.48-23.079-26.371C214.048,70.389,193.232,64.5,169.5,64.5L169.5,64.5z M153.054,279.371l0.912-0.261
+		l2.951-5.902c1.771-3.542,3.868-8.042,5.472-11.744c0.449-1.035,0.853-1.989,1.216-2.874c0.6,8.092,2.501,14.302,4.513,19.442
+		l-2.098,3.147c-0.447,0.67-0.922,1.462-2.049,3.348c-4.393,7.349-7.832,12.72-10.507,16.643c-0.255-7.689,0.052-11.492,0.22-13.565
+		C153.833,285.754,154.081,282.688,153.054,279.371L153.054,279.371z"/>
+</g>
+<g>
+	<path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M445.01,377.502H416.6c-0.828,0-1.501-0.673-1.501-1.501v-67.812
+		c0-3.775-0.607-6.899-1.808-9.283c-2.233-4.446-6.292-6.605-12.412-6.605c-7.158,0-11.952,2.849-14.657,8.708
+		c-1.406,3.146-2.121,7.051-2.121,11.583v63.41c0,0.828-0.673,1.501-1.501,1.501h-27.8c-0.828,0-1.501-0.673-1.501-1.501v-63.33
+		c0-6.069-0.609-10.49-1.816-13.142c-2.1-4.593-6.162-6.828-12.414-6.828c-7.419,0-12.225,2.26-14.695,6.912
+		c-1.373,2.681-2.073,6.848-2.073,12.368v64.02c0,0.828-0.673,1.501-1.501,1.501h-28.202c-0.828,0-1.501-0.673-1.501-1.501V269.8
+		c0-0.828,0.673-1.501,1.501-1.501h27.001c0.828,0,1.501,0.673,1.501,1.501v10.492c2.533-3.545,4.988-6.237,7.326-8.03
+		c5.624-4.353,12.977-6.562,21.853-6.562c8.402,0,15.317,1.902,20.551,5.65c0.03,0.02,0.057,0.04,0.082,0.063
+		c3.509,2.895,6.334,6.504,8.422,10.749c3.508-5.25,7.753-9.242,12.649-11.891c5.95-3.04,12.626-4.572,19.875-4.572
+		c4.873,0,9.735,0.959,14.446,2.849c4.774,1.902,9.153,5.276,13.018,10.025c3.147,3.89,5.287,8.71,6.37,14.331
+		c0.668,3.688,1.007,9.069,1.007,16.015l-0.189,67.085C446.507,376.831,445.836,377.502,445.01,377.502L445.01,377.502z"/>
+	<path fill="#1F1F1F" d="M411.86,267.2c4.7,0,9.32,0.909,13.89,2.739c4.56,1.82,8.7,5.021,12.41,9.58c3,3.711,5.02,8.271,6.06,13.67
+		c0.65,3.58,0.98,8.82,0.98,15.73L445.01,376H416.6v-67.811c0-4.039-0.66-7.359-1.97-9.959c-2.49-4.961-7.07-7.431-13.75-7.431
+		c-7.73,0-13.07,3.19-16.02,9.58c-1.51,3.38-2.26,7.45-2.26,12.21V376h-27.8v-63.33c0-6.311-0.65-10.9-1.95-13.76
+		c-2.35-5.141-6.94-7.71-13.78-7.71c-7.95,0-13.29,2.569-16.02,7.71c-1.5,2.93-2.25,7.279-2.25,13.07V376h-28.2V269.8h27v15.46
+		c3.44-5.529,6.69-9.47,9.74-11.81c5.39-4.171,12.37-6.25,20.94-6.25c8.12,0,14.68,1.79,19.68,5.37c4.02,3.32,7.08,7.58,9.15,12.779
+		c3.65-6.24,8.18-10.83,13.59-13.76C398.44,268.66,404.82,267.2,411.86,267.2 M411.86,264.2c-7.485,0-14.391,1.587-20.523,4.718
+		c-0.022,0.011-0.043,0.022-0.065,0.034c-4.465,2.418-8.405,5.893-11.758,10.363c-2.029-3.501-4.587-6.534-7.643-9.058
+		c-0.053-0.045-0.108-0.087-0.164-0.127c-5.497-3.936-12.706-5.931-21.427-5.931c-9.215,0-16.878,2.313-22.776,6.877
+		c-1.614,1.238-3.242,2.832-4.904,4.808V269.8c0-1.657-1.343-3-3-3h-27c-1.657,0-3,1.343-3,3V376c0,1.657,1.343,3,3,3h28.2
+		c1.657,0,3-1.343,3-3v-64.02c0-5.276,0.646-9.214,1.92-11.703c2.165-4.076,6.539-6.077,13.35-6.077
+		c5.682,0,9.194,1.893,11.052,5.957c0.764,1.682,1.678,5.222,1.678,12.513V376c0,1.657,1.343,3,3,3h27.8c1.657,0,3-1.343,3-3v-63.41
+		c0-4.321,0.672-8.018,1.999-10.986c2.453-5.313,6.678-7.804,13.281-7.804c5.574,0,9.091,1.835,11.069,5.776
+		c1.097,2.176,1.651,5.072,1.651,8.613V376c0,1.657,1.343,3,3,3h28.41c1.653,0,2.996-1.338,3-2.991l0.19-67.08
+		c0-7.044-0.346-12.517-1.028-16.275c-1.136-5.897-3.381-10.94-6.679-15.02c-4.031-4.955-8.615-8.479-13.631-10.48
+		C421.97,265.194,416.922,264.2,411.86,264.2L411.86,264.2z"/>
+</g>
+<g>
+	<g>
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M170,62c10.33,0,14-3.67,28.67-13
+			c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5c-8.5,5.68,29.5,34.67-22.67,42.26
+			c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12c-15-3.67-25.67-2.89-28.5,17
+			c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"/>
+		<path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M170,62
+			c10.33,0,14-3.67,28.67-13c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5
+			c-8.5,5.68,29.5,34.67-22.67,42.26c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12
+			c-15-3.67-25.67-2.89-28.5,17c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"
+			/>
+	</g>
+	<defs>
+		<filter id="MyOpacityMaskFilter" filterUnits="userSpaceOnUse" x="105.83" y="47.5" width="122.67" height="85.774">
+			
+				<feColorMatrix  type="matrix" values="-1 0 0 0 1  0 -1 0 0 1  0 0 -1 0 1  0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+		</filter>
+	</defs>
+	<mask maskUnits="userSpaceOnUse" x="105.83" y="47.5" width="122.67" height="85.774" id="SVGID_1_">
+		<g filter="url(#My_OpacityMaskFilter)">
+			
+				<image overflow="visible" width="128" height="91" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAItAAADjQAABP//2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAFsAgAMBIgACEQEDEQH/
+xACNAAEAAgMBAQAAAAAAAAAAAAAABQcBBAYCAwEBAAAAAAAAAAAAAAAAAAAAABAAAgICAQQCAwEB
+AAAAAAAAAwQBAgUGABAgERMwElAxFEAWEQABAwIEBAUEAwAAAAAAAAABABECIQMgMUESEFFhIjBx
+gTIEQJGhQlJiFBIBAAAAAAAAAAAAAAAAAAAAUP/aAAwDAQACEQMRAAAAr8GZad70qyHvKHKfdZzp
+qvewam91PYlQa1oVofICXiLCOv38ZGMj56MkITakR49hqVDclRECD6XBVlxm4AAAA8/M91ZavGlZ
+M4J+26rtU9cl0VaFjyNMWmSrGQDU4GxqyO7ia/1Dai/WCc7ist024jWHrrOR2y8fpEypljyZr7qq
+1IIAD15AAHV9PVosuF44b+gAAH//2gAIAQIAAQUA/If/2gAIAQMAAQUA/If/2gAIAQEAAQUA6Vra
+8p646zB9UdHVhRha3apiGmYcQOpbsiJmdX1z7wrjABpdIF4yWtLM1yulmFLGNdXn0m4tjHWbYXTJ
+mVsCAQ9hwI7hZBZc/XXcf/a5i0qLg6kCMkHwqpuf80n5BhVQ8oKlI5kBQRfZQ1Fkeuk42KirERHw
+sR5Dt8eMl0WH7T60rAVfiJHmm8LTRnpgQ+7JYwfrW+C1orA2wFn983LGwwC1ZpbmoBm761fqEl4H
+RzeFV3sdmAOVifPbkq2sshkzY3Jr5gVxZnJAJTKgHcn65pcxDILR6n2xUFsaYTFw+aYxjGGyg3Qd
+haxYe5qSIwNgbENjItsW9pOTMzzVmKhZYz1FlsptbbNyZBonLEtfml5a4yhJBB9bT4ru9qyLsRPI
+D5R+5R9cWzKzuEdqZfpctKRk80EI9izH9pe215t2RMxOC2iFqj3FX6s7utTju72vDuYccn/L/9oA
+CAECAgY/AEP/2gAIAQMCBj8AQ//aAAgBAQEGPwDgIxBJOQCEiNoK3Rr5hbb0DHrpi3CJjHRNcHbz
+wgDM5KN67F5SqgNoTGIR7AXRn8an9dE1y1KmoDr2S+xQFu0WOpDKNz5A3S6oR2gKXbop2pfqfxgB
+IeMD+VFg1MDSDqsQvYFSITRDcJPyUm/bP0wRuSFZVKAGnhS8l6Hjbt/ykAoUZh4ch0UbrasTxthn
+EaqI6eDukWATQkCeE2FRUIxkGILHgZaBgojojM6I/FJ7oljyHqgYyBfFIRzZXPjXpkwlIygZF8zU
+VKBJGSkDII3LWevCXmFGuilEkKV22wm+aEZyJtPXookF3GGQ6IfIt0lAu4Ww16omdwsdAm3FVUnN
+XBW4yZgpRslov7iu+bruX+acssn5ISGuAkqbYRJ2BoULYNDngt3HYOx9VGunF5FSAkEbcC4epxVw
+OMwo27p2kc1W4PumFwP5oi05KO+TROg+m//Z" transform="matrix(1 0 0 1 103 45)">
+			</image>
+		</g>
+	</mask>
+	<g mask="url(#SVGID_1_)">
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M170,62c10.33,0,14-3.67,28.67-13
+			c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5c-8.5,5.68,29.5,34.67-22.67,42.26
+			c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12c-15-3.67-25.67-2.89-28.5,17
+			c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"/>
+		<path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M170,62
+			c10.33,0,14-3.67,28.67-13c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5
+			c-8.5,5.68,29.5,34.67-22.67,42.26c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12
+			c-15-3.67-25.67-2.89-28.5,17c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"
+			/>
+	</g>
+</g>
+<g>
+	<path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M293.5,382c-9.998,0-10.315-5.942-10.546-10.279
+		c-0.217-4.07-0.465-5.721-4.453-5.721c-1.218,0-7.149,2.766-12.382,5.203C255.8,376.014,242.957,382,236.5,382
+		c-12.534,0-24.353-5.965-33.282-16.796C195.682,356.062,191,344.297,191,334.499v-21.89c-0.17-1.201-0.341-2.459-0.518-3.752
+		c-0.845-6.225-1.805-13.276-3.424-18.945c-1.138-4.55-2.757-8.294-4.324-11.914c-2.56-5.912-5.206-12.029-5.732-21.414
+		c-0.002-1.18,0.212-2.402,0.442-3.695c0.355-2.016,0.799-4.522-0.004-5.328c-2.376-2.377-5.892-4.014-9.292-5.598
+		c-1.994-0.93-4.056-1.889-5.919-3.005c-8.018-4.455-11.089-13.294-13.123-19.146c-0.37-1.066-0.69-1.987-0.997-2.755l-0.038-0.095
+		l-0.025-0.1c-0.816-3.267-2.352-5.857-5.008-9.474c-4.247,2.344-4.152,6.092-4.06,9.727c0.013,0.481,0.023,0.944,0.023,1.384
+		c0,11.657,6.152,18.462,10.225,22.965c2.191,2.423,3.775,4.175,3.775,6.034c0,3.166-8.077,19.509-8.159,19.671l-0.296,0.592
+		l-0.633,0.181c-3.363,0.961-11.819,1.606-21.042,1.606c-7.303,0-25.421-0.454-35.926-4.656
+		c-30.922-10.66-39.625-50.538-41.929-67.187c-0.814-5.892,0.305-10.864,3.325-14.776c6.96-9.015,22.775-10.902,35.482-12.418
+		c8.487-1.01,19.755-2.69,30.65-4.316c5.071-0.757,10.019-1.493,14.48-2.133c0.025-0.116,0.048-0.296,0.048-0.562
+		c0-1.51-0.598-4.632-1.125-7.385c-0.542-2.835-0.875-4.625-0.875-5.616v-6.001c0-11.356,13.95-20.5,25.5-20.5
+		c17.761,0,34.676,23.646,42.804,35.009c0.467,0.654,0.904,1.262,1.304,1.819c0.164-0.953,0.326-1.91,0.488-2.869
+		c4.085-24.071,7.006-38.771,13.125-39.933c1.174-0.168,2.268-0.248,3.317-0.248c16.308,0,21.873,18.76,25.937,32.459
+		c0.671,2.254,1.311,4.413,1.952,6.341c2.131-0.759,4.403-1.588,6.779-2.457C264.544,148.163,286.92,140,302.5,140
+		c16.501,0,16.501,16.934,16.501,22.5c0,25.503-14.097,62.045-45.589,86.19l-1.1,0.843l-0.928-1.03
+		c-6.994-7.771-16.168-12.191-28.05-13.513l-1.984-0.221l0.764-1.845c7.093-17.106,9.554-38.674,5.162-45.25
+		c-0.763-1.145-1.647-1.677-2.776-1.677c-0.789,0-1.146,0.278-1.346,0.486c-1.222,1.269-1.085,4.924-0.984,7.593
+		c0.074,1.938,0.139,3.62-0.208,4.779c-1.132,6.178-3.464,15.332-5.345,22.691c-1.271,4.979-2.585,10.13-2.617,10.963
+		c0,8.704,2.499,15.01,5.145,21.688c2.633,6.646,5.355,13.515,5.355,22.801c0,3.303-4.705,23.461-7.551,33.896l-0.417,1.529
+		l-1.504-0.501C232.255,311,227.348,311,225.499,311c-7.319,0-12.5,0.539-12.5,7.499c0,4.545,3.536,5.5,6.501,5.5
+		c0.724,0,2.461-0.41,4.142-0.808c2.474-0.585,5.031-1.19,6.857-1.19c3.014,0,7.5,1.731,7.5,6.5c0,5.946-5.555,7.321-10.456,8.535
+		c-5.938,1.47-9.543,2.707-9.543,7.465c0,5.075,2.224,5.5,4.5,5.5c0.845-0.146,5.368-2.56,8.67-4.322
+		c6.417-3.424,10.441-5.515,12.195-5.673c0.25-0.022,0.488-0.033,0.711-0.033c2.091,0,3.172,0.936,3.71,1.721
+		c1.59,2.315,0.269,5.939,0.114,6.346l-0.238,0.614l-0.61,0.241c-7.2,2.854-7.12,6.903-7.063,9.859
+		c0.006,0.263,0.011,0.511,0.011,0.746c0,4.068,2.289,6.5,4.499,6.5c8.643,0,9.501-18.314,9.501-18.5v-1.499h1.5
+		c2.734,0,5.946-0.217,9.348-0.444c3.719-0.248,7.553-0.507,11.48-0.551c0.231-1.382,0.072-2.827-0.097-4.339
+		c-0.113-1.024-0.231-2.083-0.231-3.166c0-9.228,7.274-12.5,13.502-12.5c9.963,0,13.5,5.655,13.5,10.5
+		c0,1.88-1.435,4.758-3.625,8.935c-0.976,1.864-2.313,4.413-2.376,5.091c0,1.074,1.71,3.068,3.363,4.997
+		c2.957,3.445,6.636,7.734,6.636,12.976C306.999,376.174,301.574,382,293.5,382L293.5,382z"/>
+	<g>
+		<path fill="#1F1F1F" d="M213.538,119.277c18.366,0.001,22.213,25.926,26.962,39.223c17-6,44-17,62-17c13,0,15,11,15,21
+			c0,26-15,62-45,85c-9-10-20-13-29-14c8.5-20.5,10.83-49,1-49c-6,0-3,11-4,14c-2,11-8,32-8,34c0,18,10.5,26.5,10.5,44.5
+			c0,3-4.5,22.5-7.5,33.5c-3-1-8-1-10-1c-6,0-14,0-14,9c0,6,5,7,8,7c2,0,8-2,11-2c2,0,6,1,6,5c0,10-20,4-20,16c0,6,3,7,6,7
+			c2,0,18.01-9.73,21-10c0.204-0.019,0.396-0.027,0.579-0.027c4.739,0,2.421,6.027,2.421,6.027c-8.83,3.5-8,9-8,12c0,5,3,8,6,8
+			c10,0,11-19,11-20c6,0,14-1,22-1c1-3,0-6,0-9c0-8,6-11,12-11c8,0,12,4,12,9c0,3-6,12-6,14c0,4,10,10,10,18s-5,13-12,13
+			c-16,0-3-16-15-16c-4,0-32,16-42,16c-27,0-44-28-44-46v-22c-1-7-2-16-4-23c-3-12-9.17-18.17-10-33c0-3,2-8,0-10
+			c-4-4-10.5-5.83-15.5-8.83c-9-5-11.5-16.17-13.5-21.17c-1-4-3-7-6-11c-7,3-6,9-6,13c0,18,14,25,14,29c0,2-5,13-8,19
+			c-3.04,0.868-11.171,1.549-20.627,1.549c-12.319,0-26.887-1.154-35.373-4.549c-29-10-38.26-46.189-41-66
+			C43.67,177,65.83,174.17,84,172c12.6-1.5,31.5-4.5,45.5-6.5c0,0,1,0,1-2c0-3-2-11-2-13v-6c0-10,12.5-19,24-19c20.17,0,40,33,45,39
+			c3.5-20.17,6.83-43.83,13-45C211.555,119.349,212.566,119.277,213.538,119.277 M213.538,116.277L213.538,116.277
+			c-1.121,0-2.285,0.085-3.462,0.253l-0.067,0.009l-0.067,0.013c-7.154,1.356-10.092,16.252-14.208,40.478
+			c-8.547-11.923-25.273-34.53-43.232-34.53c-6.25,0-12.861,2.322-18.139,6.37c-5.631,4.32-8.861,10.017-8.861,15.63v6
+			c0,1.128,0.326,2.887,0.902,5.898c0.415,2.168,0.916,4.785,1.058,6.364c-4.108,0.593-8.54,1.254-13.201,1.949
+			c-10.889,1.624-22.148,3.302-30.614,4.31c-12.988,1.551-29.15,3.481-36.493,12.993c-3.275,4.243-4.495,9.591-3.625,15.896
+			c1.349,9.753,4.34,24.19,10.932,37.593c7.76,15.777,18.523,26.143,31.994,30.81c10.756,4.273,29.043,4.736,36.418,4.736
+			c9.348,0,17.968-0.669,21.452-1.664l1.269-0.362l0.59-1.181c0.34-0.68,8.317-16.676,8.317-20.342c0-2.437-1.747-4.369-4.165-7.043
+			c-3.916-4.332-9.835-10.879-9.835-21.957c0-0.452-0.012-0.929-0.024-1.423c-0.087-3.454,0.041-5.904,2.188-7.644
+			c2.064,2.912,3.25,5.088,3.926,7.794l0.05,0.197l0.075,0.189c0.294,0.734,0.609,1.641,0.973,2.689
+			c1.976,5.687,5.281,15.197,13.81,19.963c1.919,1.147,4.002,2.118,6.018,3.057c3.399,1.584,6.611,3.08,8.799,5.234
+			c0.252,0.677-0.136,2.876-0.347,4.069c-0.23,1.3-0.467,2.645-0.467,3.873v0.084l0.005,0.084c0.54,9.651,3.24,15.891,5.851,21.924
+			c1.614,3.729,3.138,7.252,4.234,11.636l0.012,0.049l0.014,0.048c1.589,5.56,2.54,12.55,3.378,18.716
+			c0.172,1.267,0.34,2.497,0.507,3.673V334.5c0,10.129,4.813,22.26,12.56,31.658c9.218,11.183,21.45,17.342,34.44,17.342
+			c6.791,0,19.8-6.064,30.254-10.938c4.641-2.163,10.408-4.851,11.819-5.062c2.478,0.006,2.669,0.32,2.882,4.301
+			c0.219,4.089,0.626,11.699,12.044,11.699c8.832,0,15-6.579,15-16c0-5.797-3.88-10.319-6.997-13.953
+			c-1.082-1.262-2.686-3.131-2.97-3.964c0.292-0.864,1.411-2.999,2.171-4.449c2.362-4.507,3.796-7.404,3.796-9.634
+			c0-5.973-4.638-12-15-12c-9.112,0-15,5.495-15,14c0,1.166,0.123,2.267,0.241,3.331c0.107,0.968,0.207,1.864,0.204,2.7
+			c-3.537,0.083-7.038,0.317-10.199,0.529c-3.374,0.226-6.562,0.439-9.246,0.439h-2.961l-0.039,2.989
+			c-0.035,2.644-1.656,17.011-8,17.011c-1.21,0-3-1.589-3-5c0-0.244-0.005-0.503-0.01-0.775c-0.057-2.933-0.117-5.966,6.116-8.436
+			l1.223-0.484l0.472-1.228c0.302-0.785,1.707-4.846-0.276-7.733c-0.608-0.886-2.06-2.371-4.945-2.371
+			c-0.274,0-0.561,0.014-0.851,0.04c-1.974,0.178-5.405,1.917-12.763,5.842c-2.98,1.59-7.018,3.744-8.235,4.145
+			c-1.546-0.011-2.731-0.216-2.731-3.999c0-3.57,2.432-4.528,8.404-6.008c4.894-1.212,11.596-2.872,11.596-9.992
+			c0-5.252-4.527-8-9-8c-2.002,0-4.647,0.626-7.205,1.231c-1.293,0.307-3.246,0.769-3.795,0.769c-5,0-5-2.906-5-4
+			c0-5.094,2.882-6,11-6c1.611,0,6.513,0,9.051,0.846l3.009,1.003l0.834-3.06C240.998,301.743,246,280.698,246,277
+			c0-9.572-2.776-16.579-5.461-23.355c-2.583-6.521-5.024-12.68-5.039-21.068c0.119-1.052,1.42-6.151,2.57-10.657
+			c1.876-7.352,4.206-16.483,5.351-22.711c0.392-1.379,0.328-3.073,0.248-5.188c-0.054-1.437-0.219-5.81,0.57-6.5c0,0,0,0,0.001,0
+			c0.011,0,0.1-0.021,0.261-0.021c0.299,0,0.854,0,1.528,1.008c3.675,5.502,2.161,25.852-5.299,43.842l-1.53,3.69l3.97,0.44
+			c11.498,1.277,20.363,5.538,27.101,13.025l1.855,2.061l2.2-1.687c14.329-10.985,26.298-25.655,34.612-42.423
+			c7.457-15.037,11.562-31.003,11.562-44.958c0-5.936,0-24-18-24c-15.847,0-37.457,7.883-54.821,14.218
+			c-1.838,0.67-3.611,1.317-5.304,1.927c-0.479-1.517-0.963-3.148-1.464-4.836C236.714,135.658,230.964,116.277,213.538,116.277
+			L213.538,116.277z"/>
+	</g>
+</g>
+<g>
+	<g>
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M240.5,158.5c-5-14-9-42-30-39c-6.17,1.17-9.5,24.83-13,45
+			c-5-6-24.83-39-45-39c-11.5,0-24,9-24,19v6c0,2,2,10,2,13c0,2-1,2-1,2c-14,2-32.9,5-45.5,6.5c-18.17,2.17-40.33,5-37.5,25.5
+			c2.74,19.811,12,56,41,66c15,6,49,5,56,3c3-6,8-17,8-19c0-4-14-11-14-29c0-4-1-10,6-13c3,4,5,7,6,11c2,5,4.5,16.17,13.5,21.17
+			c5,3,11.5,4.83,15.5,8.83c2,2,0,7,0,10c0.83,14.83,7,21,10,33c2,7,3,16,4,23v22c0,18,17,46,44,46c10,0,38-16,42-16
+			c12,0-1,16,15,16c7,0,12-5,12-13s-10-14-10-18c0-2,6-11,6-14c0-5-4-9-12-9c-6,0-12,3-12,11c0,3,1,6,0,9c-8,0-16,1-22,1
+			c0,1-1,20-11,20c-3,0-6-3-6-8c0-3-0.83-8.5,8-12c0,0,2.5-6.5-3-6c-2.99,0.27-19,10-21,10c-3,0-6-1-6-7c0-12,20-6,20-16
+			c0-4-4-5-6-5c-3,0-9,2-11,2c-3,0-8-1-8-7c0-9,8-9,14-9c2,0,7,0,10,1c3-11,7.5-30.5,7.5-33.5c0-18-10.5-26.5-10.5-44.5
+			c0-2,6-23,8-34c1-3-2-14,4-14c9.83,0,7.5,28.5-1,49c9,1,20,4,29,14c30-23,45-59,45-85c0-10-2-21-15-21
+			C284.5,141.5,257.5,152.5,240.5,158.5z"/>
+	</g>
+	<defs>
+		<filter id="My_OpacityMaskFilter_1_" filterUnits="userSpaceOnUse" x="46.254" y="119.277" width="271.246" height="261.223">
+			
+				<feColorMatrix  type="matrix" values="-1 0 0 0 1  0 -1 0 0 1  0 0 -1 0 1  0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+		</filter>
+	</defs>
+	<mask maskUnits="userSpaceOnUse" x="46.254" y="119.277" width="271.246" height="261.223" id="SVGID_2_">
+		<g filter="url(#My_OpacityMaskFilter_1_)">
+			
+				<image overflow="visible" width="278" height="268" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAARTAAAJlwAADlr/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAQwBFgMBIgACEQEDEQH/
+xACaAAEAAgMBAQAAAAAAAAAAAAAABgcDBAUBAgEBAAAAAAAAAAAAAAAAAAAAABAAAgICAQMEAgEE
+AwEAAAAAAgMBBAUGACARExAwQBIxFBWAITM0IjI1FhEAAgIBAQYFAgUEAwEAAAAAAQIAESEDIDFB
+URIiEDBAYXGRE4GxMlIjocFCYuFyMwQSAQAAAAAAAAAAAAAAAAAAAID/2gAMAwEAAhEDEQAAAK/A
+AAAAPs+Hf7BCEqjprgAzdPrTsp7WtOtjVAAAAAAAAAAB7N4nbRubf16YI/J/kpblXDWJzPr52iy5
+VyeuYa5suOlRMuIAPreOekfSIUm8eOSAAAAADcuCmLhO0AD5i8qxlGb8v5pYG3jyDT3Pkprj27rF
+ed+fbpGOz0fTBk+xjjUp5RTzeHHMhjd7tEH+rK3yrNi19oqres3KQSbbHoAAB8fOUeegB4D0AADl
+dXglatIY7DidrDZ+x49AAAAAAAADz35OBwNWGl65+F3QADyGS2ryLvB3bZpi3zpAAAAeOEdfNT1j
+nbeegAADFl0yt4r1eYWzI+B3wB57iORU0qhQB92vUs4LH9+PsAAA8gU9hJW0yhvQLsycnqnoAAHD
+7cMK6y6fcLQ6mlug8Ee6FYHK1QAdLmi7OnXc/MwAAHG7OMo7Un0DJfP6Q7RcnsQlRlAB81xZFekC
+6vKFmyaju0XFqRThn3EffkAAA2LIq/aLxywKVnSYsh689Hjw5VU2PVZhBktyobWJQ89APIxKNApD
+563JAPv4AAAAAD66fKEw6tdC0c1Uelq6la+EhjwALKrWUlre4cwA+PvwraE2ZWYAAAAAAAAAAAAA
+2tUXP2YNOD0Dz34IdWc2hIAAAAAAAAAAAAABK7Rp23DaeaxtamnxiG8HZ1gAAAAAAAAAAAAADoXD
+TtwGSrrGp0+vnD6eAAAAAAAAAAAAAAA37gp63jfiMy4RCND65Bh8ABlxSYxa9p8Qq/zPgAAAAAAA
+AAAMtsVFNiya9n3GKd+5Z0iFa3Y4g++hPitpvKugZIHPa6IMAAAAAAAAAABt6gtuR0tY5IdfL9lP
+8KyYodGw4VjJxrVZoF687hSMqXky2JAAAAAAAAAAADb1BM+3WP0T+O8L5NrVADu9+B/Rv84AP//a
+AAgBAgABBQD+jL//2gAIAQMAAQUA/oy//9oACAEBAAEFAPiVqrLJ/wDzlmRtULFWfjqUxx0dWsP4
+GmB9bunmuLdGxULo1TF+QVYlfjzWBWasjSOnY+KAyZa1r49quOUoIUuONqKZGY15Tgy2EfRZ6LH7
+HqtSAREdosKhq9wxfaPi4oYO9gkCKfUhgozOHW9eZxTaL+YxXlu4JP0r+my0oaiyrw2PUFsZKMJf
+fyvp9lnE6SMcdpixHJ4N1L3MSUDfwhRNfoMYMdiwgWFX6TKT9ZT5chjl/RHpkUeVGz05rXhAjmrg
+r1maGlSXKOqIVCMPXXAVEhyFBHDSso2HHBKf14/kPaqlIWNdkpq9LlC0Nn1ybAahhLiXpD6L9CGC
+jL6xXyBVNQrJmviEJgErDqzYxKCGP5/phbJ4NG2fF4LIslWq3jlGlOKcfo6QZSqDWV1GsGQuupc+
+7my7VyKP5/ia7nlS1W0/lbSA7I02uMK1auPF6/WHgYmuPBooHgoUPIEY97v25BDPsbG6Ar+aP5Kn
+VK0/A68sARj0qGFhHO0fE2HPDjk4fdP2rFWwL1dMz2jb7sAj7T9tVUJ2scoQT8U57DvbJkaxkuxr
+b5ZW6bTIWrcL3kZzVGwFygX2R7JFAx+2n7RMFHsvL6q3V4kxX+TV/wDW6c9eFKcnZmzb5hH+G/h3
+Qyv7Ow5T9NC9rvxcwWVG2n2ck3xo2Sz5r6Bk360uRrdFhsKXt+W/t6JOVt1e3DEexP43k5/X5peR
+IeJODX7Gw2IXXut81rEpl1/CK+lf1mYiNgyoIVkbhW7PrpeQ/wCCjgw65/G61SOvzC3Jq3cNdFye
+ufxuVvx15mZnV0fa3jfrCfXKZAK6tkzJWndGDvTUuYe6L0+xnqUWK+TqFUtxMxOs7DAcpZNTwgoK
+Ok/+u9sKB5iMkunOJ2ZBRWySXRBhMXb60hs+fI5mZKeiJmJ1PN9xruFodblwwNswXkgwJZCZAWN2
+W1UnC7SmzCXC4Ogv7jvNeSV6Aw1ljdmtVSr7OJqzWzkcMYbD6qVtlR+vZ8HLS4Gj15pYSrOisbfo
+h7a7NXtm+r07VT8tdgStnqDmBEzMz7FDIOpMwm1LZFXLJbAvWfIKJ6CKBjYsgIJuPl9j0X/k1WYi
+v05WvDUbFTmtd94DMCp7BdrTU3SR5X3RBcHca3A22sUM22uPH7fXkc7nf2o9YntOn24NET3joaP2
+XulKIH4cEQ8kiLr06/421WQxXRP43Bcfr/LxtqatvA3IfX6J/G4tiK/zNLvSxET3j1YX1Dd7UyPz
+NKsyLUF9let90LTtVry2/mas2V36B/ZH44++hPGZ6vHMrnFmvIv89v5mDKRyOJnvXyVr9dGc2S06
+zN+5PJt2S5M95+Zhf/Qw/wDr7Aozq21GqzztPzsL/wChh/8AXekXBmdarNJmDrom3WSIlEQXRXrs
+sMRq7DC7r7a8EMjPxMPPa/hSia/M/fVWXkdg8putub1alUFxV8cEKzyFrXckZs/ErM8VjWrcMRP4
+302Qri1MZMUCGGiIl2meCppTFC4XNIxtha+31XueQ8ITMzPxdPyv9kMhi8/hAyCo0ZgtXra6q86f
+gZ+eYOn+zYx+upIVYGsPEVVIg47ju+Naz4+NulTs4DMLeoSEx8YcuVxJO2IJd/mp0pCKrVLW7K11
+cDYKpGl4OHMUQerP4/8AUs/GwuZOgzD59TwVYWyD+shs2GVchWBhTatlVQLm1Aobuw3LMjcsizVs
+wTq9myBK2wgkfj0sjZpljdwiIXtaTG9sKCG3nQmX5Cw7kzM+uCysVodsQeLLZGbjPkj5OF5OqO/e
+fJ29f//aAAgBAgIGPwAZf//aAAgBAwIGPwAZf//aAAgBAQEGPwD0nQg+TOoE/SfyLjn6gJpi2MB1
+Lo8BMpmE6dgzp1Vxz2RqMMtmCxG7Y2mR232+mCLvJoRXZbY5JMGJulERqUG4zAE6d/TxVeZAiY4C
+VCCI2qq5XPptMGKa4bFGN23cY1/GT9PDSX3uL8eL43iPp/tONikUsfYQUnSDzgLk+4EtgT8w0kLL
+ZUbx5mmTzqL8bJBjdt3G0mBr/EwGr6azF+PFh7QtVB5SgseQgpOkHnAdW2+YOwfSDtEws3SiIxrh
+PsVjrqvL02G8MIhPLaKkRm017t4qM/8A9Gn0d2PwgXxIPGXqIGo2IKQCvaDtEwNpviIP9v7HawhP
+4GDp0mz7QD7dA8Z3YHsJ3kmKzr1UQRed0CDgNumFy1WvOb4iHh1f2Ph06SljAdSwOQnepPzAPtjH
+tB2D6T9In6RP0iYWYHn4PkN8T7vD7n/EXSXjvikrBgTA9Kz3u4T7epaEnAPGBhtEx88DOrjdw3zE
+FDh6Yyv9h+c03XeGES+W0TPtA7znwKnjRi/HlWTQnT1C5Yz5TGBOJMT/ALD84nwNps1iO92AaHgh
+ug2Ivx5TMDVCfcZv4i27kIpu7HlN8Qi7CzTUbywiXy2SxjaaNlsDxRx/iQYmeA8kxxw8Bosf0moD
+5LZ4TUe7tjU0l5G4vxsWY3dVCNqE2t9uwumxyuICPJ1K5HwVrpWwYueHkvngZZ3mfcO4YEAHLYOa
+jaKHHE7K5pWOfmLnh5LCrsR9MigSSssbxF0tRqYc4O4Swb2jKB3nPgOrHvAvWPrBTCXcOYdLSbuM
+JJsnedmxvG6Lps3cuDAQfIKmNqIveMgwo4phvEDIaYbiIBqEso4iKOsXygZTsmM37Tf08epGKnmI
+q6p6l5wHq4RtPSa2MLubY7ztrqIaF9wijqgIPkNfKHp35vxGppMVYHhxiF95A2nxwMZDvUkbBCsQ
+DwlnJ8kOhPTxWBWajxBg7hMGYOxZMbPCPqHiceK/I/OIByG02OELcH/Pz+pCVPMTJ6hANQlT7yi4
++s/9B9Zhx9Zlx9YQNQfWFNNrvYsbxEzeBAdkiM4GVN+kwSPiZJPzt/ZY7jj4gO059j6xNQbrAMXO
+8bTj2PrUBOaowHYJhQcTXrTp8AfzinYOeECXus+tq8Govx4dzCYYRgrR3969bp1F+Ize0fT0WpVN
+EzOs07tQmWfW6cX4jheU1EcUwY/1Phu9dpxfiFWhcoLhpRCMQgbtkJpizxMtruFlvHAwqcEb/S6Z
+i/HgzMaqEaORz4TuOOW11EWbgxwjYj9O6/S6b8iImeHgQDQJAP18KQXL1Me0oTEpUJJ9pjRY/hOr
+WQoSTgz4EZQe44Es7z6ZdNjlcGAiMpF3MsxS90wtVPtJgnwyLAxASggtRKQVCJ91QT0G69OuoD23
+3Re67EsZE3RqHCAkdpsX4DUcUWNwXMsJ0dYuWpuNYuxCyilY59OFY/x3v5Re4G5YMIuHnvBEvUPU
+BwMAsCoQrWeQhCsUX+sGqNVuoG95iFzmsw54Rq3+oB02PT+2BdRuk+8/WPrCeoQ/byfaV1dI9pZy
+fEIxqp+rhKBtR6rsv8Lndde97WN8zde97H//2Q==" transform="matrix(1 0 0 1 43 116)">
+			</image>
+		</g>
+	</mask>
+	<g mask="url(#SVGID_2_)">
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#CEBC01" d="M240.5,158.5c-5-14-9-42-30-39c-6.17,1.17-9.5,24.83-13,45
+			c-5-6-24.83-39-45-39c-11.5,0-24,9-24,19v6c0,2,2,10,2,13c0,2-1,2-1,2c-14,2-32.9,5-45.5,6.5c-18.17,2.17-40.33,5-37.5,25.5
+			c2.74,19.811,12,56,41,66c15,6,49,5,56,3c3-6,8-17,8-19c0-4-14-11-14-29c0-4-1-10,6-13c3,4,5,7,6,11c2,5,4.5,16.17,13.5,21.17
+			c5,3,11.5,4.83,15.5,8.83c2,2,0,7,0,10c0.83,14.83,7,21,10,33c2,7,3,16,4,23v22c0,18,17,46,44,46c10,0,38-16,42-16
+			c12,0-1,16,15,16c7,0,12-5,12-13s-10-14-10-18c0-2,6-11,6-14c0-5-4-9-12-9c-6,0-12,3-12,11c0,3,1,6,0,9c-8,0-16,1-22,1
+			c0,1-1,20-11,20c-3,0-6-3-6-8c0-3-0.83-8.5,8-12c0,0,2.5-6.5-3-6c-2.99,0.27-19,10-21,10c-3,0-6-1-6-7c0-12,20-6,20-16
+			c0-4-4-5-6-5c-3,0-9,2-11,2c-3,0-8-1-8-7c0-9,8-9,14-9c2,0,7,0,10,1c3-11,7.5-30.5,7.5-33.5c0-18-10.5-26.5-10.5-44.5
+			c0-2,6-23,8-34c1-3-2-14,4-14c9.83,0,7.5,28.5-1,49c9,1,20,4,29,14c30-23,45-59,45-85c0-10-2-21-15-21
+			C284.5,141.5,257.5,152.5,240.5,158.5z"/>
+	</g>
+</g>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M168.67,263.33c-3.67-2-6.67-3.33-9-6.33
+	c-5,9-11.17,30.5-11.17,41.5c0,3,1,10,2,15C177.67,289,168.67,263.33,168.67,263.33z"/>
+<g>
+	<path fill-rule="evenodd" clip-rule="evenodd" fill="#FF6600" d="M193.772,206.837c-5.358,0-10.236-2.729-13.736-7.683l-0.198-0.28
+		l-0.093-0.33c-8.547-30.246-25.982-48.151-39.992-62.539c-2.949-3.03-5.736-5.89-8.24-8.667l-0.94-1.043l0.662-1.238
+		c3.588-6.719,10.431-10.272,19.783-10.272c5.169,0,10.029,1.066,13.196,1.96c2.665,0.75,5.5,1.129,8.429,1.129
+		c0.004,0,0.006,0,0.01,0c7.256,0,14.981-2.283,22.334-6.601c2.978-1.746,6.236-2.632,9.686-2.632
+		c6.564,0,11.543,3.219,11.753,3.357l1.181,0.775l-0.336,1.373c-4.887,19.923-7.7,46.495-8.604,81.235l-0.006,0.27l-0.078,0.255
+		C206.643,202.342,200.553,206.835,193.772,206.837L193.772,206.837z"/>
+	<path fill="#917013" d="M204.676,110.643c6.042,0,10.654,3.027,10.654,3.027c-4.33,17.66-7.66,43.26-8.66,81.66
+		c-1.729,5.729-7.115,9.506-12.899,9.506c-4.249,0-8.713-2.037-12.101-6.836c-10.51-37.2-34.41-56.19-48.67-72
+		c3.897-7.297,11.292-9.214,18.019-9.214c5.322,0,10.226,1.199,12.651,1.884c2.928,0.824,5.941,1.206,8.975,1.206
+		c8.011,0,16.174-2.662,23.355-6.876C198.988,111.248,201.975,110.643,204.676,110.643 M204.677,106.643
+		C204.677,106.643,204.677,106.643,204.677,106.643c-3.812,0-7.412,0.979-10.701,2.907c-7.053,4.139-14.428,6.327-21.332,6.327
+		c-2.745,0-5.4-0.355-7.892-1.057c-3.285-0.927-8.337-2.033-13.734-2.033c-10.138,0-17.589,3.917-21.547,11.33l-1.323,2.478
+		l1.881,2.086c2.528,2.803,5.326,5.676,8.289,8.718c13.853,14.225,31.094,31.929,39.502,61.69l0.187,0.659l0.396,0.561
+		c3.883,5.5,9.342,8.528,15.369,8.528c7.655,0,14.534-5.078,16.729-12.35l0.155-0.515l0.014-0.537
+		c0.889-34.117,3.764-61.306,8.546-80.812l0.673-2.746l-2.363-1.551C217.296,110.176,211.832,106.643,204.677,106.643
+		L204.677,106.643z"/>
+</g>
+<g>
+	<g>
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#FF6600" d="M215.33,113.67c-4.33,17.66-7.66,43.26-8.66,81.66
+			c-3,9.939-17,14-25,2.67c-10.51-37.2-34.41-56.19-48.67-72c6.98-13.07,25.18-8.88,30.67-7.33c10.66,3,22.43,0.14,32.33-5.67
+			C205.67,107.33,215.33,113.67,215.33,113.67z"/>
+	</g>
+	<defs>
+		<filter id="My_OpacityMaskFilter_2_" filterUnits="userSpaceOnUse" x="133" y="110.643" width="82.33" height="94.193">
+			
+				<feColorMatrix  type="matrix" values="-1 0 0 0 1  0 -1 0 0 1  0 0 -1 0 1  0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+		</filter>
+	</defs>
+	<mask maskUnits="userSpaceOnUse" x="133" y="110.643" width="82.33" height="94.193" id="SVGID_3_">
+		<g filter="url(#My_OpacityMaskFilter_2_)">
+			
+				<image overflow="visible" width="87" height="99" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAIPAAADBQAAA/v/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAGMAVwMBIgACEQEDEQH/
+xACPAAEAAgMBAQAAAAAAAAAAAAAABgcCAwUBBAEBAAAAAAAAAAAAAAAAAAAAABAAAQQBAwMDBQEA
+AAAAAAAAAwECBAYFABAgETESUCETMDIjMxQ0EQACAQEGAwgDAQAAAAAAAAABAgARECAhMUEDcRIi
+MFFhgZGhMkJigrITEgEAAAAAAAAAAAAAAAAAAABQ/9oADAMBAAIRAxEAAACv2ySEXWJ8xBEowI1n
+MZGQLbaXOKmfaNVkVRIS3Ped0jW2jDL0OH24uVm+YYgk1lUhMSzffm+kA8hE2rwggAGeAsia0lbB
+2HnphWlk1YRcAACawr7i7tnJ6xpqi1anI+AAACxJvS0zJXU0ihhpAAAA2BjiAH//2gAIAQIAAQUA
+9K//2gAIAQMAAQUA9K//2gAIAQEAAQUA5iCUzolalGSTWXiaSK8ZwAed+Oq7TIyoBVkmkjVCUuQj
+kpkpVh0j3gVUAdCxYRtzEQYxS3IuZxUhgj4MgSNY1nirGLpY4l1/MLSDY3exERkd5PLJ6r+efGLi
+8kOSPlbDeEfz/JtWs+QBMdPZIHwXtdJHhH3RVatWsDmrEktOPd/23cifFwCV4SVTOIcY3o9uxPZl
+4d15YbIOhSsJkGyA7SF6CuhXKflTcu7QSIQepX6bj/q5YeUsWbhJaGBqYvQFtIjpnJFVFqOU8gjM
+x7clIY0Nkej5/PEZR0EsWzj+PKWZijlSHSDfQH2J32//2gAIAQICBj8AK//aAAgBAwIGPwAr/9oA
+CAEBAQY/AL/LtqWPhAz1A7hKioMXZObMFHmaQInmYC45ie+U5B6Q8q0PhDysaT5H0gO6C3GDoA8p
+QARjTSbQ0G4n9CAPqc4tKQUExE+M+MwFrcINyuH+qmvAixdrdbDQwY1rffgZz/lze9bRs7rYaEwY
+1umPwNwMpoRkYuzut1CAg3DGBOeF1dxDRlNYqserIiBhraZT8heU16GIBi41qLWgXQm+Nl26lwgY
+WNF4m+jaMaGLjpY0C61JvgjMZRAxxgNYwrpCR49gAT0EwdfvCA2cbcbXLsfv+s+37W//2Q==" transform="matrix(1 0 0 1 131 108)">
+			</image>
+		</g>
+	</mask>
+	<g opacity="0.6" mask="url(#SVGID_3_)">
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#7F3E03" d="M215.33,113.67c-4.33,17.66-7.66,43.26-8.66,81.66
+			c-3,9.939-17,14-25,2.67c-10.51-37.2-34.41-56.19-48.67-72c6.98-13.07,25.18-8.88,30.67-7.33c10.66,3,22.43,0.14,32.33-5.67
+			C205.67,107.33,215.33,113.67,215.33,113.67z"/>
+	</g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M210.936,113.796
+	c-11.983,64.227-22.738,60.791-73.726,11.721c0.148-11.045,22.734-5.193,27.431-4c9.14,2.331,19.844,0.864,27.954-4.462
+	C202.85,110.315,210.936,113.796,210.936,113.796z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M281.5,290.5c-7.17-37.17-37.17-42.83-37.17-42.83
+	c3,10,6.34,19.33,9.17,27.83C261,282,273.5,289.5,281.5,290.5z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M168.67,263.33c-3.67-2-6.67-3.33-9-6.33
+	c-5,9-11.17,30.5-11.17,41.5c0,3,1,10,2,15C177.67,289,168.67,263.33,168.67,263.33z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M281.5,290.5c-7.17-37.17-37.17-42.83-37.17-42.83
+	c3,10,6.34,19.33,9.17,27.83C261,282,273.5,289.5,281.5,290.5z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M166.77,188.01c5.25,0.61,8.37,11.49,9.67,19.44c1.33,8.17,1.33,16.76-4.05,17.47
+	c-8.06,1.08-11.67-21.93-11.67-21.93C158.28,187.29,166.77,188.01,166.77,188.01z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M229.86,192.56c0.99,10.209-3.431,23.959-6.57,24.39
+	c-6.29,0.85-7.51-9.05-7.72-10.7c-0.41-3.3-3.061-24.76,7.939-26.25C228.33,182,229.45,189.26,229.86,192.56z"/>
+<path opacity="0.1" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M216.51,195.85c0.93-8.26,11.79-5.08,11.79,2.86
+	c0,7.95-2.1,14.261-4.34,16.21C217.75,220.32,215.58,204.12,216.51,195.85z"/>
+<path opacity="0.1" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M163.09,206.33c-1.19-8.13,9.59-8.43,11.57-0.891
+	c1.97,7.551,1.6,14.181,0.02,16.721C170.3,229.18,164.28,214.45,163.09,206.33z"/>
+<rect x="701" y="306" fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" stroke="#1F1F1F" stroke-width="20" stroke-linecap="round" stroke-linejoin="round" width="14" height="34"/>
+<circle fill-rule="evenodd" clip-rule="evenodd" fill="#FFFF33" cx="182.5" cy="139.5" r="11.5"/>
+<g>
+	<g>
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M149.33,127.79c0,14.21-17,14.21-17,14.21
+			c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12C139,114.67,149.33,119.26,149.33,127.79z"/>
+		<path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M149.33,127.79
+			c0,14.21-17,14.21-17,14.21c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12
+			C139,114.67,149.33,119.26,149.33,127.79z"/>
+	</g>
+	<defs>
+		<filter id="My_OpacityMaskFilter_3_" filterUnits="userSpaceOnUse" x="116.477" y="113.17" width="34.353" height="30.33">
+			
+				<feColorMatrix  type="matrix" values="-1 0 0 0 1  0 -1 0 0 1  0 0 -1 0 1  0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+		</filter>
+	</defs>
+	<mask maskUnits="userSpaceOnUse" x="116.477" y="113.17" width="34.353" height="30.33" id="SVGID_4_">
+		<g filter="url(#My_OpacityMaskFilter_3_)">
+			
+				<image overflow="visible" width="39" height="35" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAGnAAAB+QAAAmr/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIACMAJwMBIgACEQEDEQH/
+xAB9AAEAAgMBAAAAAAAAAAAAAAAABgcBBAUDAQEAAAAAAAAAAAAAAAAAAAAAEAACAwEAAwEBAAAA
+AAAAAAADBAECBQYQMBEAExEBAAIBAwMDBQAAAAAAAAAAAQACETFBAxBxEiGBkcEiMhMEEgEAAAAA
+AAAAAAAAAAAAAAAw/9oADAMBAAIRAxEAAACAdvxtYgHEurklMuyNm1aPm5YOlHo4aqPjzBnAAf/a
+AAgBAgABBQD0/wD/2gAIAQMAAQUA9P8A/9oACAEBAAEFAIibTncyy3BOKvFH8NxOfk/edThlzMzx
+CDIRzGvlhIJ7PgO1yJKUZSJW4f2kwMYdRql91Nu6h8rrhQMnYLRXY67+1bHJY/ifP//aAAgBAgIG
+PwAf/9oACAEDAgY/AB//2gAIAQEBBj8AAMroQtfIOxM1yMVq2qb7zG8GxkrKvjtMeJLPiaTg4g+3
+l5aVx3sER1zK4elhdp/JjSvPxq9rkOWm2pAvfCajPzPmWpwvks/eubli3uevU+vX/9k=" transform="matrix(1 0 0 1 114 111)">
+			</image>
+		</g>
+	</mask>
+	<g mask="url(#SVGID_4_)">
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M149.33,127.79c0,14.21-17,14.21-17,14.21
+			c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12C139,114.67,149.33,119.26,149.33,127.79z"/>
+		<path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M149.33,127.79
+			c0,14.21-17,14.21-17,14.21c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12
+			C139,114.67,149.33,119.26,149.33,127.79z"/>
+	</g>
+</g>
+<g>
+	<g>
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M230.33,111.33c3,4.84,4.68,17.12-15.33,16.17
+			c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+		<path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M230.33,111.33
+			c3,4.84,4.68,17.12-15.33,16.17c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+	</g>
+	<defs>
+		<filter id="My_OpacityMaskFilter_4_" filterUnits="userSpaceOnUse" x="204.631" y="103.813" width="29.007" height="25.239">
+			
+				<feColorMatrix  type="matrix" values="-1 0 0 0 1  0 -1 0 0 1  0 0 -1 0 1  0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+		</filter>
+	</defs>
+	<mask maskUnits="userSpaceOnUse" x="204.631" y="103.813" width="29.007" height="25.239" id="SVGID_5_">
+		<g filter="url(#My_OpacityMaskFilter_4_)">
+			
+				<image overflow="visible" width="34" height="31" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAGWAAAB3QAAAkb/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAB8AIgMBIgACEQEDEQH/
+xAB4AAADAQEAAAAAAAAAAAAAAAAABQcGAwEBAAAAAAAAAAAAAAAAAAAAABAAAgIDAQEAAAAAAAAA
+AAAAAgMEBQABBiASEQACAQMDAwUAAAAAAAAAAAABAgAREgMQITFRsQRBcdEiYhIBAAAAAAAAAAAA
+AAAAAAAAIP/aAAwDAQACEQMRAAAAwTkqRLU1vnZkQBrUoy5KrPV6Y5gH/9oACAECAAEFAPX/2gAI
+AQMAAQUA9f/aAAgBAQABBQBSjccbl5Tgk8tMSLksSecugGya+CnSpUBJr6ysBesoJuosystUkmVa
+IBfU2i2awfr6iTrxYSLC/MH7cR5//9oACAECAgY/AF//2gAIAQMCBj8AX//aAAgBAQEGPwAJjFWM
+DEkE9BLlNfcQpkFrDQ3DgiA0h2EbIg+y76C40Dd4tWHENGEZFNSdhoLa3elOYBi8fK46hGPYSj+P
+mQdTjf4hOe6/9Cmn/9k=" transform="matrix(1 0 0 1 202 101)">
+			</image>
+		</g>
+	</mask>
+	<g mask="url(#SVGID_5_)">
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M230.33,111.33c3,4.84,4.68,17.12-15.33,16.17
+			c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+		<path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M230.33,111.33
+			c3,4.84,4.68,17.12-15.33,16.17c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+	</g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M116,85c4-22.67,16.33-29.33,23.67-27.67
+	c7.33,1.67,20,11,30,11c12.33,0,16.66-3,23.66-8.66c7-5.67,10.31,2.33,10,12.33C203,83,207,91.67,204,92s-10.67-18-19-11
+	c-5.33,10.67-2,25.67-12.33,27c-6.7,0.86-21.67-3.67-35-19c-3.07-3.52-12-6-15,1c-3.33,7.75-3.34,4.67-5,8
+	C116.61,100.11,114.86,91.45,116,85z"/>
+<g>
+	<g>
+		<circle fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" cx="169" cy="29" r="26"/>
+		<circle fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" cx="169" cy="29" r="26"/>
+	</g>
+	<defs>
+		<filter id="My_OpacityMaskFilter_5_" filterUnits="userSpaceOnUse" x="141.5" y="1.5" width="55" height="55">
+			
+				<feColorMatrix  type="matrix" values="-1 0 0 0 1  0 -1 0 0 1  0 0 -1 0 1  0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+		</filter>
+	</defs>
+	<mask maskUnits="userSpaceOnUse" x="141.5" y="1.5" width="55" height="55" id="SVGID_6_">
+		<g filter="url(#My_OpacityMaskFilter_5_)">
+			
+				<image overflow="visible" width="60" height="60" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAHLAAACZwAAAyD/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIADwAPAMBIgACEQEDEQH/
+xACFAAACAwEBAQAAAAAAAAAAAAAABwIFBgQBAwEBAAAAAAAAAAAAAAAAAAAAABAAAQQBBAMBAAAA
+AAAAAAAAAgEDBAYFABARFCBAExIRAAEDAgQFBAMAAAAAAAAAAAEAEQJBEiAhMQMQUXGRImGhwWKx
+MhMSAQAAAAAAAAAAAAAAAAAAAED/2gAMAwEAAhEDEQAAAF/6bAorJk9gpKZ5Z8UxYV5aNtbNU+no
+BGQYVdN9TFy2Ua0TUEZB4cpQqvS5cO7hBi3ag+w0chmYEogf/9oACAECAAEFAPQ//9oACAEDAAEF
+APQ//9oACAEBAAEFANIiksKvzpWhpcpUkVGY0MmFIilsiKS1qtfXUPFMMAjDSaciMuJmq4xIby+M
+PHyNV+F2p2KhgwxuYoQ3HFibPC80sUWUwnDXhZwRY34XuVGQLUyI4jjPha5YhH/afaFJKLIrmbbf
+ZAxNNps1thu15rsObY3KyIDmKuDJiNnjKMq2RwHM2w5GnDNw9055HucH9uN//9oACAECAgY/AAf/
+2gAIAQMCBj8AB//aAAgBAQEGPwBAAOToEDbbE909x7ImJJPqFbvQI9acQAHJ0Cjvb0Xkc86IC0L9
+QmMQpeALoxY2HQ8uEXDxj+VFhTAQaqcgMxmFbXRlJ+YUemGfRW/f5RiTmSCokcsMw9Cr6XXe7qG9
+Ghz6KHlqE8S/EknNS2ISd9enEGBeD5hASmx5FPeESJjujDYLvWiM5l5HU4PHWjI2/wBGrqvO5vs/
+zg//2Q==" transform="matrix(1 0 0 1 139 -1)">
+			</image>
+		</g>
+	</mask>
+	<g mask="url(#SVGID_6_)">
+		<circle fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" cx="169" cy="29" r="26"/>
+		<circle fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" cx="169" cy="29" r="26"/>
+	</g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M149,22.33c13.33-26.66,39.67-9,40.67,3.34
+	C190.67,38,141.58,37.17,149,22.33z"/>
+</svg>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/assembly/job.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/assembly/job.xml b/community/mahout-mr/mr/src/main/assembly/job.xml
new file mode 100644
index 0000000..2bdb3ce
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/assembly/job.xml
@@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<assembly
+  xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0
+    http://maven.apache.org/xsd/assembly-1.1.0.xsd">
+  <id>job</id>
+  <formats>
+   <format>jar</format>
+  </formats>
+  <includeBaseDirectory>false</includeBaseDirectory>
+  <dependencySets>
+    <dependencySet>
+      <unpack>true</unpack>
+      <unpackOptions>
+        <!-- MAHOUT-1126 -->
+        <excludes>
+          <exclude>META-INF/LICENSE</exclude>
+        </excludes>
+      </unpackOptions>
+      <scope>runtime</scope>
+      <outputDirectory>/</outputDirectory>
+      <useTransitiveFiltering>true</useTransitiveFiltering>
+      <excludes>
+        <exclude>org.apache.hadoop:hadoop-core</exclude>
+      </excludes>
+    </dependencySet>
+  </dependencySets>
+  <fileSets>
+    <fileSet>
+      <directory>${basedir}/target/classes</directory>
+      <outputDirectory>/</outputDirectory>
+      <excludes>
+        <exclude>*.jar</exclude>
+      </excludes>
+    </fileSet>
+    <fileSet>
+      <directory>${basedir}/target/classes</directory>
+      <outputDirectory>/</outputDirectory>
+      <includes>
+        <include>driver.classes.default.props</include>
+      </includes>
+    </fileSet>
+  </fileSets>
+</assembly>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/assembly/src.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/assembly/src.xml b/community/mahout-mr/mr/src/main/assembly/src.xml
new file mode 100644
index 0000000..0bb8e8b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/assembly/src.xml
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
+  <id>src</id>
+  <formats>
+    <format>dir</format>
+    <format>tar.gz</format>
+  </formats>
+  <fileSets>
+    <fileSet>
+      <directory>${project.basedir}/..</directory>
+      <outputDirectory/>
+      <useDefaultExcludes>true</useDefaultExcludes>
+      <includes>
+        <include>**/README*</include>
+        <include>**/LICENSE*</include>
+        <include>**/NOTICE*</include>
+        <include>**/pom.xml</include>
+        <include>**/src/**</include>
+        <include>src/conf/**</include>
+        <include>**/build.xml</include>
+        <include>**/*.properties</include>
+      </includes>
+      <excludes>
+        <exclude>**/target/**</exclude>
+      </excludes>
+    </fileSet>
+    <fileSet>
+      <directory>${project.basedir}/../bin</directory>
+      <outputDirectory>bin</outputDirectory>
+      <useDefaultExcludes>true</useDefaultExcludes>
+      <fileMode>0755</fileMode>
+      <directoryMode>0755</directoryMode>
+    </fileSet>
+    <fileSet>
+      <directory>${project.basedir}/../examples/bin</directory>
+      <outputDirectory>examples/bin</outputDirectory>
+      <useDefaultExcludes>true</useDefaultExcludes>
+      <fileMode>0755</fileMode>
+      <directoryMode>0755</directoryMode>
+      <excludes>
+        <exclude>work</exclude>
+        <exclude>work/**</exclude>
+      </excludes>
+    </fileSet>
+  </fileSets>
+</assembly>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/Version.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/Version.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/Version.java
new file mode 100644
index 0000000..5f3c879
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/Version.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout;
+
+import com.google.common.base.Charsets;
+import com.google.common.io.Resources;
+
+import java.io.IOException;
+
+public final class Version {
+
+  private Version() {
+  }
+
+  public static String version() {
+    return Version.class.getPackage().getImplementationVersion();
+  }
+
+  public static String versionFromResource() throws IOException {
+    return Resources.toString(Resources.getResource("version"), Charsets.UTF_8);
+  }
+
+  public static void main(String[] args) throws IOException {
+    System.out.println(version() + ' ' + versionFromResource());
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchItemException.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchItemException.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchItemException.java
new file mode 100644
index 0000000..1ac5b72
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchItemException.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.common;
+
+public final class NoSuchItemException extends TasteException {
+  
+  public NoSuchItemException() { }
+
+  public NoSuchItemException(long itemID) {
+    this(String.valueOf(itemID));
+  }
+  
+  public NoSuchItemException(String message) {
+    super(message);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchUserException.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchUserException.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchUserException.java
new file mode 100644
index 0000000..cbb60fa
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchUserException.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.common;
+
+public final class NoSuchUserException extends TasteException {
+  
+  public NoSuchUserException() { }
+
+  public NoSuchUserException(long userID) {
+    this(String.valueOf(userID));
+  }
+  
+  public NoSuchUserException(String message) {
+    super(message);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Refreshable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Refreshable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Refreshable.java
new file mode 100644
index 0000000..9b26bee
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Refreshable.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.common;
+
+import java.util.Collection;
+
+/**
+ * <p>
+ * Implementations of this interface have state that can be periodically refreshed. For example, an
+ * implementation instance might contain some pre-computed information that should be periodically refreshed.
+ * The {@link #refresh(Collection)} method triggers such a refresh.
+ * </p>
+ * 
+ * <p>
+ * All Taste components implement this. In particular,
+ * {@link org.apache.mahout.cf.taste.recommender.Recommender}s do. Callers may want to call
+ * {@link #refresh(Collection)} periodically to re-compute information throughout the system and bring it up
+ * to date, though this operation may be expensive.
+ * </p>
+ */
+public interface Refreshable {
+  
+  /**
+   * <p>
+   * Triggers "refresh" -- whatever that means -- of the implementation. The general contract is that any
+   * {@link Refreshable} should always leave itself in a consistent, operational state, and that the refresh
+   * atomically updates internal state from old to new.
+   * </p>
+   * 
+   * @param alreadyRefreshed
+   *          {@link org.apache.mahout.cf.taste.common.Refreshable}s that are known to have already been
+   *          refreshed as a result of an initial call to a {#refresh(Collection)} method on some
+   *          object. This ensure that objects in a refresh dependency graph aren't refreshed twice
+   *          needlessly.
+   */
+  void refresh(Collection<Refreshable> alreadyRefreshed);
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/TasteException.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/TasteException.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/TasteException.java
new file mode 100644
index 0000000..1792eff
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/TasteException.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.common;
+
+/**
+ * <p>
+ * An exception thrown when an error occurs inside the Taste engine.
+ * </p>
+ */
+public class TasteException extends Exception {
+  
+  public TasteException() { }
+  
+  public TasteException(String message) {
+    super(message);
+  }
+  
+  public TasteException(Throwable cause) {
+    super(cause);
+  }
+  
+  public TasteException(String message, Throwable cause) {
+    super(message, cause);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Weighting.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Weighting.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Weighting.java
new file mode 100644
index 0000000..4e39617
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Weighting.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.common;
+
+/**
+ * <p>
+ * A simple enum which gives symbolic names to the ideas of "weighted" and "unweighted", to make various API
+ * calls which take a weighting parameter more readable.
+ * </p>
+ */
+public enum Weighting {
+  
+  WEIGHTED,
+  UNWEIGHTED
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/DataModelBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/DataModelBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/DataModelBuilder.java
new file mode 100644
index 0000000..875c65e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/DataModelBuilder.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.eval;
+
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+/**
+ * <p>
+ * Implementations of this inner interface are simple helper classes which create a {@link DataModel} to be
+ * used while evaluating a {@link org.apache.mahout.cf.taste.recommender.Recommender}.
+ * 
+ * @see RecommenderBuilder
+ * @see RecommenderEvaluator
+ */
+public interface DataModelBuilder {
+  
+  /**
+   * <p>
+   * Builds a {@link DataModel} implementation to be used in an evaluation, given training data.
+   * </p>
+   * 
+   * @param trainingData
+   *          data to be used in the {@link DataModel}
+   * @return {@link DataModel} based upon the given data
+   */
+  DataModel buildDataModel(FastByIDMap<PreferenceArray> trainingData);
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/IRStatistics.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/IRStatistics.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/IRStatistics.java
new file mode 100644
index 0000000..9c442ff
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/IRStatistics.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.eval;
+
+/**
+ * <p>
+ * Implementations encapsulate information retrieval-related statistics about a
+ * {@link org.apache.mahout.cf.taste.recommender.Recommender}'s recommendations.
+ * </p>
+ * 
+ * <p>
+ * See <a href="http://en.wikipedia.org/wiki/Information_retrieval">Information retrieval</a>.
+ * </p>
+ */
+public interface IRStatistics {
+  
+  /**
+   * <p>
+   * See <a href="http://en.wikipedia.org/wiki/Information_retrieval#Precision">Precision</a>.
+   * </p>
+   */
+  double getPrecision();
+  
+  /**
+   * <p>
+   * See <a href="http://en.wikipedia.org/wiki/Information_retrieval#Recall">Recall</a>.
+   * </p>
+   */
+  double getRecall();
+  
+  /**
+   * <p>
+   * See <a href="http://en.wikipedia.org/wiki/Information_retrieval#Fall-Out">Fall-Out</a>.
+   * </p>
+   */
+  double getFallOut();
+  
+  /**
+   * <p>
+   * See <a href="http://en.wikipedia.org/wiki/Information_retrieval#F-measure">F-measure</a>.
+   * </p>
+   */
+  double getF1Measure();
+  
+  /**
+   * <p>
+   * See <a href="http://en.wikipedia.org/wiki/Information_retrieval#F-measure">F-measure</a>.
+   * </p>
+   */
+  double getFNMeasure(double n);
+
+  /**
+   * <p>
+   * See <a href="http://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG">
+   * Normalized Discounted Cumulative Gain</a>.
+   * </p>
+   */
+  double getNormalizedDiscountedCumulativeGain();
+  
+  /**
+   * @return the fraction of all users for whom recommendations could be produced
+   */
+  double getReach();
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderBuilder.java
new file mode 100644
index 0000000..1805092
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderBuilder.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.eval;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+
+/**
+ * <p>
+ * Implementations of this inner interface are simple helper classes which create a {@link Recommender} to be
+ * evaluated based on the given {@link DataModel}.
+ * </p>
+ */
+public interface RecommenderBuilder {
+  
+  /**
+   * <p>
+   * Builds a {@link Recommender} implementation to be evaluated, using the given {@link DataModel}.
+   * </p>
+   * 
+   * @param dataModel
+   *          {@link DataModel} to build the {@link Recommender} on
+   * @return {@link Recommender} based upon the given {@link DataModel}
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link DataModel}
+   */
+  Recommender buildRecommender(DataModel dataModel) throws TasteException;
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderEvaluator.java
new file mode 100644
index 0000000..dcbbcf8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderEvaluator.java
@@ -0,0 +1,105 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.eval;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+/**
+ * <p>
+ * Implementations of this interface evaluate the quality of a
+ * {@link org.apache.mahout.cf.taste.recommender.Recommender}'s recommendations.
+ * </p>
+ */
+public interface RecommenderEvaluator {
+  
+  /**
+   * <p>
+   * Evaluates the quality of a {@link org.apache.mahout.cf.taste.recommender.Recommender}'s recommendations.
+   * The range of values that may be returned depends on the implementation, but <em>lower</em> values must
+   * mean better recommendations, with 0 being the lowest / best possible evaluation, meaning a perfect match.
+   * This method does not accept a {@link org.apache.mahout.cf.taste.recommender.Recommender} directly, but
+   * rather a {@link RecommenderBuilder} which can build the
+   * {@link org.apache.mahout.cf.taste.recommender.Recommender} to test on top of a given {@link DataModel}.
+   * </p>
+   *
+   * <p>
+   * Implementations will take a certain percentage of the preferences supplied by the given {@link DataModel}
+   * as "training data". This is typically most of the data, like 90%. This data is used to produce
+   * recommendations, and the rest of the data is compared against estimated preference values to see how much
+   * the {@link org.apache.mahout.cf.taste.recommender.Recommender}'s predicted preferences match the user's
+   * real preferences. Specifically, for each user, this percentage of the user's ratings are used to produce
+   * recommendations, and for each user, the remaining preferences are compared against the user's real
+   * preferences.
+   * </p>
+   *
+   * <p>
+   * For large datasets, it may be desirable to only evaluate based on a small percentage of the data.
+   * {@code evaluationPercentage} controls how many of the {@link DataModel}'s users are used in
+   * evaluation.
+   * </p>
+   *
+   * <p>
+   * To be clear, {@code trainingPercentage} and {@code evaluationPercentage} are not related. They
+   * do not need to add up to 1.0, for example.
+   * </p>
+   *
+   * @param recommenderBuilder
+   *          object that can build a {@link org.apache.mahout.cf.taste.recommender.Recommender} to test
+   * @param dataModelBuilder
+   *          {@link DataModelBuilder} to use, or if null, a default {@link DataModel}
+   *          implementation will be used
+   * @param dataModel
+   *          dataset to test on
+   * @param trainingPercentage
+   *          percentage of each user's preferences to use to produce recommendations; the rest are compared
+   *          to estimated preference values to evaluate
+   *          {@link org.apache.mahout.cf.taste.recommender.Recommender} performance
+   * @param evaluationPercentage
+   *          percentage of users to use in evaluation
+   * @return a "score" representing how well the {@link org.apache.mahout.cf.taste.recommender.Recommender}'s
+   *         estimated preferences match real values; <em>lower</em> scores mean a better match and 0 is a
+   *         perfect match
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link DataModel}
+   */
+  double evaluate(RecommenderBuilder recommenderBuilder,
+                  DataModelBuilder dataModelBuilder,
+                  DataModel dataModel,
+                  double trainingPercentage,
+                  double evaluationPercentage) throws TasteException;
+
+  /**
+   * @deprecated see {@link DataModel#getMaxPreference()}
+   */
+  @Deprecated
+  float getMaxPreference();
+
+  @Deprecated
+  void setMaxPreference(float maxPreference);
+
+  /**
+   * @deprecated see {@link DataModel#getMinPreference()}
+   */
+  @Deprecated
+  float getMinPreference();
+
+  @Deprecated
+  void setMinPreference(float minPreference);
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderIRStatsEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderIRStatsEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderIRStatsEvaluator.java
new file mode 100644
index 0000000..6e4e9c7
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderIRStatsEvaluator.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.eval;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+
+/**
+ * <p>
+ * Implementations collect information retrieval-related statistics on a
+ * {@link org.apache.mahout.cf.taste.recommender.Recommender}'s performance, including precision, recall and
+ * f-measure.
+ * </p>
+ * 
+ * <p>
+ * See <a href="http://en.wikipedia.org/wiki/Information_retrieval">Information retrieval</a>.
+ */
+public interface RecommenderIRStatsEvaluator {
+  
+  /**
+   * @param recommenderBuilder
+   *          object that can build a {@link org.apache.mahout.cf.taste.recommender.Recommender} to test
+   * @param dataModelBuilder
+   *          {@link DataModelBuilder} to use, or if null, a default {@link DataModel} implementation will be
+   *          used
+   * @param dataModel
+   *          dataset to test on
+   * @param rescorer
+   *          if any, to use when computing recommendations
+   * @param at
+   *          as in, "precision at 5". The number of recommendations to consider when evaluating precision,
+   *          etc.
+   * @param relevanceThreshold
+   *          items whose preference value is at least this value are considered "relevant" for the purposes
+   *          of computations
+   * @return {@link IRStatistics} with resulting precision, recall, etc.
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link DataModel}
+   */
+  IRStatistics evaluate(RecommenderBuilder recommenderBuilder,
+                        DataModelBuilder dataModelBuilder,
+                        DataModel dataModel,
+                        IDRescorer rescorer,
+                        int at,
+                        double relevanceThreshold,
+                        double evaluationPercentage) throws TasteException;
+  
+}


[28/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/pom.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/pom.xml b/community/mahout-mr/mr/pom.xml
new file mode 100644
index 0000000..0f28588
--- /dev/null
+++ b/community/mahout-mr/mr/pom.xml
@@ -0,0 +1,295 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.mahout</groupId>
+    <artifactId>mahout-mr</artifactId>
+    <version>0.14.0-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <!-- modules inherit parent's group id and version. -->
+  <artifactId>mr</artifactId>
+  <name>-- Mahout Classic: Algorithms and Code</name>
+  <description>Scalable machine learning libraries</description>
+
+  <packaging>jar</packaging>
+
+  <properties>
+    <hadoop.version>2.4.1</hadoop.version>
+    <lucene.version>5.5.2</lucene.version>
+  </properties>
+  <build>
+    <resources>
+      <resource>
+        <directory>mr/src/main/resources</directory>
+      </resource>
+      <resource>
+        <directory>../src/conf</directory>
+        <includes>
+          <include>driver.classes.default.props</include>
+        </includes>
+      </resource>
+    </resources>
+    <plugins>
+      <!-- ensure licenses -->
+      <plugin>
+        <groupId>org.apache.rat</groupId>
+        <artifactId>apache-rat-plugin</artifactId>
+      </plugin>
+      
+      <!-- copy jars to lib/ -->
+      <plugin>
+        <artifactId>maven-antrun-plugin</artifactId>
+        <version>1.4</version>
+        <executions>
+          <execution>
+            <id>copy</id>
+            <phase>package</phase>
+            <configuration>
+              <tasks>
+                <copy file="target/mahout-mr-${version}.jar" tofile="../../lib/mahout-mr-${version}.jar" />
+              </tasks>
+            </configuration>
+            <goals>
+              <goal>run</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+
+      <!-- delete files on mvn clean -->
+      <plugin>
+        <artifactId>maven-clean-plugin</artifactId>
+        <version>3.0.0</version>
+        <configuration>
+          <filesets>
+            <fileset>
+              <directory>../../lib/</directory>
+              <includes>
+                <include>mahout-mr_*.jar</include>
+              </includes>
+              <followSymlinks>false</followSymlinks>
+            </fileset>
+          </filesets>
+        </configuration>
+      </plugin>
+      <!-- create test jar so other modules can reuse the core test utility classes. -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <executions>
+          <execution>
+            <goals>
+              <goal>test-jar</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+
+      <!-- create core hadoop job jar -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>job</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+            <configuration>
+              <descriptors>
+                <descriptor>src/main/assembly/job.xml</descriptor>
+              </descriptors>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+
+      <plugin>
+        <artifactId>maven-javadoc-plugin</artifactId>
+      </plugin>
+
+      <plugin>
+        <artifactId>maven-source-plugin</artifactId>
+      </plugin>
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-remote-resources-plugin</artifactId>
+        <configuration>
+          <appendedResourcesDirectory>src/main/resources</appendedResourcesDirectory>
+          <resourceBundles>
+            <resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle>
+          </resourceBundles>
+          <supplementalModels>
+            <supplementalModel>supplemental-models.xml</supplementalModel>
+          </supplementalModels>
+        </configuration>
+      </plugin>
+
+      <!-- remove jars from top directory on clean -->
+      <plugin>
+        <artifactId>maven-clean-plugin</artifactId>
+        <version>3.0.0</version>
+        <configuration>
+          <filesets>
+            <fileset>
+              <directory>../../lib</directory>
+              <includes>
+                <include>mahout-mr*.jar</include>
+              </includes>
+              <followSymlinks>false</followSymlinks>
+            </fileset>
+          </filesets>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+
+  <dependencies>
+
+
+
+    <!-- Third Party -->
+
+    <dependency>
+      <groupId>com.tdunning</groupId>
+      <artifactId>t-digest</artifactId>
+      <version>3.1</version>
+    </dependency>
+
+    <dependency>
+      <groupId>com.google.guava</groupId>
+      <artifactId>guava</artifactId>
+      <version>11.0.2</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-client</artifactId>
+      <version>${hadoop.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-core</artifactId>
+      <version>2.7.4</version>
+    </dependency>
+
+
+
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-lang3</artifactId>
+      <version>3.1</version>
+    </dependency>
+
+    <dependency>
+      <groupId>commons-cli</groupId>
+      <artifactId>commons-cli</artifactId>
+      <version>1.2</version>
+    </dependency>
+
+    <dependency>
+      <groupId>com.thoughtworks.xstream</groupId>
+      <artifactId>xstream</artifactId>
+      <version>1.4.4</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-core</artifactId>
+      <version>${lucene.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-analyzers-common</artifactId>
+      <version>${lucene.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.mahout.commons</groupId>
+      <artifactId>commons-cli</artifactId>
+      <version>2.0-mahout</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-math3</artifactId>
+      <version>3.2</version>
+    </dependency>
+
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.12</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.hamcrest</groupId>
+      <artifactId>hamcrest-all</artifactId>
+      <version>1.3</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>com.carrotsearch.randomizedtesting</groupId>
+      <artifactId>randomizedtesting-runner</artifactId>
+      <version>2.0.15</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.easymock</groupId>
+      <artifactId>easymock</artifactId>
+      <version>3.2</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.mrunit</groupId>
+      <artifactId>mrunit</artifactId>
+      <version>1.0.0</version>
+      <classifier>hadoop2</classifier>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>commons-httpclient</groupId>
+      <artifactId>commons-httpclient</artifactId>
+      <version>3.0.1</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.solr</groupId>
+      <artifactId>solr-commons-csv</artifactId>
+      <version>3.5.0</version>
+    </dependency>
+
+  </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/appended-resources/supplemental-models.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/appended-resources/supplemental-models.xml b/community/mahout-mr/mr/src/appended-resources/supplemental-models.xml
new file mode 100644
index 0000000..971c72b
--- /dev/null
+++ b/community/mahout-mr/mr/src/appended-resources/supplemental-models.xml
@@ -0,0 +1,279 @@
+<supplementalDataModels>
+  <!-- missing: Maven Profile Model -->
+  <supplement>
+    <project>
+    <groupId>org.apache.maven</groupId>
+    <artifactId>maven-profile</artifactId>
+    <name>Maven Profile Model</name>
+    <licenses>
+      <license>
+        <name>The Apache Software License, Version 2.0</name>
+        <url>http://maven.apache.org/ref/2.1.0/maven-profile/license.html</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- missing: Maven Project Builder -->
+  <supplement>
+    <project>
+    <groupId>org.apache.maven</groupId>
+    <artifactId>maven-project</artifactId>
+    <name>Maven Project Builder</name>
+    <licenses>
+      <license>
+        <name>The Apache Software License, Version 2.0</name>
+        <url>http://maven.apache.org/ref/2.1.0/maven-project/license.html</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- missing: Maven Local Settings  -->
+  <supplement>
+    <project>
+    <groupId>org.apache.maven</groupId>
+    <artifactId>maven-settings</artifactId>
+    <name>Maven Local Settings</name>
+    <licenses>
+      <license>
+        <name>The Apache Software License, Version 2.0</name>
+        <url>http://maven.apache.org/ref/2.1.0/maven-settings/license.html</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- Maven Repository Metadata Model -->
+  <supplement>
+    <project>
+    <groupId>org.apache.maven</groupId>
+    <artifactId>maven-repository-metadata</artifactId>
+    <name>Maven Repository Metadata Model</name>
+    <licenses>
+      <license>
+        <name>The Apache Software License, Version 2.0</name>
+        <url>http://maven.apache.org/ref/2.1.0/maven-repository-metadata/license.html</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- Maven Model -->
+  <supplement>
+    <project>
+    <groupId>org.apache.maven</groupId>
+    <artifactId>maven-model</artifactId>
+    <name>Maven Model</name>
+    <licenses>
+      <license>
+        <name>The Apache Software License, Version 2.0</name>
+        <url>http://maven.apache.org/ref/2.0.8/maven-model/license.html</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- Maven Artifact -->
+  <supplement>
+    <project>
+    <groupId>org.apache.maven</groupId>
+    <artifactId>maven-artifact</artifactId>
+    <name>Maven Artifact</name>
+    <licenses>
+      <license>
+        <name>The Apache Software License, Version 2.0</name>
+        <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- Maven Artifact Manager-->
+  <supplement>
+    <project>
+    <groupId>org.apache.maven</groupId>
+    <artifactId>maven-artifact-manager</artifactId>
+    <name>Maven Artifact Manager</name>
+    <licenses>
+      <license>
+        <name>The Apache Software License, Version 2.0</name>
+        <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- Maven Artifact Manager-->
+  <supplement>
+    <project>
+    <groupId>org.apache.maven</groupId>
+    <artifactId>maven-plugin-api</artifactId>
+    <name>Maven Plugin API</name>
+    <licenses>
+      <license>
+        <name>The Apache Software License, Version 2.0</name>
+        <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- Maven Wagon API-->
+  <supplement>
+    <project>
+    <groupId>org.apache.maven</groupId>
+    <artifactId>wagon-provider-api</artifactId>
+    <name>Maven Wagon API</name>
+    <licenses>
+      <license>
+        <name>The Apache Software License, Version 2.0</name>
+        <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- Shade Maven Plugin -->
+  <supplement>
+    <project>
+    <groupId>org.codehouse.mojo</groupId>
+    <artifactId>shade-maven-plugin</artifactId>
+    <name>Shade Maven Plugin</name>
+    <licenses>
+      <license>
+        <name>UNKNOWN</name>
+        <url>UNKNOWN</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+ <!-- junit -->
+  <supplement>
+    <project>
+    <groupId>junit</groupId>
+    <artifactId>junit</artifactId>
+    <name>Junit Unit testing library</name>
+    <licenses>
+      <license>
+        <name>Common Public License - v 1.0</name>
+        <url>http://junit.sourceforge.net/cpl-v10.html</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- jdom -->
+  <supplement>
+    <project>
+    <groupId>jdom</groupId>
+    <artifactId>jdom</artifactId>
+    <name>JDom</name>
+    <licenses>
+      <license>
+        <name>UNKOWN</name>
+        <url>UNKOWN</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- asm -->
+  <supplement>
+    <project>
+    <groupId>asm</groupId>
+    <artifactId>asm-all</artifactId>
+    <name>ASM ALL</name>
+    <licenses>
+      <license>
+        <name>UNKOWN</name>
+        <url>http://asm.ow2.org/license.html</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- Default Plexus Container -->
+  <supplement>
+    <project>
+    <groupId>org.codehaus.plexus</groupId>
+    <artifactId>plexus-container-default</artifactId>
+    <name>Default Plexus Container</name>
+    <licenses>
+      <license>
+        <name>UNKNOWN</name>
+        <url>UNKNOWN</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- Classworlds -->
+  <supplement>
+    <project>
+    <groupId>org.codehouse.classworlds</groupId>
+    <artifactId>classworlds</artifactId>
+    <name>Classworlds</name>
+    <licenses>
+      <license>
+        <name></name>
+        <url>http://classworlds.codehaus.org/license.html</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- Plexus Common Utilities -->
+  <supplement>
+    <project>
+    <groupId>org.codehouse.plexus</groupId>
+    <artifactId>plexus-utils</artifactId>
+    <name>Plexus Common Utilities</name>
+    <licenses>
+      <license>
+        <name>The Apache Software License, Version 2.0</name>
+        <url>http://plexus.codehaus.org/plexus-utils/license.html</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- Commons Codec  -->
+  <supplement>
+    <project>
+    <groupId>commons-codec</groupId>
+    <artifactId>commons-codec</artifactId>
+    <name>Commons Codec</name>
+    <url>http://commons.apache.org/codec/</url>
+    <organization>
+          <name>Apache Software Foundation</name>
+          <url>http://www.apache.org/</url>
+    </organization>
+    <licenses>
+      <license>
+        <name>The Apache Software License, Version 2.0</name>
+        <url>http://www.apache.org/licenses/LICENSE-2.0</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- Commons CLI  -->
+  <supplement>
+    <project>
+    <groupId>org.apache.mahout.commons</groupId>
+    <artifactId>commons-cli</artifactId>
+    <name>Commons CLI</name>
+    <url>http://commons.apache.org/cli/</url>
+    <organization>
+          <name>Apache Software Foundation</name>
+          <url>http://www.apache.org/</url>
+    </organization>
+    <licenses>
+      <license>
+        <name>The Apache Software License, Version 2.0</name>
+        <url>http://www.apache.org/licenses/LICENSE-2.0</url>
+      </license>
+    </licenses>
+    </project>
+  </supplement>
+  <!-- Xpp3  -->
+  <supplement>
+    <project>
+    <name>Xml Pull Parser 3rd Edition</name>
+    <groupId>xpp3</groupId>
+    <artifactId>xpp3_min</artifactId>
+    <url>http://www.extreme.indiana.edu/xgws/xsoap/xpp/mxp1/</url>
+    <licenses>
+      <license>
+        <name>Public Domain</name>
+        <url>http://www.xmlpull.org/</url>
+      </license>
+    </licenses>
+    </project>
+ </supplement>
+</supplementalDataModels>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/ mahout-powered.svg
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/ mahout-powered.svg b/community/mahout-mr/mr/src/images/logos/ mahout-powered.svg
new file mode 100644
index 0000000..ce3ea9f
--- /dev/null
+++ b/community/mahout-mr/mr/src/images/logos/ mahout-powered.svg	
@@ -0,0 +1,630 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 13.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 14948)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 width="956px" height="400px" viewBox="0 0 956 400" enable-background="new 0 0 956 400" xml:space="preserve">
+<g>
+	<path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M709.799,389.6c-21.38,0-37.761-6.839-48.688-20.322
+		c-0.377-0.467-0.747-0.936-1.11-1.408V376c0,5.523-4.478,10.001-10.001,10.001h-28.6c-5.522,0-10-4.478-10-10.001v-64.87
+		c0-4.989-0.908-7.693-1.669-9.083c-0.053-0.096-0.104-0.194-0.154-0.292c-0.32-0.634-0.987-1.954-5.366-1.954
+		c-5.29,0-7.384,1.85-8.617,3.464c-2.353,3.069-3.593,8.255-3.593,15.005V376c0,5.523-4.477,10.001-10,10.001h-27.8
+		c-0.756,0-1.492-0.085-2.201-0.244c-0.708,0.159-1.444,0.244-2.2,0.244h-30.271c-3.453,0-6.61-1.776-8.425-4.61
+		c-0.791,0.505-1.595,0.995-2.412,1.471c-7.595,4.351-16.133,6.54-25.442,6.54c-11.384,0-21.145-3.183-29.042-9.469
+		c-1.529,3.569-5.072,6.068-9.198,6.068h-28.408c-5.523,0-10-4.478-10-10.001v-67.812c0-3.194-0.564-4.789-0.9-5.458
+		c-0.392-0.777-0.97-1.93-4.821-1.93c-4.724,0-5.983,1.728-6.896,3.675c-0.919,2.062-1.383,4.791-1.383,8.114V376
+		c0,5.523-4.477,10.001-10,10.001h-27.8c-5.523,0-10-4.478-10-10.001v-63.33c0-6.95-0.88-9.239-1.055-9.628
+		c-0.349-0.762-0.843-1.841-4.675-1.841c-5.697,0-6.798,1.676-7.151,2.329c-0.298,0.621-1.12,2.837-1.12,8.449V376
+		c0,5.523-4.477,10.001-10,10.001h-28.199c-5.523,0-10-4.478-10-10.001V269.8c0-5.522,4.477-10,10-10h26.999
+		c2.902,0,5.514,1.235,7.34,3.209c6.486-3.852,14.321-5.809,23.34-5.809c10.216,0,18.796,2.437,25.504,7.242
+		c0.185,0.133,0.368,0.272,0.545,0.418c1.322,1.092,2.566,2.262,3.73,3.506c2.438-2.188,5.07-4.048,7.884-5.571
+		c0.07-0.036,0.14-0.073,0.211-0.11c7.126-3.639,15.103-5.484,23.707-5.484c5.958,0,11.882,1.164,17.608,3.456
+		c6.131,2.448,11.667,6.673,16.449,12.554c1.573,1.945,2.946,4.052,4.116,6.313c0.941-1.602,1.974-3.131,3.103-4.586
+		C462.508,263.016,477.94,257,499.041,257c13.235,0,25.249,2.715,35.706,8.067c3.12,1.598,6.458,3.872,9.454,7.101v-39.569
+		c0-5.522,4.477-10,10-10h27.8c5.523,0,10,4.478,10,10v28.484c6.504-2.974,13.447-4.483,20.639-4.483
+		c7.865,0,15.192,1.418,21.774,4.218c7.009,3,12.832,7.627,17.329,13.761c2.014,2.758,3.63,5.599,4.846,8.499
+		c1.368-2.145,2.862-4.229,4.481-6.253c10.92-13.683,27.316-20.624,48.729-20.624c21.414,0,37.812,6.941,48.737,20.633
+		c0.225,0.278,0.444,0.562,0.665,0.843v-8.274c0-5.523,4.477-10,10-10h28.6c5.523,0,10,4.477,10,10v64.358
+		c0,6.407,0.92,8.881,1.203,9.484c0.409,0.88,1.098,2.354,5.816,2.354c6.371,0,8.746-2.222,10.299-5.57
+		c0.86-2.012,1.881-5.809,1.881-12.539v-58.088c0-5.523,4.477-10,10-10h28.201c1.719,0,3.338,0.434,4.749,1.198h2.85v-20.001
+		c0-5.522,4.478-10,10.001-10h27.6c5.522,0,10,4.478,10,10V260.6h7.198c5.523,0,10,4.477,10,10v19.602c0,5.523-4.477,10-10,10H920.4
+		v46.178c0.521,0.013,1.106,0.021,1.76,0.021c0.63,0,1.279-0.023,1.929-0.071c0.704-0.053,1.405-0.129,2.085-0.227
+		c0.475-0.068,0.952-0.103,1.427-0.103c2.388,0,4.717,0.856,6.547,2.442c2.192,1.899,3.451,4.658,3.451,7.558v20.8
+		c0,5.347-4.205,9.745-9.545,9.989l-13.179,0.602c-0.037,0.002-0.076,0.004-0.113,0.004c-1.198,0.042-2.364,0.062-3.501,0.062
+		c-14.403,0-24.539-3.26-30.987-9.963c-2.15-2.205-3.846-4.837-5.072-7.872V376c0,5.523-4.478,10.001-10,10.001H838.2
+		c-3.148,0-5.959-1.456-7.791-3.732c-2.405,1.436-4.804,2.577-7.188,3.416c-5.142,1.804-11.065,2.717-17.621,2.717
+		c-24.711,0-35.835-12.303-40.818-22.626c-0.51-1.045-0.984-2.142-1.422-3.292c-1.476,2.343-3.101,4.608-4.874,6.796
+		C747.562,382.761,731.181,389.6,709.799,389.6L709.799,389.6z M487.944,348.278c0.598,0.447,1.538,0.922,3.414,0.922
+		c4.033,0,7.665-1.15,11.099-3.517c1.935-1.333,2.882-4.174,3.318-7.126c-0.231,0.043-0.465,0.089-0.702,0.133l-6.347,1.172
+		c-6.723,1.191-9.018,2.316-9.562,2.634c-0.961,0.561-1.564,1.024-1.564,3.194C487.601,347.181,487.822,347.995,487.944,348.278
+		L487.944,348.278z M709.751,299.801c-6.414,0-9.15,2.51-10.819,4.697c-3.009,3.937-4.531,10.177-4.531,18.552
+		c0,8.386,1.529,14.651,4.544,18.623c1.671,2.205,4.405,4.728,10.807,4.728c6.375,0,9.085-2.51,10.732-4.697
+		c2.995-3.98,4.517-10.259,4.517-18.653c0-8.384-1.515-14.637-4.504-18.585C718.854,302.297,716.139,299.801,709.751,299.801
+		L709.751,299.801z M491.611,300.711c-0.264,0.336-0.562,0.826-0.854,1.529l7.135-0.875c3.8-0.479,5.996-0.97,7.181-1.304
+		c-1.357-0.335-3.556-0.662-6.974-0.662C493.944,299.399,492.062,300.24,491.611,300.711L491.611,300.711z"/>
+	<path fill="#1F1F1F" d="M582,232.6v50.641c4.02-6.2,8.67-10.52,13.96-12.971c5.28-2.449,10.851-3.67,16.681-3.67
+		c6.549,0,12.5,1.141,17.859,3.42c5.35,2.291,9.74,5.78,13.18,10.471c2.91,3.99,4.7,8.08,5.35,12.289
+		c0.65,4.201,0.971,11.07,0.971,20.601V376h-28.6v-64.87c0-5.739-0.971-10.37-2.9-13.89c-2.51-4.961-7.27-7.44-14.29-7.44
+		c-7.271,0-12.79,2.46-16.56,7.39c-3.771,4.92-5.65,11.951-5.65,21.08V376h-27.8V232.6H582 M910.4,240.6v30H927.6V290.2H910.4
+		v56.409c0,4.371,0.55,7.101,1.649,8.17c1.101,1.08,4.47,1.621,10.11,1.621c0.84,0,1.73-0.03,2.67-0.101
+		c0.939-0.069,1.859-0.17,2.77-0.3v20.8l-13.18,0.601c-1.083,0.037-2.135,0.056-3.161,0.056c-11.429,0-19.356-2.299-23.778-6.896
+		c-3.121-3.201-4.681-8.121-4.681-14.761v-65.6H868V270.6h14.8v-30H910.4 M709.8,266.2c18.3,0,31.94,5.62,40.92,16.87
+		c8.99,11.24,13.48,24.539,13.48,39.88c0,15.6-4.49,28.94-13.48,40.03c-8.979,11.08-22.62,16.619-40.92,16.619
+		s-31.94-5.539-40.92-16.619c-8.989-11.09-13.479-24.431-13.479-40.03c0-15.341,4.49-28.64,13.479-39.88
+		C677.859,271.82,691.5,266.2,709.8,266.2 M709.75,356.4c8.12,0,14.359-2.891,18.72-8.68c4.351-5.781,6.53-14.011,6.53-24.671
+		c0-10.659-2.18-18.87-6.53-24.62c-4.36-5.75-10.6-8.63-18.72-8.63c-8.13,0-14.38,2.88-18.77,8.63
+		c-4.391,5.75-6.58,13.961-6.58,24.62c0,10.66,2.189,18.89,6.58,24.671C695.37,353.51,701.62,356.4,709.75,356.4 M499.04,267
+		c11.69,0,22.069,2.32,31.149,6.971c9.07,4.639,13.61,13.369,13.61,26.18v48.76c0,3.38,0.07,7.48,0.2,12.29
+		c0.2,3.63,0.75,6.09,1.67,7.39c0.92,1.301,2.29,2.37,4.13,3.21v4.2h-30.271c-0.84-2.141-1.43-4.141-1.75-6.02
+		c-0.329-1.881-0.59-4.021-0.779-6.41c-3.859,4.17-8.311,7.72-13.34,10.65c-6.02,3.449-12.82,5.18-20.41,5.18
+		c-9.68,0-17.67-2.75-23.98-8.26c-6.31-5.5-9.47-13.301-9.47-23.4c0-13.1,5.08-22.57,15.23-28.44c5.56-3.19,13.75-5.47,24.55-6.84
+		l9.529-1.17c5.17-0.649,8.871-1.47,11.101-2.44c3.99-1.699,5.99-4.34,5.99-7.92c0-4.359-1.53-7.38-4.601-9.039
+		c-3.06-1.66-7.56-2.49-13.5-2.49c-6.66,0-11.379,1.619-14.14,4.869c-1.979,2.4-3.3,5.641-3.96,9.73h-26.8
+		c0.59-9.311,3.2-16.95,7.84-22.939C468.41,271.689,481.08,267,499.04,267 M491.359,359.2c6.07,0,11.66-1.761,16.771-5.28
+		c5.12-3.529,7.771-9.949,7.97-19.279V324.26c-1.779,1.11-3.58,2.01-5.39,2.69c-1.81,0.69-4.3,1.319-7.47,1.909l-6.33,1.17
+		c-5.93,1.051-10.189,2.32-12.77,3.82c-4.361,2.551-6.541,6.49-6.541,11.84c0,4.771,1.339,8.211,4.009,10.33
+		C484.279,358.141,487.529,359.2,491.359,359.2 M411.86,267.2c4.7,0,9.32,0.909,13.89,2.739c4.56,1.82,8.7,5.021,12.41,9.58
+		c3,3.711,5.02,8.271,6.06,13.67c0.65,3.58,0.98,8.82,0.98,15.73L445.01,376H416.6v-67.811c0-4.039-0.66-7.359-1.97-9.959
+		c-2.49-4.961-7.07-7.431-13.75-7.431c-7.73,0-13.07,3.19-16.02,9.58c-1.51,3.38-2.26,7.45-2.26,12.21V376h-27.8v-63.33
+		c0-6.311-0.65-10.9-1.95-13.76c-2.35-5.141-6.94-7.71-13.78-7.71c-7.95,0-13.29,2.569-16.02,7.71c-1.5,2.93-2.25,7.279-2.25,13.07
+		V376h-28.2V269.8h27v15.46c3.44-5.529,6.69-9.47,9.74-11.81c5.39-4.171,12.37-6.25,20.94-6.25c8.12,0,14.68,1.79,19.68,5.37
+		c4.02,3.32,7.08,7.58,9.15,12.779c3.65-6.24,8.18-10.83,13.59-13.76C398.44,268.66,404.82,267.2,411.86,267.2 M865.2,269.4V376h-27
+		v-14.96c-0.261,0.33-0.91,1.3-1.95,2.931c-1.04,1.619-2.28,3.049-3.71,4.289c-4.36,3.9-8.57,6.561-12.64,7.99
+		c-4.07,1.43-8.83,2.15-14.301,2.15c-15.74,0-26.35-5.66-31.81-16.971c-3.06-6.27-4.59-15.5-4.59-27.699V269.4h28.6v64.359
+		c0,6.07,0.71,10.641,2.14,13.711c2.53,5.42,7.49,8.129,14.881,8.129c9.47,0,15.959-3.85,19.459-11.56
+		c1.811-4.181,2.721-9.7,2.721-16.55V269.4H865.2 M582,212.6h-27.8c-11.046,0-20,8.954-20,20v21.182
+		C523.599,249.28,511.796,247,499.04,247c-20.979,0-37.309,5.431-48.668,16.161c-5.107-5.312-10.877-9.27-17.208-11.796
+		c-6.893-2.761-14.068-4.165-21.305-4.165c-10.198,0-19.703,2.213-28.252,6.576c-0.145,0.074-0.289,0.149-0.431,0.227
+		c-0.904,0.49-1.792,1.006-2.664,1.55c-8.252-5.543-18.415-8.353-30.233-8.353c-8.355,0-15.932,1.435-22.647,4.278
+		c-2.458-1.08-5.175-1.679-8.032-1.679h-27c-11.045,0-20,8.954-20,20V376c0,11.046,8.955,20,20,20h28.2
+		c7.177,0,13.472-3.781,17-9.459c3.528,5.678,9.823,9.459,17,9.459h27.8c7.177,0,13.471-3.781,17-9.459
+		c3.528,5.678,9.823,9.459,17,9.459h28.41c3.945,0,7.625-1.143,10.724-3.115c8.044,4.328,17.258,6.516,27.516,6.516
+		c9.591,0,18.534-1.975,26.644-5.875c2.891,1.591,6.19,2.475,9.636,2.475H549.8c0.743,0,1.478-0.04,2.2-0.119
+		c0.723,0.079,1.457,0.119,2.2,0.119H582c9.862,0,18.058-7.139,19.7-16.531c1.643,9.393,9.838,16.531,19.7,16.531H650
+		c6.725,0,12.675-3.318,16.3-8.408c11.611,7.979,26.173,12.008,43.5,12.008c22.084,0,39.678-6.547,52.395-19.475
+		c7.525,9.087,20.741,18.275,43.405,18.275c7.69,0,14.732-1.104,20.93-3.281c0.97-0.341,1.939-0.72,2.908-1.136
+		c2.646,1.292,5.62,2.017,8.763,2.017h27c5.679,0,10.805-2.367,14.445-6.168c7.947,5.119,18.379,7.624,31.613,7.624
+		c1.246,0,2.539-0.022,3.843-0.067c0.076-0.003,0.152-0.006,0.229-0.009l13.18-0.601c10.681-0.486,19.09-9.287,19.09-19.979V356
+		c0-5.798-2.516-11.311-6.896-15.108c-2.94-2.551-6.527-4.16-10.304-4.694v-26.191c9.72-1.362,17.199-9.711,17.199-19.806V270.6
+		c0-10.095-7.479-18.443-17.199-19.806V240.6c0-11.046-8.954-20-20-20H882.8c-11.046,0-20,8.954-20,20v8.801H837
+		c-9.677,0-17.747,6.871-19.601,16.001c-1.852-9.13-9.923-16.001-19.6-16.001h-28.6c-6.813,0-12.833,3.408-16.443,8.612
+		c-3.523-2.381-7.322-4.414-11.38-6.087c-9.217-3.799-19.841-5.726-31.577-5.726s-22.36,1.927-31.577,5.726
+		c-7.925,3.267-14.862,7.909-20.695,13.84c-5.208-6.167-11.636-10.911-19.153-14.131c-0.016-0.007-0.031-0.014-0.047-0.021
+		c-7.824-3.327-16.467-5.015-25.687-5.015c-3.604,0-7.156,0.315-10.641,0.943V232.6C602,221.554,593.046,212.6,582,212.6L582,212.6z
+		 M709.75,336.4c-2.254,0-2.562-0.406-2.833-0.764c-0.598-0.787-2.517-3.982-2.517-12.587c0-8.573,1.895-11.722,2.476-12.482
+		c0.263-0.343,0.587-0.768,2.874-0.768c2.241,0,2.542,0.396,2.783,0.715c0.569,0.752,2.467,3.929,2.467,12.535
+		c0,8.638-1.922,11.862-2.511,12.645C712.255,336.006,711.958,336.4,709.75,336.4L709.75,336.4z"/>
+</g>
+<g>
+	<path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M293.5,388c-14.735,0-16.195-10.601-16.492-15.157
+		c-2.281,0.968-5.548,2.49-8.354,3.8C254.849,383.076,243.715,388,236.499,388c-25.961,0-44.166-21.61-49.72-41.423
+		c-0.496,1.275-1.103,2.539-1.847,3.778l-0.259,0.435l-0.314,0.393C176.217,361.363,147.782,362,147.5,362
+		c-13.223,0-22.925-3.37-28.833-10.014c-3.174-3.572-6.704-9.898-5.668-19.864c-0.076-13.164,4.078-39.976,7.319-50.778l1.604-5.345
+		h5.58H138.5h3.11l2.2,2.203c2.876,2.883,2.6,6.301,2.397,8.795c-0.186,2.297-0.532,6.568-0.213,15.227
+		c0.099,2.286,2.6,9.209,5.635,13.571c2.905-2.996,8.481-10.19,18.777-27.414c1.035-1.731,1.508-2.521,1.855-3.041l4.312-6.47
+		c-2.459-5.737-5.025-12.35-5.561-21.953L171,256.709V256.5c0-1.624,0.272-3.165,0.536-4.656c0.063-0.36,0.141-0.801,0.208-1.223
+		c-1.643-1.128-3.838-2.151-6.127-3.218c-2.111-0.98-4.292-1.997-6.398-3.256c-0.369-0.209-0.729-0.422-1.082-0.644
+		c0.54,1.213,0.862,2.522,0.862,3.996c0,3.947-4.782,14.335-8.793,22.354l-1.476,2.949l-3.169,0.907
+		c-4.74,1.354-14.83,1.837-22.691,1.837c-3.454,0-7.977-0.087-12.869-0.412v1.364c0,1.262,0.242,3.583,0.437,5.449
+		c0.242,2.332,0.392,3.825,0.392,5.05c0,9.626-4.898,16.854-13.795,20.355c-5.908,2.325-12.401,2.646-18.535,2.646
+		c-14.368,0-22.193-2.225-27.005-7.674c-4.93-5.588-4.942-12.66-4.958-20.851c-0.002-1.472-0.006-3.027-0.036-4.666
+		c-0.021-0.987,0.051-4.085,0.19-9.928c0.137-5.841,0.308-13.109,0.308-16.382v-21.002c-4.692-11.946-6.908-23.599-7.928-30.97
+		c-1.042-7.549,0.479-14.029,4.519-19.265c2.714-3.515,6.315-6.117,10.411-8.084v-3.68c0-4.226,0-8.548,0.348-12.964
+		c-0.274-0.091-0.552-0.181-0.833-0.272c-7.121-2.319-15.983-5.204-21.708-11.882C22.598,131.542,17,104.646,17,101.5
+		c0-9.415,5.693-15.501,14.501-15.501C40.835,85.999,46,94.573,46,100.5c0,2.351-0.814,5.752-2.543,12.424
+		c-0.538,2.081-1.261,4.873-1.453,5.927c0.13,5.004,3.026,8.388,5.463,10.36c3.112,2.516,7.279,4.158,11.751,4.679
+		C76.873,88.335,129.009,72,169.499,72c50.34,0,81.615,26.567,86.227,73.024C271.345,139.479,288.758,134,302.5,134
+		c10.265,0,22.501,4.945,22.501,28.5c0,26.976-14.824,65.562-47.938,90.953l-5.501,4.217l-4.637-5.153
+		c-6.05-6.723-13.757-10.396-24.253-11.562l-1.746-0.194c0.875,3.851,2.273,7.381,3.798,11.227
+		c1.421,3.591,2.943,7.431,4.067,11.781l0.006-0.036L259.498,278c6.913,9.213,14.501,33.549,14.501,46.5
+		c0,0.404-0.011,0.826-0.036,1.263c3.446-4.232,8.916-6.763,15.537-6.763c13.398,0,19.501,8.553,19.501,16.501
+		c0,3.262-1.63,6.604-4.312,11.722c-0.3,0.573-0.668,1.277-1.004,1.936c0.398,0.487,0.848,1.01,1.231,1.457
+		c3.22,3.751,8.084,9.422,8.084,16.884C313.001,379.377,304.8,388,293.5,388L293.5,388z M246.439,356.083
+		c-0.28,0.348-0.395,0.733-0.437,1.229C246.153,356.929,246.298,356.518,246.439,356.083L246.439,356.083z M270.056,335.941
+		c-1.21,1.355-2.773,2.583-4.78,3.574c1.535-0.104,3.14-0.207,4.789-0.296c-0.04-0.548-0.065-1.123-0.065-1.721
+		C270,336.973,270.019,336.451,270.056,335.941L270.056,335.941z M219.021,317.979c0.093,0.007,0.194,0.013,0.302,0.018
+		c0.586-0.089,1.986-0.42,2.938-0.646c0.477-0.114,0.957-0.226,1.438-0.338c-1.721,0.032-3.758,0.146-4.62,0.547
+		C219.059,317.655,219.036,317.792,219.021,317.979L219.021,317.979z M172.531,125.258c8.011,5.611,15.058,13.592,20.572,20.675
+		c2.554-14.033,4.928-23.67,8.842-29.011c-5.7,1.628-9.894,5.061-12.692,7.353c-2.444,1.999-4.553,3.726-7.753,3.726
+		c-2.045,0-3.8-0.7-6.71-1.858C174.111,125.874,173.352,125.572,172.531,125.258L172.531,125.258z"/>
+	<path fill="#1F1F1F" d="M169.5,79.5c36,0,75,15,79,69h-3c-5-28-16-40-37-40c-16,0-25,12-27,12s-12.5-6-23-6c-21,0-43,12-42,42
+		l-55,11c0-6,0-12,1-18c-7-3-19-5-25-12c-7.5-8.83-13-34-13-36c0-6,3-8,7-8c5,0,7,5,7,7c0,3-4,16-4,18
+		c0,13.355,12.737,23.069,27.8,23.069c0.728,0,1.463-0.023,2.2-0.069C79.5,93.5,134.5,79.5,169.5,79.5 M213.538,119.277
+		c18.366,0.001,22.213,25.926,26.962,39.223c17-6,44-17,62-17c13,0,15,11,15,21c0,26-15,62-45,85c-9-10-20-13-29-14
+		c8.5-20.5,10.83-49,1-49c-6,0-3,11-4,14c-2,11-8,32-8,34c0,18,10.5,26.5,10.5,44.5c0,3-4.5,22.5-7.5,33.5c-3-1-8-1-10-1
+		c-6,0-14,0-14,9c0,6,5,7,8,7c2,0,8-2,11-2c2,0,6,1,6,5c0,10-20,4-20,16c0,6,3,7,6,7c2,0,18.01-9.73,21-10
+		c0.204-0.019,0.396-0.027,0.579-0.027c4.739,0,2.421,6.027,2.421,6.027c-8.83,3.5-8,9-8,12c0,5,3,8,6,8c10,0,11-19,11-20
+		c6,0,14-1,22-1c1-3,0-6,0-9c0-8,6-11,12-11c8,0,12,4,12,9c0,3-6,12-6,14c0,4,10,10,10,18s-5,13-12,13c-16,0-3-16-15-16
+		c-4,0-32,16-42,16c-27,0-44-28-44-46v-22c-1-7-2-16-4-23c-3-12-9.17-18.17-10-33c0-3,2-8,0-10c-4-4-10.5-5.83-15.5-8.83
+		c-9-5-11.5-16.17-13.5-21.17c-1-4-3-7-6-11c-7,3-6,9-6,13c0,18,14,25,14,29c0,2-5,13-8,19c-3.04,0.868-11.171,1.549-20.627,1.549
+		c-12.319,0-26.887-1.154-35.373-4.549c-29-10-38.26-46.189-41-66C43.67,177,65.83,174.17,84,172c12.6-1.5,31.5-4.5,45.5-6.5
+		c0,0,1,0,1-2c0-3-2-11-2-13v-6c0-10,12.5-19,24-19c20.17,0,40,33,45,39c3.5-20.17,6.83-43.83,13-45
+		C211.555,119.349,212.566,119.277,213.538,119.277 M54.5,250.5c10.601,13.491,30.487,26.054,46.237,26.054
+		c0.594,0,1.182-0.018,1.763-0.054c0,3,0.83,8.5,0.83,10.5c0,15-15.83,15.5-24.83,15.5c-27,0-24.17-8.17-24.5-25.83
+		C53.96,274.67,54.5,256.5,54.5,250.5 M253.5,282.5c6,8,13,31,13,42c0,8-6,10-14,10c-7,0-7-9-7-13
+		C245.5,318.5,251.5,295.5,253.5,282.5 M138.5,283.5c1,1-0.59,3.01,0,19c0.17,4.5,4.83,17.17,11,22
+		c0.394,0.31,0.843,0.454,1.342,0.454c7.473,0,25.783-32.642,27.658-35.454l3,41c0,5,0,11-3,16c-4,5-22,8-31,8c-15,0-29-5-27-22
+		c-0.17-12.17,4-39,7-49H138.5 M169.5,64.5c-22.887,0-47.102,5.267-66.436,14.451c-22.318,10.602-38.762,26.385-48.174,46.081
+		c-2.892-1.323-4.917-3.379-5.317-5.69c0.286-1.215,0.786-3.146,1.146-4.539c1.934-7.468,2.781-11.077,2.781-14.302
+		c0-10.625-8.84-22-22-22c-12.953,0-22,9.458-22,23c0,5.403,4.153,19.196,4.33,19.781c3.642,12.041,7.645,20.522,12.238,25.93
+		l0.022,0.026l0.022,0.025c5.736,6.693,13.632,10.188,20.458,12.587c-0.062,2.329-0.068,4.619-0.069,6.88
+		c-3.33,2.099-6.335,4.699-8.847,7.953c-3.655,4.736-7.666,12.895-6.012,24.87c1.152,8.332,3.418,19.828,7.859,31.554V250.5
+		c0,3.184-0.17,10.403-0.307,16.204c-0.159,6.711-0.212,9.158-0.19,10.267c0.029,1.535,0.031,3.051,0.034,4.517
+		c0.015,8.896,0.031,18.094,6.835,25.802C53.794,316.263,66.235,317.5,78.5,317.5c6.544,0,14.191-0.376,21.283-3.167
+		c2.781-1.094,5.281-2.484,7.479-4.137c-1.056,8.09-1.759,15.938-1.766,21.561c-1.177,12.445,3.43,20.561,7.567,25.214
+		c7.394,8.313,18.98,12.529,34.438,12.529c5.904,0,13.821-0.954,20.661-2.489c6.875-1.543,12.2-3.518,16.228-6.052
+		c2.301,4.51,5.13,8.851,8.412,12.832C204.34,387.79,219.86,395.5,236.5,395.5c8.772,0,20.174-4.999,35.323-12.061
+		c0.02-0.009,0.04-0.019,0.06-0.028c0.447,0.926,0.981,1.858,1.621,2.783c2.932,4.245,8.782,9.306,19.996,9.306
+		c7.6,0,14.536-2.912,19.53-8.201c4.817-5.1,7.47-12.132,7.47-19.799c0-8.513-4.28-14.937-7.848-19.338
+		c2.113-4.158,3.848-8.218,3.848-12.662c0-11.927-9.274-24-27-24c-3.298,0-6.405,0.485-9.255,1.394
+		c-2.486-13.581-8.349-30.866-14.745-39.394l-9.87-13.16c-0.968-3.413-2.118-6.49-3.218-9.299c3.468,1.514,6.374,3.645,8.938,6.493
+		l9.274,10.305l11.002-8.435C316.77,232.461,332.5,191.32,332.5,162.5c0-5.601-0.454-13.9-4.378-21.287
+		c-5.04-9.488-14.14-14.713-25.622-14.713c-12.295,0-26.812,3.88-40.602,8.463c-1.801-9.966-4.853-19.031-9.12-27.063
+		c-5.635-10.608-13.4-19.48-23.079-26.371C214.048,70.389,193.232,64.5,169.5,64.5L169.5,64.5z M153.054,279.371l0.912-0.261
+		l2.951-5.902c1.771-3.542,3.868-8.042,5.472-11.744c0.449-1.035,0.853-1.989,1.216-2.875c0.6,8.093,2.501,14.303,4.513,19.443
+		l-2.098,3.147c-0.447,0.67-0.922,1.462-2.05,3.349c-4.393,7.349-7.831,12.719-10.507,16.642c-0.255-7.688,0.052-11.492,0.22-13.565
+		C153.833,285.754,154.081,282.688,153.054,279.371L153.054,279.371z"/>
+</g>
+<g>
+	<path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M445.01,377.502H416.6c-0.828,0-1.501-0.673-1.501-1.501v-67.812
+		c0-3.775-0.607-6.899-1.808-9.283c-2.233-4.446-6.292-6.605-12.412-6.605c-7.158,0-11.952,2.849-14.657,8.708
+		c-1.406,3.146-2.121,7.051-2.121,11.583v63.41c0,0.828-0.673,1.501-1.501,1.501h-27.8c-0.828,0-1.501-0.673-1.501-1.501v-63.33
+		c0-6.069-0.609-10.49-1.816-13.142c-2.1-4.593-6.162-6.828-12.414-6.828c-7.419,0-12.225,2.26-14.695,6.912
+		c-1.373,2.681-2.073,6.848-2.073,12.368v64.02c0,0.828-0.673,1.501-1.501,1.501h-28.202c-0.828,0-1.501-0.673-1.501-1.501V269.8
+		c0-0.828,0.673-1.501,1.501-1.501h27.001c0.828,0,1.501,0.673,1.501,1.501v10.492c2.533-3.545,4.988-6.237,7.326-8.03
+		c5.624-4.353,12.977-6.562,21.853-6.562c8.402,0,15.317,1.902,20.551,5.65c0.03,0.02,0.057,0.04,0.082,0.063
+		c3.509,2.895,6.334,6.504,8.422,10.749c3.508-5.25,7.753-9.242,12.649-11.891c5.95-3.04,12.626-4.572,19.875-4.572
+		c4.873,0,9.735,0.959,14.446,2.849c4.774,1.902,9.153,5.276,13.018,10.025c3.147,3.89,5.287,8.71,6.37,14.331
+		c0.668,3.688,1.007,9.069,1.007,16.015l-0.189,67.085C446.507,376.831,445.836,377.502,445.01,377.502L445.01,377.502z"/>
+	<path fill="#1F1F1F" d="M411.86,267.2c4.7,0,9.32,0.909,13.89,2.739c4.56,1.82,8.7,5.021,12.41,9.58c3,3.711,5.02,8.271,6.06,13.67
+		c0.65,3.58,0.98,8.82,0.98,15.73L445.01,376H416.6v-67.811c0-4.039-0.66-7.359-1.97-9.959c-2.49-4.961-7.07-7.431-13.75-7.431
+		c-7.73,0-13.07,3.19-16.02,9.58c-1.51,3.38-2.26,7.45-2.26,12.21V376h-27.8v-63.33c0-6.311-0.65-10.9-1.95-13.76
+		c-2.35-5.141-6.94-7.71-13.78-7.71c-7.95,0-13.29,2.569-16.02,7.71c-1.5,2.93-2.25,7.279-2.25,13.07V376h-28.2V269.8h27v15.46
+		c3.44-5.529,6.69-9.47,9.74-11.81c5.39-4.171,12.37-6.25,20.94-6.25c8.12,0,14.68,1.79,19.68,5.37c4.02,3.32,7.08,7.58,9.15,12.779
+		c3.65-6.24,8.18-10.83,13.59-13.76C398.44,268.66,404.82,267.2,411.86,267.2 M411.86,264.2c-7.485,0-14.391,1.587-20.523,4.718
+		c-0.022,0.011-0.043,0.022-0.065,0.034c-4.465,2.418-8.405,5.893-11.758,10.363c-2.029-3.501-4.587-6.534-7.643-9.058
+		c-0.053-0.045-0.108-0.087-0.164-0.127c-5.497-3.936-12.706-5.931-21.427-5.931c-9.215,0-16.878,2.313-22.776,6.877
+		c-1.614,1.238-3.242,2.832-4.904,4.808V269.8c0-1.657-1.343-3-3-3h-27c-1.657,0-3,1.343-3,3V376c0,1.657,1.343,3,3,3h28.2
+		c1.657,0,3-1.343,3-3v-64.02c0-5.276,0.646-9.214,1.92-11.703c2.165-4.076,6.539-6.077,13.35-6.077
+		c5.682,0,9.194,1.893,11.052,5.957c0.764,1.682,1.678,5.222,1.678,12.513V376c0,1.657,1.343,3,3,3h27.8c1.657,0,3-1.343,3-3v-63.41
+		c0-4.321,0.672-8.018,1.999-10.986c2.453-5.313,6.678-7.804,13.281-7.804c5.574,0,9.091,1.835,11.069,5.776
+		c1.097,2.176,1.651,5.072,1.651,8.613V376c0,1.657,1.343,3,3,3h28.41c1.653,0,2.996-1.338,3-2.991l0.19-67.08
+		c0-7.044-0.346-12.517-1.028-16.275c-1.136-5.897-3.381-10.94-6.679-15.02c-4.031-4.955-8.615-8.479-13.631-10.48
+		C421.97,265.194,416.922,264.2,411.86,264.2L411.86,264.2z"/>
+</g>
+<g>
+	<g>
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M170,62c10.33,0,14-3.67,28.67-13
+			c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5c-8.5,5.68,29.5,34.67-22.67,42.26
+			c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12c-15-3.67-25.67-2.89-28.5,17
+			c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"/>
+		<path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M170,62
+			c10.33,0,14-3.67,28.67-13c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5
+			c-8.5,5.68,29.5,34.67-22.67,42.26c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12
+			c-15-3.67-25.67-2.89-28.5,17c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"
+			/>
+	</g>
+	<defs>
+		<filter id="Adobe_OpacityMaskFilter" filterUnits="userSpaceOnUse" x="105.83" y="47.5" width="122.67" height="85.774">
+			
+				<feColorMatrix  type="matrix" values="-1 0 0 0 1  0 -1 0 0 1  0 0 -1 0 1  0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+		</filter>
+	</defs>
+	<mask maskUnits="userSpaceOnUse" x="105.83" y="47.5" width="122.67" height="85.774" id="SVGID_1_">
+		<g filter="url(#Adobe_OpacityMaskFilter)">
+			
+				<image overflow="visible" width="128" height="91" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAItAAADjQAABP//2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAFsAgAMBIgACEQEDEQH/
+xACNAAEAAgMBAQAAAAAAAAAAAAAABQcBBAYCAwEBAAAAAAAAAAAAAAAAAAAAABAAAgICAQQCAwEB
+AAAAAAAAAwQBAgUGABAgERMwElAxFEAWEQABAwIEBAUEAwAAAAAAAAABABECIQMgMUESEFFhIjBx
+gTIEQJGhQlJiFBIBAAAAAAAAAAAAAAAAAAAAUP/aAAwDAQACEQMRAAAAr8GZad70qyHvKHKfdZzp
+qvewam91PYlQa1oVofICXiLCOv38ZGMj56MkITakR49hqVDclRECD6XBVlxm4AAAA8/M91ZavGlZ
+M4J+26rtU9cl0VaFjyNMWmSrGQDU4GxqyO7ia/1Dai/WCc7ist024jWHrrOR2y8fpEypljyZr7qq
+1IIAD15AAHV9PVosuF44b+gAAH//2gAIAQIAAQUA/If/2gAIAQMAAQUA/If/2gAIAQEAAQUA6Vra
+8p646zB9UdHVhRha3apiGmYcQOpbsiJmdX1z7wrjABpdIF4yWtLM1yulmFLGNdXn0m4tjHWbYXTJ
+mVsCAQ9hwI7hZBZc/XXcf/a5i0qLg6kCMkHwqpuf80n5BhVQ8oKlI5kBQRfZQ1Fkeuk42KirERHw
+sR5Dt8eMl0WH7T60rAVfiJHmm8LTRnpgQ+7JYwfrW+C1orA2wFn983LGwwC1ZpbmoBm761fqEl4H
+RzeFV3sdmAOVifPbkq2sshkzY3Jr5gVxZnJAJTKgHcn65pcxDILR6n2xUFsaYTFw+aYxjGGyg3Qd
+haxYe5qSIwNgbENjItsW9pOTMzzVmKhZYz1FlsptbbNyZBonLEtfml5a4yhJBB9bT4ru9qyLsRPI
+D5R+5R9cWzKzuEdqZfpctKRk80EI9izH9pe215t2RMxOC2iFqj3FX6s7utTju72vDuYccn/L/9oA
+CAECAgY/AEP/2gAIAQMCBj8AQ//aAAgBAQEGPwDgIxBJOQCEiNoK3Rr5hbb0DHrpi3CJjHRNcHbz
+wgDM5KN67F5SqgNoTGIR7AXRn8an9dE1y1KmoDr2S+xQFu0WOpDKNz5A3S6oR2gKXbop2pfqfxgB
+IeMD+VFg1MDSDqsQvYFSITRDcJPyUm/bP0wRuSFZVKAGnhS8l6Hjbt/ykAoUZh4ch0UbrasTxthn
+EaqI6eDukWATQkCeE2FRUIxkGILHgZaBgojojM6I/FJ7oljyHqgYyBfFIRzZXPjXpkwlIygZF8zU
+VKBJGSkDII3LWevCXmFGuilEkKV22wm+aEZyJtPXookF3GGQ6IfIt0lAu4Ww16omdwsdAm3FVUnN
+XBW4yZgpRslov7iu+bruX+acssn5ISGuAkqbYRJ2BoULYNDngt3HYOx9VGunF5FSAkEbcC4epxVw
+OMwo27p2kc1W4PumFwP5oi05KO+TROg+m//Z" transform="matrix(1 0 0 1 103 45)">
+			</image>
+		</g>
+	</mask>
+	<g mask="url(#SVGID_1_)">
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M170,62c10.33,0,14-3.67,28.67-13
+			c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5c-8.5,5.68,29.5,34.67-22.67,42.26
+			c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12c-15-3.67-25.67-2.89-28.5,17
+			c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"/>
+		<path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M170,62
+			c10.33,0,14-3.67,28.67-13c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5
+			c-8.5,5.68,29.5,34.67-22.67,42.26c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12
+			c-15-3.67-25.67-2.89-28.5,17c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"
+			/>
+	</g>
+</g>
+<g>
+	<path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M293.5,382c-9.998,0-10.315-5.942-10.546-10.279
+		c-0.217-4.07-0.465-5.721-4.453-5.721c-1.218,0-7.149,2.766-12.382,5.203C255.8,376.014,242.957,382,236.5,382
+		c-12.534,0-24.353-5.965-33.282-16.796C195.682,356.062,191,344.297,191,334.499v-21.89c-0.17-1.201-0.341-2.459-0.518-3.752
+		c-0.845-6.225-1.805-13.276-3.424-18.945c-1.138-4.55-2.757-8.294-4.324-11.914c-2.56-5.912-5.206-12.029-5.732-21.414
+		c-0.002-1.18,0.212-2.402,0.442-3.695c0.355-2.016,0.799-4.522-0.004-5.328c-2.376-2.377-5.892-4.014-9.292-5.598
+		c-1.994-0.93-4.056-1.889-5.919-3.005c-8.018-4.455-11.089-13.294-13.123-19.146c-0.37-1.066-0.69-1.987-0.997-2.755l-0.038-0.095
+		l-0.025-0.1c-0.816-3.267-2.352-5.857-5.008-9.474c-4.247,2.344-4.152,6.092-4.06,9.727c0.013,0.481,0.023,0.944,0.023,1.384
+		c0,11.657,6.152,18.462,10.225,22.965c2.191,2.423,3.775,4.175,3.775,6.034c0,3.166-8.077,19.509-8.159,19.671l-0.296,0.592
+		l-0.633,0.181c-3.363,0.961-11.819,1.606-21.042,1.606c-7.303,0-25.421-0.454-35.926-4.656
+		c-30.922-10.66-39.625-50.538-41.929-67.187c-0.814-5.892,0.305-10.864,3.325-14.776c6.96-9.015,22.775-10.902,35.482-12.418
+		c8.487-1.01,19.755-2.69,30.65-4.316c5.071-0.757,10.019-1.493,14.48-2.133c0.025-0.116,0.048-0.296,0.048-0.562
+		c0-1.51-0.598-4.632-1.125-7.385c-0.542-2.835-0.875-4.625-0.875-5.616v-6.001c0-11.356,13.95-20.5,25.5-20.5
+		c17.761,0,34.676,23.646,42.804,35.009c0.467,0.654,0.904,1.262,1.304,1.819c0.164-0.953,0.326-1.91,0.488-2.869
+		c4.085-24.071,7.006-38.771,13.125-39.933c1.174-0.168,2.268-0.248,3.317-0.248c16.308,0,21.873,18.76,25.937,32.459
+		c0.671,2.254,1.311,4.413,1.952,6.341c2.131-0.759,4.403-1.588,6.779-2.457C264.544,148.163,286.92,140,302.5,140
+		c16.501,0,16.501,16.934,16.501,22.5c0,25.503-14.097,62.045-45.589,86.19l-1.1,0.843l-0.928-1.03
+		c-6.994-7.771-16.168-12.191-28.05-13.513l-1.984-0.221l0.764-1.845c7.093-17.106,9.554-38.674,5.162-45.25
+		c-0.763-1.145-1.647-1.677-2.776-1.677c-0.789,0-1.146,0.278-1.346,0.486c-1.222,1.269-1.085,4.924-0.984,7.593
+		c0.074,1.938,0.139,3.62-0.208,4.779c-1.132,6.178-3.464,15.332-5.345,22.691c-1.271,4.979-2.585,10.13-2.617,10.963
+		c0,8.704,2.499,15.01,5.145,21.688c2.633,6.646,5.355,13.515,5.355,22.801c0,3.303-4.705,23.461-7.551,33.896l-0.417,1.529
+		l-1.504-0.501C232.255,311,227.348,311,225.499,311c-7.319,0-12.5,0.539-12.5,7.499c0,4.545,3.536,5.5,6.501,5.5
+		c0.724,0,2.461-0.41,4.142-0.808c2.474-0.585,5.031-1.19,6.857-1.19c3.014,0,7.5,1.731,7.5,6.5c0,5.946-5.555,7.321-10.456,8.535
+		c-5.938,1.47-9.543,2.707-9.543,7.465c0,5.075,2.224,5.5,4.5,5.5c0.845-0.146,5.368-2.56,8.67-4.322
+		c6.417-3.424,10.441-5.515,12.195-5.673c0.25-0.022,0.488-0.033,0.711-0.033c2.091,0,3.172,0.936,3.71,1.721
+		c1.59,2.315,0.269,5.939,0.114,6.346l-0.238,0.614l-0.61,0.241c-7.2,2.854-7.12,6.903-7.063,9.859
+		c0.006,0.263,0.011,0.511,0.011,0.746c0,4.068,2.289,6.5,4.499,6.5c8.643,0,9.501-18.314,9.501-18.5v-1.499h1.5
+		c2.734,0,5.946-0.217,9.348-0.444c3.719-0.248,7.553-0.507,11.48-0.551c0.231-1.382,0.072-2.827-0.097-4.339
+		c-0.113-1.024-0.231-2.083-0.231-3.166c0-9.228,7.274-12.5,13.502-12.5c9.963,0,13.5,5.655,13.5,10.5
+		c0,1.88-1.435,4.758-3.625,8.935c-0.976,1.864-2.313,4.413-2.376,5.091c0,1.074,1.71,3.068,3.363,4.997
+		c2.957,3.445,6.636,7.734,6.636,12.976C306.999,376.174,301.574,382,293.5,382L293.5,382z"/>
+	<g>
+		<path fill="#1F1F1F" d="M213.538,119.277c18.366,0.001,22.213,25.926,26.962,39.223c17-6,44-17,62-17c13,0,15,11,15,21
+			c0,26-15,62-45,85c-9-10-20-13-29-14c8.5-20.5,10.83-49,1-49c-6,0-3,11-4,14c-2,11-8,32-8,34c0,18,10.5,26.5,10.5,44.5
+			c0,3-4.5,22.5-7.5,33.5c-3-1-8-1-10-1c-6,0-14,0-14,9c0,6,5,7,8,7c2,0,8-2,11-2c2,0,6,1,6,5c0,10-20,4-20,16c0,6,3,7,6,7
+			c2,0,18.01-9.73,21-10c0.204-0.019,0.396-0.027,0.579-0.027c4.739,0,2.421,6.027,2.421,6.027c-8.83,3.5-8,9-8,12c0,5,3,8,6,8
+			c10,0,11-19,11-20c6,0,14-1,22-1c1-3,0-6,0-9c0-8,6-11,12-11c8,0,12,4,12,9c0,3-6,12-6,14c0,4,10,10,10,18s-5,13-12,13
+			c-16,0-3-16-15-16c-4,0-32,16-42,16c-27,0-44-28-44-46v-22c-1-7-2-16-4-23c-3-12-9.17-18.17-10-33c0-3,2-8,0-10
+			c-4-4-10.5-5.83-15.5-8.83c-9-5-11.5-16.17-13.5-21.17c-1-4-3-7-6-11c-7,3-6,9-6,13c0,18,14,25,14,29c0,2-5,13-8,19
+			c-3.04,0.868-11.171,1.549-20.627,1.549c-12.319,0-26.887-1.154-35.373-4.549c-29-10-38.26-46.189-41-66
+			C43.67,177,65.83,174.17,84,172c12.6-1.5,31.5-4.5,45.5-6.5c0,0,1,0,1-2c0-3-2-11-2-13v-6c0-10,12.5-19,24-19c20.17,0,40,33,45,39
+			c3.5-20.17,6.83-43.83,13-45C211.555,119.349,212.566,119.277,213.538,119.277 M213.538,116.277L213.538,116.277
+			c-1.121,0-2.285,0.085-3.462,0.253l-0.067,0.009l-0.067,0.013c-7.154,1.356-10.092,16.252-14.208,40.478
+			c-8.547-11.923-25.273-34.53-43.232-34.53c-6.25,0-12.861,2.322-18.139,6.37c-5.631,4.32-8.861,10.017-8.861,15.63v6
+			c0,1.128,0.326,2.887,0.902,5.898c0.415,2.168,0.916,4.785,1.058,6.364c-4.108,0.593-8.54,1.254-13.201,1.949
+			c-10.889,1.624-22.148,3.302-30.614,4.31c-12.988,1.551-29.15,3.481-36.493,12.993c-3.275,4.243-4.495,9.591-3.625,15.896
+			c1.349,9.753,4.34,24.19,10.932,37.593c7.76,15.777,18.523,26.143,31.994,30.81c10.756,4.273,29.043,4.736,36.418,4.736
+			c9.348,0,17.968-0.669,21.452-1.664l1.269-0.362l0.59-1.181c0.34-0.68,8.317-16.676,8.317-20.342c0-2.437-1.747-4.369-4.165-7.043
+			c-3.916-4.332-9.835-10.879-9.835-21.957c0-0.452-0.012-0.929-0.024-1.423c-0.087-3.454,0.041-5.904,2.188-7.644
+			c2.064,2.912,3.25,5.088,3.926,7.794l0.05,0.197l0.075,0.189c0.294,0.734,0.609,1.641,0.973,2.689
+			c1.976,5.687,5.281,15.197,13.81,19.963c1.919,1.147,4.002,2.118,6.018,3.057c3.399,1.584,6.611,3.08,8.799,5.234
+			c0.252,0.677-0.136,2.876-0.347,4.069c-0.23,1.3-0.467,2.645-0.467,3.873v0.084l0.005,0.084c0.54,9.651,3.24,15.891,5.851,21.924
+			c1.614,3.729,3.138,7.252,4.234,11.636l0.012,0.049l0.014,0.048c1.589,5.56,2.54,12.55,3.378,18.716
+			c0.172,1.267,0.34,2.497,0.507,3.673V334.5c0,10.129,4.813,22.26,12.56,31.658c9.218,11.183,21.45,17.342,34.44,17.342
+			c6.791,0,19.8-6.064,30.254-10.938c4.641-2.163,10.408-4.851,11.819-5.062c2.478,0.006,2.669,0.32,2.882,4.301
+			c0.219,4.089,0.626,11.699,12.044,11.699c8.832,0,15-6.579,15-16c0-5.797-3.88-10.319-6.997-13.953
+			c-1.082-1.262-2.686-3.131-2.97-3.964c0.292-0.864,1.411-2.999,2.171-4.449c2.362-4.507,3.796-7.404,3.796-9.634
+			c0-5.973-4.638-12-15-12c-9.112,0-15,5.495-15,14c0,1.166,0.123,2.267,0.241,3.331c0.107,0.968,0.207,1.864,0.204,2.7
+			c-3.537,0.083-7.038,0.317-10.199,0.529c-3.374,0.226-6.562,0.439-9.246,0.439h-2.961l-0.039,2.989
+			c-0.035,2.644-1.656,17.011-8,17.011c-1.21,0-3-1.589-3-5c0-0.244-0.005-0.503-0.01-0.775c-0.057-2.933-0.117-5.966,6.116-8.436
+			l1.223-0.484l0.472-1.228c0.302-0.785,1.707-4.846-0.276-7.733c-0.608-0.886-2.06-2.371-4.945-2.371
+			c-0.274,0-0.561,0.014-0.851,0.04c-1.974,0.178-5.405,1.917-12.763,5.842c-2.98,1.59-7.018,3.744-8.235,4.145
+			c-1.546-0.011-2.731-0.216-2.731-3.999c0-3.57,2.432-4.528,8.404-6.008c4.894-1.212,11.596-2.872,11.596-9.992
+			c0-5.252-4.527-8-9-8c-2.002,0-4.647,0.626-7.205,1.231c-1.293,0.307-3.246,0.769-3.795,0.769c-5,0-5-2.906-5-4
+			c0-5.094,2.882-6,11-6c1.611,0,6.513,0,9.051,0.846l3.009,1.003l0.834-3.06C240.998,301.743,246,280.698,246,277
+			c0-9.572-2.776-16.579-5.461-23.355c-2.583-6.521-5.024-12.68-5.039-21.068c0.119-1.052,1.42-6.151,2.57-10.657
+			c1.876-7.352,4.206-16.483,5.351-22.711c0.392-1.379,0.328-3.073,0.248-5.188c-0.054-1.437-0.219-5.81,0.57-6.5c0,0,0,0,0.001,0
+			c0.011,0,0.1-0.021,0.261-0.021c0.299,0,0.854,0,1.528,1.008c3.675,5.502,2.161,25.852-5.299,43.842l-1.53,3.69l3.97,0.44
+			c11.498,1.277,20.363,5.538,27.101,13.025l1.855,2.061l2.2-1.687c14.329-10.985,26.298-25.655,34.612-42.423
+			c7.457-15.037,11.562-31.003,11.562-44.958c0-5.936,0-24-18-24c-15.847,0-37.457,7.883-54.821,14.218
+			c-1.838,0.67-3.611,1.317-5.304,1.927c-0.479-1.517-0.963-3.148-1.464-4.836C236.714,135.658,230.964,116.277,213.538,116.277
+			L213.538,116.277z"/>
+	</g>
+</g>
+<g>
+	<g>
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M240.5,158.5c-5-14-9-42-30-39c-6.17,1.17-9.5,24.83-13,45
+			c-5-6-24.83-39-45-39c-11.5,0-24,9-24,19v6c0,2,2,10,2,13c0,2-1,2-1,2c-14,2-32.9,5-45.5,6.5c-18.17,2.17-40.33,5-37.5,25.5
+			c2.74,19.811,12,56,41,66c15,6,49,5,56,3c3-6,8-17,8-19c0-4-14-11-14-29c0-4-1-10,6-13c3,4,5,7,6,11c2,5,4.5,16.17,13.5,21.17
+			c5,3,11.5,4.83,15.5,8.83c2,2,0,7,0,10c0.83,14.83,7,21,10,33c2,7,3,16,4,23v22c0,18,17,46,44,46c10,0,38-16,42-16
+			c12,0-1,16,15,16c7,0,12-5,12-13s-10-14-10-18c0-2,6-11,6-14c0-5-4-9-12-9c-6,0-12,3-12,11c0,3,1,6,0,9c-8,0-16,1-22,1
+			c0,1-1,20-11,20c-3,0-6-3-6-8c0-3-0.83-8.5,8-12c0,0,2.5-6.5-3-6c-2.99,0.27-19,10-21,10c-3,0-6-1-6-7c0-12,20-6,20-16
+			c0-4-4-5-6-5c-3,0-9,2-11,2c-3,0-8-1-8-7c0-9,8-9,14-9c2,0,7,0,10,1c3-11,7.5-30.5,7.5-33.5c0-18-10.5-26.5-10.5-44.5
+			c0-2,6-23,8-34c1-3-2-14,4-14c9.83,0,7.5,28.5-1,49c9,1,20,4,29,14c30-23,45-59,45-85c0-10-2-21-15-21
+			C284.5,141.5,257.5,152.5,240.5,158.5z"/>
+	</g>
+	<defs>
+		<filter id="Adobe_OpacityMaskFilter_1_" filterUnits="userSpaceOnUse" x="46.254" y="119.277" width="271.246" height="261.223">
+			
+				<feColorMatrix  type="matrix" values="-1 0 0 0 1  0 -1 0 0 1  0 0 -1 0 1  0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+		</filter>
+	</defs>
+	<mask maskUnits="userSpaceOnUse" x="46.254" y="119.277" width="271.246" height="261.223" id="SVGID_2_">
+		<g filter="url(#Adobe_OpacityMaskFilter_1_)">
+			
+				<image overflow="visible" width="278" height="268" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAARTAAAJlwAADlr/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAQwBFgMBIgACEQEDEQH/
+xACaAAEAAgMBAQAAAAAAAAAAAAAABgcDBAUBAgEBAAAAAAAAAAAAAAAAAAAAABAAAgICAQMEAgEE
+AwEAAAAAAgMBBAUGACARExAwQBIxFBWAITM0IjI1FhEAAgIBAQYFAgUEAwEAAAAAAQIAESEDIDFB
+URIiEDBAYXGRE4GxMlIjocFCYuFyMwQSAQAAAAAAAAAAAAAAAAAAAID/2gAMAwEAAhEDEQAAAK/A
+AAAAPs+Hf7BCEqjprgAzdPrTsp7WtOtjVAAAAAAAAAAB7N4nbRubf16YI/J/kpblXDWJzPr52iy5
+VyeuYa5suOlRMuIAPreOekfSIUm8eOSAAAAADcuCmLhO0AD5i8qxlGb8v5pYG3jyDT3Pkprj27rF
+ed+fbpGOz0fTBk+xjjUp5RTzeHHMhjd7tEH+rK3yrNi19oqres3KQSbbHoAAB8fOUeegB4D0AADl
+dXglatIY7DidrDZ+x49AAAAAAAADz35OBwNWGl65+F3QADyGS2ryLvB3bZpi3zpAAAAeOEdfNT1j
+nbeegAADFl0yt4r1eYWzI+B3wB57iORU0qhQB92vUs4LH9+PsAAA8gU9hJW0yhvQLsycnqnoAAHD
+7cMK6y6fcLQ6mlug8Ee6FYHK1QAdLmi7OnXc/MwAAHG7OMo7Un0DJfP6Q7RcnsQlRlAB81xZFekC
+6vKFmyaju0XFqRThn3EffkAAA2LIq/aLxywKVnSYsh689Hjw5VU2PVZhBktyobWJQ89APIxKNApD
+563JAPv4AAAAAD66fKEw6tdC0c1Uelq6la+EhjwALKrWUlre4cwA+PvwraE2ZWYAAAAAAAAAAAAA
+2tUXP2YNOD0Dz34IdWc2hIAAAAAAAAAAAAABK7Rp23DaeaxtamnxiG8HZ1gAAAAAAAAAAAAADoXD
+TtwGSrrGp0+vnD6eAAAAAAAAAAAAAAA37gp63jfiMy4RCND65Bh8ABlxSYxa9p8Qq/zPgAAAAAAA
+AAAMtsVFNiya9n3GKd+5Z0iFa3Y4g++hPitpvKugZIHPa6IMAAAAAAAAAABt6gtuR0tY5IdfL9lP
+8KyYodGw4VjJxrVZoF687hSMqXky2JAAAAAAAAAAADb1BM+3WP0T+O8L5NrVADu9+B/Rv84AP//a
+AAgBAgABBQD+jL//2gAIAQMAAQUA/oy//9oACAEBAAEFAPiVqrLJ/wDzlmRtULFWfjqUxx0dWsP4
+GmB9bunmuLdGxULo1TF+QVYlfjzWBWasjSOnY+KAyZa1r49quOUoIUuONqKZGY15Tgy2EfRZ6LH7
+HqtSAREdosKhq9wxfaPi4oYO9gkCKfUhgozOHW9eZxTaL+YxXlu4JP0r+my0oaiyrw2PUFsZKMJf
+fyvp9lnE6SMcdpixHJ4N1L3MSUDfwhRNfoMYMdiwgWFX6TKT9ZT5chjl/RHpkUeVGz05rXhAjmrg
+r1maGlSXKOqIVCMPXXAVEhyFBHDSso2HHBKf14/kPaqlIWNdkpq9LlC0Nn1ybAahhLiXpD6L9CGC
+jL6xXyBVNQrJmviEJgErDqzYxKCGP5/phbJ4NG2fF4LIslWq3jlGlOKcfo6QZSqDWV1GsGQuupc+
+7my7VyKP5/ia7nlS1W0/lbSA7I02uMK1auPF6/WHgYmuPBooHgoUPIEY97v25BDPsbG6Ar+aP5Kn
+VK0/A68sARj0qGFhHO0fE2HPDjk4fdP2rFWwL1dMz2jb7sAj7T9tVUJ2scoQT8U57DvbJkaxkuxr
+b5ZW6bTIWrcL3kZzVGwFygX2R7JFAx+2n7RMFHsvL6q3V4kxX+TV/wDW6c9eFKcnZmzb5hH+G/h3
+Qyv7Ow5T9NC9rvxcwWVG2n2ck3xo2Sz5r6Bk360uRrdFhsKXt+W/t6JOVt1e3DEexP43k5/X5peR
+IeJODX7Gw2IXXut81rEpl1/CK+lf1mYiNgyoIVkbhW7PrpeQ/wCCjgw65/G61SOvzC3Jq3cNdFye
+ufxuVvx15mZnV0fa3jfrCfXKZAK6tkzJWndGDvTUuYe6L0+xnqUWK+TqFUtxMxOs7DAcpZNTwgoK
+Ok/+u9sKB5iMkunOJ2ZBRWySXRBhMXb60hs+fI5mZKeiJmJ1PN9xruFodblwwNswXkgwJZCZAWN2
+W1UnC7SmzCXC4Ogv7jvNeSV6Aw1ljdmtVSr7OJqzWzkcMYbD6qVtlR+vZ8HLS4Gj15pYSrOisbfo
+h7a7NXtm+r07VT8tdgStnqDmBEzMz7FDIOpMwm1LZFXLJbAvWfIKJ6CKBjYsgIJuPl9j0X/k1WYi
+v05WvDUbFTmtd94DMCp7BdrTU3SR5X3RBcHca3A22sUM22uPH7fXkc7nf2o9YntOn24NET3joaP2
+XulKIH4cEQ8kiLr06/421WQxXRP43Bcfr/LxtqatvA3IfX6J/G4tiK/zNLvSxET3j1YX1Dd7UyPz
+NKsyLUF9let90LTtVry2/mas2V36B/ZH44++hPGZ6vHMrnFmvIv89v5mDKRyOJnvXyVr9dGc2S06
+zN+5PJt2S5M95+Zhf/Qw/wDr7Aozq21GqzztPzsL/wChh/8AXekXBmdarNJmDrom3WSIlEQXRXrs
+sMRq7DC7r7a8EMjPxMPPa/hSia/M/fVWXkdg8putub1alUFxV8cEKzyFrXckZs/ErM8VjWrcMRP4
+302Qri1MZMUCGGiIl2meCppTFC4XNIxtha+31XueQ8ITMzPxdPyv9kMhi8/hAyCo0ZgtXra6q86f
+gZ+eYOn+zYx+upIVYGsPEVVIg47ju+Naz4+NulTs4DMLeoSEx8YcuVxJO2IJd/mp0pCKrVLW7K11
+cDYKpGl4OHMUQerP4/8AUs/GwuZOgzD59TwVYWyD+shs2GVchWBhTatlVQLm1Aobuw3LMjcsizVs
+wTq9myBK2wgkfj0sjZpljdwiIXtaTG9sKCG3nQmX5Cw7kzM+uCysVodsQeLLZGbjPkj5OF5OqO/e
+fJ29f//aAAgBAgIGPwAZf//aAAgBAwIGPwAZf//aAAgBAQEGPwD0nQg+TOoE/SfyLjn6gJpi2MB1
+Lo8BMpmE6dgzp1Vxz2RqMMtmCxG7Y2mR232+mCLvJoRXZbY5JMGJulERqUG4zAE6d/TxVeZAiY4C
+VCCI2qq5XPptMGKa4bFGN23cY1/GT9PDSX3uL8eL43iPp/tONikUsfYQUnSDzgLk+4EtgT8w0kLL
+ZUbx5mmTzqL8bJBjdt3G0mBr/EwGr6azF+PFh7QtVB5SgseQgpOkHnAdW2+YOwfSDtEws3SiIxrh
+PsVjrqvL02G8MIhPLaKkRm017t4qM/8A9Gn0d2PwgXxIPGXqIGo2IKQCvaDtEwNpviIP9v7HawhP
+4GDp0mz7QD7dA8Z3YHsJ3kmKzr1UQRed0CDgNumFy1WvOb4iHh1f2Ph06SljAdSwOQnepPzAPtjH
+tB2D6T9In6RP0iYWYHn4PkN8T7vD7n/EXSXjvikrBgTA9Kz3u4T7epaEnAPGBhtEx88DOrjdw3zE
+FDh6Yyv9h+c03XeGES+W0TPtA7znwKnjRi/HlWTQnT1C5Yz5TGBOJMT/ALD84nwNps1iO92AaHgh
+ug2Ivx5TMDVCfcZv4i27kIpu7HlN8Qi7CzTUbywiXy2SxjaaNlsDxRx/iQYmeA8kxxw8Bosf0moD
+5LZ4TUe7tjU0l5G4vxsWY3dVCNqE2t9uwumxyuICPJ1K5HwVrpWwYueHkvngZZ3mfcO4YEAHLYOa
+jaKHHE7K5pWOfmLnh5LCrsR9MigSSssbxF0tRqYc4O4Swb2jKB3nPgOrHvAvWPrBTCXcOYdLSbuM
+JJsnedmxvG6Lps3cuDAQfIKmNqIveMgwo4phvEDIaYbiIBqEso4iKOsXygZTsmM37Tf08epGKnmI
+q6p6l5wHq4RtPSa2MLubY7ztrqIaF9wijqgIPkNfKHp35vxGppMVYHhxiF95A2nxwMZDvUkbBCsQ
+DwlnJ8kOhPTxWBWajxBg7hMGYOxZMbPCPqHiceK/I/OIByG02OELcH/Pz+pCVPMTJ6hANQlT7yi4
++s/9B9Zhx9Zlx9YQNQfWFNNrvYsbxEzeBAdkiM4GVN+kwSPiZJPzt/ZY7jj4gO059j6xNQbrAMXO
+8bTj2PrUBOaowHYJhQcTXrTp8AfzinYOeECXus+tq8Govx4dzCYYRgrR3969bp1F+Ize0fT0WpVN
+EzOs07tQmWfW6cX4jheU1EcUwY/1Phu9dpxfiFWhcoLhpRCMQgbtkJpizxMtruFlvHAwqcEb/S6Z
+i/HgzMaqEaORz4TuOOW11EWbgxwjYj9O6/S6b8iImeHgQDQJAP18KQXL1Me0oTEpUJJ9pjRY/hOr
+WQoSTgz4EZQe44Es7z6ZdNjlcGAiMpF3MsxS90wtVPtJgnwyLAxASggtRKQVCJ91QT0G69OuoD23
+3Re67EsZE3RqHCAkdpsX4DUcUWNwXMsJ0dYuWpuNYuxCyilY59OFY/x3v5Re4G5YMIuHnvBEvUPU
+BwMAsCoQrWeQhCsUX+sGqNVuoG95iFzmsw54Rq3+oB02PT+2BdRuk+8/WPrCeoQ/byfaV1dI9pZy
+fEIxqp+rhKBtR6rsv8Lndde97WN8zde97H//2Q==" transform="matrix(1 0 0 1 43 116)">
+			</image>
+		</g>
+	</mask>
+	<g mask="url(#SVGID_2_)">
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#CEBC01" d="M240.5,158.5c-5-14-9-42-30-39c-6.17,1.17-9.5,24.83-13,45
+			c-5-6-24.83-39-45-39c-11.5,0-24,9-24,19v6c0,2,2,10,2,13c0,2-1,2-1,2c-14,2-32.9,5-45.5,6.5c-18.17,2.17-40.33,5-37.5,25.5
+			c2.74,19.811,12,56,41,66c15,6,49,5,56,3c3-6,8-17,8-19c0-4-14-11-14-29c0-4-1-10,6-13c3,4,5,7,6,11c2,5,4.5,16.17,13.5,21.17
+			c5,3,11.5,4.83,15.5,8.83c2,2,0,7,0,10c0.83,14.83,7,21,10,33c2,7,3,16,4,23v22c0,18,17,46,44,46c10,0,38-16,42-16
+			c12,0-1,16,15,16c7,0,12-5,12-13s-10-14-10-18c0-2,6-11,6-14c0-5-4-9-12-9c-6,0-12,3-12,11c0,3,1,6,0,9c-8,0-16,1-22,1
+			c0,1-1,20-11,20c-3,0-6-3-6-8c0-3-0.83-8.5,8-12c0,0,2.5-6.5-3-6c-2.99,0.27-19,10-21,10c-3,0-6-1-6-7c0-12,20-6,20-16
+			c0-4-4-5-6-5c-3,0-9,2-11,2c-3,0-8-1-8-7c0-9,8-9,14-9c2,0,7,0,10,1c3-11,7.5-30.5,7.5-33.5c0-18-10.5-26.5-10.5-44.5
+			c0-2,6-23,8-34c1-3-2-14,4-14c9.83,0,7.5,28.5-1,49c9,1,20,4,29,14c30-23,45-59,45-85c0-10-2-21-15-21
+			C284.5,141.5,257.5,152.5,240.5,158.5z"/>
+	</g>
+</g>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M168.67,263.33c-3.67-2-6.67-3.33-9-6.33
+	c-5,9-11.17,30.5-11.17,41.5c0,3,1,10,2,15C177.67,289,168.67,263.33,168.67,263.33z"/>
+<g>
+	<path fill-rule="evenodd" clip-rule="evenodd" fill="#FF6600" d="M193.772,206.837c-5.358,0-10.236-2.729-13.736-7.683l-0.198-0.28
+		l-0.093-0.33c-8.547-30.246-25.982-48.151-39.992-62.539c-2.949-3.03-5.736-5.89-8.24-8.667l-0.94-1.043l0.662-1.238
+		c3.588-6.719,10.431-10.272,19.783-10.272c5.169,0,10.029,1.066,13.196,1.96c2.665,0.75,5.5,1.129,8.429,1.129
+		c0.004,0,0.006,0,0.01,0c7.256,0,14.981-2.283,22.334-6.601c2.978-1.746,6.236-2.632,9.686-2.632
+		c6.564,0,11.543,3.219,11.753,3.357l1.181,0.775l-0.336,1.373c-4.887,19.923-7.7,46.495-8.604,81.235l-0.006,0.27l-0.078,0.255
+		C206.643,202.342,200.553,206.835,193.772,206.837L193.772,206.837z"/>
+	<path fill="#917013" d="M204.676,110.643c6.042,0,10.654,3.027,10.654,3.027c-4.33,17.66-7.66,43.26-8.66,81.66
+		c-1.729,5.729-7.115,9.506-12.899,9.506c-4.249,0-8.713-2.037-12.101-6.836c-10.51-37.2-34.41-56.19-48.67-72
+		c3.897-7.297,11.292-9.214,18.019-9.214c5.322,0,10.226,1.199,12.651,1.884c2.928,0.824,5.941,1.206,8.975,1.206
+		c8.011,0,16.174-2.662,23.355-6.876C198.988,111.248,201.975,110.643,204.676,110.643 M204.677,106.643L204.677,106.643
+		c-3.812,0-7.412,0.979-10.701,2.907c-7.053,4.139-14.428,6.327-21.332,6.327c-2.745,0-5.4-0.355-7.892-1.057
+		c-3.285-0.927-8.337-2.033-13.734-2.033c-10.138,0-17.589,3.917-21.547,11.33l-1.323,2.478l1.881,2.086
+		c2.528,2.803,5.326,5.676,8.289,8.718c13.853,14.225,31.094,31.929,39.502,61.69l0.187,0.659l0.396,0.561
+		c3.883,5.5,9.342,8.528,15.369,8.528c7.655,0,14.534-5.078,16.729-12.35l0.155-0.515l0.014-0.537
+		c0.889-34.117,3.764-61.306,8.546-80.812l0.673-2.746l-2.363-1.551C217.296,110.176,211.832,106.643,204.677,106.643
+		L204.677,106.643z"/>
+</g>
+<g>
+	<g>
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#FF6600" d="M215.33,113.67c-4.33,17.66-7.66,43.26-8.66,81.66
+			c-3,9.939-17,14-25,2.67c-10.51-37.2-34.41-56.19-48.67-72c6.98-13.07,25.18-8.88,30.67-7.33c10.66,3,22.43,0.14,32.33-5.67
+			C205.67,107.33,215.33,113.67,215.33,113.67z"/>
+	</g>
+	<defs>
+		<filter id="Adobe_OpacityMaskFilter_2_" filterUnits="userSpaceOnUse" x="133" y="110.643" width="82.33" height="94.193">
+			
+				<feColorMatrix  type="matrix" values="-1 0 0 0 1  0 -1 0 0 1  0 0 -1 0 1  0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+		</filter>
+	</defs>
+	<mask maskUnits="userSpaceOnUse" x="133" y="110.643" width="82.33" height="94.193" id="SVGID_3_">
+		<g filter="url(#Adobe_OpacityMaskFilter_2_)">
+			
+				<image overflow="visible" width="87" height="99" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAIPAAADBQAAA/v/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAGMAVwMBIgACEQEDEQH/
+xACPAAEAAgMBAQAAAAAAAAAAAAAABgcCAwUBBAEBAAAAAAAAAAAAAAAAAAAAABAAAQQBAwMDBQEA
+AAAAAAAAAwECBAYFABAgETESUCETMDIjMxQ0EQACAQEGAwgDAQAAAAAAAAABAgARECAhMUEDcRIi
+MFFhgZGhMkJigrITEgEAAAAAAAAAAAAAAAAAAABQ/9oADAMBAAIRAxEAAACv2ySEXWJ8xBEowI1n
+MZGQLbaXOKmfaNVkVRIS3Ped0jW2jDL0OH24uVm+YYgk1lUhMSzffm+kA8hE2rwggAGeAsia0lbB
+2HnphWlk1YRcAACawr7i7tnJ6xpqi1anI+AAACxJvS0zJXU0ihhpAAAA2BjiAH//2gAIAQIAAQUA
+9K//2gAIAQMAAQUA9K//2gAIAQEAAQUA5iCUzolalGSTWXiaSK8ZwAed+Oq7TIyoBVkmkjVCUuQj
+kpkpVh0j3gVUAdCxYRtzEQYxS3IuZxUhgj4MgSNY1nirGLpY4l1/MLSDY3exERkd5PLJ6r+efGLi
+8kOSPlbDeEfz/JtWs+QBMdPZIHwXtdJHhH3RVatWsDmrEktOPd/23cifFwCV4SVTOIcY3o9uxPZl
+4d15YbIOhSsJkGyA7SF6CuhXKflTcu7QSIQepX6bj/q5YeUsWbhJaGBqYvQFtIjpnJFVFqOU8gjM
+x7clIY0Nkej5/PEZR0EsWzj+PKWZijlSHSDfQH2J32//2gAIAQICBj8AK//aAAgBAwIGPwAr/9oA
+CAEBAQY/AL/LtqWPhAz1A7hKioMXZObMFHmaQInmYC45ie+U5B6Q8q0PhDysaT5H0gO6C3GDoA8p
+QARjTSbQ0G4n9CAPqc4tKQUExE+M+MwFrcINyuH+qmvAixdrdbDQwY1rffgZz/lze9bRs7rYaEwY
+1umPwNwMpoRkYuzut1CAg3DGBOeF1dxDRlNYqserIiBhraZT8heU16GIBi41qLWgXQm+Nl26lwgY
+WNF4m+jaMaGLjpY0C61JvgjMZRAxxgNYwrpCR49gAT0EwdfvCA2cbcbXLsfv+s+37W//2Q==" transform="matrix(1 0 0 1 131 108)">
+			</image>
+		</g>
+	</mask>
+	<g opacity="0.6" mask="url(#SVGID_3_)">
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#7F3E03" d="M215.33,113.67c-4.33,17.66-7.66,43.26-8.66,81.66
+			c-3,9.939-17,14-25,2.67c-10.51-37.2-34.41-56.19-48.67-72c6.98-13.07,25.18-8.88,30.67-7.33c10.66,3,22.43,0.14,32.33-5.67
+			C205.67,107.33,215.33,113.67,215.33,113.67z"/>
+	</g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M210.936,113.796
+	c-11.983,64.227-22.738,60.791-73.726,11.721c0.148-11.045,22.734-5.193,27.431-4c9.14,2.331,19.844,0.864,27.954-4.462
+	C202.85,110.315,210.936,113.796,210.936,113.796z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M281.5,290.5c-7.17-37.17-37.17-42.83-37.17-42.83
+	c3,10,6.34,19.33,9.17,27.83C261,282,273.5,289.5,281.5,290.5z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M168.67,263.33c-3.67-2-6.67-3.33-9-6.33
+	c-5,9-11.17,30.5-11.17,41.5c0,3,1,10,2,15C177.67,289,168.67,263.33,168.67,263.33z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M281.5,290.5c-7.17-37.17-37.17-42.83-37.17-42.83
+	c3,10,6.34,19.33,9.17,27.83C261,282,273.5,289.5,281.5,290.5z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M166.77,188.01c5.25,0.61,8.37,11.49,9.67,19.44c1.33,8.17,1.33,16.76-4.05,17.47
+	c-8.06,1.08-11.67-21.93-11.67-21.93C158.28,187.29,166.77,188.01,166.77,188.01z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M229.86,192.56c0.99,10.209-3.431,23.959-6.57,24.39
+	c-6.29,0.85-7.51-9.05-7.72-10.7c-0.41-3.3-3.061-24.76,7.939-26.25C228.33,182,229.45,189.26,229.86,192.56z"/>
+<path opacity="0.1" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M216.51,195.85c0.93-8.26,11.79-5.08,11.79,2.86
+	c0,7.95-2.1,14.261-4.34,16.21C217.75,220.32,215.58,204.12,216.51,195.85z"/>
+<path opacity="0.1" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M163.09,206.33c-1.19-8.13,9.59-8.43,11.57-0.891
+	c1.97,7.551,1.6,14.181,0.02,16.721C170.3,229.18,164.28,214.45,163.09,206.33z"/>
+<rect x="701" y="306" fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" stroke="#1F1F1F" stroke-width="20" stroke-linecap="round" stroke-linejoin="round" width="14" height="34"/>
+<circle fill-rule="evenodd" clip-rule="evenodd" fill="#FFFF33" cx="182.5" cy="139.5" r="11.5"/>
+<g>
+	<g>
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M149.33,127.79c0,14.21-17,14.21-17,14.21
+			c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12C139,114.67,149.33,119.26,149.33,127.79z"/>
+		<path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M149.33,127.79
+			c0,14.21-17,14.21-17,14.21c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12
+			C139,114.67,149.33,119.26,149.33,127.79z"/>
+	</g>
+	<defs>
+		<filter id="Adobe_OpacityMaskFilter_3_" filterUnits="userSpaceOnUse" x="116.477" y="113.17" width="34.353" height="30.33">
+			
+				<feColorMatrix  type="matrix" values="-1 0 0 0 1  0 -1 0 0 1  0 0 -1 0 1  0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+		</filter>
+	</defs>
+	<mask maskUnits="userSpaceOnUse" x="116.477" y="113.17" width="34.353" height="30.33" id="SVGID_4_">
+		<g filter="url(#Adobe_OpacityMaskFilter_3_)">
+			
+				<image overflow="visible" width="39" height="35" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAGnAAAB+QAAAmr/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIACMAJwMBIgACEQEDEQH/
+xAB9AAEAAgMBAAAAAAAAAAAAAAAABgcBBAUDAQEAAAAAAAAAAAAAAAAAAAAAEAACAwEAAwEBAAAA
+AAAAAAADBAECBQYQMBEAExEBAAIBAwMDBQAAAAAAAAAAAQACETFBAxBxEiGBkcEiMhMEEgEAAAAA
+AAAAAAAAAAAAAAAw/9oADAMBAAIRAxEAAACAdvxtYgHEurklMuyNm1aPm5YOlHo4aqPjzBnAAf/a
+AAgBAgABBQD0/wD/2gAIAQMAAQUA9P8A/9oACAEBAAEFAIibTncyy3BOKvFH8NxOfk/edThlzMzx
+CDIRzGvlhIJ7PgO1yJKUZSJW4f2kwMYdRql91Nu6h8rrhQMnYLRXY67+1bHJY/ifP//aAAgBAgIG
+PwAf/9oACAEDAgY/AB//2gAIAQEBBj8AAMroQtfIOxM1yMVq2qb7zG8GxkrKvjtMeJLPiaTg4g+3
+l5aVx3sER1zK4elhdp/JjSvPxq9rkOWm2pAvfCajPzPmWpwvks/eubli3uevU+vX/9k=" transform="matrix(1 0 0 1 114 111)">
+			</image>
+		</g>
+	</mask>
+	<g mask="url(#SVGID_4_)">
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M149.33,127.79c0,14.21-17,14.21-17,14.21
+			c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12C139,114.67,149.33,119.26,149.33,127.79z"/>
+		<path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M149.33,127.79
+			c0,14.21-17,14.21-17,14.21c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12
+			C139,114.67,149.33,119.26,149.33,127.79z"/>
+	</g>
+</g>
+<g>
+	<g>
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M230.33,111.33c3,4.84,4.68,17.12-15.33,16.17
+			c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+		<path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M230.33,111.33
+			c3,4.84,4.68,17.12-15.33,16.17c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+	</g>
+	<defs>
+		<filter id="Adobe_OpacityMaskFilter_4_" filterUnits="userSpaceOnUse" x="204.631" y="103.813" width="29.007" height="25.239">
+			
+				<feColorMatrix  type="matrix" values="-1 0 0 0 1  0 -1 0 0 1  0 0 -1 0 1  0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+		</filter>
+	</defs>
+	<mask maskUnits="userSpaceOnUse" x="204.631" y="103.813" width="29.007" height="25.239" id="SVGID_5_">
+		<g filter="url(#Adobe_OpacityMaskFilter_4_)">
+			
+				<image overflow="visible" width="34" height="31" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAGWAAAB3QAAAkb/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAB8AIgMBIgACEQEDEQH/
+xAB4AAADAQEAAAAAAAAAAAAAAAAABQcGAwEBAAAAAAAAAAAAAAAAAAAAABAAAgIDAQEAAAAAAAAA
+AAAAAgMEBQABBiASEQACAQMDAwUAAAAAAAAAAAABAgAREgMQITFRsQRBcdEiYhIBAAAAAAAAAAAA
+AAAAAAAAIP/aAAwDAQACEQMRAAAAwTkqRLU1vnZkQBrUoy5KrPV6Y5gH/9oACAECAAEFAPX/2gAI
+AQMAAQUA9f/aAAgBAQABBQBSjccbl5Tgk8tMSLksSecugGya+CnSpUBJr6ysBesoJuosystUkmVa
+IBfU2i2awfr6iTrxYSLC/MH7cR5//9oACAECAgY/AF//2gAIAQMCBj8AX//aAAgBAQEGPwAJjFWM
+DEkE9BLlNfcQpkFrDQ3DgiA0h2EbIg+y76C40Dd4tWHENGEZFNSdhoLa3elOYBi8fK46hGPYSj+P
+mQdTjf4hOe6/9Cmn/9k=" transform="matrix(1 0 0 1 202 101)">
+			</image>
+		</g>
+	</mask>
+	<g mask="url(#SVGID_5_)">
+		<path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M230.33,111.33c3,4.84,4.68,17.12-15.33,16.17
+			c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+		<path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M230.33,111.33
+			c3,4.84,4.68,17.12-15.33,16.17c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+	</g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M116,85c4-22.67,16.33-29.33,23.67-27.67
+	c7.33,1.67,20,11,30,11c12.33,0,16.66-3,23.66-8.66c7-5.67,10.31,2.33,10,12.33C203,83,207,91.67,204,92s-10.67-18-19-11
+	c-5.33,10.67-2,25.67-12.33,27c-6.7,0.86-21.67-3.67-35-19c-3.07-3.52-12-6-15,1c-3.33,7.75-3.34,4.67-5,8
+	C116.61,100.11,114.86,91.45,116,85z"/>
+<g>
+	<g>
+		<circle fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" cx="169" cy="29" r="26"/>
+		<circle fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" cx="169" cy="29" r="26"/>
+	</g>
+	<defs>
+		<filter id="Adobe_OpacityMaskFilter_5_" filterUnits="userSpaceOnUse" x="141.5" y="1.5" width="55" height="55">
+			
+				<feColorMatrix  type="matrix" values="-1 0 0 0 1  0 -1 0 0 1  0 0 -1 0 1  0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+		</filter>
+	</defs>
+	<mask maskUnits="userSpaceOnUse" x="141.5" y="1.5" width="55" height="55" id="SVGID_6_">
+		<g filter="url(#Adobe_OpacityMaskFilter_5_)">
+			
+				<image overflow="visible" width="60" height="60" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAHLAAACZwAAAyD/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIADwAPAMBIgACEQEDEQH/
+xACFAAACAwEBAQAAAAAAAAAAAAAABwIFBgQBAwEBAAAAAAAAAAAAAAAAAAAAABAAAQQBBAMBAAAA
+AAAAAAAAAgEDBAYFABARFCBAExIRAAEDAgQFBAMAAAAAAAAAAAEAEQJBEiAhMQMQUXGRImGhwWKx
+MhMSAQAAAAAAAAAAAAAAAAAAAED/2gAMAwEAAhEDEQAAAF/6bAorJk9gpKZ5Z8UxYV5aNtbNU+no
+BGQYVdN9TFy2Ua0TUEZB4cpQqvS5cO7hBi3ag+w0chmYEogf/9oACAECAAEFAPQ//9oACAEDAAEF
+APQ//9oACAEBAAEFANIiksKvzpWhpcpUkVGY0MmFIilsiKS1qtfXUPFMMAjDSaciMuJmq4xIby+M
+PHyNV+F2p2KhgwxuYoQ3HFibPC80sUWUwnDXhZwRY34XuVGQLUyI4jjPha5YhH/afaFJKLIrmbbf
+ZAxNNps1thu15rsObY3KyIDmKuDJiNnjKMq2RwHM2w5GnDNw9055HucH9uN//9oACAECAgY/AAf/
+2gAIAQMCBj8AB//aAAgBAQEGPwBAAOToEDbbE909x7ImJJPqFbvQI9acQAHJ0Cjvb0Xkc86IC0L9
+QmMQpeALoxY2HQ8uEXDxj+VFhTAQaqcgMxmFbXRlJ+YUemGfRW/f5RiTmSCokcsMw9Cr6XXe7qG9
+Ghz6KHlqE8S/EknNS2ISd9enEGBeD5hASmx5FPeESJjujDYLvWiM5l5HU4PHWjI2/wBGrqvO5vs/
+zg//2Q==" transform="matrix(1 0 0 1 139 -1)">
+			</image>
+		</g>
+	</mask>
+	<g mask="url(#SVGID_6_)">
+		<circle fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" cx="169" cy="29" r="26"/>
+		<circle fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" cx="169" cy="29" r="26"/>
+	</g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M149,22.33c13.33-26.66,39.67-9,40.67,3.34
+	C190.67,38,141.58,37.17,149,22.33z"/>
+<rect x="337.5" y="105.5" fill-rule="evenodd" clip-rule="evenodd" fill="none" width="764" height="167"/>
+<text transform="matrix(1 0 0 1 337.5 191.7793)" fill="#1F1F1F" font-family="'Helvetica-Bold'" font-size="120" letter-spacing="-6">Powered by</text>
+</svg>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/favicon.ico
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/favicon.ico b/community/mahout-mr/mr/src/images/logos/favicon.ico
new file mode 100644
index 0000000..4f5878d
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/favicon.ico differ

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/favicon128.png
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/favicon128.png b/community/mahout-mr/mr/src/images/logos/favicon128.png
new file mode 100644
index 0000000..a477d15
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/favicon128.png differ

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/favicon16.png
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/favicon16.png b/community/mahout-mr/mr/src/images/logos/favicon16.png
new file mode 100644
index 0000000..595b237
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/favicon16.png differ

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/favicon32.png
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/favicon32.png b/community/mahout-mr/mr/src/images/logos/favicon32.png
new file mode 100644
index 0000000..39668fd
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/favicon32.png differ

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/favicon64.png
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/favicon64.png b/community/mahout-mr/mr/src/images/logos/favicon64.png
new file mode 100644
index 0000000..5032b12
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/favicon64.png differ

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/mahout-logo-100.png
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo-100.png b/community/mahout-mr/mr/src/images/logos/mahout-logo-100.png
new file mode 100644
index 0000000..9868200
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/mahout-logo-100.png differ

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/mahout-logo-200.png
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo-200.png b/community/mahout-mr/mr/src/images/logos/mahout-logo-200.png
new file mode 100644
index 0000000..4ef5bdd
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/mahout-logo-200.png differ

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/mahout-logo-300.png
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo-300.png b/community/mahout-mr/mr/src/images/logos/mahout-logo-300.png
new file mode 100644
index 0000000..2fbd589
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/mahout-logo-300.png differ

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/mahout-logo-400.png
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo-400.png b/community/mahout-mr/mr/src/images/logos/mahout-logo-400.png
new file mode 100644
index 0000000..d9ac832
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/mahout-logo-400.png differ

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/mahout-logo-poweredby-100.png
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo-poweredby-100.png b/community/mahout-mr/mr/src/images/logos/mahout-logo-poweredby-100.png
new file mode 100644
index 0000000..8f8af00
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/mahout-logo-poweredby-100.png differ

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/mahout-logo-poweredby-55.png
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo-poweredby-55.png b/community/mahout-mr/mr/src/images/logos/mahout-logo-poweredby-55.png
new file mode 100644
index 0000000..9814d31
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/mahout-logo-poweredby-55.png differ

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/mahout-logo-transparent-400.png
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo-transparent-400.png b/community/mahout-mr/mr/src/images/logos/mahout-logo-transparent-400.png
new file mode 100644
index 0000000..583436b
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/mahout-logo-transparent-400.png differ


[41/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/resources/cf-data-purchase.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/resources/cf-data-purchase.txt b/community/mahout-mr/examples/src/main/resources/cf-data-purchase.txt
deleted file mode 100644
index d87c031..0000000
--- a/community/mahout-mr/examples/src/main/resources/cf-data-purchase.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-u1,iphone
-u1,ipad
-u2,nexus
-u2,galaxy
-u3,surface
-u4,iphone
-u4,galaxy

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/resources/cf-data-view.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/resources/cf-data-view.txt b/community/mahout-mr/examples/src/main/resources/cf-data-view.txt
deleted file mode 100644
index 09ad9b6..0000000
--- a/community/mahout-mr/examples/src/main/resources/cf-data-view.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-u1,ipad
-u1,nexus
-u1,galaxy
-u2,iphone
-u2,ipad
-u2,nexus
-u2,galaxy
-u3,surface
-u3,nexus
-u4,iphone
-u4,ipad
-u4,galaxy

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/resources/donut-test.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/resources/donut-test.csv b/community/mahout-mr/examples/src/main/resources/donut-test.csv
deleted file mode 100644
index 46ea564..0000000
--- a/community/mahout-mr/examples/src/main/resources/donut-test.csv
+++ /dev/null
@@ -1,41 +0,0 @@
-"x","y","shape","color","xx","xy","yy","c","a","b"
-0.802415437065065,0.0978854028508067,21,2,0.643870533640319,0.07854475831082,0.00958155209126472,0.503141377562721,0.808363832523192,0.220502180491382
-0.97073650965467,0.989339149091393,23,2,0.942329371176533,0.96038763245370,0.978791951924881,0.67900343471543,1.38604520961670,0.989771844311643
-0.566630310611799,0.369259539060295,25,1,0.321069908904024,0.209233647314105,0.136352607187021,0.146740132271139,0.676330182744379,0.569352171215186
-0.377948862500489,0.500907538458705,24,1,0.142845342665413,0.189317434378387,0.250908362084759,0.122054511555201,0.62749797190921,0.79865886318828
-0.0133881184738129,0.269793515326455,25,2,0.000179241716268851,0.00361202754665705,0.0727885409122062,0.538317888266967,0.270125494221621,1.02283505301727
-0.395229484187439,0.385281964903697,25,1,0.156206345171069,0.152274792255611,0.148442192480054,0.155361155247979,0.551949760078871,0.717070128562224
-0.757145672803745,0.416044564917684,21,1,0.573269569845435,0.315006342020941,0.173093079997545,0.270503996498299,0.863922826323613,0.481737796145881
-0.589166145538911,0.971624446567148,24,2,0.347116747049177,0.572448230095344,0.944054065166917,0.479979395505718,1.13629697360157,1.05491161769044
-0.843438957352191,0.218833807157353,25,2,0.711389274779351,0.184572958142208,0.0478882351549814,0.443852166182378,0.871365313708512,0.269071728782402
-0.628562391968444,0.801476288354024,25,2,0.395090680597092,0.503777852913796,0.642364240793743,0.327744170151609,1.01855531091386,0.8833629703887
-0.262267543468624,0.247060472844169,22,2,0.0687842643570668,0.0647959433010369,0.0610388772419841,0.347124077652729,0.360309785599907,0.778002605819416
-0.738417695043609,0.562460686312988,21,1,0.545260692353516,0.415330923539883,0.316362023647678,0.246463657857698,0.928236347058869,0.620312280963368
-0.498857178725302,0.164454092038795,21,1,0.248858484765768,0.0820391043843046,0.0270451483883046,0.335547854098302,0.525265297877247,0.527436513434051
-0.499293045606464,0.733599063009024,25,1,0.249293545390979,0.366280910423824,0.538167585247717,0.233600132755117,0.88739006679064,0.888186376514393
-0.553942533675581,0.548312899889424,24,1,0.306852330614922,0.303733837011753,0.30064703618515,0.0724150069741539,0.779422457207946,0.706833997094728
-0.661088703200221,0.98143746308051,24,2,0.43703827349895,0.64881721974001,0.963219493937908,0.507672730364875,1.1833248782295,1.03830648704340
-0.492181566543877,0.376017479225993,23,1,0.242242694445585,0.185068871973329,0.141389144683470,0.124228794404457,0.619380205632255,0.63187712891139
-0.991064163157716,0.216620326042175,21,2,0.982208175495505,0.21468464215194,0.0469243656546183,0.566963889458783,1.01446170018888,0.21680455446021
-0.601602173643187,0.343355831922963,24,1,0.361925175332207,0.206563614817919,0.117893227315510,0.186709392055052,0.692689254029335,0.52594111396747
-0.0397100185509771,0.0602901463862509,25,2,0.00157688557331895,0.00239412283143915,0.00363490175127556,0.636562347604197,0.0721927096360464,0.962180726382856
-0.158290433697402,0.630195834673941,23,2,0.0250558614001118,0.0997539719848347,0.397146790040385,0.365672507948237,0.649771230080632,1.05148551299849
-0.967184047214687,0.497705311980098,25,2,0.935444981186582,0.48137263796116,0.247710577573207,0.467189682639721,1.08772954302059,0.498785990511377
-0.538070349488407,0.0130743277259171,24,2,0.289519700998577,0.00703490808881019,0.000170938045484685,0.488411672495383,0.538229169633216,0.462114639529248
-0.758642012253404,0.673675778554752,25,2,0.575537702755893,0.511078748249156,0.453839054611352,0.311542880770993,1.01458206044028,0.715606548922268
-0.986405614530668,0.981674374546856,21,2,0.972996036377624,0.9683291146939,0.96368457764196,0.684544100071034,1.39164672744903,0.981768498658543
-0.51937106740661,0.462004136526957,23,1,0.269746305659081,0.239951581534275,0.213447822168019,0.0426488439882434,0.695121664046734,0.666672328069706
-0.534244359936565,0.692785677267238,21,1,0.28541703612403,0.370116840724856,0.479951994626626,0.195803456422130,0.87485371963012,0.83479357381183
-0.0795328004751354,0.536029864801094,22,2,0.00632546635141770,0.0426319562859392,0.287328015958679,0.422008076977050,0.541898036820671,1.06517035321108
-0.330987347057089,0.804738595616072,23,2,0.10955262391189,0.266358292837412,0.647604207274128,0.348469350894533,0.870147591610767,1.04650950166343
-0.9804020607844,0.74571731640026,25,2,0.961188200790297,0.731102793761427,0.556094315979205,0.539595348001485,1.23178022259229,0.745974795285138
-0.362560331821442,0.805498170899227,21,2,0.131449994210474,0.292041684122788,0.648827303322001,0.334990738397057,0.883333061496328,1.02720817456326
-0.47635925677605,0.961423690896481,21,2,0.226918141516230,0.457983074842334,0.924335513417013,0.462028903057712,1.07296488988841,1.09477629741475
-0.850710266502574,0.635807712096721,24,2,0.723707957532881,0.540888148202193,0.404251446761667,0.376086992190972,1.06205433208219,0.65309943445803
-0.136131341336295,0.714137809583917,25,2,0.0185317420940189,0.0972165379176223,0.509992811077315,0.422203034393551,0.726996941651981,1.12083088398685
-0.930458213202655,0.865616530412808,24,2,0.865752486516278,0.805420010206583,0.749291977723908,0.564774043865972,1.27084399681479,0.868405457050378
-0.374636142514646,0.197784703457728,21,2,0.140352239278254,0.0740972983518064,0.0391187889218614,0.327185241457712,0.423640210792266,0.655895375171089
-0.482126326300204,0.841961156809703,22,1,0.232445794511731,0.405931639420132,0.708898589576332,0.342427950053959,0.970229036922758,0.988479504839456
-0.660344187868759,0.746531683253124,24,2,0.436054446452051,0.492967858096082,0.557309554100743,0.294088642131774,0.996676477375078,0.82016804669243
-0.0772640188224614,0.437956433976069,22,2,0.00596972860459766,0.0338382741581451,0.191805838061035,0.427264688298837,0.444719649515999,1.02139489377063
-0.998469967395067,0.464829172473401,25,2,0.996942275789907,0.464117968683793,0.216066159582307,0.499709210945471,1.10136662168971,0.464831690595724

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/resources/donut.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/resources/donut.csv b/community/mahout-mr/examples/src/main/resources/donut.csv
deleted file mode 100644
index 33ba3b7..0000000
--- a/community/mahout-mr/examples/src/main/resources/donut.csv
+++ /dev/null
@@ -1,41 +0,0 @@
-"x","y","shape","color","k","k0","xx","xy","yy","a","b","c","bias"
-0.923307513352484,0.0135197141207755,21,2,4,8,0.852496764213146,0.0124828536260896,0.000182782669907495,0.923406490600458,0.0778750292332978,0.644866125183976,1
-0.711011884035543,0.909141522599384,22,2,3,9,0.505537899239772,0.64641042683833,0.826538308114327,1.15415605849213,0.953966686673604,0.46035073663368,1
-0.75118898646906,0.836567111080512,23,2,3,9,0.564284893392414,0.62842000028592,0.699844531341594,1.12433510339845,0.872783737128441,0.419968245447719,1
-0.308209649519995,0.418023289414123,24,1,5,1,0.094993188057238,0.128838811521522,0.174743470492603,0.519361780024138,0.808280495564412,0.208575453051705,1
-0.849057961953804,0.500220163026825,25,1,5,2,0.720899422757147,0.424715912147755,0.250220211498583,0.985454024425153,0.52249756970547,0.349058031386046,1
-0.0738831346388906,0.486534863477573,21,2,6,1,0.00545871758406844,0.0359467208248278,0.236716173379140,0.492112681164801,1.04613986717142,0.42632955896436,1
-0.612888508243486,0.0204555552918464,22,2,4,10,0.375632323536926,0.0125369747681119,0.000418429742297785,0.613229772009826,0.387651566219268,0.492652707029903,1
-0.207169560948387,0.932857288978994,23,2,1,4,0.0429192269835473,0.193259634985281,0.870222721601238,0.955584610897845,1.22425602987611,0.522604151014326,1
-0.309267645236105,0.506309477845207,24,1,5,1,0.0956464763898851,0.156585139973909,0.256349287355886,0.593292308854389,0.856423069092351,0.190836685845410,1
-0.78758287569508,0.171928803203627,25,2,4,10,0.620286786088131,0.135408181241926,0.0295595133710317,0.806130448165285,0.273277419610556,0.436273561610666,1
-0.930236018029973,0.0790199618786573,21,2,4,8,0.86533904924026,0.0735072146828825,0.00624415437530446,0.93358620577618,0.105409523078414,0.601936228937031,1
-0.238834470743313,0.623727766098455,22,1,5,1,0.0570419044152386,0.148967690904034,0.389036326202168,0.667890882268509,0.984077887735915,0.288991338582386,1
-0.83537525916472,0.802311758277938,23,2,3,7,0.697851823624524,0.670231393002335,0.643704157471036,1.15825557675997,0.819027144096042,0.451518508649315,1
-0.656760312616825,0.320640653371811,24,1,5,3,0.43133410822855,0.210584055746134,0.102810428594702,0.730851925374252,0.469706197095164,0.238209090579297,1
-0.180789119331166,0.114329558331519,25,2,2,5,0.0326847056685386,0.0206695401642766,0.0130712479082803,0.213906413126907,0.82715035810576,0.500636870310341,1
-0.990028728265315,0.061085847672075,21,2,4,8,0.980156882790638,0.0604767440857932,0.00373148078581595,0.991911469626425,0.06189432159595,0.657855445853466,1
-0.751934139290825,0.972332585137337,22,2,3,9,0.565404949831033,0.731130065509666,0.945430656119858,1.22916052895905,1.00347761677540,0.535321288127727,1
-0.136412925552577,0.552212274167687,23,2,6,1,0.0186084862578129,0.0753288918452558,0.304938395741448,0.5688118159807,1.02504684326820,0.3673168690368,1
-0.5729476721026,0.0981996888294816,24,2,4,10,0.328269034967789,0.0562632831160512,0.0096431788862070,0.581302170866406,0.43819729534628,0.408368525870829,1
-0.446335297077894,0.339370004367083,25,1,5,3,0.199215197417612,0.151472811718508,0.115171999864114,0.560702414192882,0.649397107420365,0.169357302283512,1
-0.922843366628513,0.912627586396411,21,2,3,7,0.851639879330248,0.842212314308118,0.832889111451739,1.29789405992245,0.915883320912091,0.590811338548155,1
-0.166969822719693,0.398156099021435,22,2,6,1,0.0278789216990458,0.0664800532683736,0.158528279187967,0.431749002184154,0.923291695753637,0.348254618269284,1
-0.350683249300346,0.84422400011681,23,2,1,6,0.122978741339848,0.296055215498298,0.712714162373228,0.914162405545687,1.06504760696993,0.375214144584023,1
-0.47748578293249,0.792779305484146,24,1,5,6,0.227992672902653,0.378540847371773,0.628499027203925,0.9254683679665,0.949484141121692,0.29364368150863,1
-0.384564548265189,0.153326370986179,25,2,2,5,0.147889891782409,0.0589638865954405,0.0235089760397912,0.414003463538894,0.634247405427742,0.365387395199715,1
-0.563622857443988,0.467359990812838,21,1,5,3,0.317670725433326,0.263414773476928,0.218425361012576,0.73218582781006,0.639414084578942,0.071506910079209,1
-0.343304847599939,0.854578266385943,22,2,1,6,0.117858218385617,0.293380861503846,0.730304013379203,0.920957236664559,1.07775346743350,0.387658506651072,1
-0.666085948701948,0.710089378990233,23,1,5,2,0.443670491058174,0.472980557667886,0.504226926154735,0.973600234805286,0.784681795257806,0.267809801016930,1
-0.190568120684475,0.0772022884339094,24,2,2,5,0.0363162086212125,0.0147122950193909,0.00596019333943254,0.205612261211838,0.813105258002736,0.523933195018469,1
-0.353534662164748,0.427994541125372,25,1,5,1,0.124986757351942,0.151310905505115,0.183179327233118,0.555127088678854,0.775304301713569,0.163208092002022,1
-0.127048352966085,0.927507144864649,21,2,1,4,0.0161412839913949,0.117838255119330,0.860269503774972,0.936168140755905,1.27370093893119,0.567322915045421,1
-0.960906301159412,0.891004979610443,22,2,3,7,0.923340919607862,0.856172299272088,0.793889873690606,1.31043152942016,0.891862204031343,0.604416671286136,1
-0.306814440060407,0.902291874401271,23,2,1,6,0.094135100629581,0.276836176215481,0.81413062661056,0.953029761990747,1.13782109627099,0.446272800849954,1
-0.087350245565176,0.671402548439801,24,2,6,4,0.00763006540029655,0.0586471774793016,0.450781382051459,0.677060889028273,1.13300968942079,0.446831795474291,1
-0.27015240653418,0.371201378758997,25,1,5,1,0.0729823227562089,0.100280945780549,0.137790463592580,0.459099974241765,0.81882108746687,0.263474858488646,1
-0.871842501685023,0.569787061074749,21,2,3,2,0.7601093477444,0.496764576755166,0.324657294968199,1.04152131169391,0.584021951079369,0.378334613738721,1
-0.686449621338397,0.169308491749689,22,2,4,10,0.471213082635629,0.116221750050949,0.0286653653785545,0.707020825728764,0.356341416814533,0.379631841296403,1
-0.67132937326096,0.571220482233912,23,1,5,2,0.450683127402953,0.383477088331915,0.326292839323543,0.881462402332905,0.659027480614106,0.185542747720368,1
-0.548616112209857,0.405350996181369,24,1,5,3,0.300979638576258,0.222382087605415,0.164309430105228,0.682121007359754,0.606676886210257,0.106404700508298,1
-0.677980388281867,0.993355110753328,25,2,3,9,0.459657406894831,0.673475283690318,0.986754376059756,1.20266860895036,1.04424662144096,0.524477152905055,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/resources/test-data.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/resources/test-data.csv b/community/mahout-mr/examples/src/main/resources/test-data.csv
deleted file mode 100644
index ab683cd..0000000
--- a/community/mahout-mr/examples/src/main/resources/test-data.csv
+++ /dev/null
@@ -1,61 +0,0 @@
-"V1","V2","V3","V4","V5","V6","V7","V8","y"
-1,-0.212887381184450,-0.955959589855826,-0.00326541907490505,0.0560086232868742,0.091264583618544,0.0172194710825328,-0.0237399208336878,1
-1,3.14702017427074,2.12881054220556,-0.00566925018709358,-0.055626039510634,-0.0630510476335515,-0.00155145331201058,0.108559859662683,0
-1,-2.16541417186635,-2.71847685293678,-0.00833554984263851,0.0433655514274994,-0.102555485096075,-0.156155728366877,-0.0241458595902909,1
-1,-4.33686585982661,-2.6857484867589,-0.0115524101901378,0.122387581992154,0.081766215557828,-0.0206167352421607,-0.0424490760296281,1
-1,2.34100936064648,2.10958510331364,-0.0129315842415535,0.173866353524092,-0.0299915285951044,0.108136400830407,-0.0063355720943443,0
-1,1.30317270786224,3.37038662087804,-0.0230504278644102,-0.131884713919903,0.086455020204179,0.17337860146005,-0.0524355492943794,0
-1,1.94943481762617,3.54806480367192,-0.029538920288902,-0.0720379027720258,0.214306548234308,-0.082665692089578,0.226607475768828,0
-1,3.14635496849369,1.76134258264267,-0.0318247859223975,-0.187198080297378,-0.08576487890296,0.153638925055934,-0.0691201521844938,0
-1,-1.26105438936697,-1.95583819596755,-0.0367826492102569,-0.0936093811581598,-0.0317225362744449,-0.0840334569992295,-0.0627566339884115,1
-1,2.40442001058194,3.23077413487565,-0.0452264569747572,0.0371989606630366,-0.17352653795031,0.102543062447842,-0.0551882772900301,0
-1,-2.20940227045733,-0.175769402031962,-0.0465958462590872,0.130789407148096,-0.140283147466875,0.0708851428212228,0.0605244763586474,1
-1,-1.64710385829030,-2.57691366099069,-0.0553070134425288,-0.0349011715152424,-0.0826092377112715,0.106766133325393,-0.0585587032435851,1
-1,-2.6523724984616,-4.16903830585265,-0.0568310036349303,-0.0291979248790545,-0.255996825268056,0.0401827924643623,0.0179311252387879,1
-1,2.34337447158977,0.28996735916551,-0.0625800583342644,0.0899232083837452,0.0255207970332586,-0.0343458209061299,0.0755898049986344,0
-1,3.67556867120403,1.36097809464341,-0.0956707962851342,0.0537771695881714,-0.0373171704803031,0.0463473815328367,-0.228499359561800,0
-1,1.96533061882493,2.92646586187099,-0.103334098736041,-0.0194013528907574,0.0253359438067293,0.00748464018133427,-0.239745502177878,0
-1,-1.95041601303593,-0.860607985906108,-0.103721968898869,-0.00972933741506002,0.0227857854969761,-0.0287381002832544,-0.130156656165122,1
-1,-1.51543545229533,-1.35683836829949,-0.106483722717291,0.103877046729912,0.00840497101030744,0.0258430051020969,0.168907472637671,1
-1,1.45074382041585,1.88231080047069,-0.107681637419817,-0.00626324733854461,-0.144385489192821,0.00088239451623517,-0.00299885969569744,0
-1,3.87956616310254,4.31276421460554,-0.129963535661731,-0.0640782960295875,-0.0324909886960640,0.0428280701443882,0.0329254937199428,0
-1,-2.88187391546093,-3.16731558128991,-0.136390769151814,-0.155408895734766,0.105626409419800,-0.0918345772196075,0.197828194781600,1
-1,-2.65024496288248,-1.81147577507541,-0.145438998990911,0.0691687502404964,0.0749439097959056,-0.0674149410216342,0.123896965825847,1
-1,-1.37426198993006,-2.08894064826135,-0.153236566384176,0.0213513951854753,-0.134553043562400,0.00287304090325258,0.0122158739075685,1
-1,1.65698424179346,2.49004336804714,-0.153862461770005,0.105220938080375,-0.0946233303225818,-0.122426312548592,-0.00538234276442917,0
-1,2.93315586503758,2.75229115279104,-0.168877592929163,-0.0349207806558679,0.0189964813847077,0.202397029441612,0.0426299706123943,0
-1,-3.84306960373604,-2.35606387141237,-0.179511886850707,-0.0916819865200809,0.0265829433229566,0.101658708455140,-0.0855390303406673,1
-1,2.28101644492271,1.37963780647481,-0.180898801743387,-0.0789829066843624,-0.0779025366072777,0.0442621459868237,-0.136195159617836,0
-1,1.70008372335953,2.71018350574622,-0.188985514267118,-0.195856534813112,-0.106263419324547,-0.0311178988395261,-0.121173036989233,0
-1,-2.05613043162767,-1.73770126734937,0.00630625444849072,-0.134595964087825,0.0708994966210059,0.0739139562742148,-0.00416084523004362,1
-1,2.39375626983328,3.2468518382106,0.00951905535238045,-0.140380515724865,0.0630970962358967,0.00183192220061040,-0.0773483294293499,0
-1,4.26863682432937,3.49421800345979,0.0109175198048448,-0.109995560295421,-0.111585866731122,0.154763193427948,-0.0186987535307691,0
-1,1.54495296452702,3.17243560853872,0.0117478311845783,0.115838636637105,-0.1715332868224,0.0927292648278796,-0.0885962242970987,0
-1,2.16883227993245,1.63879588167162,0.0158863105366749,-0.00488771308802354,0.0280782748001184,0.131946735985038,0.066416828384239,0
-1,1.86427271422921,3.32026821853873,0.0162473257475520,0.0355005599857545,-0.0988825269654524,0.0527023072810735,0.100841323212596,0
-1,-3.03828333997027,-1.43214405751321,0.0247204684728272,0.146197859364444,0.0141171187314724,-0.201738256450160,0.044002672456105,1
-1,2.08595761680696,0.225336429607513,0.0335964287149376,0.0576493862055925,0.121452048491972,0.0640240734436852,0.224720096669846,0
-1,-1.85256114614442,-2.22817393781734,0.0346230650580488,0.160185441442375,0.0114059982858295,0.00496408500928602,-0.094156048483371,1
-1,2.33572915427688,1.03334367238243,0.0357824515834720,-0.172284120406131,0.0329286256184980,-0.101030665525296,-0.00238851979619332,0
-1,-2.00334039609229,-2.98875026257892,0.0375804284421083,0.142856636546252,-0.0862220203147005,-0.0441603903572752,0.0147126239348866,1
-1,2.38346139581192,1.21051372282823,0.0405425233313353,-0.145245065311593,-0.0216697981922324,-0.0128934036902430,-0.0325085994141851,0
-1,-1.15629168023471,-1.37784639006639,0.0429948703549178,-0.00491267793152886,0.0263522850749959,-0.0442602193050815,0.0582704866256344,1
-1,2.13230915550664,1.32833684701498,0.0434112538719301,-0.0296522957829338,0.00247091583877657,-0.123872403365319,-0.136549696313901,0
-1,-1.88291252343724,-1.99980946454726,0.0472833199907535,-0.0365284873908706,-0.0209054390489622,-0.0891896486647233,0.0542966824787834,1
-1,-1.34787394136153,-2.57763619051754,0.0493154843443071,0.0384664637019124,-0.00780509859650452,-0.118550134827935,0.00573215142098708,1
-1,-1.81748193199251,-2.72113041015796,0.0551479875680516,-0.255723061179778,-0.217672946803948,0.145106553357089,0.0632886151091758,1
-1,-3.13049595715861,-0.0285946551309455,0.0724437318718333,-0.0360911974267016,-0.121364676014540,0.038351368519738,-0.0125375424386282,1
-1,-2.3836883021805,-1.40162632998805,0.0746620557343183,0.069222624188286,0.04657285528431,0.0932835769596473,0.00836816351062604,1
-1,-2.43800450243598,-0.965440038635416,0.0763675021411913,-0.122575769653323,0.045866930905471,-0.0493852614669876,0.128116802512532,1
-1,1.09024638837653,2.21814920469686,0.0769910502309598,-0.270152593833931,-0.252735856082821,0.0661674666715274,-0.000429289775969046,0
-1,3.17642151475607,1.18015379683312,0.0776648965451875,-0.117234850817615,0.0759455286430382,0.119280079276134,0.117056969569811,0
-1,-3.5501372839931,-4.02435741321994,0.0833451415432366,-0.0185864612285970,0.0553371588028254,0.0269699189958747,-0.0930023774668385,1
-1,-2.85922019599943,-2.07644295605507,0.0903467736346066,0.124804691516462,0.0673015037344841,0.0234043567104492,0.0866115903248345,1
-1,0.513249476607372,5.0165612245778,0.0934321220365115,-0.0387550539552360,0.070129320868753,0.0635055975927393,-0.00773489793089484,0
-1,1.30094323285406,2.74698316868320,0.094239413405751,-0.105600040230387,-0.0134676903839459,0.00834379403909127,0.0978349326557826,0
-1,1.62511731278249,3.01296963021698,0.104352029985773,-0.0065839083200722,0.068460830526483,-0.1202220553,0.121998460927858,0
-1,1.82917662184333,2.89388269168932,0.110781239485760,-0.262387884050666,-0.00517657837760664,-0.0224028641246511,-0.108606003593092,0
-1,-3.17279743572930,-2.86698187406046,0.110873139279243,-0.093614374710967,0.0925974010859032,-0.00747619041107016,-0.066394213442664,1
-1,-3.20104938765970,-1.68043245593876,0.123227179211642,-0.00179275501686146,-0.175893752209014,-0.0835732816974749,0.0560957582079696,1
-1,-1.89923900052239,-2.92427973445236,0.147975477003611,0.00819675018680998,0.00470753628896422,-0.0122227288860826,0.209903875101594,1
-1,0.148491843864120,-1.54734877494689,0.162479731968606,0.112962938668545,-0.0100535803565242,0.0422099301034027,0.0752974779385111,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java b/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java
deleted file mode 100644
index e849011..0000000
--- a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.Collections;
-
-public class LogisticModelParametersTest extends MahoutTestCase {
-
-  @Test
-  public void serializationWithoutCsv() throws IOException {
-    LogisticModelParameters params = new LogisticModelParameters();
-    params.setTargetVariable("foo");
-    params.setTypeMap(Collections.<String, String>emptyMap());
-    params.setTargetCategories(Arrays.asList("foo", "bar"));
-    params.setNumFeatures(1);
-    params.createRegression();
-
-    //MAHOUT-1196 should work without "csv" being set
-    params.saveTo(new ByteArrayOutputStream());
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java b/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java
deleted file mode 100644
index c8e4879..0000000
--- a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import org.apache.mahout.examples.MahoutTestCase;
-import org.apache.mahout.math.DenseVector;
-import org.junit.Test;
-
-public class ModelDissectorTest extends MahoutTestCase {
-  @Test
-  public void testCategoryOrdering() {
-    ModelDissector.Weight w = new ModelDissector.Weight("a", new DenseVector(new double[]{-2, -5, 5, 2, 4, 1, 0}), 4);
-    assertEquals(1, w.getCategory(0), 0);
-    assertEquals(-5, w.getWeight(0), 0);
-
-    assertEquals(2, w.getCategory(1), 0);
-    assertEquals(5, w.getWeight(1), 0);
-
-    assertEquals(4, w.getCategory(2), 0);
-    assertEquals(4, w.getWeight(2), 0);
-
-    assertEquals(0, w.getCategory(3), 0);
-    assertEquals(-2, w.getWeight(3), 0);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java b/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
deleted file mode 100644
index 4cde692..0000000
--- a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.base.Charsets;
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Sets;
-import com.google.common.io.Resources;
-import org.apache.mahout.classifier.AbstractVectorClassifier;
-import org.apache.mahout.examples.MahoutTestCase;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.junit.Test;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStream;
-import java.io.PrintWriter;
-import java.io.StringWriter;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-
-public class TrainLogisticTest extends MahoutTestCase {
-
-  @Test
-  public void example131() throws Exception {
-    String outputFile = getTestTempFile("model").getAbsolutePath();
-
-    StringWriter sw = new StringWriter();
-    PrintWriter pw = new PrintWriter(sw, true);
-    TrainLogistic.mainToOutput(new String[]{
-        "--input", "donut.csv",
-        "--output", outputFile,
-        "--target", "color", "--categories", "2",
-        "--predictors", "x", "y",
-        "--types", "numeric",
-        "--features", "20",
-        "--passes", "100",
-        "--rate", "50"
-    }, pw);
-    String trainOut = sw.toString();
-    assertTrue(trainOut.contains("x -0.7"));
-    assertTrue(trainOut.contains("y -0.4"));
-
-    LogisticModelParameters lmp = TrainLogistic.getParameters();
-    assertEquals(1.0e-4, lmp.getLambda(), 1.0e-9);
-    assertEquals(20, lmp.getNumFeatures());
-    assertTrue(lmp.useBias());
-    assertEquals("color", lmp.getTargetVariable());
-    CsvRecordFactory csv = lmp.getCsvRecordFactory();
-    assertEquals("[1, 2]", new TreeSet<>(csv.getTargetCategories()).toString());
-    assertEquals("[Intercept Term, x, y]", Sets.newTreeSet(csv.getPredictors()).toString());
-
-    // verify model by building dissector
-    AbstractVectorClassifier model = TrainLogistic.getModel();
-    List<String> data = Resources.readLines(Resources.getResource("donut.csv"), Charsets.UTF_8);
-    Map<String, Double> expectedValues = ImmutableMap.of("x", -0.7, "y", -0.43, "Intercept Term", -0.15);
-    verifyModel(lmp, csv, data, model, expectedValues);
-
-    // test saved model
-    try (InputStream in = new FileInputStream(new File(outputFile))){
-      LogisticModelParameters lmpOut = LogisticModelParameters.loadFrom(in);
-      CsvRecordFactory csvOut = lmpOut.getCsvRecordFactory();
-      csvOut.firstLine(data.get(0));
-      OnlineLogisticRegression lrOut = lmpOut.createRegression();
-      verifyModel(lmpOut, csvOut, data, lrOut, expectedValues);
-    }
-
-    sw = new StringWriter();
-    pw = new PrintWriter(sw, true);
-    RunLogistic.mainToOutput(new String[]{
-        "--input", "donut.csv",
-        "--model", outputFile,
-        "--auc",
-        "--confusion"
-    }, pw);
-    trainOut = sw.toString();
-    assertTrue(trainOut.contains("AUC = 0.57"));
-    assertTrue(trainOut.contains("confusion: [[27.0, 13.0], [0.0, 0.0]]"));
-  }
-
-  @Test
-  public void example132() throws Exception {
-    String outputFile = getTestTempFile("model").getAbsolutePath();
-
-    StringWriter sw = new StringWriter();
-    PrintWriter pw = new PrintWriter(sw, true);
-    TrainLogistic.mainToOutput(new String[]{
-        "--input", "donut.csv",
-        "--output", outputFile,
-        "--target", "color",
-        "--categories", "2",
-        "--predictors", "x", "y", "a", "b", "c",
-        "--types", "numeric",
-        "--features", "20",
-        "--passes", "100",
-        "--rate", "50"
-    }, pw);
-
-    String trainOut = sw.toString();
-    assertTrue(trainOut.contains("a 0."));
-    assertTrue(trainOut.contains("b -1."));
-    assertTrue(trainOut.contains("c -25."));
-
-    sw = new StringWriter();
-    pw = new PrintWriter(sw, true);
-    RunLogistic.mainToOutput(new String[]{
-        "--input", "donut.csv",
-        "--model", outputFile,
-        "--auc",
-        "--confusion"
-    }, pw);
-    trainOut = sw.toString();
-    assertTrue(trainOut.contains("AUC = 1.00"));
-
-    sw = new StringWriter();
-    pw = new PrintWriter(sw, true);
-    RunLogistic.mainToOutput(new String[]{
-        "--input", "donut-test.csv",
-        "--model", outputFile,
-        "--auc",
-        "--confusion"
-    }, pw);
-    trainOut = sw.toString();
-    assertTrue(trainOut.contains("AUC = 0.9"));
-  }
-
-  private static void verifyModel(LogisticModelParameters lmp,
-                                  RecordFactory csv,
-                                  List<String> data,
-                                  AbstractVectorClassifier model,
-                                  Map<String, Double> expectedValues) {
-    ModelDissector md = new ModelDissector();
-    for (String line : data.subList(1, data.size())) {
-      Vector v = new DenseVector(lmp.getNumFeatures());
-      csv.getTraceDictionary().clear();
-      csv.processLine(line, v);
-      md.update(v, csv.getTraceDictionary(), model);
-    }
-
-    // check right variables are present
-    List<ModelDissector.Weight> weights = md.summary(10);
-    Set<String> expected = Sets.newHashSet(expectedValues.keySet());
-    for (ModelDissector.Weight weight : weights) {
-      assertTrue(expected.remove(weight.getFeature()));
-      assertEquals(expectedValues.get(weight.getFeature()), weight.getWeight(), 0.1);
-    }
-    assertEquals(0, expected.size());
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java b/community/mahout-mr/examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
deleted file mode 100644
index 6e43b97..0000000
--- a/community/mahout-mr/examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.io.IOException;
-
-public class ClustersFilterTest extends MahoutTestCase {
-
-  private Configuration configuration;
-  private Path output;
-
-  @Override
-  @Before
-  public void setUp() throws Exception {
-    super.setUp();
-    configuration = getConfiguration();
-    output = getTestTempDirPath();
-  }
-
-  @Test
-  public void testAcceptNotFinal() throws Exception {
-    Path path0 = new Path(output, "clusters-0");
-    Path path1 = new Path(output, "clusters-1");
-
-    path0.getFileSystem(configuration).createNewFile(path0);
-    path1.getFileSystem(configuration).createNewFile(path1);
-
-    PathFilter clustersFilter = new ClustersFilter();
-
-    assertTrue(clustersFilter.accept(path0));
-    assertTrue(clustersFilter.accept(path1));
-  }
-
-  @Test
-  public void testAcceptFinalPath() throws IOException {
-    Path path0 = new Path(output, "clusters-0");
-    Path path1 = new Path(output, "clusters-1");
-    Path path2 = new Path(output, "clusters-2");
-    Path path3Final = new Path(output, "clusters-3-final");
-
-    path0.getFileSystem(configuration).createNewFile(path0);
-    path1.getFileSystem(configuration).createNewFile(path1);
-    path2.getFileSystem(configuration).createNewFile(path2);
-    path3Final.getFileSystem(configuration).createNewFile(path3Final);
-
-    PathFilter clustersFilter = new ClustersFilter();
-
-    assertTrue(clustersFilter.accept(path0));
-    assertTrue(clustersFilter.accept(path1));
-    assertTrue(clustersFilter.accept(path2));
-    assertTrue(clustersFilter.accept(path3Final));
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java b/community/mahout-mr/examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java
deleted file mode 100644
index 4d81e3f..0000000
--- a/community/mahout-mr/examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.examples;
-
-/**
- * This class should not exist. It's here to work around some bizarre problem in Maven
- * dependency management wherein it can see methods in {@link org.apache.mahout.common.MahoutTestCase}
- * but not constants. Duplicated here to make it jive.
- */
-public abstract class MahoutTestCase extends org.apache.mahout.common.MahoutTestCase {
-
-  /** "Close enough" value for floating-point comparisons. */
-  public static final double EPSILON = 0.000001;
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/resources/country.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/resources/country.txt b/community/mahout-mr/examples/src/test/resources/country.txt
deleted file mode 100644
index 6a22091..0000000
--- a/community/mahout-mr/examples/src/test/resources/country.txt
+++ /dev/null
@@ -1,229 +0,0 @@
-Afghanistan
-Albania
-Algeria
-American Samoa
-Andorra
-Angola
-Anguilla
-Antigua and Barbuda
-Argentina
-Armenia
-Aruba
-Australia
-Austria
-Azerbaijan
-Bahamas
-Bangladesh
-Barbados
-Belarus
-Belgium
-Belize
-Benin
-Bermuda
-Bhutan
-Bolivia
-Bosnia and Herzegovina
-Botswana
-Bouvet Island
-Brazil
-British Indian Ocean Territory
-Brunei Darussalam
-Bulgaria
-Burkina Faso
-Burundi
-Cambodia
-Cameroon
-Canada
-Cape Verde
-Cayman Islands
-Central African Republic
-Chad
-Chile
-China
-Christmas Island
-Cocos  Islands
-Colombia
-Comoros
-Congo
-Cook Islands
-Costa Rica
-Croatia
-C�te d'Ivoire
-Cuba
-Cyprus
-Czech Republic
-Djibouti
-Dominica
-Dominican Republic
-Ecuador
-Egypt
-El Salvador
-Equatorial Guinea
-Eritrea
-Estonia
-Ethiopia
-Falkland Islands 
-Faroe Islands
-Fiji
-Finland
-France
-French Guiana
-French Polynesia
-French Southern Territories
-Gabon
-Georgia
-Germany
-Ghana
-Gibraltar
-Greece
-Greenland
-Grenada
-Guadeloupe
-Guam
-Guatemala
-Guernsey
-Guinea
-Guinea-Bissau
-Guyana
-Haiti
-Honduras
-Hong Kong
-Hungary
-Iceland
-India
-Indonesia
-Iran
-Iraq
-Ireland
-Isle of Man
-Israel
-Italy
-Japan
-Jersey
-Jordan
-Kazakhstan
-Kenya
-Kiribati
-Korea
-Kuwait
-Kyrgyzstan
-Latvia
-Lebanon
-Lesotho
-Liberia
-Liechtenstein
-Lithuania
-Luxembourg
-Macedonia
-Madagascar
-Malawi
-Malaysia
-Maldives
-Mali
-Malta
-Marshall Islands
-Martinique
-Mauritania
-Mauritius
-Mayotte
-Mexico
-Micronesia
-Moldova
-Monaco
-Mongolia
-Montenegro
-Montserrat
-Morocco
-Mozambique
-Myanmar
-Namibia
-Nauru
-Nepal
-Netherlands
-Netherlands Antilles
-New Caledonia
-New Zealand
-Nicaragua
-Niger
-Nigeria
-Niue
-Norfolk Island
-Northern Mariana Islands
-Norway
-Oman
-Pakistan
-Palau
-Palestinian Territory
-Panama
-Papua New Guinea
-Paraguay
-Peru
-Philippines
-Pitcairn
-Poland
-Portugal
-Puerto Rico
-Qatar
-R�union
-Russian Federation
-Rwanda
-Saint Barth�lemy
-Saint Helena
-Saint Kitts and Nevis
-Saint Lucia
-Saint Martin 
-Saint Pierre and Miquelon
-Saint Vincent and the Grenadines
-Samoa
-San Marino
-Sao Tome and Principe
-Saudi Arabia
-Senegal
-Serbia
-Seychelles
-Sierra Leone
-Singapore
-Slovakia
-Slovenia
-Solomon Islands
-Somalia
-South Africa
-South Georgia and the South Sandwich Islands
-Spain
-Sri Lanka
-Sudan
-Suriname
-Svalbard and Jan Mayen
-Swaziland
-Sweden
-Switzerland
-Syrian Arab Republic
-Taiwan
-Tanzania
-Thailand
-Timor-Leste
-Togo
-Tokelau
-Tonga
-Trinidad and Tobago
-Tunisia
-Turkey
-Turkmenistan
-Turks and Caicos Islands
-Tuvalu
-Ukraine
-United Arab Emirates
-United Kingdom
-United States
-United States Minor Outlying Islands
-Uruguay
-Uzbekistan
-Vanuatu
-Vatican 
-Venezuela
-Vietnam
-Virgin Islands
-Wallis and Futuna
-Yemen
-Zambia
-Zimbabwe

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/resources/country10.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/resources/country10.txt b/community/mahout-mr/examples/src/test/resources/country10.txt
deleted file mode 100644
index 97a63e1..0000000
--- a/community/mahout-mr/examples/src/test/resources/country10.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-Australia
-Austria
-Bahamas
-Canada
-Colombia
-Cuba
-Panama
-Pakistan
-United Kingdom
-Vietnam

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/resources/country2.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/resources/country2.txt b/community/mahout-mr/examples/src/test/resources/country2.txt
deleted file mode 100644
index f4b4f61..0000000
--- a/community/mahout-mr/examples/src/test/resources/country2.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-United States
-United Kingdom

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/resources/subjects.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/resources/subjects.txt b/community/mahout-mr/examples/src/test/resources/subjects.txt
deleted file mode 100644
index f52ae33..0000000
--- a/community/mahout-mr/examples/src/test/resources/subjects.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Science
-History

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/resources/wdbc.infos
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/resources/wdbc.infos b/community/mahout-mr/examples/src/test/resources/wdbc.infos
deleted file mode 100644
index 94a63d6..0000000
--- a/community/mahout-mr/examples/src/test/resources/wdbc.infos
+++ /dev/null
@@ -1,32 +0,0 @@
-IGNORED
-LABEL, B, M
-NUMERICAL, 6.9, 28.2
-NUMERICAL, 9.7, 39.3
-NUMERICAL, 43.7, 188.5
-NUMERICAL, 143.5, 2501.0
-NUMERICAL, 0.0, 0.2
-NUMERICAL, 0.0, 0.4
-NUMERICAL, 0.0, 0.5
-NUMERICAL, 0.0, 0.3
-NUMERICAL, 0.1, 0.4 
-NUMERICAL, 0.0, 0.1
-NUMERICAL, 0.1, 2.9
-NUMERICAL, 0.3, 4.9
-NUMERICAL, 0.7, 22.0
-NUMERICAL, 6.8, 542.3
-NUMERICAL, 0.0, 0.1
-NUMERICAL, 0.0, 0.2
-NUMERICAL, 0.0, 0.4
-NUMERICAL, 0.0, 0.1
-NUMERICAL, 0.0, 0.1
-NUMERICAL, 0.0, 0.1
-NUMERICAL, 7.9, 36.1
-NUMERICAL, 12.0, 49.6
-NUMERICAL, 50.4, 251.2
-NUMERICAL, 185.2, 4254.0
-NUMERICAL, 0.0, 0.3
-NUMERICAL, 0.0, 1.1
-NUMERICAL, 0.0, 1.3
-NUMERICAL, 0.0, 0.3
-NUMERICAL, 0.1, 0.7
-NUMERICAL, 0.0, 0.3 


[47/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
deleted file mode 100644
index 752bb48..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
+++ /dev/null
@@ -1,274 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import com.google.common.io.Closeables;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.filecache.DistributedCache;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.FileUtil;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
-import org.apache.mahout.math.VarIntWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.net.URI;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.atomic.AtomicInteger;
-
-/**
- * Convert the Mail archives (see {@link org.apache.mahout.text.SequenceFilesFromMailArchives}) to a preference
- * file that can be consumed by the {@link org.apache.mahout.cf.taste.hadoop.item.RecommenderJob}.
- * <p/>
- * This assumes the input is a Sequence File, that the key is: filename/message id and the value is a list
- * (separated by the user's choosing) containing the from email and any references
- * <p/>
- * The output is a matrix where either the from or to are the rows (represented as longs) and the columns are the
- * message ids that the user has interacted with (as a VectorWritable).  This class currently does not account for
- * thread hijacking.
- * <p/>
- * It also outputs a side table mapping the row ids to their original and the message ids to the message thread id
- */
-public final class MailToPrefsDriver extends AbstractJob {
-
-  private static final Logger log = LoggerFactory.getLogger(MailToPrefsDriver.class);
-
-  private static final String OUTPUT_FILES_PATTERN = "part-*";
-  private static final int DICTIONARY_BYTE_OVERHEAD = 4;
-
-  public static void main(String[] args) throws Exception {
-    ToolRunner.run(new Configuration(), new MailToPrefsDriver(), args);
-  }
-
-  @Override
-  public int run(String[] args) throws Exception {
-    addInputOption();
-    addOutputOption();
-    addOption(DefaultOptionCreator.overwriteOption().create());
-    addOption("chunkSize", "cs", "The size of chunks to write.  Default is 100 mb", "100");
-    addOption("separator", "sep", "The separator used in the input file to separate to, from, subject.  Default is \\n",
-        "\n");
-    addOption("from", "f", "The position in the input text (value) where the from email is located, starting from "
-        + "zero (0).", "0");
-    addOption("refs", "r", "The position in the input text (value) where the reference ids are located, "
-        + "starting from zero (0).", "1");
-    addOption(buildOption("useCounts", "u", "If set, then use the number of times the user has interacted with a "
-        + "thread as an indication of their preference.  Otherwise, use boolean preferences.", false, false,
-        String.valueOf(true)));
-    Map<String, List<String>> parsedArgs = parseArguments(args);
-
-    Path input = getInputPath();
-    Path output = getOutputPath();
-    int chunkSize = Integer.parseInt(getOption("chunkSize"));
-    String separator = getOption("separator");
-    Configuration conf = getConf();
-    boolean useCounts = hasOption("useCounts");
-    AtomicInteger currentPhase = new AtomicInteger();
-    int[] msgDim = new int[1];
-    //TODO: mod this to not do so many passes over the data.  Dictionary creation could probably be a chain mapper
-    List<Path> msgIdChunks = null;
-    boolean overwrite = hasOption(DefaultOptionCreator.OVERWRITE_OPTION);
-    // create the dictionary between message ids and longs
-    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
-      //TODO: there seems to be a pattern emerging for dictionary creation
-      // -- sparse vectors from seq files also has this.
-      Path msgIdsPath = new Path(output, "msgIds");
-      if (overwrite) {
-        HadoopUtil.delete(conf, msgIdsPath);
-      }
-      log.info("Creating Msg Id Dictionary");
-      Job createMsgIdDictionary = prepareJob(input,
-              msgIdsPath,
-              SequenceFileInputFormat.class,
-              MsgIdToDictionaryMapper.class,
-              Text.class,
-              VarIntWritable.class,
-              MailToDictionaryReducer.class,
-              Text.class,
-              VarIntWritable.class,
-              SequenceFileOutputFormat.class);
-
-      boolean succeeded = createMsgIdDictionary.waitForCompletion(true);
-      if (!succeeded) {
-        return -1;
-      }
-      //write out the dictionary at the top level
-      msgIdChunks = createDictionaryChunks(msgIdsPath, output, "msgIds-dictionary-",
-          createMsgIdDictionary.getConfiguration(), chunkSize, msgDim);
-    }
-    //create the dictionary between from email addresses and longs
-    List<Path> fromChunks = null;
-    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
-      Path fromIdsPath = new Path(output, "fromIds");
-      if (overwrite) {
-        HadoopUtil.delete(conf, fromIdsPath);
-      }
-      log.info("Creating From Id Dictionary");
-      Job createFromIdDictionary = prepareJob(input,
-              fromIdsPath,
-              SequenceFileInputFormat.class,
-              FromEmailToDictionaryMapper.class,
-              Text.class,
-              VarIntWritable.class,
-              MailToDictionaryReducer.class,
-              Text.class,
-              VarIntWritable.class,
-              SequenceFileOutputFormat.class);
-      createFromIdDictionary.getConfiguration().set(EmailUtility.SEPARATOR, separator);
-      boolean succeeded = createFromIdDictionary.waitForCompletion(true);
-      if (!succeeded) {
-        return -1;
-      }
-      //write out the dictionary at the top level
-      int[] fromDim = new int[1];
-      fromChunks = createDictionaryChunks(fromIdsPath, output, "fromIds-dictionary-",
-          createFromIdDictionary.getConfiguration(), chunkSize, fromDim);
-    }
-    //OK, we have our dictionaries, let's output the real thing we need: <from_id -> <msgId, msgId, msgId, ...>>
-    if (shouldRunNextPhase(parsedArgs, currentPhase) && fromChunks != null && msgIdChunks != null) {
-      //Job map
-      //may be a way to do this so that we can load the from ids in memory, if they are small enough so that
-      // we don't need the double loop
-      log.info("Creating recommendation matrix");
-      Path vecPath = new Path(output, "recInput");
-      if (overwrite) {
-        HadoopUtil.delete(conf, vecPath);
-      }
-      //conf.set(EmailUtility.FROM_DIMENSION, String.valueOf(fromDim[0]));
-      conf.set(EmailUtility.MSG_ID_DIMENSION, String.valueOf(msgDim[0]));
-      conf.set(EmailUtility.FROM_PREFIX, "fromIds-dictionary-");
-      conf.set(EmailUtility.MSG_IDS_PREFIX, "msgIds-dictionary-");
-      conf.set(EmailUtility.FROM_INDEX, getOption("from"));
-      conf.set(EmailUtility.REFS_INDEX, getOption("refs"));
-      conf.set(EmailUtility.SEPARATOR, separator);
-      conf.set(MailToRecReducer.USE_COUNTS_PREFERENCE, String.valueOf(useCounts));
-      int j = 0;
-      int i = 0;
-      for (Path fromChunk : fromChunks) {
-        for (Path idChunk : msgIdChunks) {
-          Path out = new Path(vecPath, "tmp-" + i + '-' + j);
-          DistributedCache.setCacheFiles(new URI[]{fromChunk.toUri(), idChunk.toUri()}, conf);
-          Job createRecMatrix = prepareJob(input, out, SequenceFileInputFormat.class,
-                  MailToRecMapper.class, Text.class, LongWritable.class, MailToRecReducer.class, Text.class,
-                  NullWritable.class, TextOutputFormat.class);
-          createRecMatrix.getConfiguration().set("mapred.output.compress", "false");
-          boolean succeeded = createRecMatrix.waitForCompletion(true);
-          if (!succeeded) {
-            return -1;
-          }
-          //copy the results up a level
-          //HadoopUtil.copyMergeSeqFiles(out.getFileSystem(conf), out, vecPath.getFileSystem(conf), outPath, true,
-          // conf, "");
-          FileStatus[] fs = HadoopUtil.getFileStatus(new Path(out, "*"), PathType.GLOB, PathFilters.partFilter(), null,
-              conf);
-          for (int k = 0; k < fs.length; k++) {
-            FileStatus f = fs[k];
-            Path outPath = new Path(vecPath, "chunk-" + i + '-' + j + '-' + k);
-            FileUtil.copy(f.getPath().getFileSystem(conf), f.getPath(), outPath.getFileSystem(conf), outPath, true,
-                overwrite, conf);
-          }
-          HadoopUtil.delete(conf, out);
-          j++;
-        }
-        i++;
-      }
-      //concat the files together
-      /*Path mergePath = new Path(output, "vectors.dat");
-      if (overwrite) {
-        HadoopUtil.delete(conf, mergePath);
-      }
-      log.info("Merging together output vectors to vectors.dat in {}", output);*/
-      //HadoopUtil.copyMergeSeqFiles(vecPath.getFileSystem(conf), vecPath, mergePath.getFileSystem(conf), mergePath,
-      // false, conf, "\n");
-    }
-
-    return 0;
-  }
-
-  private static List<Path> createDictionaryChunks(Path inputPath,
-                                                   Path dictionaryPathBase,
-                                                   String name,
-                                                   Configuration baseConf,
-                                                   int chunkSizeInMegabytes, int[] maxTermDimension)
-    throws IOException {
-    List<Path> chunkPaths = new ArrayList<>();
-
-    Configuration conf = new Configuration(baseConf);
-
-    FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
-
-    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
-    int chunkIndex = 0;
-    Path chunkPath = new Path(dictionaryPathBase, name + chunkIndex);
-    chunkPaths.add(chunkPath);
-
-    SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
-
-    try {
-      long currentChunkSize = 0;
-      Path filesPattern = new Path(inputPath, OUTPUT_FILES_PATTERN);
-      int i = 1; //start at 1, since a miss in the OpenObjectIntHashMap returns a 0
-      for (Pair<Writable, Writable> record
-              : new SequenceFileDirIterable<>(filesPattern, PathType.GLOB, null, null, true, conf)) {
-        if (currentChunkSize > chunkSizeLimit) {
-          Closeables.close(dictWriter, false);
-          chunkIndex++;
-
-          chunkPath = new Path(dictionaryPathBase, name + chunkIndex);
-          chunkPaths.add(chunkPath);
-
-          dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
-          currentChunkSize = 0;
-        }
-
-        Writable key = record.getFirst();
-        int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8;
-        currentChunkSize += fieldSize;
-        dictWriter.append(key, new IntWritable(i++));
-      }
-      maxTermDimension[0] = i;
-    } finally {
-      Closeables.close(dictWriter, false);
-    }
-
-    return chunkPaths;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
deleted file mode 100644
index 91bbd17..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import org.apache.commons.lang3.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.math.map.OpenObjectIntHashMap;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-
-public final class MailToRecMapper extends Mapper<Text, Text, Text, LongWritable> {
-
-  private static final Logger log = LoggerFactory.getLogger(MailToRecMapper.class);
-
-  private final OpenObjectIntHashMap<String> fromDictionary = new OpenObjectIntHashMap<>();
-  private final OpenObjectIntHashMap<String> msgIdDictionary = new OpenObjectIntHashMap<>();
-  private String separator = "\n";
-  private int fromIdx;
-  private int refsIdx;
-
-  public enum Counters {
-    REFERENCE, ORIGINAL
-  }
-
-  @Override
-  protected void setup(Context context) throws IOException, InterruptedException {
-    super.setup(context);
-    Configuration conf = context.getConfiguration();
-    String fromPrefix = conf.get(EmailUtility.FROM_PREFIX);
-    String msgPrefix = conf.get(EmailUtility.MSG_IDS_PREFIX);
-    fromIdx = conf.getInt(EmailUtility.FROM_INDEX, 0);
-    refsIdx = conf.getInt(EmailUtility.REFS_INDEX, 1);
-    EmailUtility.loadDictionaries(conf, fromPrefix, fromDictionary, msgPrefix, msgIdDictionary);
-    log.info("From Dictionary size: {} Msg Id Dictionary size: {}", fromDictionary.size(), msgIdDictionary.size());
-    separator = context.getConfiguration().get(EmailUtility.SEPARATOR);
-  }
-
-  @Override
-  protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
-
-    int msgIdKey = Integer.MIN_VALUE;
-
-
-    int fromKey = Integer.MIN_VALUE;
-    String valStr = value.toString();
-    String[] splits = StringUtils.splitByWholeSeparatorPreserveAllTokens(valStr, separator);
-
-    if (splits != null && splits.length > 0) {
-      if (splits.length > refsIdx) {
-        String from = EmailUtility.cleanUpEmailAddress(splits[fromIdx]);
-        fromKey = fromDictionary.get(from);
-      }
-      //get the references
-      if (splits.length > refsIdx) {
-        String[] theRefs = EmailUtility.parseReferences(splits[refsIdx]);
-        if (theRefs != null && theRefs.length > 0) {
-          //we have a reference, the first one is the original message id, so map to that one if it exists
-          msgIdKey = msgIdDictionary.get(theRefs[0]);
-          context.getCounter(Counters.REFERENCE).increment(1);
-        }
-      }
-    }
-    //we don't have any references, so use the msg id
-    if (msgIdKey == Integer.MIN_VALUE) {
-      //get the msg id and the from and output the associated ids
-      String keyStr = key.toString();
-      int idx = keyStr.lastIndexOf('/');
-      if (idx != -1) {
-        String msgId = keyStr.substring(idx + 1);
-        msgIdKey = msgIdDictionary.get(msgId);
-        context.getCounter(Counters.ORIGINAL).increment(1);
-      }
-    }
-
-    if (msgIdKey != Integer.MIN_VALUE && fromKey != Integer.MIN_VALUE) {
-      context.write(new Text(fromKey + "," + msgIdKey), new LongWritable(1));
-    }
-  }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
deleted file mode 100644
index ee36a41..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Reducer;
-
-import java.io.IOException;
-
-public class MailToRecReducer extends Reducer<Text, LongWritable, Text, NullWritable> {
-  //if true, then output weight
-  private boolean useCounts = true;
-  /**
-   * We can either ignore how many times the user interacted (boolean) or output the number of times they interacted.
-   */
-  public static final String USE_COUNTS_PREFERENCE = "useBooleanPreferences";
-
-  @Override
-  protected void setup(Context context) throws IOException, InterruptedException {
-    useCounts = context.getConfiguration().getBoolean(USE_COUNTS_PREFERENCE, true);
-  }
-
-  @Override
-  protected void reduce(Text key, Iterable<LongWritable> values, Context context)
-    throws IOException, InterruptedException {
-    if (useCounts) {
-      long sum = 0;
-      for (LongWritable value : values) {
-        sum++;
-      }
-      context.write(new Text(key.toString() + ',' + sum), null);
-    } else {
-      context.write(new Text(key.toString()), null);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
deleted file mode 100644
index f3de847..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.math.VarIntWritable;
-
-import java.io.IOException;
-
-/**
- * Assumes the input is in the format created by {@link org.apache.mahout.text.SequenceFilesFromMailArchives}
- */
-public final class MsgIdToDictionaryMapper extends Mapper<Text, Text, Text, VarIntWritable> {
-
-  @Override
-  protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
-    //message id is in the key: /201008/AANLkTikvVnhNH+Y5AGEwqd2=u0CFv2mCm0ce6E6oBnj1@mail.gmail.com
-    String keyStr = key.toString();
-    int idx = keyStr.lastIndexOf('@'); //find the last @
-    if (idx == -1) {
-      context.getCounter(EmailUtility.Counters.NO_MESSAGE_ID).increment(1);
-    } else {
-      //found the @, now find the last slash before the @ and grab everything after that
-      idx = keyStr.lastIndexOf('/', idx);
-      String msgId = keyStr.substring(idx + 1);
-      if (EmailUtility.WHITESPACE.matcher(msgId).matches()) {
-        context.getCounter(EmailUtility.Counters.NO_MESSAGE_ID).increment(1);
-      } else {
-        context.write(new Text(msgId), new VarIntWritable(1));
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java
deleted file mode 100644
index c358021..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-
-public final class DataFileIterable implements Iterable<Pair<PreferenceArray,long[]>> {
-
-  private final File dataFile;
-
-  public DataFileIterable(File dataFile) {
-    this.dataFile = dataFile;
-  }
-
-  @Override
-  public Iterator<Pair<PreferenceArray, long[]>> iterator() {
-    try {
-      return new DataFileIterator(dataFile);
-    } catch (IOException ioe) {
-      throw new IllegalStateException(ioe);
-    }
-  }
- 
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java
deleted file mode 100644
index 786e080..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java
+++ /dev/null
@@ -1,158 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup;
-
-import java.io.Closeable;
-import java.io.File;
-import java.io.IOException;
-import java.util.regex.Pattern;
-
-import com.google.common.collect.AbstractIterator;
-import com.google.common.io.Closeables;
-import org.apache.mahout.cf.taste.impl.common.SkippingIterator;
-import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.iterator.FileLineIterator;
-import org.apache.mahout.common.Pair;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * <p>An {@link java.util.Iterator} which iterates over any of the KDD Cup's rating files. These include the files
- * {train,test,validation}Idx{1,2}}.txt. See http://kddcup.yahoo.com/. Each element in the iteration corresponds
- * to one user's ratings as a {@link PreferenceArray} and corresponding timestamps as a parallel {@code long}
- * array.</p>
- *
- * <p>Timestamps in the data set are relative to some unknown point in time, for anonymity. They are assumed
- * to be relative to the epoch, time 0, or January 1 1970, for purposes here.</p>
- */
-public final class DataFileIterator
-    extends AbstractIterator<Pair<PreferenceArray,long[]>>
-    implements SkippingIterator<Pair<PreferenceArray,long[]>>, Closeable {
-
-  private static final Pattern COLON_PATTERN = Pattern.compile(":");
-  private static final Pattern PIPE_PATTERN = Pattern.compile("\\|");
-  private static final Pattern TAB_PATTERN = Pattern.compile("\t");
-
-  private final FileLineIterator lineIterator;
-
-  private static final Logger log = LoggerFactory.getLogger(DataFileIterator.class);
-
-  public DataFileIterator(File dataFile) throws IOException {
-    if (dataFile == null || dataFile.isDirectory() || !dataFile.exists()) {
-      throw new IllegalArgumentException("Bad data file: " + dataFile);
-    }
-    lineIterator = new FileLineIterator(dataFile);
-  }
-
-  @Override
-  protected Pair<PreferenceArray, long[]> computeNext() {
-
-    if (!lineIterator.hasNext()) {
-      return endOfData();
-    }
-
-    String line = lineIterator.next();
-    // First a userID|ratingsCount line
-    String[] tokens = PIPE_PATTERN.split(line);
-
-    long userID = Long.parseLong(tokens[0]);
-    int ratingsLeftToRead = Integer.parseInt(tokens[1]);
-    int ratingsRead = 0;
-
-    PreferenceArray currentUserPrefs = new GenericUserPreferenceArray(ratingsLeftToRead);
-    long[] timestamps = new long[ratingsLeftToRead];
-
-    while (ratingsLeftToRead > 0) {
-
-      line = lineIterator.next();
-
-      // Then a data line. May be 1-4 tokens depending on whether preference info is included (it's not in test data)
-      // or whether date info is included (not inluded in track 2). Item ID is always first, and date is the last
-      // two fields if it exists.
-      tokens = TAB_PATTERN.split(line);
-      boolean hasPref = tokens.length == 2 || tokens.length == 4;
-      boolean hasDate = tokens.length > 2;
-
-      long itemID = Long.parseLong(tokens[0]);
-
-      currentUserPrefs.setUserID(0, userID);
-      currentUserPrefs.setItemID(ratingsRead, itemID);
-      if (hasPref) {
-        float preference = Float.parseFloat(tokens[1]);
-        currentUserPrefs.setValue(ratingsRead, preference);
-      }
-
-      if (hasDate) {
-        long timestamp;
-        if (hasPref) {
-          timestamp = parseFakeTimestamp(tokens[2], tokens[3]);
-        } else {
-          timestamp = parseFakeTimestamp(tokens[1], tokens[2]);
-        }
-        timestamps[ratingsRead] = timestamp;
-      }
-
-      ratingsRead++;
-      ratingsLeftToRead--;
-    }
-
-    return new Pair<>(currentUserPrefs, timestamps);
-  }
-
-  @Override
-  public void skip(int n) {
-    for (int i = 0; i < n; i++) {
-      if (lineIterator.hasNext()) {
-        String line = lineIterator.next();
-        // First a userID|ratingsCount line
-        String[] tokens = PIPE_PATTERN.split(line);
-        int linesToSKip = Integer.parseInt(tokens[1]);
-        lineIterator.skip(linesToSKip);
-      } else {
-        break;
-      }
-    }
-  }
-
-  @Override
-  public void close() {
-    endOfData();
-    try {
-      Closeables.close(lineIterator, true);
-    } catch (IOException e) {
-      log.error(e.getMessage(), e);
-    }
-  }
-
-  /**
-   * @param dateString "date" in days since some undisclosed date, which we will arbitrarily assume to be the
-   *  epoch, January 1 1970.
-   * @param timeString time of day in HH:mm:ss format
-   * @return the UNIX timestamp for this moment in time
-   */
-  private static long parseFakeTimestamp(String dateString, CharSequence timeString) {
-    int days = Integer.parseInt(dateString);
-    String[] timeTokens = COLON_PATTERN.split(timeString);
-    int hours = Integer.parseInt(timeTokens[0]);
-    int minutes = Integer.parseInt(timeTokens[1]);
-    int seconds = Integer.parseInt(timeTokens[2]);
-    return 86400L * days + 3600L + hours + 60L * minutes + seconds;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java
deleted file mode 100644
index 4b62050..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java
+++ /dev/null
@@ -1,231 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Iterator;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.SamplingIterator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * <p>An {@link DataModel} which reads into memory any of the KDD Cup's rating files; it is really
- * meant for use with training data in the files trainIdx{1,2}}.txt.
- * See http://kddcup.yahoo.com/.</p>
- *
- * <p>Timestamps in the data set are relative to some unknown point in time, for anonymity. They are assumed
- * to be relative to the epoch, time 0, or January 1 1970, for purposes here.</p>
- */
-public final class KDDCupDataModel implements DataModel {
-
-  private static final Logger log = LoggerFactory.getLogger(KDDCupDataModel.class);
-
-  private final File dataFileDirectory;
-  private final DataModel delegate;
-
-  /**
-   * @param dataFile training rating file
-   */
-  public KDDCupDataModel(File dataFile) throws IOException {
-    this(dataFile, false, 1.0);
-  }
-
-  /**
-   * @param dataFile training rating file
-   * @param storeDates if true, dates are parsed and stored, otherwise not
-   * @param samplingRate percentage of users to keep; can be used to reduce memory requirements
-   */
-  public KDDCupDataModel(File dataFile, boolean storeDates, double samplingRate) throws IOException {
-
-    Preconditions.checkArgument(!Double.isNaN(samplingRate) && samplingRate > 0.0 && samplingRate <= 1.0,
-        "Must be: 0.0 < samplingRate <= 1.0");
-
-    dataFileDirectory = dataFile.getParentFile();
-
-    Iterator<Pair<PreferenceArray,long[]>> dataIterator = new DataFileIterator(dataFile);
-    if (samplingRate < 1.0) {
-      dataIterator = new SamplingIterator<>(dataIterator, samplingRate);
-    }
-
-    FastByIDMap<PreferenceArray> userData = new FastByIDMap<>();
-    FastByIDMap<FastByIDMap<Long>> timestamps = new FastByIDMap<>();
-
-    while (dataIterator.hasNext()) {
-
-      Pair<PreferenceArray,long[]> pair = dataIterator.next();
-      PreferenceArray userPrefs = pair.getFirst();
-      long[] timestampsForPrefs = pair.getSecond();
-
-      userData.put(userPrefs.getUserID(0), userPrefs);
-      if (storeDates) {
-        FastByIDMap<Long> itemTimestamps = new FastByIDMap<>();
-        for (int i = 0; i < timestampsForPrefs.length; i++) {
-          long timestamp = timestampsForPrefs[i];
-          if (timestamp > 0L) {
-            itemTimestamps.put(userPrefs.getItemID(i), timestamp);
-          }
-        }
-      }
-
-    }
-
-    if (storeDates) {
-      delegate = new GenericDataModel(userData, timestamps);
-    } else {
-      delegate = new GenericDataModel(userData);
-    }
-
-    Runtime runtime = Runtime.getRuntime();
-    log.info("Loaded data model in about {}MB heap", (runtime.totalMemory() - runtime.freeMemory()) / 1000000);
-  }
-
-  public File getDataFileDirectory() {
-    return dataFileDirectory;
-  }
-
-  public static File getTrainingFile(File dataFileDirectory) {
-    return getFile(dataFileDirectory, "trainIdx");
-  }
-
-  public static File getValidationFile(File dataFileDirectory) {
-    return getFile(dataFileDirectory, "validationIdx");
-  }
-
-  public static File getTestFile(File dataFileDirectory) {
-    return getFile(dataFileDirectory, "testIdx");
-  }
-
-  public static File getTrackFile(File dataFileDirectory) {
-    return getFile(dataFileDirectory, "trackData");
-  }
-
-  private static File getFile(File dataFileDirectory, String prefix) {
-    // Works on set 1 or 2
-    for (int set : new int[] {1,2}) {
-      // Works on sample data from before contest or real data
-      for (String firstLinesOrNot : new String[] {"", ".firstLines"}) {
-        for (String gzippedOrNot : new String[] {".gz", ""}) {
-          File dataFile = new File(dataFileDirectory, prefix + set + firstLinesOrNot + ".txt" + gzippedOrNot);
-          if (dataFile.exists()) {
-            return dataFile;
-          }
-        }
-      }
-    }
-    throw new IllegalArgumentException("Can't find " + prefix + " file in " + dataFileDirectory);
-  }
-
-  @Override
-  public LongPrimitiveIterator getUserIDs() throws TasteException {
-    return delegate.getUserIDs();
-  }
-
-  @Override
-  public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
-    return delegate.getPreferencesFromUser(userID);
-  }
-
-  @Override
-  public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
-    return delegate.getItemIDsFromUser(userID);
-  }
-
-  @Override
-  public LongPrimitiveIterator getItemIDs() throws TasteException {
-    return delegate.getItemIDs();
-  }
-
-  @Override
-  public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
-    return delegate.getPreferencesForItem(itemID);
-  }
-
-  @Override
-  public Float getPreferenceValue(long userID, long itemID) throws TasteException {
-    return delegate.getPreferenceValue(userID, itemID);
-  }
-
-  @Override
-  public Long getPreferenceTime(long userID, long itemID) throws TasteException {
-    return delegate.getPreferenceTime(userID, itemID);
-  }
-
-  @Override
-  public int getNumItems() throws TasteException {
-    return delegate.getNumItems();
-  }
-
-  @Override
-  public int getNumUsers() throws TasteException {
-    return delegate.getNumUsers();
-  }
-
-  @Override
-  public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
-    return delegate.getNumUsersWithPreferenceFor(itemID);
-  }
-
-  @Override
-  public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
-    return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2);
-  }
-
-  @Override
-  public void setPreference(long userID, long itemID, float value) throws TasteException {
-    delegate.setPreference(userID, itemID, value);
-  }
-
-  @Override
-  public void removePreference(long userID, long itemID) throws TasteException {
-    delegate.removePreference(userID, itemID);
-  }
-
-  @Override
-  public boolean hasPreferenceValues() {
-    return delegate.hasPreferenceValues();
-  }
-
-  @Override
-  public float getMaxPreference() {
-    return 100.0f;
-  }
-
-  @Override
-  public float getMinPreference() {
-    return 0.0f;
-  }
-
-  @Override
-  public void refresh(Collection<Refreshable> alreadyRefreshed) {
-    // do nothing
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java
deleted file mode 100644
index 3f4a732..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup;
-
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.zip.GZIPOutputStream;
-
-/**
- * <p>This class converts a KDD Cup input file into a compressed CSV format. The output format is
- * {@code userID,itemID,score,timestamp}. It can optionally restrict its output to exclude
- * score and/or timestamp.</p>
- *
- * <p>Run as: {@code ToCSV (input file) (output file) [num columns to output]}</p>
- */
-public final class ToCSV {
-
-  private ToCSV() {
-  }
-
-  public static void main(String[] args) throws Exception {
-
-    File inputFile = new File(args[0]);
-    File outputFile = new File(args[1]);
-    int columnsToOutput = 4;
-    if (args.length >= 3) {
-      columnsToOutput = Integer.parseInt(args[2]);
-    }
-
-    OutputStream outStream = new GZIPOutputStream(new FileOutputStream(outputFile));
-
-    try (Writer outWriter = new BufferedWriter(new OutputStreamWriter(outStream, Charsets.UTF_8))){
-      for (Pair<PreferenceArray,long[]> user : new DataFileIterable(inputFile)) {
-        PreferenceArray prefs = user.getFirst();
-        long[] timestamps = user.getSecond();
-        for (int i = 0; i < prefs.length(); i++) {
-          outWriter.write(String.valueOf(prefs.getUserID(i)));
-          outWriter.write(',');
-          outWriter.write(String.valueOf(prefs.getItemID(i)));
-          if (columnsToOutput > 2) {
-            outWriter.write(',');
-            outWriter.write(String.valueOf(prefs.getValue(i)));
-          }
-          if (columnsToOutput > 3) {
-            outWriter.write(',');
-            outWriter.write(String.valueOf(timestamps[i]));
-          }
-          outWriter.write('\n');
-        }
-      }
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java
deleted file mode 100644
index 0112ab9..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class EstimateConverter {
-
-  private static final Logger log = LoggerFactory.getLogger(EstimateConverter.class);
-
-  private EstimateConverter() {}
-
-  public static byte convert(double estimate, long userID, long itemID) {
-    if (Double.isNaN(estimate)) {
-      log.warn("Unable to compute estimate for user {}, item {}", userID, itemID);
-      return 0x7F;
-    } else {
-      int scaledEstimate = (int) (estimate * 2.55);
-      if (scaledEstimate > 255) {
-        scaledEstimate = 255;
-      } else if (scaledEstimate < 0) {
-        scaledEstimate = 0;
-      }
-      return (byte) scaledEstimate;
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java
deleted file mode 100644
index 72056da..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import java.util.concurrent.Callable;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import org.apache.mahout.cf.taste.common.NoSuchItemException;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-final class Track1Callable implements Callable<byte[]> {
-
-  private static final Logger log = LoggerFactory.getLogger(Track1Callable.class);
-  private static final AtomicInteger COUNT = new AtomicInteger();
-
-  private final Recommender recommender;
-  private final PreferenceArray userTest;
-
-  Track1Callable(Recommender recommender, PreferenceArray userTest) {
-    this.recommender = recommender;
-    this.userTest = userTest;
-  }
-
-  @Override
-  public byte[] call() throws TasteException {
-    long userID = userTest.get(0).getUserID();
-    byte[] result = new byte[userTest.length()];
-    for (int i = 0; i < userTest.length(); i++) {
-      long itemID = userTest.getItemID(i);
-      double estimate;
-      try {
-        estimate = recommender.estimatePreference(userID, itemID);
-      } catch (NoSuchItemException nsie) {
-        // OK in the sample data provided before the contest, should never happen otherwise
-        log.warn("Unknown item {}; OK unless this is the real contest data", itemID);
-        continue;
-      }
-      result[i] = EstimateConverter.convert(estimate, userID, itemID);
-    }
-
-    if (COUNT.incrementAndGet() % 10000 == 0) {
-      log.info("Completed {} users", COUNT.get());
-    }
-
-    return result;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java
deleted file mode 100644
index 067daf5..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender;
-import org.apache.mahout.cf.taste.impl.similarity.UncenteredCosineSimilarity;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.IDRescorer;
-import org.apache.mahout.cf.taste.recommender.RecommendedItem;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
-
-public final class Track1Recommender implements Recommender {
-
-  private final Recommender recommender;
-
-  public Track1Recommender(DataModel dataModel) throws TasteException {
-    // Change this to whatever you like!
-    ItemSimilarity similarity = new UncenteredCosineSimilarity(dataModel);
-    recommender = new GenericItemBasedRecommender(dataModel, similarity);
-  }
-  
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
-    return recommender.recommend(userID, howMany);
-  }
-
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
-    return recommend(userID, howMany, null, includeKnownItems);
-  }
-
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
-    return recommender.recommend(userID, howMany, rescorer, false);
-  }
-  
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
-    throws TasteException {
-    return recommender.recommend(userID, howMany, rescorer, includeKnownItems);
-  }
-  
-  @Override
-  public float estimatePreference(long userID, long itemID) throws TasteException {
-    return recommender.estimatePreference(userID, itemID);
-  }
-  
-  @Override
-  public void setPreference(long userID, long itemID, float value) throws TasteException {
-    recommender.setPreference(userID, itemID, value);
-  }
-  
-  @Override
-  public void removePreference(long userID, long itemID) throws TasteException {
-    recommender.removePreference(userID, itemID);
-  }
-  
-  @Override
-  public DataModel getDataModel() {
-    return recommender.getDataModel();
-  }
-  
-  @Override
-  public void refresh(Collection<Refreshable> alreadyRefreshed) {
-    recommender.refresh(alreadyRefreshed);
-  }
-  
-  @Override
-  public String toString() {
-    return "Track1Recommender[recommender:" + recommender + ']';
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java
deleted file mode 100644
index 6b9fe1b..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-
-final class Track1RecommenderBuilder implements RecommenderBuilder {
-  
-  @Override
-  public Recommender buildRecommender(DataModel dataModel) throws TasteException {
-    return new Track1Recommender(dataModel);
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java
deleted file mode 100644
index bcd0a3d..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java
+++ /dev/null
@@ -1,108 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import java.io.File;
-import java.util.Collection;
-import java.util.concurrent.Callable;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import com.google.common.collect.Lists;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.DataModelBuilder;
-import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
-import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
-import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev;
-import org.apache.mahout.cf.taste.impl.common.RunningAverage;
-import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
-import org.apache.mahout.cf.taste.impl.eval.AbstractDifferenceRecommenderEvaluator;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.apache.mahout.common.Pair;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Attempts to run an evaluation just like that dictated for Yahoo's KDD Cup, Track 1.
- * It will compute the RMSE of a validation data set against the predicted ratings from
- * the training data set.
- */
-public final class Track1RecommenderEvaluator extends AbstractDifferenceRecommenderEvaluator {
-
-  private static final Logger log = LoggerFactory.getLogger(Track1RecommenderEvaluator.class);
-
-  private RunningAverage average;
-  private final File dataFileDirectory;
-
-  public Track1RecommenderEvaluator(File dataFileDirectory) {
-    setMaxPreference(100.0f);
-    setMinPreference(0.0f);
-    average = new FullRunningAverage();
-    this.dataFileDirectory = dataFileDirectory;
-  }
-
-  @Override
-  public double evaluate(RecommenderBuilder recommenderBuilder,
-                         DataModelBuilder dataModelBuilder,
-                         DataModel dataModel,
-                         double trainingPercentage,
-                         double evaluationPercentage) throws TasteException {
-
-    Recommender recommender = recommenderBuilder.buildRecommender(dataModel);
-
-    Collection<Callable<Void>> estimateCallables = Lists.newArrayList();
-    AtomicInteger noEstimateCounter = new AtomicInteger();
-    for (Pair<PreferenceArray,long[]> userData
-        : new DataFileIterable(KDDCupDataModel.getValidationFile(dataFileDirectory))) {
-      PreferenceArray validationPrefs = userData.getFirst();
-      long userID = validationPrefs.get(0).getUserID();
-      estimateCallables.add(
-          new PreferenceEstimateCallable(recommender, userID, validationPrefs, noEstimateCounter));
-    }
-
-    RunningAverageAndStdDev timing = new FullRunningAverageAndStdDev();
-    execute(estimateCallables, noEstimateCounter, timing);
-
-    double result = computeFinalEvaluation();
-    log.info("Evaluation result: {}", result);
-    return result;
-  }
-
-  // Use RMSE scoring:
-
-  @Override
-  protected void reset() {
-    average = new FullRunningAverage();
-  }
-
-  @Override
-  protected void processOneEstimate(float estimatedPreference, Preference realPref) {
-    double diff = realPref.getValue() - estimatedPreference;
-    average.addDatum(diff * diff);
-  }
-
-  @Override
-  protected double computeFinalEvaluation() {
-    return Math.sqrt(average.getAverage());
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java
deleted file mode 100644
index deadc00..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.commons.cli2.OptionException;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.example.TasteOptionParser;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class Track1RecommenderEvaluatorRunner {
-
-  private static final Logger log = LoggerFactory.getLogger(Track1RecommenderEvaluatorRunner.class);
-
-  private Track1RecommenderEvaluatorRunner() {
-  }
-  
-  public static void main(String... args) throws IOException, TasteException, OptionException {
-    File dataFileDirectory = TasteOptionParser.getRatings(args);
-    if (dataFileDirectory == null) {
-      throw new IllegalArgumentException("No data directory");
-    }
-    if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
-      throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
-    }
-    Track1RecommenderEvaluator evaluator = new Track1RecommenderEvaluator(dataFileDirectory);
-    DataModel model = new KDDCupDataModel(KDDCupDataModel.getTrainingFile(dataFileDirectory));
-    double evaluation = evaluator.evaluate(new Track1RecommenderBuilder(),
-      null,
-      model,
-      Float.NaN,
-      Float.NaN);
-    log.info(String.valueOf(evaluation));
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java
deleted file mode 100644
index a0ff126..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java
+++ /dev/null
@@ -1,95 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-
-/**
- * <p>Runs "track 1" of the KDD Cup competition using whatever recommender is inside {@link Track1Recommender}
- * and attempts to output the result in the correct contest format.</p>
- *
- * <p>Run as: {@code Track1Runner [track 1 data file directory] [output file]}</p>
- */
-public final class Track1Runner {
-
-  private static final Logger log = LoggerFactory.getLogger(Track1Runner.class);
-
-  private Track1Runner() {
-  }
-
-  public static void main(String[] args) throws Exception {
-
-    File dataFileDirectory = new File(args[0]);
-    if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
-      throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
-    }
-
-    long start = System.currentTimeMillis();
-
-    KDDCupDataModel model = new KDDCupDataModel(KDDCupDataModel.getTrainingFile(dataFileDirectory));
-    Track1Recommender recommender = new Track1Recommender(model);
-
-    long end = System.currentTimeMillis();
-    log.info("Loaded model in {}s", (end - start) / 1000);
-    start = end;
-
-    Collection<Track1Callable> callables = new ArrayList<>();
-    for (Pair<PreferenceArray,long[]> tests : new DataFileIterable(KDDCupDataModel.getTestFile(dataFileDirectory))) {
-      PreferenceArray userTest = tests.getFirst();
-      callables.add(new Track1Callable(recommender, userTest));
-    }
-
-    int cores = Runtime.getRuntime().availableProcessors();
-    log.info("Running on {} cores", cores);
-    ExecutorService executor = Executors.newFixedThreadPool(cores);
-    List<Future<byte[]>> results = executor.invokeAll(callables);
-    executor.shutdown();
-
-    end = System.currentTimeMillis();
-    log.info("Ran recommendations in {}s", (end - start) / 1000);
-    start = end;
-
-    try (OutputStream out = new BufferedOutputStream(new FileOutputStream(new File(args[1])))){
-      for (Future<byte[]> result : results) {
-        for (byte estimate : result.get()) {
-          out.write(estimate);
-        }
-      }
-    }
-
-    end = System.currentTimeMillis();
-    log.info("Wrote output in {}s", (end - start) / 1000);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java
deleted file mode 100644
index 022d78c..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.model.GenericPreference;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.Preference;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * can be used to drop {@link DataModel}s into {@link ParallelArraysSGDFactorizer}
- */
-public class DataModelFactorizablePreferences implements FactorizablePreferences {
-
-  private final FastIDSet userIDs;
-  private final FastIDSet itemIDs;
-
-  private final List<Preference> preferences;
-
-  private final float minPreference;
-  private final float maxPreference;
-
-  public DataModelFactorizablePreferences(DataModel dataModel) {
-
-    minPreference = dataModel.getMinPreference();
-    maxPreference = dataModel.getMaxPreference();
-
-    try {
-      userIDs = new FastIDSet(dataModel.getNumUsers());
-      itemIDs = new FastIDSet(dataModel.getNumItems());
-      preferences = new ArrayList<>();
-
-      LongPrimitiveIterator userIDsIterator = dataModel.getUserIDs();
-      while (userIDsIterator.hasNext()) {
-        long userID = userIDsIterator.nextLong();
-        userIDs.add(userID);
-        for (Preference preference : dataModel.getPreferencesFromUser(userID)) {
-          itemIDs.add(preference.getItemID());
-          preferences.add(new GenericPreference(userID, preference.getItemID(), preference.getValue()));
-        }
-      }
-    } catch (TasteException te) {
-      throw new IllegalStateException("Unable to create factorizable preferences!", te);
-    }
-  }
-
-  @Override
-  public LongPrimitiveIterator getUserIDs() {
-    return userIDs.iterator();
-  }
-
-  @Override
-  public LongPrimitiveIterator getItemIDs() {
-    return itemIDs.iterator();
-  }
-
-  @Override
-  public Iterable<Preference> getPreferences() {
-    return preferences;
-  }
-
-  @Override
-  public float getMinPreference() {
-    return minPreference;
-  }
-
-  @Override
-  public float getMaxPreference() {
-    return maxPreference;
-  }
-
-  @Override
-  public int numUsers() {
-    return userIDs.size();
-  }
-
-  @Override
-  public int numItems() {
-    return itemIDs.size();
-  }
-
-  @Override
-  public int numPreferences() {
-    return preferences.size();
-  }
-}
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java
deleted file mode 100644
index a126dec..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
-
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.model.Preference;
-
-/**
- * models the necessary input for {@link ParallelArraysSGDFactorizer}
- */
-public interface FactorizablePreferences {
-
-  LongPrimitiveIterator getUserIDs();
-
-  LongPrimitiveIterator getItemIDs();
-
-  Iterable<Preference> getPreferences();
-
-  float getMinPreference();
-
-  float getMaxPreference();
-
-  int numUsers();
-
-  int numItems();
-
-  int numPreferences();
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java
deleted file mode 100644
index 6dcef6b..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java
+++ /dev/null
@@ -1,123 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Iterables;
-import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
-import org.apache.mahout.cf.taste.impl.common.AbstractLongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-
-import java.io.File;
-
-public class KDDCupFactorizablePreferences implements FactorizablePreferences {
-
-  private final File dataFile;
-
-  public KDDCupFactorizablePreferences(File dataFile) {
-    this.dataFile = dataFile;
-  }
-
-  @Override
-  public LongPrimitiveIterator getUserIDs() {
-    return new FixedSizeLongIterator(numUsers());
-  }
-
-  @Override
-  public LongPrimitiveIterator getItemIDs() {
-    return new FixedSizeLongIterator(numItems());
-  }
-
-  @Override
-  public Iterable<Preference> getPreferences() {
-    Iterable<Iterable<Preference>> prefIterators =
-        Iterables.transform(new DataFileIterable(dataFile),
-          new Function<Pair<PreferenceArray,long[]>,Iterable<Preference>>() {
-            @Override
-            public Iterable<Preference> apply(Pair<PreferenceArray,long[]> from) {
-              return from.getFirst();
-            }
-          });
-    return Iterables.concat(prefIterators);
-  }
-
-  @Override
-  public float getMinPreference() {
-    return 0;
-  }
-
-  @Override
-  public float getMaxPreference() {
-    return 100;
-  }
-
-  @Override
-  public int numUsers() {
-    return 1000990;
-  }
-
-  @Override
-  public int numItems() {
-    return 624961;
-  }
-
-  @Override
-  public int numPreferences() {
-    return 252800275;
-  }
-
-  static class FixedSizeLongIterator extends AbstractLongPrimitiveIterator {
-
-    private long currentValue;
-    private final long maximum;
-
-    FixedSizeLongIterator(long maximum) {
-      this.maximum = maximum;
-      currentValue = 0;
-    }
-
-    @Override
-    public long nextLong() {
-      return currentValue++;
-    }
-
-    @Override
-    public long peek() {
-      return currentValue;
-    }
-
-    @Override
-    public void skip(int n) {
-      currentValue += n;
-    }
-
-    @Override
-    public boolean hasNext() {
-      return currentValue < maximum;
-    }
-
-    @Override
-    public void remove() {
-      throw new UnsupportedOperationException();
-    }
-  }
-
-}


[29/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/resources/wdbc/wdbc.data
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/resources/wdbc/wdbc.data b/community/mahout-mr/mr-examples/src/test/resources/wdbc/wdbc.data
new file mode 100644
index 0000000..8885375
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/resources/wdbc/wdbc.data
@@ -0,0 +1,569 @@
+842302,M,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
+842517,M,20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956,0.1238,0.1866,0.2416,0.186,0.275,0.08902
+84300903,M,19.69,21.25,130,1203,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
+84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
+84358402,M,20.29,14.34,135.1,1297,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575,0.1374,0.205,0.4,0.1625,0.2364,0.07678
+843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,0.3345,0.8902,2.217,27.19,0.00751,0.03345,0.03672,0.01137,0.02165,0.005082,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
+844359,M,18.25,19.98,119.6,1040,0.09463,0.109,0.1127,0.074,0.1794,0.05742,0.4467,0.7732,3.18,53.91,0.004314,0.01382,0.02254,0.01039,0.01369,0.002179,22.88,27.66,153.2,1606,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
+84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,0.5835,1.377,3.856,50.96,0.008805,0.03029,0.02488,0.01448,0.01486,0.005412,17.06,28.14,110.6,897,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
+844981,M,13,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,0.3063,1.002,2.406,24.32,0.005731,0.03502,0.03553,0.01226,0.02143,0.003749,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
+84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,0.2976,1.599,2.039,23.94,0.007149,0.07217,0.07743,0.01432,0.01789,0.01008,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075
+845636,M,16.02,23.24,102.7,797.8,0.08206,0.06669,0.03299,0.03323,0.1528,0.05697,0.3795,1.187,2.466,40.51,0.004029,0.009269,0.01101,0.007591,0.0146,0.003042,19.19,33.88,123.8,1150,0.1181,0.1551,0.1459,0.09975,0.2948,0.08452
+84610002,M,15.78,17.89,103.6,781,0.0971,0.1292,0.09954,0.06606,0.1842,0.06082,0.5058,0.9849,3.564,54.16,0.005771,0.04061,0.02791,0.01282,0.02008,0.004144,20.42,27.28,136.5,1299,0.1396,0.5609,0.3965,0.181,0.3792,0.1048
+846226,M,19.17,24.8,132.4,1123,0.0974,0.2458,0.2065,0.1118,0.2397,0.078,0.9555,3.568,11.07,116.2,0.003139,0.08297,0.0889,0.0409,0.04484,0.01284,20.96,29.94,151.7,1332,0.1037,0.3903,0.3639,0.1767,0.3176,0.1023
+846381,M,15.85,23.95,103.7,782.7,0.08401,0.1002,0.09938,0.05364,0.1847,0.05338,0.4033,1.078,2.903,36.58,0.009769,0.03126,0.05051,0.01992,0.02981,0.003002,16.84,27.66,112,876.5,0.1131,0.1924,0.2322,0.1119,0.2809,0.06287
+84667401,M,13.73,22.61,93.6,578.3,0.1131,0.2293,0.2128,0.08025,0.2069,0.07682,0.2121,1.169,2.061,19.21,0.006429,0.05936,0.05501,0.01628,0.01961,0.008093,15.03,32.01,108.8,697.7,0.1651,0.7725,0.6943,0.2208,0.3596,0.1431
+84799002,M,14.54,27.54,96.73,658.8,0.1139,0.1595,0.1639,0.07364,0.2303,0.07077,0.37,1.033,2.879,32.55,0.005607,0.0424,0.04741,0.0109,0.01857,0.005466,17.46,37.13,124.1,943.2,0.1678,0.6577,0.7026,0.1712,0.4218,0.1341
+848406,M,14.68,20.13,94.74,684.5,0.09867,0.072,0.07395,0.05259,0.1586,0.05922,0.4727,1.24,3.195,45.4,0.005718,0.01162,0.01998,0.01109,0.0141,0.002085,19.07,30.88,123.4,1138,0.1464,0.1871,0.2914,0.1609,0.3029,0.08216
+84862001,M,16.13,20.68,108.1,798.8,0.117,0.2022,0.1722,0.1028,0.2164,0.07356,0.5692,1.073,3.854,54.18,0.007026,0.02501,0.03188,0.01297,0.01689,0.004142,20.96,31.48,136.8,1315,0.1789,0.4233,0.4784,0.2073,0.3706,0.1142
+849014,M,19.81,22.15,130,1260,0.09831,0.1027,0.1479,0.09498,0.1582,0.05395,0.7582,1.017,5.865,112.4,0.006494,0.01893,0.03391,0.01521,0.01356,0.001997,27.32,30.88,186.8,2398,0.1512,0.315,0.5372,0.2388,0.2768,0.07615
+8510426,B,13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,0.1885,0.05766,0.2699,0.7886,2.058,23.56,0.008462,0.0146,0.02387,0.01315,0.0198,0.0023,15.11,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259
+8510653,B,13.08,15.71,85.63,520,0.1075,0.127,0.04568,0.0311,0.1967,0.06811,0.1852,0.7477,1.383,14.67,0.004097,0.01898,0.01698,0.00649,0.01678,0.002425,14.5,20.49,96.09,630.5,0.1312,0.2776,0.189,0.07283,0.3184,0.08183
+8510824,B,9.504,12.44,60.34,273.9,0.1024,0.06492,0.02956,0.02076,0.1815,0.06905,0.2773,0.9768,1.909,15.7,0.009606,0.01432,0.01985,0.01421,0.02027,0.002968,10.23,15.66,65.13,314.9,0.1324,0.1148,0.08867,0.06227,0.245,0.07773
+8511133,M,15.34,14.26,102.5,704.4,0.1073,0.2135,0.2077,0.09756,0.2521,0.07032,0.4388,0.7096,3.384,44.91,0.006789,0.05328,0.06446,0.02252,0.03672,0.004394,18.07,19.08,125.1,980.9,0.139,0.5954,0.6305,0.2393,0.4667,0.09946
+851509,M,21.16,23.04,137.2,1404,0.09428,0.1022,0.1097,0.08632,0.1769,0.05278,0.6917,1.127,4.303,93.99,0.004728,0.01259,0.01715,0.01038,0.01083,0.001987,29.17,35.59,188,2615,0.1401,0.26,0.3155,0.2009,0.2822,0.07526
+852552,M,16.65,21.38,110,904.6,0.1121,0.1457,0.1525,0.0917,0.1995,0.0633,0.8068,0.9017,5.455,102.6,0.006048,0.01882,0.02741,0.0113,0.01468,0.002801,26.46,31.56,177,2215,0.1805,0.3578,0.4695,0.2095,0.3613,0.09564
+852631,M,17.14,16.4,116,912.7,0.1186,0.2276,0.2229,0.1401,0.304,0.07413,1.046,0.976,7.276,111.4,0.008029,0.03799,0.03732,0.02397,0.02308,0.007444,22.25,21.4,152.4,1461,0.1545,0.3949,0.3853,0.255,0.4066,0.1059
+852763,M,14.58,21.53,97.41,644.8,0.1054,0.1868,0.1425,0.08783,0.2252,0.06924,0.2545,0.9832,2.11,21.05,0.004452,0.03055,0.02681,0.01352,0.01454,0.003711,17.62,33.21,122.4,896.9,0.1525,0.6643,0.5539,0.2701,0.4264,0.1275
+852781,M,18.61,20.25,122.1,1094,0.0944,0.1066,0.149,0.07731,0.1697,0.05699,0.8529,1.849,5.632,93.54,0.01075,0.02722,0.05081,0.01911,0.02293,0.004217,21.31,27.26,139.9,1403,0.1338,0.2117,0.3446,0.149,0.2341,0.07421
+852973,M,15.3,25.27,102.4,732.4,0.1082,0.1697,0.1683,0.08751,0.1926,0.0654,0.439,1.012,3.498,43.5,0.005233,0.03057,0.03576,0.01083,0.01768,0.002967,20.27,36.71,149.3,1269,0.1641,0.611,0.6335,0.2024,0.4027,0.09876
+853201,M,17.57,15.05,115,955.1,0.09847,0.1157,0.09875,0.07953,0.1739,0.06149,0.6003,0.8225,4.655,61.1,0.005627,0.03033,0.03407,0.01354,0.01925,0.003742,20.01,19.52,134.9,1227,0.1255,0.2812,0.2489,0.1456,0.2756,0.07919
+853401,M,18.63,25.11,124.8,1088,0.1064,0.1887,0.2319,0.1244,0.2183,0.06197,0.8307,1.466,5.574,105,0.006248,0.03374,0.05196,0.01158,0.02007,0.00456,23.15,34.01,160.5,1670,0.1491,0.4257,0.6133,0.1848,0.3444,0.09782
+853612,M,11.84,18.7,77.93,440.6,0.1109,0.1516,0.1218,0.05182,0.2301,0.07799,0.4825,1.03,3.475,41,0.005551,0.03414,0.04205,0.01044,0.02273,0.005667,16.82,28.12,119.4,888.7,0.1637,0.5775,0.6956,0.1546,0.4761,0.1402
+85382601,M,17.02,23.98,112.8,899.3,0.1197,0.1496,0.2417,0.1203,0.2248,0.06382,0.6009,1.398,3.999,67.78,0.008268,0.03082,0.05042,0.01112,0.02102,0.003854,20.88,32.09,136.1,1344,0.1634,0.3559,0.5588,0.1847,0.353,0.08482
+854002,M,19.27,26.47,127.9,1162,0.09401,0.1719,0.1657,0.07593,0.1853,0.06261,0.5558,0.6062,3.528,68.17,0.005015,0.03318,0.03497,0.009643,0.01543,0.003896,24.15,30.9,161.4,1813,0.1509,0.659,0.6091,0.1785,0.3672,0.1123
+854039,M,16.13,17.88,107,807.2,0.104,0.1559,0.1354,0.07752,0.1998,0.06515,0.334,0.6857,2.183,35.03,0.004185,0.02868,0.02664,0.009067,0.01703,0.003817,20.21,27.26,132.7,1261,0.1446,0.5804,0.5274,0.1864,0.427,0.1233
+854253,M,16.74,21.59,110.1,869.5,0.0961,0.1336,0.1348,0.06018,0.1896,0.05656,0.4615,0.9197,3.008,45.19,0.005776,0.02499,0.03695,0.01195,0.02789,0.002665,20.01,29.02,133.5,1229,0.1563,0.3835,0.5409,0.1813,0.4863,0.08633
+854268,M,14.25,21.72,93.63,633,0.09823,0.1098,0.1319,0.05598,0.1885,0.06125,0.286,1.019,2.657,24.91,0.005878,0.02995,0.04815,0.01161,0.02028,0.004022,15.89,30.36,116.2,799.6,0.1446,0.4238,0.5186,0.1447,0.3591,0.1014
+854941,B,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,0.1467,0.05863,0.1839,2.342,1.17,14.16,0.004352,0.004899,0.01343,0.01164,0.02671,0.001777,13.3,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169
+855133,M,14.99,25.2,95.54,698.8,0.09387,0.05131,0.02398,0.02899,0.1565,0.05504,1.214,2.188,8.077,106,0.006883,0.01094,0.01818,0.01917,0.007882,0.001754,14.99,25.2,95.54,698.8,0.09387,0.05131,0.02398,0.02899,0.1565,0.05504
+855138,M,13.48,20.82,88.4,559.2,0.1016,0.1255,0.1063,0.05439,0.172,0.06419,0.213,0.5914,1.545,18.52,0.005367,0.02239,0.03049,0.01262,0.01377,0.003187,15.53,26.02,107.3,740.4,0.161,0.4225,0.503,0.2258,0.2807,0.1071
+855167,M,13.44,21.58,86.18,563,0.08162,0.06031,0.0311,0.02031,0.1784,0.05587,0.2385,0.8265,1.572,20.53,0.00328,0.01102,0.0139,0.006881,0.0138,0.001286,15.93,30.25,102.5,787.9,0.1094,0.2043,0.2085,0.1112,0.2994,0.07146
+855563,M,10.95,21.35,71.9,371.1,0.1227,0.1218,0.1044,0.05669,0.1895,0.0687,0.2366,1.428,1.822,16.97,0.008064,0.01764,0.02595,0.01037,0.01357,0.00304,12.84,35.34,87.22,514,0.1909,0.2698,0.4023,0.1424,0.2964,0.09606
+855625,M,19.07,24.81,128.3,1104,0.09081,0.219,0.2107,0.09961,0.231,0.06343,0.9811,1.666,8.83,104.9,0.006548,0.1006,0.09723,0.02638,0.05333,0.007646,24.09,33.17,177.4,1651,0.1247,0.7444,0.7242,0.2493,0.467,0.1038
+856106,M,13.28,20.28,87.32,545.2,0.1041,0.1436,0.09847,0.06158,0.1974,0.06782,0.3704,0.8249,2.427,31.33,0.005072,0.02147,0.02185,0.00956,0.01719,0.003317,17.38,28,113.1,907.2,0.153,0.3724,0.3664,0.1492,0.3739,0.1027
+85638502,M,13.17,21.81,85.42,531.5,0.09714,0.1047,0.08259,0.05252,0.1746,0.06177,0.1938,0.6123,1.334,14.49,0.00335,0.01384,0.01452,0.006853,0.01113,0.00172,16.23,29.89,105.5,740.7,0.1503,0.3904,0.3728,0.1607,0.3693,0.09618
+857010,M,18.65,17.6,123.7,1076,0.1099,0.1686,0.1974,0.1009,0.1907,0.06049,0.6289,0.6633,4.293,71.56,0.006294,0.03994,0.05554,0.01695,0.02428,0.003535,22.82,21.32,150.6,1567,0.1679,0.509,0.7345,0.2378,0.3799,0.09185
+85713702,B,8.196,16.84,51.71,201.9,0.086,0.05943,0.01588,0.005917,0.1769,0.06503,0.1563,0.9567,1.094,8.205,0.008968,0.01646,0.01588,0.005917,0.02574,0.002582,8.964,21.96,57.26,242.2,0.1297,0.1357,0.0688,0.02564,0.3105,0.07409
+85715,M,13.17,18.66,85.98,534.6,0.1158,0.1231,0.1226,0.0734,0.2128,0.06777,0.2871,0.8937,1.897,24.25,0.006532,0.02336,0.02905,0.01215,0.01743,0.003643,15.67,27.95,102.8,759.4,0.1786,0.4166,0.5006,0.2088,0.39,0.1179
+857155,B,12.05,14.63,78.04,449.3,0.1031,0.09092,0.06592,0.02749,0.1675,0.06043,0.2636,0.7294,1.848,19.87,0.005488,0.01427,0.02322,0.00566,0.01428,0.002422,13.76,20.7,89.88,582.6,0.1494,0.2156,0.305,0.06548,0.2747,0.08301
+857156,B,13.49,22.3,86.91,561,0.08752,0.07698,0.04751,0.03384,0.1809,0.05718,0.2338,1.353,1.735,20.2,0.004455,0.01382,0.02095,0.01184,0.01641,0.001956,15.15,31.82,99,698.8,0.1162,0.1711,0.2282,0.1282,0.2871,0.06917
+857343,B,11.76,21.6,74.72,427.9,0.08637,0.04966,0.01657,0.01115,0.1495,0.05888,0.4062,1.21,2.635,28.47,0.005857,0.009758,0.01168,0.007445,0.02406,0.001769,12.98,25.72,82.98,516.5,0.1085,0.08615,0.05523,0.03715,0.2433,0.06563
+857373,B,13.64,16.34,87.21,571.8,0.07685,0.06059,0.01857,0.01723,0.1353,0.05953,0.1872,0.9234,1.449,14.55,0.004477,0.01177,0.01079,0.007956,0.01325,0.002551,14.67,23.19,96.08,656.7,0.1089,0.1582,0.105,0.08586,0.2346,0.08025
+857374,B,11.94,18.24,75.71,437.6,0.08261,0.04751,0.01972,0.01349,0.1868,0.0611,0.2273,0.6329,1.52,17.47,0.00721,0.00838,0.01311,0.008,0.01996,0.002635,13.1,21.33,83.67,527.2,0.1144,0.08906,0.09203,0.06296,0.2785,0.07408
+857392,M,18.22,18.7,120.3,1033,0.1148,0.1485,0.1772,0.106,0.2092,0.0631,0.8337,1.593,4.877,98.81,0.003899,0.02961,0.02817,0.009222,0.02674,0.005126,20.6,24.13,135.1,1321,0.128,0.2297,0.2623,0.1325,0.3021,0.07987
+857438,M,15.1,22.02,97.26,712.8,0.09056,0.07081,0.05253,0.03334,0.1616,0.05684,0.3105,0.8339,2.097,29.91,0.004675,0.0103,0.01603,0.009222,0.01095,0.001629,18.1,31.69,117.7,1030,0.1389,0.2057,0.2712,0.153,0.2675,0.07873
+85759902,B,11.52,18.75,73.34,409,0.09524,0.05473,0.03036,0.02278,0.192,0.05907,0.3249,0.9591,2.183,23.47,0.008328,0.008722,0.01349,0.00867,0.03218,0.002386,12.84,22.47,81.81,506.2,0.1249,0.0872,0.09076,0.06316,0.3306,0.07036
+857637,M,19.21,18.57,125.5,1152,0.1053,0.1267,0.1323,0.08994,0.1917,0.05961,0.7275,1.193,4.837,102.5,0.006458,0.02306,0.02945,0.01538,0.01852,0.002608,26.14,28.14,170.1,2145,0.1624,0.3511,0.3879,0.2091,0.3537,0.08294
+857793,M,14.71,21.59,95.55,656.9,0.1137,0.1365,0.1293,0.08123,0.2027,0.06758,0.4226,1.15,2.735,40.09,0.003659,0.02855,0.02572,0.01272,0.01817,0.004108,17.87,30.7,115.7,985.5,0.1368,0.429,0.3587,0.1834,0.3698,0.1094
+857810,B,13.05,19.31,82.61,527.2,0.0806,0.03789,0.000692,0.004167,0.1819,0.05501,0.404,1.214,2.595,32.96,0.007491,0.008593,0.000692,0.004167,0.0219,0.00299,14.23,22.25,90.24,624.1,0.1021,0.06191,0.001845,0.01111,0.2439,0.06289
+858477,B,8.618,11.79,54.34,224.5,0.09752,0.05272,0.02061,0.007799,0.1683,0.07187,0.1559,0.5796,1.046,8.322,0.01011,0.01055,0.01981,0.005742,0.0209,0.002788,9.507,15.4,59.9,274.9,0.1733,0.1239,0.1168,0.04419,0.322,0.09026
+858970,B,10.17,14.88,64.55,311.9,0.1134,0.08061,0.01084,0.0129,0.2743,0.0696,0.5158,1.441,3.312,34.62,0.007514,0.01099,0.007665,0.008193,0.04183,0.005953,11.02,17.45,69.86,368.6,0.1275,0.09866,0.02168,0.02579,0.3557,0.0802
+858981,B,8.598,20.98,54.66,221.8,0.1243,0.08963,0.03,0.009259,0.1828,0.06757,0.3582,2.067,2.493,18.39,0.01193,0.03162,0.03,0.009259,0.03357,0.003048,9.565,27.04,62.06,273.9,0.1639,0.1698,0.09001,0.02778,0.2972,0.07712
+858986,M,14.25,22.15,96.42,645.7,0.1049,0.2008,0.2135,0.08653,0.1949,0.07292,0.7036,1.268,5.373,60.78,0.009407,0.07056,0.06899,0.01848,0.017,0.006113,17.67,29.51,119.1,959.5,0.164,0.6247,0.6922,0.1785,0.2844,0.1132
+859196,B,9.173,13.86,59.2,260.9,0.07721,0.08751,0.05988,0.0218,0.2341,0.06963,0.4098,2.265,2.608,23.52,0.008738,0.03938,0.04312,0.0156,0.04192,0.005822,10.01,19.23,65.59,310.1,0.09836,0.1678,0.1397,0.05087,0.3282,0.0849
+85922302,M,12.68,23.84,82.69,499,0.1122,0.1262,0.1128,0.06873,0.1905,0.0659,0.4255,1.178,2.927,36.46,0.007781,0.02648,0.02973,0.0129,0.01635,0.003601,17.09,33.47,111.8,888.3,0.1851,0.4061,0.4024,0.1716,0.3383,0.1031
+859283,M,14.78,23.94,97.4,668.3,0.1172,0.1479,0.1267,0.09029,0.1953,0.06654,0.3577,1.281,2.45,35.24,0.006703,0.0231,0.02315,0.01184,0.019,0.003224,17.31,33.39,114.6,925.1,0.1648,0.3416,0.3024,0.1614,0.3321,0.08911
+859464,B,9.465,21.01,60.11,269.4,0.1044,0.07773,0.02172,0.01504,0.1717,0.06899,0.2351,2.011,1.66,14.2,0.01052,0.01755,0.01714,0.009333,0.02279,0.004237,10.41,31.56,67.03,330.7,0.1548,0.1664,0.09412,0.06517,0.2878,0.09211
+859465,B,11.31,19.04,71.8,394.1,0.08139,0.04701,0.03709,0.0223,0.1516,0.05667,0.2727,0.9429,1.831,18.15,0.009282,0.009216,0.02063,0.008965,0.02183,0.002146,12.33,23.84,78,466.7,0.129,0.09148,0.1444,0.06961,0.24,0.06641
+859471,B,9.029,17.33,58.79,250.5,0.1066,0.1413,0.313,0.04375,0.2111,0.08046,0.3274,1.194,1.885,17.67,0.009549,0.08606,0.3038,0.03322,0.04197,0.009559,10.31,22.65,65.5,324.7,0.1482,0.4365,1.252,0.175,0.4228,0.1175
+859487,B,12.78,16.49,81.37,502.5,0.09831,0.05234,0.03653,0.02864,0.159,0.05653,0.2368,0.8732,1.471,18.33,0.007962,0.005612,0.01585,0.008662,0.02254,0.001906,13.46,19.76,85.67,554.9,0.1296,0.07061,0.1039,0.05882,0.2383,0.0641
+859575,M,18.94,21.31,123.6,1130,0.09009,0.1029,0.108,0.07951,0.1582,0.05461,0.7888,0.7975,5.486,96.05,0.004444,0.01652,0.02269,0.0137,0.01386,0.001698,24.86,26.58,165.9,1866,0.1193,0.2336,0.2687,0.1789,0.2551,0.06589
+859711,B,8.888,14.64,58.79,244,0.09783,0.1531,0.08606,0.02872,0.1902,0.0898,0.5262,0.8522,3.168,25.44,0.01721,0.09368,0.05671,0.01766,0.02541,0.02193,9.733,15.67,62.56,284.4,0.1207,0.2436,0.1434,0.04786,0.2254,0.1084
+859717,M,17.2,24.52,114.2,929.4,0.1071,0.183,0.1692,0.07944,0.1927,0.06487,0.5907,1.041,3.705,69.47,0.00582,0.05616,0.04252,0.01127,0.01527,0.006299,23.32,33.82,151.6,1681,0.1585,0.7394,0.6566,0.1899,0.3313,0.1339
+859983,M,13.8,15.79,90.43,584.1,0.1007,0.128,0.07789,0.05069,0.1662,0.06566,0.2787,0.6205,1.957,23.35,0.004717,0.02065,0.01759,0.009206,0.0122,0.00313,16.57,20.86,110.3,812.4,0.1411,0.3542,0.2779,0.1383,0.2589,0.103
+8610175,B,12.31,16.52,79.19,470.9,0.09172,0.06829,0.03372,0.02272,0.172,0.05914,0.2505,1.025,1.74,19.68,0.004854,0.01819,0.01826,0.007965,0.01386,0.002304,14.11,23.21,89.71,611.1,0.1176,0.1843,0.1703,0.0866,0.2618,0.07609
+8610404,M,16.07,19.65,104.1,817.7,0.09168,0.08424,0.09769,0.06638,0.1798,0.05391,0.7474,1.016,5.029,79.25,0.01082,0.02203,0.035,0.01809,0.0155,0.001948,19.77,24.56,128.8,1223,0.15,0.2045,0.2829,0.152,0.265,0.06387
+8610629,B,13.53,10.94,87.91,559.2,0.1291,0.1047,0.06877,0.06556,0.2403,0.06641,0.4101,1.014,2.652,32.65,0.0134,0.02839,0.01162,0.008239,0.02572,0.006164,14.08,12.49,91.36,605.5,0.1451,0.1379,0.08539,0.07407,0.271,0.07191
+8610637,M,18.05,16.15,120.2,1006,0.1065,0.2146,0.1684,0.108,0.2152,0.06673,0.9806,0.5505,6.311,134.8,0.00794,0.05839,0.04658,0.0207,0.02591,0.007054,22.39,18.91,150.1,1610,0.1478,0.5634,0.3786,0.2102,0.3751,0.1108
+8610862,M,20.18,23.97,143.7,1245,0.1286,0.3454,0.3754,0.1604,0.2906,0.08142,0.9317,1.885,8.649,116.4,0.01038,0.06835,0.1091,0.02593,0.07895,0.005987,23.37,31.72,170.3,1623,0.1639,0.6164,0.7681,0.2508,0.544,0.09964
+8610908,B,12.86,18,83.19,506.3,0.09934,0.09546,0.03889,0.02315,0.1718,0.05997,0.2655,1.095,1.778,20.35,0.005293,0.01661,0.02071,0.008179,0.01748,0.002848,14.24,24.82,91.88,622.1,0.1289,0.2141,0.1731,0.07926,0.2779,0.07918
+861103,B,11.45,20.97,73.81,401.5,0.1102,0.09362,0.04591,0.02233,0.1842,0.07005,0.3251,2.174,2.077,24.62,0.01037,0.01706,0.02586,0.007506,0.01816,0.003976,13.11,32.16,84.53,525.1,0.1557,0.1676,0.1755,0.06127,0.2762,0.08851
+8611161,B,13.34,15.86,86.49,520,0.1078,0.1535,0.1169,0.06987,0.1942,0.06902,0.286,1.016,1.535,12.96,0.006794,0.03575,0.0398,0.01383,0.02134,0.004603,15.53,23.19,96.66,614.9,0.1536,0.4791,0.4858,0.1708,0.3527,0.1016
+8611555,M,25.22,24.91,171.5,1878,0.1063,0.2665,0.3339,0.1845,0.1829,0.06782,0.8973,1.474,7.382,120,0.008166,0.05693,0.0573,0.0203,0.01065,0.005893,30,33.62,211.7,2562,0.1573,0.6076,0.6476,0.2867,0.2355,0.1051
+8611792,M,19.1,26.29,129.1,1132,0.1215,0.1791,0.1937,0.1469,0.1634,0.07224,0.519,2.91,5.801,67.1,0.007545,0.0605,0.02134,0.01843,0.03056,0.01039,20.33,32.72,141.3,1298,0.1392,0.2817,0.2432,0.1841,0.2311,0.09203
+8612080,B,12,15.65,76.95,443.3,0.09723,0.07165,0.04151,0.01863,0.2079,0.05968,0.2271,1.255,1.441,16.16,0.005969,0.01812,0.02007,0.007027,0.01972,0.002607,13.67,24.9,87.78,567.9,0.1377,0.2003,0.2267,0.07632,0.3379,0.07924
+8612399,M,18.46,18.52,121.1,1075,0.09874,0.1053,0.1335,0.08795,0.2132,0.06022,0.6997,1.475,4.782,80.6,0.006471,0.01649,0.02806,0.0142,0.0237,0.003755,22.93,27.68,152.2,1603,0.1398,0.2089,0.3157,0.1642,0.3695,0.08579
+86135501,M,14.48,21.46,94.25,648.2,0.09444,0.09947,0.1204,0.04938,0.2075,0.05636,0.4204,2.22,3.301,38.87,0.009369,0.02983,0.05371,0.01761,0.02418,0.003249,16.21,29.25,108.4,808.9,0.1306,0.1976,0.3349,0.1225,0.302,0.06846
+86135502,M,19.02,24.59,122,1076,0.09029,0.1206,0.1468,0.08271,0.1953,0.05629,0.5495,0.6636,3.055,57.65,0.003872,0.01842,0.0371,0.012,0.01964,0.003337,24.56,30.41,152.9,1623,0.1249,0.3206,0.5755,0.1956,0.3956,0.09288
+861597,B,12.36,21.8,79.78,466.1,0.08772,0.09445,0.06015,0.03745,0.193,0.06404,0.2978,1.502,2.203,20.95,0.007112,0.02493,0.02703,0.01293,0.01958,0.004463,13.83,30.5,91.46,574.7,0.1304,0.2463,0.2434,0.1205,0.2972,0.09261
+861598,B,14.64,15.24,95.77,651.9,0.1132,0.1339,0.09966,0.07064,0.2116,0.06346,0.5115,0.7372,3.814,42.76,0.005508,0.04412,0.04436,0.01623,0.02427,0.004841,16.34,18.24,109.4,803.6,0.1277,0.3089,0.2604,0.1397,0.3151,0.08473
+861648,B,14.62,24.02,94.57,662.7,0.08974,0.08606,0.03102,0.02957,0.1685,0.05866,0.3721,1.111,2.279,33.76,0.004868,0.01818,0.01121,0.008606,0.02085,0.002893,16.11,29.11,102.9,803.7,0.1115,0.1766,0.09189,0.06946,0.2522,0.07246
+861799,M,15.37,22.76,100.2,728.2,0.092,0.1036,0.1122,0.07483,0.1717,0.06097,0.3129,0.8413,2.075,29.44,0.009882,0.02444,0.04531,0.01763,0.02471,0.002142,16.43,25.84,107.5,830.9,0.1257,0.1997,0.2846,0.1476,0.2556,0.06828
+861853,B,13.27,14.76,84.74,551.7,0.07355,0.05055,0.03261,0.02648,0.1386,0.05318,0.4057,1.153,2.701,36.35,0.004481,0.01038,0.01358,0.01082,0.01069,0.001435,16.36,22.35,104.5,830.6,0.1006,0.1238,0.135,0.1001,0.2027,0.06206
+862009,B,13.45,18.3,86.6,555.1,0.1022,0.08165,0.03974,0.0278,0.1638,0.0571,0.295,1.373,2.099,25.22,0.005884,0.01491,0.01872,0.009366,0.01884,0.001817,15.1,25.94,97.59,699.4,0.1339,0.1751,0.1381,0.07911,0.2678,0.06603
+862028,M,15.06,19.83,100.3,705.6,0.1039,0.1553,0.17,0.08815,0.1855,0.06284,0.4768,0.9644,3.706,47.14,0.00925,0.03715,0.04867,0.01851,0.01498,0.00352,18.23,24.23,123.5,1025,0.1551,0.4203,0.5203,0.2115,0.2834,0.08234
+86208,M,20.26,23.03,132.4,1264,0.09078,0.1313,0.1465,0.08683,0.2095,0.05649,0.7576,1.509,4.554,87.87,0.006016,0.03482,0.04232,0.01269,0.02657,0.004411,24.22,31.59,156.1,1750,0.119,0.3539,0.4098,0.1573,0.3689,0.08368
+86211,B,12.18,17.84,77.79,451.1,0.1045,0.07057,0.0249,0.02941,0.19,0.06635,0.3661,1.511,2.41,24.44,0.005433,0.01179,0.01131,0.01519,0.0222,0.003408,12.83,20.92,82.14,495.2,0.114,0.09358,0.0498,0.05882,0.2227,0.07376
+862261,B,9.787,19.94,62.11,294.5,0.1024,0.05301,0.006829,0.007937,0.135,0.0689,0.335,2.043,2.132,20.05,0.01113,0.01463,0.005308,0.00525,0.01801,0.005667,10.92,26.29,68.81,366.1,0.1316,0.09473,0.02049,0.02381,0.1934,0.08988
+862485,B,11.6,12.84,74.34,412.6,0.08983,0.07525,0.04196,0.0335,0.162,0.06582,0.2315,0.5391,1.475,15.75,0.006153,0.0133,0.01693,0.006884,0.01651,0.002551,13.06,17.16,82.96,512.5,0.1431,0.1851,0.1922,0.08449,0.2772,0.08756
+862548,M,14.42,19.77,94.48,642.5,0.09752,0.1141,0.09388,0.05839,0.1879,0.0639,0.2895,1.851,2.376,26.85,0.008005,0.02895,0.03321,0.01424,0.01462,0.004452,16.33,30.86,109.5,826.4,0.1431,0.3026,0.3194,0.1565,0.2718,0.09353
+862717,M,13.61,24.98,88.05,582.7,0.09488,0.08511,0.08625,0.04489,0.1609,0.05871,0.4565,1.29,2.861,43.14,0.005872,0.01488,0.02647,0.009921,0.01465,0.002355,16.99,35.27,108.6,906.5,0.1265,0.1943,0.3169,0.1184,0.2651,0.07397
+862722,B,6.981,13.43,43.79,143.5,0.117,0.07568,0,0,0.193,0.07818,0.2241,1.508,1.553,9.833,0.01019,0.01084,0,0,0.02659,0.0041,7.93,19.54,50.41,185.2,0.1584,0.1202,0,0,0.2932,0.09382
+862965,B,12.18,20.52,77.22,458.7,0.08013,0.04038,0.02383,0.0177,0.1739,0.05677,0.1924,1.571,1.183,14.68,0.00508,0.006098,0.01069,0.006797,0.01447,0.001532,13.34,32.84,84.58,547.8,0.1123,0.08862,0.1145,0.07431,0.2694,0.06878
+862980,B,9.876,19.4,63.95,298.3,0.1005,0.09697,0.06154,0.03029,0.1945,0.06322,0.1803,1.222,1.528,11.77,0.009058,0.02196,0.03029,0.01112,0.01609,0.00357,10.76,26.83,72.22,361.2,0.1559,0.2302,0.2644,0.09749,0.2622,0.0849
+862989,B,10.49,19.29,67.41,336.1,0.09989,0.08578,0.02995,0.01201,0.2217,0.06481,0.355,1.534,2.302,23.13,0.007595,0.02219,0.0288,0.008614,0.0271,0.003451,11.54,23.31,74.22,402.8,0.1219,0.1486,0.07987,0.03203,0.2826,0.07552
+863030,M,13.11,15.56,87.21,530.2,0.1398,0.1765,0.2071,0.09601,0.1925,0.07692,0.3908,0.9238,2.41,34.66,0.007162,0.02912,0.05473,0.01388,0.01547,0.007098,16.31,22.4,106.4,827.2,0.1862,0.4099,0.6376,0.1986,0.3147,0.1405
+863031,B,11.64,18.33,75.17,412.5,0.1142,0.1017,0.0707,0.03485,0.1801,0.0652,0.306,1.657,2.155,20.62,0.00854,0.0231,0.02945,0.01398,0.01565,0.00384,13.14,29.26,85.51,521.7,0.1688,0.266,0.2873,0.1218,0.2806,0.09097
+863270,B,12.36,18.54,79.01,466.7,0.08477,0.06815,0.02643,0.01921,0.1602,0.06066,0.1199,0.8944,0.8484,9.227,0.003457,0.01047,0.01167,0.005558,0.01251,0.001356,13.29,27.49,85.56,544.1,0.1184,0.1963,0.1937,0.08442,0.2983,0.07185
+86355,M,22.27,19.67,152.8,1509,0.1326,0.2768,0.4264,0.1823,0.2556,0.07039,1.215,1.545,10.05,170,0.006515,0.08668,0.104,0.0248,0.03112,0.005037,28.4,28.01,206.8,2360,0.1701,0.6997,0.9608,0.291,0.4055,0.09789
+864018,B,11.34,21.26,72.48,396.5,0.08759,0.06575,0.05133,0.01899,0.1487,0.06529,0.2344,0.9861,1.597,16.41,0.009113,0.01557,0.02443,0.006435,0.01568,0.002477,13.01,29.15,83.99,518.1,0.1699,0.2196,0.312,0.08278,0.2829,0.08832
+864033,B,9.777,16.99,62.5,290.2,0.1037,0.08404,0.04334,0.01778,0.1584,0.07065,0.403,1.424,2.747,22.87,0.01385,0.02932,0.02722,0.01023,0.03281,0.004638,11.05,21.47,71.68,367,0.1467,0.1765,0.13,0.05334,0.2533,0.08468
+86408,B,12.63,20.76,82.15,480.4,0.09933,0.1209,0.1065,0.06021,0.1735,0.0707,0.3424,1.803,2.711,20.48,0.01291,0.04042,0.05101,0.02295,0.02144,0.005891,13.33,25.47,89,527.4,0.1287,0.225,0.2216,0.1105,0.2226,0.08486
+86409,B,14.26,19.65,97.83,629.9,0.07837,0.2233,0.3003,0.07798,0.1704,0.07769,0.3628,1.49,3.399,29.25,0.005298,0.07446,0.1435,0.02292,0.02566,0.01298,15.3,23.73,107,709,0.08949,0.4193,0.6783,0.1505,0.2398,0.1082
+864292,B,10.51,20.19,68.64,334.2,0.1122,0.1303,0.06476,0.03068,0.1922,0.07782,0.3336,1.86,2.041,19.91,0.01188,0.03747,0.04591,0.01544,0.02287,0.006792,11.16,22.75,72.62,374.4,0.13,0.2049,0.1295,0.06136,0.2383,0.09026
+864496,B,8.726,15.83,55.84,230.9,0.115,0.08201,0.04132,0.01924,0.1649,0.07633,0.1665,0.5864,1.354,8.966,0.008261,0.02213,0.03259,0.0104,0.01708,0.003806,9.628,19.62,64.48,284.4,0.1724,0.2364,0.2456,0.105,0.2926,0.1017
+864685,B,11.93,21.53,76.53,438.6,0.09768,0.07849,0.03328,0.02008,0.1688,0.06194,0.3118,0.9227,2,24.79,0.007803,0.02507,0.01835,0.007711,0.01278,0.003856,13.67,26.15,87.54,583,0.15,0.2399,0.1503,0.07247,0.2438,0.08541
+864726,B,8.95,15.76,58.74,245.2,0.09462,0.1243,0.09263,0.02308,0.1305,0.07163,0.3132,0.9789,3.28,16.94,0.01835,0.0676,0.09263,0.02308,0.02384,0.005601,9.414,17.07,63.34,270,0.1179,0.1879,0.1544,0.03846,0.1652,0.07722
+864729,M,14.87,16.67,98.64,682.5,0.1162,0.1649,0.169,0.08923,0.2157,0.06768,0.4266,0.9489,2.989,41.18,0.006985,0.02563,0.03011,0.01271,0.01602,0.003884,18.81,27.37,127.1,1095,0.1878,0.448,0.4704,0.2027,0.3585,0.1065
+864877,M,15.78,22.91,105.7,782.6,0.1155,0.1752,0.2133,0.09479,0.2096,0.07331,0.552,1.072,3.598,58.63,0.008699,0.03976,0.0595,0.0139,0.01495,0.005984,20.19,30.5,130.3,1272,0.1855,0.4925,0.7356,0.2034,0.3274,0.1252
+865128,M,17.95,20.01,114.2,982,0.08402,0.06722,0.07293,0.05596,0.2129,0.05025,0.5506,1.214,3.357,54.04,0.004024,0.008422,0.02291,0.009863,0.05014,0.001902,20.58,27.83,129.2,1261,0.1072,0.1202,0.2249,0.1185,0.4882,0.06111
+865137,B,11.41,10.82,73.34,403.3,0.09373,0.06685,0.03512,0.02623,0.1667,0.06113,0.1408,0.4607,1.103,10.5,0.00604,0.01529,0.01514,0.00646,0.01344,0.002206,12.82,15.97,83.74,510.5,0.1548,0.239,0.2102,0.08958,0.3016,0.08523
+86517,M,18.66,17.12,121.4,1077,0.1054,0.11,0.1457,0.08665,0.1966,0.06213,0.7128,1.581,4.895,90.47,0.008102,0.02101,0.03342,0.01601,0.02045,0.00457,22.25,24.9,145.4,1549,0.1503,0.2291,0.3272,0.1674,0.2894,0.08456
+865423,M,24.25,20.2,166.2,1761,0.1447,0.2867,0.4268,0.2012,0.2655,0.06877,1.509,3.12,9.807,233,0.02333,0.09806,0.1278,0.01822,0.04547,0.009875,26.02,23.99,180.9,2073,0.1696,0.4244,0.5803,0.2248,0.3222,0.08009
+865432,B,14.5,10.89,94.28,640.7,0.1101,0.1099,0.08842,0.05778,0.1856,0.06402,0.2929,0.857,1.928,24.19,0.003818,0.01276,0.02882,0.012,0.0191,0.002808,15.7,15.98,102.8,745.5,0.1313,0.1788,0.256,0.1221,0.2889,0.08006
+865468,B,13.37,16.39,86.1,553.5,0.07115,0.07325,0.08092,0.028,0.1422,0.05823,0.1639,1.14,1.223,14.66,0.005919,0.0327,0.04957,0.01038,0.01208,0.004076,14.26,22.75,91.99,632.1,0.1025,0.2531,0.3308,0.08978,0.2048,0.07628
+86561,B,13.85,17.21,88.44,588.7,0.08785,0.06136,0.0142,0.01141,0.1614,0.0589,0.2185,0.8561,1.495,17.91,0.004599,0.009169,0.009127,0.004814,0.01247,0.001708,15.49,23.58,100.3,725.9,0.1157,0.135,0.08115,0.05104,0.2364,0.07182
+866083,M,13.61,24.69,87.76,572.6,0.09258,0.07862,0.05285,0.03085,0.1761,0.0613,0.231,1.005,1.752,19.83,0.004088,0.01174,0.01796,0.00688,0.01323,0.001465,16.89,35.64,113.2,848.7,0.1471,0.2884,0.3796,0.1329,0.347,0.079
+866203,M,19,18.91,123.4,1138,0.08217,0.08028,0.09271,0.05627,0.1946,0.05044,0.6896,1.342,5.216,81.23,0.004428,0.02731,0.0404,0.01361,0.0203,0.002686,22.32,25.73,148.2,1538,0.1021,0.2264,0.3207,0.1218,0.2841,0.06541
+866458,B,15.1,16.39,99.58,674.5,0.115,0.1807,0.1138,0.08534,0.2001,0.06467,0.4309,1.068,2.796,39.84,0.009006,0.04185,0.03204,0.02258,0.02353,0.004984,16.11,18.33,105.9,762.6,0.1386,0.2883,0.196,0.1423,0.259,0.07779
+866674,M,19.79,25.12,130.4,1192,0.1015,0.1589,0.2545,0.1149,0.2202,0.06113,0.4953,1.199,2.765,63.33,0.005033,0.03179,0.04755,0.01043,0.01578,0.003224,22.63,33.58,148.7,1589,0.1275,0.3861,0.5673,0.1732,0.3305,0.08465
+866714,B,12.19,13.29,79.08,455.8,0.1066,0.09509,0.02855,0.02882,0.188,0.06471,0.2005,0.8163,1.973,15.24,0.006773,0.02456,0.01018,0.008094,0.02662,0.004143,13.34,17.81,91.38,545.2,0.1427,0.2585,0.09915,0.08187,0.3469,0.09241
+8670,M,15.46,19.48,101.7,748.9,0.1092,0.1223,0.1466,0.08087,0.1931,0.05796,0.4743,0.7859,3.094,48.31,0.00624,0.01484,0.02813,0.01093,0.01397,0.002461,19.26,26,124.9,1156,0.1546,0.2394,0.3791,0.1514,0.2837,0.08019
+86730502,M,16.16,21.54,106.2,809.8,0.1008,0.1284,0.1043,0.05613,0.216,0.05891,0.4332,1.265,2.844,43.68,0.004877,0.01952,0.02219,0.009231,0.01535,0.002373,19.47,31.68,129.7,1175,0.1395,0.3055,0.2992,0.1312,0.348,0.07619
+867387,B,15.71,13.93,102,761.7,0.09462,0.09462,0.07135,0.05933,0.1816,0.05723,0.3117,0.8155,1.972,27.94,0.005217,0.01515,0.01678,0.01268,0.01669,0.00233,17.5,19.25,114.3,922.8,0.1223,0.1949,0.1709,0.1374,0.2723,0.07071
+867739,M,18.45,21.91,120.2,1075,0.0943,0.09709,0.1153,0.06847,0.1692,0.05727,0.5959,1.202,3.766,68.35,0.006001,0.01422,0.02855,0.009148,0.01492,0.002205,22.52,31.39,145.6,1590,0.1465,0.2275,0.3965,0.1379,0.3109,0.0761
+868202,M,12.77,22.47,81.72,506.3,0.09055,0.05761,0.04711,0.02704,0.1585,0.06065,0.2367,1.38,1.457,19.87,0.007499,0.01202,0.02332,0.00892,0.01647,0.002629,14.49,33.37,92.04,653.6,0.1419,0.1523,0.2177,0.09331,0.2829,0.08067
+868223,B,11.71,16.67,74.72,423.6,0.1051,0.06095,0.03592,0.026,0.1339,0.05945,0.4489,2.508,3.258,34.37,0.006578,0.0138,0.02662,0.01307,0.01359,0.003707,13.33,25.48,86.16,546.7,0.1271,0.1028,0.1046,0.06968,0.1712,0.07343
+868682,B,11.43,15.39,73.06,399.8,0.09639,0.06889,0.03503,0.02875,0.1734,0.05865,0.1759,0.9938,1.143,12.67,0.005133,0.01521,0.01434,0.008602,0.01501,0.001588,12.32,22.02,79.93,462,0.119,0.1648,0.1399,0.08476,0.2676,0.06765
+868826,M,14.95,17.57,96.85,678.1,0.1167,0.1305,0.1539,0.08624,0.1957,0.06216,1.296,1.452,8.419,101.9,0.01,0.0348,0.06577,0.02801,0.05168,0.002887,18.55,21.43,121.4,971.4,0.1411,0.2164,0.3355,0.1667,0.3414,0.07147
+868871,B,11.28,13.39,73,384.8,0.1164,0.1136,0.04635,0.04796,0.1771,0.06072,0.3384,1.343,1.851,26.33,0.01127,0.03498,0.02187,0.01965,0.0158,0.003442,11.92,15.77,76.53,434,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
+868999,B,9.738,11.97,61.24,288.5,0.0925,0.04102,0,0,0.1903,0.06422,0.1988,0.496,1.218,12.26,0.00604,0.005656,0,0,0.02277,0.00322,10.62,14.1,66.53,342.9,0.1234,0.07204,0,0,0.3105,0.08151
+869104,M,16.11,18.05,105.1,813,0.09721,0.1137,0.09447,0.05943,0.1861,0.06248,0.7049,1.332,4.533,74.08,0.00677,0.01938,0.03067,0.01167,0.01875,0.003434,19.92,25.27,129,1233,0.1314,0.2236,0.2802,0.1216,0.2792,0.08158
+869218,B,11.43,17.31,73.66,398,0.1092,0.09486,0.02031,0.01861,0.1645,0.06562,0.2843,1.908,1.937,21.38,0.006664,0.01735,0.01158,0.00952,0.02282,0.003526,12.78,26.76,82.66,503,0.1413,0.1792,0.07708,0.06402,0.2584,0.08096
+869224,B,12.9,15.92,83.74,512.2,0.08677,0.09509,0.04894,0.03088,0.1778,0.06235,0.2143,0.7712,1.689,16.64,0.005324,0.01563,0.0151,0.007584,0.02104,0.001887,14.48,21.82,97.17,643.8,0.1312,0.2548,0.209,0.1012,0.3549,0.08118
+869254,B,10.75,14.97,68.26,355.3,0.07793,0.05139,0.02251,0.007875,0.1399,0.05688,0.2525,1.239,1.806,17.74,0.006547,0.01781,0.02018,0.005612,0.01671,0.00236,11.95,20.72,77.79,441.2,0.1076,0.1223,0.09755,0.03413,0.23,0.06769
+869476,B,11.9,14.65,78.11,432.8,0.1152,0.1296,0.0371,0.03003,0.1995,0.07839,0.3962,0.6538,3.021,25.03,0.01017,0.04741,0.02789,0.0111,0.03127,0.009423,13.15,16.51,86.26,509.6,0.1424,0.2517,0.0942,0.06042,0.2727,0.1036
+869691,M,11.8,16.58,78.99,432,0.1091,0.17,0.1659,0.07415,0.2678,0.07371,0.3197,1.426,2.281,24.72,0.005427,0.03633,0.04649,0.01843,0.05628,0.004635,13.74,26.38,91.93,591.7,0.1385,0.4092,0.4504,0.1865,0.5774,0.103
+86973701,B,14.95,18.77,97.84,689.5,0.08138,0.1167,0.0905,0.03562,0.1744,0.06493,0.422,1.909,3.271,39.43,0.00579,0.04877,0.05303,0.01527,0.03356,0.009368,16.25,25.47,107.1,809.7,0.0997,0.2521,0.25,0.08405,0.2852,0.09218
+86973702,B,14.44,15.18,93.97,640.1,0.0997,0.1021,0.08487,0.05532,0.1724,0.06081,0.2406,0.7394,2.12,21.2,0.005706,0.02297,0.03114,0.01493,0.01454,0.002528,15.85,19.85,108.6,766.9,0.1316,0.2735,0.3103,0.1599,0.2691,0.07683
+869931,B,13.74,17.91,88.12,585,0.07944,0.06376,0.02881,0.01329,0.1473,0.0558,0.25,0.7574,1.573,21.47,0.002838,0.01592,0.0178,0.005828,0.01329,0.001976,15.34,22.46,97.19,725.9,0.09711,0.1824,0.1564,0.06019,0.235,0.07014
+871001501,B,13,20.78,83.51,519.4,0.1135,0.07589,0.03136,0.02645,0.254,0.06087,0.4202,1.322,2.873,34.78,0.007017,0.01142,0.01949,0.01153,0.02951,0.001533,14.16,24.11,90.82,616.7,0.1297,0.1105,0.08112,0.06296,0.3196,0.06435
+871001502,B,8.219,20.7,53.27,203.9,0.09405,0.1305,0.1321,0.02168,0.2222,0.08261,0.1935,1.962,1.243,10.21,0.01243,0.05416,0.07753,0.01022,0.02309,0.01178,9.092,29.72,58.08,249.8,0.163,0.431,0.5381,0.07879,0.3322,0.1486
+8710441,B,9.731,15.34,63.78,300.2,0.1072,0.1599,0.4108,0.07857,0.2548,0.09296,0.8245,2.664,4.073,49.85,0.01097,0.09586,0.396,0.05279,0.03546,0.02984,11.02,19.49,71.04,380.5,0.1292,0.2772,0.8216,0.1571,0.3108,0.1259
+87106,B,11.15,13.08,70.87,381.9,0.09754,0.05113,0.01982,0.01786,0.183,0.06105,0.2251,0.7815,1.429,15.48,0.009019,0.008985,0.01196,0.008232,0.02388,0.001619,11.99,16.3,76.25,440.8,0.1341,0.08971,0.07116,0.05506,0.2859,0.06772
+8711002,B,13.15,15.34,85.31,538.9,0.09384,0.08498,0.09293,0.03483,0.1822,0.06207,0.271,0.7927,1.819,22.79,0.008584,0.02017,0.03047,0.009536,0.02769,0.003479,14.77,20.5,97.67,677.3,0.1478,0.2256,0.3009,0.09722,0.3849,0.08633
+8711003,B,12.25,17.94,78.27,460.3,0.08654,0.06679,0.03885,0.02331,0.197,0.06228,0.22,0.9823,1.484,16.51,0.005518,0.01562,0.01994,0.007924,0.01799,0.002484,13.59,25.22,86.6,564.2,0.1217,0.1788,0.1943,0.08211,0.3113,0.08132
+8711202,M,17.68,20.74,117.4,963.7,0.1115,0.1665,0.1855,0.1054,0.1971,0.06166,0.8113,1.4,5.54,93.91,0.009037,0.04954,0.05206,0.01841,0.01778,0.004968,20.47,25.11,132.9,1302,0.1418,0.3498,0.3583,0.1515,0.2463,0.07738
+8711216,B,16.84,19.46,108.4,880.2,0.07445,0.07223,0.0515,0.02771,0.1844,0.05268,0.4789,2.06,3.479,46.61,0.003443,0.02661,0.03056,0.0111,0.0152,0.001519,18.22,28.07,120.3,1032,0.08774,0.171,0.1882,0.08436,0.2527,0.05972
+871122,B,12.06,12.74,76.84,448.6,0.09311,0.05241,0.01972,0.01963,0.159,0.05907,0.1822,0.7285,1.171,13.25,0.005528,0.009789,0.008342,0.006273,0.01465,0.00253,13.14,18.41,84.08,532.8,0.1275,0.1232,0.08636,0.07025,0.2514,0.07898
+871149,B,10.9,12.96,68.69,366.8,0.07515,0.03718,0.00309,0.006588,0.1442,0.05743,0.2818,0.7614,1.808,18.54,0.006142,0.006134,0.001835,0.003576,0.01637,0.002665,12.36,18.2,78.07,470,0.1171,0.08294,0.01854,0.03953,0.2738,0.07685
+8711561,B,11.75,20.18,76.1,419.8,0.1089,0.1141,0.06843,0.03738,0.1993,0.06453,0.5018,1.693,3.926,38.34,0.009433,0.02405,0.04167,0.01152,0.03397,0.005061,13.32,26.21,88.91,543.9,0.1358,0.1892,0.1956,0.07909,0.3168,0.07987
+8711803,M,19.19,15.94,126.3,1157,0.08694,0.1185,0.1193,0.09667,0.1741,0.05176,1,0.6336,6.971,119.3,0.009406,0.03055,0.04344,0.02794,0.03156,0.003362,22.03,17.81,146.6,1495,0.1124,0.2016,0.2264,0.1777,0.2443,0.06251
+871201,M,19.59,18.15,130.7,1214,0.112,0.1666,0.2508,0.1286,0.2027,0.06082,0.7364,1.048,4.792,97.07,0.004057,0.02277,0.04029,0.01303,0.01686,0.003318,26.73,26.39,174.9,2232,0.1438,0.3846,0.681,0.2247,0.3643,0.09223
+8712064,B,12.34,22.22,79.85,464.5,0.1012,0.1015,0.0537,0.02822,0.1551,0.06761,0.2949,1.656,1.955,21.55,0.01134,0.03175,0.03125,0.01135,0.01879,0.005348,13.58,28.68,87.36,553,0.1452,0.2338,0.1688,0.08194,0.2268,0.09082
+8712289,M,23.27,22.04,152.1,1686,0.08439,0.1145,0.1324,0.09702,0.1801,0.05553,0.6642,0.8561,4.603,97.85,0.00491,0.02544,0.02822,0.01623,0.01956,0.00374,28.01,28.22,184.2,2403,0.1228,0.3583,0.3948,0.2346,0.3589,0.09187
+8712291,B,14.97,19.76,95.5,690.2,0.08421,0.05352,0.01947,0.01939,0.1515,0.05266,0.184,1.065,1.286,16.64,0.003634,0.007983,0.008268,0.006432,0.01924,0.00152,15.98,25.82,102.3,782.1,0.1045,0.09995,0.0775,0.05754,0.2646,0.06085
+87127,B,10.8,9.71,68.77,357.6,0.09594,0.05736,0.02531,0.01698,0.1381,0.064,0.1728,0.4064,1.126,11.48,0.007809,0.009816,0.01099,0.005344,0.01254,0.00212,11.6,12.02,73.66,414,0.1436,0.1257,0.1047,0.04603,0.209,0.07699
+8712729,M,16.78,18.8,109.3,886.3,0.08865,0.09182,0.08422,0.06576,0.1893,0.05534,0.599,1.391,4.129,67.34,0.006123,0.0247,0.02626,0.01604,0.02091,0.003493,20.05,26.3,130.7,1260,0.1168,0.2119,0.2318,0.1474,0.281,0.07228
+8712766,M,17.47,24.68,116.1,984.6,0.1049,0.1603,0.2159,0.1043,0.1538,0.06365,1.088,1.41,7.337,122.3,0.006174,0.03634,0.04644,0.01569,0.01145,0.00512,23.14,32.33,155.3,1660,0.1376,0.383,0.489,0.1721,0.216,0.093
+8712853,B,14.97,16.95,96.22,685.9,0.09855,0.07885,0.02602,0.03781,0.178,0.0565,0.2713,1.217,1.893,24.28,0.00508,0.0137,0.007276,0.009073,0.0135,0.001706,16.11,23,104.6,793.7,0.1216,0.1637,0.06648,0.08485,0.2404,0.06428
+87139402,B,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,0.1959,0.05955,0.236,0.6656,1.67,17.43,0.008045,0.0118,0.01683,0.01241,0.01924,0.002248,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
+87163,M,13.43,19.63,85.84,565.4,0.09048,0.06288,0.05858,0.03438,0.1598,0.05671,0.4697,1.147,3.142,43.4,0.006003,0.01063,0.02151,0.009443,0.0152,0.001868,17.98,29.87,116.6,993.6,0.1401,0.1546,0.2644,0.116,0.2884,0.07371
+87164,M,15.46,11.89,102.5,736.9,0.1257,0.1555,0.2032,0.1097,0.1966,0.07069,0.4209,0.6583,2.805,44.64,0.005393,0.02321,0.04303,0.0132,0.01792,0.004168,18.79,17.04,125,1102,0.1531,0.3583,0.583,0.1827,0.3216,0.101
+871641,B,11.08,14.71,70.21,372.7,0.1006,0.05743,0.02363,0.02583,0.1566,0.06669,0.2073,1.805,1.377,19.08,0.01496,0.02121,0.01453,0.01583,0.03082,0.004785,11.35,16.82,72.01,396.5,0.1216,0.0824,0.03938,0.04306,0.1902,0.07313
+871642,B,10.66,15.15,67.49,349.6,0.08792,0.04302,0,0,0.1928,0.05975,0.3309,1.925,2.155,21.98,0.008713,0.01017,0,0,0.03265,0.001002,11.54,19.2,73.2,408.3,0.1076,0.06791,0,0,0.271,0.06164
+872113,B,8.671,14.45,54.42,227.2,0.09138,0.04276,0,0,0.1722,0.06724,0.2204,0.7873,1.435,11.36,0.009172,0.008007,0,0,0.02711,0.003399,9.262,17.04,58.36,259.2,0.1162,0.07057,0,0,0.2592,0.07848
+872608,B,9.904,18.06,64.6,302.4,0.09699,0.1294,0.1307,0.03716,0.1669,0.08116,0.4311,2.261,3.132,27.48,0.01286,0.08808,0.1197,0.0246,0.0388,0.01792,11.26,24.39,73.07,390.2,0.1301,0.295,0.3486,0.0991,0.2614,0.1162
+87281702,M,16.46,20.11,109.3,832.9,0.09831,0.1556,0.1793,0.08866,0.1794,0.06323,0.3037,1.284,2.482,31.59,0.006627,0.04094,0.05371,0.01813,0.01682,0.004584,17.79,28.45,123.5,981.2,0.1415,0.4667,0.5862,0.2035,0.3054,0.09519
+873357,B,13.01,22.22,82.01,526.4,0.06251,0.01938,0.001595,0.001852,0.1395,0.05234,0.1731,1.142,1.101,14.34,0.003418,0.002252,0.001595,0.001852,0.01613,0.0009683,14,29.02,88.18,608.8,0.08125,0.03432,0.007977,0.009259,0.2295,0.05843
+873586,B,12.81,13.06,81.29,508.8,0.08739,0.03774,0.009193,0.0133,0.1466,0.06133,0.2889,0.9899,1.778,21.79,0.008534,0.006364,0.00618,0.007408,0.01065,0.003351,13.63,16.15,86.7,570.7,0.1162,0.05445,0.02758,0.0399,0.1783,0.07319
+873592,M,27.22,21.87,182.1,2250,0.1094,0.1914,0.2871,0.1878,0.18,0.0577,0.8361,1.481,5.82,128.7,0.004631,0.02537,0.03109,0.01241,0.01575,0.002747,33.12,32.85,220.8,3216,0.1472,0.4034,0.534,0.2688,0.2856,0.08082
+873593,M,21.09,26.57,142.7,1311,0.1141,0.2832,0.2487,0.1496,0.2395,0.07398,0.6298,0.7629,4.414,81.46,0.004253,0.04759,0.03872,0.01567,0.01798,0.005295,26.68,33.48,176.5,2089,0.1491,0.7584,0.678,0.2903,0.4098,0.1284
+873701,M,15.7,20.31,101.2,766.6,0.09597,0.08799,0.06593,0.05189,0.1618,0.05549,0.3699,1.15,2.406,40.98,0.004626,0.02263,0.01954,0.009767,0.01547,0.00243,20.11,32.82,129.3,1269,0.1414,0.3547,0.2902,0.1541,0.3437,0.08631
+873843,B,11.41,14.92,73.53,402,0.09059,0.08155,0.06181,0.02361,0.1167,0.06217,0.3344,1.108,1.902,22.77,0.007356,0.03728,0.05915,0.01712,0.02165,0.004784,12.37,17.7,79.12,467.2,0.1121,0.161,0.1648,0.06296,0.1811,0.07427
+873885,M,15.28,22.41,98.92,710.6,0.09057,0.1052,0.05375,0.03263,0.1727,0.06317,0.2054,0.4956,1.344,19.53,0.00329,0.01395,0.01774,0.006009,0.01172,0.002575,17.8,28.03,113.8,973.1,0.1301,0.3299,0.363,0.1226,0.3175,0.09772
+874158,B,10.08,15.11,63.76,317.5,0.09267,0.04695,0.001597,0.002404,0.1703,0.06048,0.4245,1.268,2.68,26.43,0.01439,0.012,0.001597,0.002404,0.02538,0.00347,11.87,21.18,75.39,437,0.1521,0.1019,0.00692,0.01042,0.2933,0.07697
+874217,M,18.31,18.58,118.6,1041,0.08588,0.08468,0.08169,0.05814,0.1621,0.05425,0.2577,0.4757,1.817,28.92,0.002866,0.009181,0.01412,0.006719,0.01069,0.001087,21.31,26.36,139.2,1410,0.1234,0.2445,0.3538,0.1571,0.3206,0.06938
+874373,B,11.71,17.19,74.68,420.3,0.09774,0.06141,0.03809,0.03239,0.1516,0.06095,0.2451,0.7655,1.742,17.86,0.006905,0.008704,0.01978,0.01185,0.01897,0.001671,13.01,21.39,84.42,521.5,0.1323,0.104,0.1521,0.1099,0.2572,0.07097
+874662,B,11.81,17.39,75.27,428.9,0.1007,0.05562,0.02353,0.01553,0.1718,0.0578,0.1859,1.926,1.011,14.47,0.007831,0.008776,0.01556,0.00624,0.03139,0.001988,12.57,26.48,79.57,489.5,0.1356,0.1,0.08803,0.04306,0.32,0.06576
+874839,B,12.3,15.9,78.83,463.7,0.0808,0.07253,0.03844,0.01654,0.1667,0.05474,0.2382,0.8355,1.687,18.32,0.005996,0.02212,0.02117,0.006433,0.02025,0.001725,13.35,19.59,86.65,546.7,0.1096,0.165,0.1423,0.04815,0.2482,0.06306
+874858,M,14.22,23.12,94.37,609.9,0.1075,0.2413,0.1981,0.06618,0.2384,0.07542,0.286,2.11,2.112,31.72,0.00797,0.1354,0.1166,0.01666,0.05113,0.01172,15.74,37.18,106.4,762.4,0.1533,0.9327,0.8488,0.1772,0.5166,0.1446
+875093,B,12.77,21.41,82.02,507.4,0.08749,0.06601,0.03112,0.02864,0.1694,0.06287,0.7311,1.748,5.118,53.65,0.004571,0.0179,0.02176,0.01757,0.03373,0.005875,13.75,23.5,89.04,579.5,0.09388,0.08978,0.05186,0.04773,0.2179,0.06871
+875099,B,9.72,18.22,60.73,288.1,0.0695,0.02344,0,0,0.1653,0.06447,0.3539,4.885,2.23,21.69,0.001713,0.006736,0,0,0.03799,0.001688,9.968,20.83,62.25,303.8,0.07117,0.02729,0,0,0.1909,0.06559
+875263,M,12.34,26.86,81.15,477.4,0.1034,0.1353,0.1085,0.04562,0.1943,0.06937,0.4053,1.809,2.642,34.44,0.009098,0.03845,0.03763,0.01321,0.01878,0.005672,15.65,39.34,101.7,768.9,0.1785,0.4706,0.4425,0.1459,0.3215,0.1205
+87556202,M,14.86,23.21,100.4,671.4,0.1044,0.198,0.1697,0.08878,0.1737,0.06672,0.2796,0.9622,3.591,25.2,0.008081,0.05122,0.05551,0.01883,0.02545,0.004312,16.08,27.78,118.6,784.7,0.1316,0.4648,0.4589,0.1727,0.3,0.08701
+875878,B,12.91,16.33,82.53,516.4,0.07941,0.05366,0.03873,0.02377,0.1829,0.05667,0.1942,0.9086,1.493,15.75,0.005298,0.01587,0.02321,0.00842,0.01853,0.002152,13.88,22,90.81,600.6,0.1097,0.1506,0.1764,0.08235,0.3024,0.06949
+875938,M,13.77,22.29,90.63,588.9,0.12,0.1267,0.1385,0.06526,0.1834,0.06877,0.6191,2.112,4.906,49.7,0.0138,0.03348,0.04665,0.0206,0.02689,0.004306,16.39,34.01,111.6,806.9,0.1737,0.3122,0.3809,0.1673,0.308,0.09333
+877159,M,18.08,21.84,117.4,1024,0.07371,0.08642,0.1103,0.05778,0.177,0.0534,0.6362,1.305,4.312,76.36,0.00553,0.05296,0.0611,0.01444,0.0214,0.005036,19.76,24.7,129.1,1228,0.08822,0.1963,0.2535,0.09181,0.2369,0.06558
+877486,M,19.18,22.49,127.5,1148,0.08523,0.1428,0.1114,0.06772,0.1767,0.05529,0.4357,1.073,3.833,54.22,0.005524,0.03698,0.02706,0.01221,0.01415,0.003397,23.36,32.06,166.4,1688,0.1322,0.5601,0.3865,0.1708,0.3193,0.09221
+877500,M,14.45,20.22,94.49,642.7,0.09872,0.1206,0.118,0.0598,0.195,0.06466,0.2092,0.6509,1.446,19.42,0.004044,0.01597,0.02,0.007303,0.01522,0.001976,18.33,30.12,117.9,1044,0.1552,0.4056,0.4967,0.1838,0.4753,0.1013
+877501,B,12.23,19.56,78.54,461,0.09586,0.08087,0.04187,0.04107,0.1979,0.06013,0.3534,1.326,2.308,27.24,0.007514,0.01779,0.01401,0.0114,0.01503,0.003338,14.44,28.36,92.15,638.4,0.1429,0.2042,0.1377,0.108,0.2668,0.08174
+877989,M,17.54,19.32,115.1,951.6,0.08968,0.1198,0.1036,0.07488,0.1506,0.05491,0.3971,0.8282,3.088,40.73,0.00609,0.02569,0.02713,0.01345,0.01594,0.002658,20.42,25.84,139.5,1239,0.1381,0.342,0.3508,0.1939,0.2928,0.07867
+878796,M,23.29,26.67,158.9,1685,0.1141,0.2084,0.3523,0.162,0.22,0.06229,0.5539,1.56,4.667,83.16,0.009327,0.05121,0.08958,0.02465,0.02175,0.005195,25.12,32.68,177,1986,0.1536,0.4167,0.7892,0.2733,0.3198,0.08762
+87880,M,13.81,23.75,91.56,597.8,0.1323,0.1768,0.1558,0.09176,0.2251,0.07421,0.5648,1.93,3.909,52.72,0.008824,0.03108,0.03112,0.01291,0.01998,0.004506,19.2,41.85,128.5,1153,0.2226,0.5209,0.4646,0.2013,0.4432,0.1086
+87930,B,12.47,18.6,81.09,481.9,0.09965,0.1058,0.08005,0.03821,0.1925,0.06373,0.3961,1.044,2.497,30.29,0.006953,0.01911,0.02701,0.01037,0.01782,0.003586,14.97,24.64,96.05,677.9,0.1426,0.2378,0.2671,0.1015,0.3014,0.0875
+879523,M,15.12,16.68,98.78,716.6,0.08876,0.09588,0.0755,0.04079,0.1594,0.05986,0.2711,0.3621,1.974,26.44,0.005472,0.01919,0.02039,0.00826,0.01523,0.002881,17.77,20.24,117.7,989.5,0.1491,0.3331,0.3327,0.1252,0.3415,0.0974
+879804,B,9.876,17.27,62.92,295.4,0.1089,0.07232,0.01756,0.01952,0.1934,0.06285,0.2137,1.342,1.517,12.33,0.009719,0.01249,0.007975,0.007527,0.0221,0.002472,10.42,23.22,67.08,331.6,0.1415,0.1247,0.06213,0.05588,0.2989,0.0738
+879830,M,17.01,20.26,109.7,904.3,0.08772,0.07304,0.0695,0.0539,0.2026,0.05223,0.5858,0.8554,4.106,68.46,0.005038,0.01503,0.01946,0.01123,0.02294,0.002581,19.8,25.05,130,1210,0.1111,0.1486,0.1932,0.1096,0.3275,0.06469
+8810158,B,13.11,22.54,87.02,529.4,0.1002,0.1483,0.08705,0.05102,0.185,0.0731,0.1931,0.9223,1.491,15.09,0.005251,0.03041,0.02526,0.008304,0.02514,0.004198,14.55,29.16,99.48,639.3,0.1349,0.4402,0.3162,0.1126,0.4128,0.1076
+8810436,B,15.27,12.91,98.17,725.5,0.08182,0.0623,0.05892,0.03157,0.1359,0.05526,0.2134,0.3628,1.525,20,0.004291,0.01236,0.01841,0.007373,0.009539,0.001656,17.38,15.92,113.7,932.7,0.1222,0.2186,0.2962,0.1035,0.232,0.07474
+881046502,M,20.58,22.14,134.7,1290,0.0909,0.1348,0.164,0.09561,0.1765,0.05024,0.8601,1.48,7.029,111.7,0.008124,0.03611,0.05489,0.02765,0.03176,0.002365,23.24,27.84,158.3,1656,0.1178,0.292,0.3861,0.192,0.2909,0.05865
+8810528,B,11.84,18.94,75.51,428,0.08871,0.069,0.02669,0.01393,0.1533,0.06057,0.2222,0.8652,1.444,17.12,0.005517,0.01727,0.02045,0.006747,0.01616,0.002922,13.3,24.99,85.22,546.3,0.128,0.188,0.1471,0.06913,0.2535,0.07993
+8810703,M,28.11,18.47,188.5,2499,0.1142,0.1516,0.3201,0.1595,0.1648,0.05525,2.873,1.476,21.98,525.6,0.01345,0.02772,0.06389,0.01407,0.04783,0.004476,28.11,18.47,188.5,2499,0.1142,0.1516,0.3201,0.1595,0.1648,0.05525
+881094802,M,17.42,25.56,114.5,948,0.1006,0.1146,0.1682,0.06597,0.1308,0.05866,0.5296,1.667,3.767,58.53,0.03113,0.08555,0.1438,0.03927,0.02175,0.01256,18.07,28.07,120.4,1021,0.1243,0.1793,0.2803,0.1099,0.1603,0.06818
+8810955,M,14.19,23.81,92.87,610.7,0.09463,0.1306,0.1115,0.06462,0.2235,0.06433,0.4207,1.845,3.534,31,0.01088,0.0371,0.03688,0.01627,0.04499,0.004768,16.86,34.85,115,811.3,0.1559,0.4059,0.3744,0.1772,0.4724,0.1026
+8810987,M,13.86,16.93,90.96,578.9,0.1026,0.1517,0.09901,0.05602,0.2106,0.06916,0.2563,1.194,1.933,22.69,0.00596,0.03438,0.03909,0.01435,0.01939,0.00456,15.75,26.93,104.4,750.1,0.146,0.437,0.4636,0.1654,0.363,0.1059
+8811523,B,11.89,18.35,77.32,432.2,0.09363,0.1154,0.06636,0.03142,0.1967,0.06314,0.2963,1.563,2.087,21.46,0.008872,0.04192,0.05946,0.01785,0.02793,0.004775,13.25,27.1,86.2,531.2,0.1405,0.3046,0.2806,0.1138,0.3397,0.08365
+8811779,B,10.2,17.48,65.05,321.2,0.08054,0.05907,0.05774,0.01071,0.1964,0.06315,0.3567,1.922,2.747,22.79,0.00468,0.0312,0.05774,0.01071,0.0256,0.004613,11.48,24.47,75.4,403.7,0.09527,0.1397,0.1925,0.03571,0.2868,0.07809
+8811842,M,19.8,21.56,129.7,1230,0.09383,0.1306,0.1272,0.08691,0.2094,0.05581,0.9553,1.186,6.487,124.4,0.006804,0.03169,0.03446,0.01712,0.01897,0.004045,25.73,28.64,170.3,2009,0.1353,0.3235,0.3617,0.182,0.307,0.08255
+88119002,M,19.53,32.47,128,1223,0.0842,0.113,0.1145,0.06637,0.1428,0.05313,0.7392,1.321,4.722,109.9,0.005539,0.02644,0.02664,0.01078,0.01332,0.002256,27.9,45.41,180.2,2477,0.1408,0.4097,0.3995,0.1625,0.2713,0.07568
+8812816,B,13.65,13.16,87.88,568.9,0.09646,0.08711,0.03888,0.02563,0.136,0.06344,0.2102,0.4336,1.391,17.4,0.004133,0.01695,0.01652,0.006659,0.01371,0.002735,15.34,16.35,99.71,706.2,0.1311,0.2474,0.1759,0.08056,0.238,0.08718
+8812818,B,13.56,13.9,88.59,561.3,0.1051,0.1192,0.0786,0.04451,0.1962,0.06303,0.2569,0.4981,2.011,21.03,0.005851,0.02314,0.02544,0.00836,0.01842,0.002918,14.98,17.13,101.1,686.6,0.1376,0.2698,0.2577,0.0909,0.3065,0.08177
+8812844,B,10.18,17.53,65.12,313.1,0.1061,0.08502,0.01768,0.01915,0.191,0.06908,0.2467,1.217,1.641,15.05,0.007899,0.014,0.008534,0.007624,0.02637,0.003761,11.17,22.84,71.94,375.6,0.1406,0.144,0.06572,0.05575,0.3055,0.08797
+8812877,M,15.75,20.25,102.6,761.3,0.1025,0.1204,0.1147,0.06462,0.1935,0.06303,0.3473,0.9209,2.244,32.19,0.004766,0.02374,0.02384,0.008637,0.01772,0.003131,19.56,30.29,125.9,1088,0.1552,0.448,0.3976,0.1479,0.3993,0.1064
+8813129,B,13.27,17.02,84.55,546.4,0.08445,0.04994,0.03554,0.02456,0.1496,0.05674,0.2927,0.8907,2.044,24.68,0.006032,0.01104,0.02259,0.009057,0.01482,0.002496,15.14,23.6,98.84,708.8,0.1276,0.1311,0.1786,0.09678,0.2506,0.07623
+88143502,B,14.34,13.47,92.51,641.2,0.09906,0.07624,0.05724,0.04603,0.2075,0.05448,0.522,0.8121,3.763,48.29,0.007089,0.01428,0.0236,0.01286,0.02266,0.001463,16.77,16.9,110.4,873.2,0.1297,0.1525,0.1632,0.1087,0.3062,0.06072
+88147101,B,10.44,15.46,66.62,329.6,0.1053,0.07722,0.006643,0.01216,0.1788,0.0645,0.1913,0.9027,1.208,11.86,0.006513,0.008061,0.002817,0.004972,0.01502,0.002821,11.52,19.8,73.47,395.4,0.1341,0.1153,0.02639,0.04464,0.2615,0.08269
+88147102,B,15,15.51,97.45,684.5,0.08371,0.1096,0.06505,0.0378,0.1881,0.05907,0.2318,0.4966,2.276,19.88,0.004119,0.03207,0.03644,0.01155,0.01391,0.003204,16.41,19.31,114.2,808.2,0.1136,0.3627,0.3402,0.1379,0.2954,0.08362
+88147202,B,12.62,23.97,81.35,496.4,0.07903,0.07529,0.05438,0.02036,0.1514,0.06019,0.2449,1.066,1.445,18.51,0.005169,0.02294,0.03016,0.008691,0.01365,0.003407,14.2,31.31,90.67,624,0.1227,0.3454,0.3911,0.118,0.2826,0.09585
+881861,M,12.83,22.33,85.26,503.2,0.1088,0.1799,0.1695,0.06861,0.2123,0.07254,0.3061,1.069,2.257,25.13,0.006983,0.03858,0.04683,0.01499,0.0168,0.005617,15.2,30.15,105.3,706,0.1777,0.5343,0.6282,0.1977,0.3407,0.1243
+881972,M,17.05,19.08,113.4,895,0.1141,0.1572,0.191,0.109,0.2131,0.06325,0.2959,0.679,2.153,31.98,0.005532,0.02008,0.03055,0.01384,0.01177,0.002336,19.59,24.89,133.5,1189,0.1703,0.3934,0.5018,0.2543,0.3109,0.09061
+88199202,B,11.32,27.08,71.76,395.7,0.06883,0.03813,0.01633,0.003125,0.1869,0.05628,0.121,0.8927,1.059,8.605,0.003653,0.01647,0.01633,0.003125,0.01537,0.002052,12.08,33.75,79.82,452.3,0.09203,0.1432,0.1089,0.02083,0.2849,0.07087
+88203002,B,11.22,33.81,70.79,386.8,0.0778,0.03574,0.004967,0.006434,0.1845,0.05828,0.2239,1.647,1.489,15.46,0.004359,0.006813,0.003223,0.003419,0.01916,0.002534,12.36,41.78,78.44,470.9,0.09994,0.06885,0.02318,0.03002,0.2911,0.07307
+88206102,M,20.51,27.81,134.4,1319,0.09159,0.1074,0.1554,0.0834,0.1448,0.05592,0.524,1.189,3.767,70.01,0.00502,0.02062,0.03457,0.01091,0.01298,0.002887,24.47,37.38,162.7,1872,0.1223,0.2761,0.4146,0.1563,0.2437,0.08328
+882488,B,9.567,15.91,60.21,279.6,0.08464,0.04087,0.01652,0.01667,0.1551,0.06403,0.2152,0.8301,1.215,12.64,0.01164,0.0104,0.01186,0.009623,0.02383,0.00354,10.51,19.16,65.74,335.9,0.1504,0.09515,0.07161,0.07222,0.2757,0.08178
+88249602,B,14.03,21.25,89.79,603.4,0.0907,0.06945,0.01462,0.01896,0.1517,0.05835,0.2589,1.503,1.667,22.07,0.007389,0.01383,0.007302,0.01004,0.01263,0.002925,15.33,30.28,98.27,715.5,0.1287,0.1513,0.06231,0.07963,0.2226,0.07617
+88299702,M,23.21,26.97,153.5,1670,0.09509,0.1682,0.195,0.1237,0.1909,0.06309,1.058,0.9635,7.247,155.8,0.006428,0.02863,0.04497,0.01716,0.0159,0.003053,31.01,34.51,206,2944,0.1481,0.4126,0.582,0.2593,0.3103,0.08677
+883263,M,20.48,21.46,132.5,1306,0.08355,0.08348,0.09042,0.06022,0.1467,0.05177,0.6874,1.041,5.144,83.5,0.007959,0.03133,0.04257,0.01671,0.01341,0.003933,24.22,26.17,161.7,1750,0.1228,0.2311,0.3158,0.1445,0.2238,0.07127
+883270,B,14.22,27.85,92.55,623.9,0.08223,0.1039,0.1103,0.04408,0.1342,0.06129,0.3354,2.324,2.105,29.96,0.006307,0.02845,0.0385,0.01011,0.01185,0.003589,15.75,40.54,102.5,764,0.1081,0.2426,0.3064,0.08219,0.189,0.07796
+88330202,M,17.46,39.28,113.4,920.6,0.09812,0.1298,0.1417,0.08811,0.1809,0.05966,0.5366,0.8561,3.002,49,0.00486,0.02785,0.02602,0.01374,0.01226,0.002759,22.51,44.87,141.2,1408,0.1365,0.3735,0.3241,0.2066,0.2853,0.08496
+88350402,B,13.64,15.6,87.38,575.3,0.09423,0.0663,0.04705,0.03731,0.1717,0.0566,0.3242,0.6612,1.996,27.19,0.00647,0.01248,0.0181,0.01103,0.01898,0.001794,14.85,19.05,94.11,683.4,0.1278,0.1291,0.1533,0.09222,0.253,0.0651
+883539,B,12.42,15.04,78.61,476.5,0.07926,0.03393,0.01053,0.01108,0.1546,0.05754,0.1153,0.6745,0.757,9.006,0.003265,0.00493,0.006493,0.003762,0.0172,0.00136,13.2,20.37,83.85,543.4,0.1037,0.07776,0.06243,0.04052,0.2901,0.06783
+883852,B,11.3,18.19,73.93,389.4,0.09592,0.1325,0.1548,0.02854,0.2054,0.07669,0.2428,1.642,2.369,16.39,0.006663,0.05914,0.0888,0.01314,0.01995,0.008675,12.58,27.96,87.16,472.9,0.1347,0.4848,0.7436,0.1218,0.3308,0.1297
+88411702,B,13.75,23.77,88.54,590,0.08043,0.06807,0.04697,0.02344,0.1773,0.05429,0.4347,1.057,2.829,39.93,0.004351,0.02667,0.03371,0.01007,0.02598,0.003087,15.01,26.34,98,706,0.09368,0.1442,0.1359,0.06106,0.2663,0.06321
+884180,M,19.4,23.5,129.1,1155,0.1027,0.1558,0.2049,0.08886,0.1978,0.06,0.5243,1.802,4.037,60.41,0.01061,0.03252,0.03915,0.01559,0.02186,0.003949,21.65,30.53,144.9,1417,0.1463,0.2968,0.3458,0.1564,0.292,0.07614
+884437,B,10.48,19.86,66.72,337.7,0.107,0.05971,0.04831,0.0307,0.1737,0.0644,0.3719,2.612,2.517,23.22,0.01604,0.01386,0.01865,0.01133,0.03476,0.00356,11.48,29.46,73.68,402.8,0.1515,0.1026,0.1181,0.06736,0.2883,0.07748
+884448,B,13.2,17.43,84.13,541.6,0.07215,0.04524,0.04336,0.01105,0.1487,0.05635,0.163,1.601,0.873,13.56,0.006261,0.01569,0.03079,0.005383,0.01962,0.00225,13.94,27.82,88.28,602,0.1101,0.1508,0.2298,0.0497,0.2767,0.07198
+884626,B,12.89,14.11,84.95,512.2,0.0876,0.1346,0.1374,0.0398,0.1596,0.06409,0.2025,0.4402,2.393,16.35,0.005501,0.05592,0.08158,0.0137,0.01266,0.007555,14.39,17.7,105,639.1,0.1254,0.5849,0.7727,0.1561,0.2639,0.1178
+88466802,B,10.65,25.22,68.01,347,0.09657,0.07234,0.02379,0.01615,0.1897,0.06329,0.2497,1.493,1.497,16.64,0.007189,0.01035,0.01081,0.006245,0.02158,0.002619,12.25,35.19,77.98,455.7,0.1499,0.1398,0.1125,0.06136,0.3409,0.08147
+884689,B,11.52,14.93,73.87,406.3,0.1013,0.07808,0.04328,0.02929,0.1883,0.06168,0.2562,1.038,1.686,18.62,0.006662,0.01228,0.02105,0.01006,0.01677,0.002784,12.65,21.19,80.88,491.8,0.1389,0.1582,0.1804,0.09608,0.2664,0.07809
+884948,M,20.94,23.56,138.9,1364,0.1007,0.1606,0.2712,0.131,0.2205,0.05898,1.004,0.8208,6.372,137.9,0.005283,0.03908,0.09518,0.01864,0.02401,0.005002,25.58,27,165.3,2010,0.1211,0.3172,0.6991,0.2105,0.3126,0.07849
+88518501,B,11.5,18.45,73.28,407.4,0.09345,0.05991,0.02638,0.02069,0.1834,0.05934,0.3927,0.8429,2.684,26.99,0.00638,0.01065,0.01245,0.009175,0.02292,0.001461,12.97,22.46,83.12,508.9,0.1183,0.1049,0.08105,0.06544,0.274,0.06487
+885429,M,19.73,19.82,130.7,1206,0.1062,0.1849,0.2417,0.0974,0.1733,0.06697,0.7661,0.78,4.115,92.81,0.008482,0.05057,0.068,0.01971,0.01467,0.007259,25.28,25.59,159.8,1933,0.171,0.5955,0.8489,0.2507,0.2749,0.1297
+8860702,M,17.3,17.08,113,928.2,0.1008,0.1041,0.1266,0.08353,0.1813,0.05613,0.3093,0.8568,2.193,33.63,0.004757,0.01503,0.02332,0.01262,0.01394,0.002362,19.85,25.09,130.9,1222,0.1416,0.2405,0.3378,0.1857,0.3138,0.08113
+886226,M,19.45,19.33,126.5,1169,0.1035,0.1188,0.1379,0.08591,0.1776,0.05647,0.5959,0.6342,3.797,71,0.004649,0.018,0.02749,0.01267,0.01365,0.00255,25.7,24.57,163.1,1972,0.1497,0.3161,0.4317,0.1999,0.3379,0.0895
+886452,M,13.96,17.05,91.43,602.4,0.1096,0.1279,0.09789,0.05246,0.1908,0.0613,0.425,0.8098,2.563,35.74,0.006351,0.02679,0.03119,0.01342,0.02062,0.002695,16.39,22.07,108.1,826,0.1512,0.3262,0.3209,0.1374,0.3068,0.07957
+88649001,M,19.55,28.77,133.6,1207,0.0926,0.2063,0.1784,0.1144,0.1893,0.06232,0.8426,1.199,7.158,106.4,0.006356,0.04765,0.03863,0.01519,0.01936,0.005252,25.05,36.27,178.6,1926,0.1281,0.5329,0.4251,0.1941,0.2818,0.1005
+886776,M,15.32,17.27,103.2,713.3,0.1335,0.2284,0.2448,0.1242,0.2398,0.07596,0.6592,1.059,4.061,59.46,0.01015,0.04588,0.04983,0.02127,0.01884,0.00866,17.73,22.66,119.8,928.8,0.1765,0.4503,0.4429,0.2229,0.3258,0.1191
+887181,M,15.66,23.2,110.2,773.5,0.1109,0.3114,0.3176,0.1377,0.2495,0.08104,1.292,2.454,10.12,138.5,0.01236,0.05995,0.08232,0.03024,0.02337,0.006042,19.85,31.64,143.7,1226,0.1504,0.5172,0.6181,0.2462,0.3277,0.1019
+88725602,M,15.53,33.56,103.7,744.9,0.1063,0.1639,0.1751,0.08399,0.2091,0.0665,0.2419,1.278,1.903,23.02,0.005345,0.02556,0.02889,0.01022,0.009947,0.003359,18.49,49.54,126.3,1035,0.1883,0.5564,0.5703,0.2014,0.3512,0.1204
+887549,M,20.31,27.06,132.9,1288,0.1,0.1088,0.1519,0.09333,0.1814,0.05572,0.3977,1.033,2.587,52.34,0.005043,0.01578,0.02117,0.008185,0.01282,0.001892,24.33,39.16,162.3,1844,0.1522,0.2945,0.3788,0.1697,0.3151,0.07999
+888264,M,17.35,23.06,111,933.1,0.08662,0.0629,0.02891,0.02837,0.1564,0.05307,0.4007,1.317,2.577,44.41,0.005726,0.01106,0.01246,0.007671,0.01411,0.001578,19.85,31.47,128.2,1218,0.124,0.1486,0.1211,0.08235,0.2452,0.06515
+888570,M,17.29,22.13,114.4,947.8,0.08999,0.1273,0.09697,0.07507,0.2108,0.05464,0.8348,1.633,6.146,90.94,0.006717,0.05981,0.04638,0.02149,0.02747,0.005838,20.39,27.24,137.9,1295,0.1134,0.2867,0.2298,0.1528,0.3067,0.07484
+889403,M,15.61,19.38,100,758.6,0.0784,0.05616,0.04209,0.02847,0.1547,0.05443,0.2298,0.9988,1.534,22.18,0.002826,0.009105,0.01311,0.005174,0.01013,0.001345,17.91,31.67,115.9,988.6,0.1084,0.1807,0.226,0.08568,0.2683,0.06829
+889719,M,17.19,22.07,111.6,928.3,0.09726,0.08995,0.09061,0.06527,0.1867,0.0558,0.4203,0.7383,2.819,45.42,0.004493,0.01206,0.02048,0.009875,0.01144,0.001575,21.58,29.33,140.5,1436,0.1558,0.2567,0.3889,0.1984,0.3216,0.0757
+88995002,M,20.73,31.12,135.7,1419,0.09469,0.1143,0.1367,0.08646,0.1769,0.05674,1.172,1.617,7.749,199.7,0.004551,0.01478,0.02143,0.00928,0.01367,0.002299,32.49,47.16,214,3432,0.1401,0.2644,0.3442,0.1659,0.2868,0.08218
+8910251,B,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,0.1922,0.06491,0.4505,1.197,3.43,27.1,0.00747,0.03581,0.03354,0.01365,0.03504,0.003318,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
+8910499,B,13.59,21.84,87.16,561,0.07956,0.08259,0.04072,0.02142,0.1635,0.05859,0.338,1.916,2.591,26.76,0.005436,0.02406,0.03099,0.009919,0.0203,0.003009,14.8,30.04,97.66,661.5,0.1005,0.173,0.1453,0.06189,0.2446,0.07024
+8910506,B,12.87,16.21,82.38,512.2,0.09425,0.06219,0.039,0.01615,0.201,0.05769,0.2345,1.219,1.546,18.24,0.005518,0.02178,0.02589,0.00633,0.02593,0.002157,13.9,23.64,89.27,597.5,0.1256,0.1808,0.1992,0.0578,0.3604,0.07062
+8910720,B,10.71,20.39,69.5,344.9,0.1082,0.1289,0.08448,0.02867,0.1668,0.06862,0.3198,1.489,2.23,20.74,0.008902,0.04785,0.07339,0.01745,0.02728,0.00761,11.69,25.21,76.51,410.4,0.1335,0.255,0.2534,0.086,0.2605,0.08701
+8910721,B,14.29,16.82,90.3,632.6,0.06429,0.02675,0.00725,0.00625,0.1508,0.05376,0.1302,0.7198,0.8439,10.77,0.003492,0.00371,0.004826,0.003608,0.01536,0.001381,14.91,20.65,94.44,684.6,0.08567,0.05036,0.03866,0.03333,0.2458,0.0612
+8910748,B,11.29,13.04,72.23,388,0.09834,0.07608,0.03265,0.02755,0.1769,0.0627,0.1904,0.5293,1.164,13.17,0.006472,0.01122,0.01282,0.008849,0.01692,0.002817,12.32,16.18,78.27,457.5,0.1358,0.1507,0.1275,0.0875,0.2733,0.08022
+8910988,M,21.75,20.99,147.3,1491,0.09401,0.1961,0.2195,0.1088,0.1721,0.06194,1.167,1.352,8.867,156.8,0.005687,0.0496,0.06329,0.01561,0.01924,0.004614,28.19,28.18,195.9,2384,0.1272,0.4725,0.5807,0.1841,0.2833,0.08858
+8910996,B,9.742,15.67,61.5,289.9,0.09037,0.04689,0.01103,0.01407,0.2081,0.06312,0.2684,1.409,1.75,16.39,0.0138,0.01067,0.008347,0.009472,0.01798,0.004261,10.75,20.88,68.09,355.2,0.1467,0.0937,0.04043,0.05159,0.2841,0.08175
+8911163,M,17.93,24.48,115.2,998.9,0.08855,0.07027,0.05699,0.04744,0.1538,0.0551,0.4212,1.433,2.765,45.81,0.005444,0.01169,0.01622,0.008522,0.01419,0.002751,20.92,34.69,135.1,1320,0.1315,0.1806,0.208,0.1136,0.2504,0.07948
+8911164,B,11.89,17.36,76.2,435.6,0.1225,0.0721,0.05929,0.07404,0.2015,0.05875,0.6412,2.293,4.021,48.84,0.01418,0.01489,0.01267,0.0191,0.02678,0.003002,12.4,18.99,79.46,472.4,0.1359,0.08368,0.07153,0.08946,0.222,0.06033
+8911230,B,11.33,14.16,71.79,396.6,0.09379,0.03872,0.001487,0.003333,0.1954,0.05821,0.2375,1.28,1.565,17.09,0.008426,0.008998,0.001487,0.003333,0.02358,0.001627,12.2,18.99,77.37,458,0.1259,0.07348,0.004955,0.01111,0.2758,0.06386
+8911670,M,18.81,19.98,120.9,1102,0.08923,0.05884,0.0802,0.05843,0.155,0.04996,0.3283,0.828,2.363,36.74,0.007571,0.01114,0.02623,0.01463,0.0193,0.001676,19.96,24.3,129,1236,0.1243,0.116,0.221,0.1294,0.2567,0.05737
+8911800,B,13.59,17.84,86.24,572.3,0.07948,0.04052,0.01997,0.01238,0.1573,0.0552,0.258,1.166,1.683,22.22,0.003741,0.005274,0.01065,0.005044,0.01344,0.001126,15.5,26.1,98.91,739.1,0.105,0.07622,0.106,0.05185,0.2335,0.06263
+8911834,B,13.85,15.18,88.99,587.4,0.09516,0.07688,0.04479,0.03711,0.211,0.05853,0.2479,0.9195,1.83,19.41,0.004235,0.01541,0.01457,0.01043,0.01528,0.001593,14.98,21.74,98.37,670,0.1185,0.1724,0.1456,0.09993,0.2955,0.06912
+8912049,M,19.16,26.6,126.2,1138,0.102,0.1453,0.1921,0.09664,0.1902,0.0622,0.6361,1.001,4.321,69.65,0.007392,0.02449,0.03988,0.01293,0.01435,0.003446,23.72,35.9,159.8,1724,0.1782,0.3841,0.5754,0.1872,0.3258,0.0972
+8912055,B,11.74,14.02,74.24,427.3,0.07813,0.0434,0.02245,0.02763,0.2101,0.06113,0.5619,1.268,3.717,37.83,0.008034,0.01442,0.01514,0.01846,0.02921,0.002005,13.31,18.26,84.7,533.7,0.1036,0.085,0.06735,0.0829,0.3101,0.06688
+89122,M,19.4,18.18,127.2,1145,0.1037,0.1442,0.1626,0.09464,0.1893,0.05892,0.4709,0.9951,2.903,53.16,0.005654,0.02199,0.03059,0.01499,0.01623,0.001965,23.79,28.65,152.4,1628,0.1518,0.3749,0.4316,0.2252,0.359,0.07787
+8912280,M,16.24,18.77,108.8,805.1,0.1066,0.1802,0.1948,0.09052,0.1876,0.06684,0.2873,0.9173,2.464,28.09,0.004563,0.03481,0.03872,0.01209,0.01388,0.004081,18.55,25.09,126.9,1031,0.1365,0.4706,0.5026,0.1732,0.277,0.1063
+8912284,B,12.89,15.7,84.08,516.6,0.07818,0.0958,0.1115,0.0339,0.1432,0.05935,0.2913,1.389,2.347,23.29,0.006418,0.03961,0.07927,0.01774,0.01878,0.003696,13.9,19.69,92.12,595.6,0.09926,0.2317,0.3344,0.1017,0.1999,0.07127
+8912521,B,12.58,18.4,79.83,489,0.08393,0.04216,0.00186,0.002924,0.1697,0.05855,0.2719,1.35,1.721,22.45,0.006383,0.008008,0.00186,0.002924,0.02571,0.002015,13.5,23.08,85.56,564.1,0.1038,0.06624,0.005579,0.008772,0.2505,0.06431
+8912909,B,11.94,20.76,77.87,441,0.08605,0.1011,0.06574,0.03791,0.1588,0.06766,0.2742,1.39,3.198,21.91,0.006719,0.05156,0.04387,0.01633,0.01872,0.008015,13.24,27.29,92.2,546.1,0.1116,0.2813,0.2365,0.1155,0.2465,0.09981
+8913,B,12.89,13.12,81.89,515.9,0.06955,0.03729,0.0226,0.01171,0.1337,0.05581,0.1532,0.469,1.115,12.68,0.004731,0.01345,0.01652,0.005905,0.01619,0.002081,13.62,15.54,87.4,577,0.09616,0.1147,0.1186,0.05366,0.2309,0.06915
+8913049,B,11.26,19.96,73.72,394.1,0.0802,0.1181,0.09274,0.05588,0.2595,0.06233,0.4866,1.905,2.877,34.68,0.01574,0.08262,0.08099,0.03487,0.03418,0.006517,11.86,22.33,78.27,437.6,0.1028,0.1843,0.1546,0.09314,0.2955,0.07009
+89143601,B,11.37,18.89,72.17,396,0.08713,0.05008,0.02399,0.02173,0.2013,0.05955,0.2656,1.974,1.954,17.49,0.006538,0.01395,0.01376,0.009924,0.03416,0.002928,12.36,26.14,79.29,459.3,0.1118,0.09708,0.07529,0.06203,0.3267,0.06994
+89143602,B,14.41,19.73,96.03,651,0.08757,0.1676,0.1362,0.06602,0.1714,0.07192,0.8811,1.77,4.36,77.11,0.007762,0.1064,0.0996,0.02771,0.04077,0.02286,15.77,22.13,101.7,767.3,0.09983,0.2472,0.222,0.1021,0.2272,0.08799
+8915,B,14.96,19.1,97.03,687.3,0.08992,0.09823,0.0594,0.04819,0.1879,0.05852,0.2877,0.948,2.171,24.87,0.005332,0.02115,0.01536,0.01187,0.01522,0.002815,16.25,26.19,109.1,809.8,0.1313,0.303,0.1804,0.1489,0.2962,0.08472
+891670,B,12.95,16.02,83.14,513.7,0.1005,0.07943,0.06155,0.0337,0.173,0.0647,0.2094,0.7636,1.231,17.67,0.008725,0.02003,0.02335,0.01132,0.02625,0.004726,13.74,19.93,88.81,585.4,0.1483,0.2068,0.2241,0.1056,0.338,0.09584
+891703,B,11.85,17.46,75.54,432.7,0.08372,0.05642,0.02688,0.0228,0.1875,0.05715,0.207,1.238,1.234,13.88,0.007595,0.015,0.01412,0.008578,0.01792,0.001784,13.06,25.75,84.35,517.8,0.1369,0.1758,0.1316,0.0914,0.3101,0.07007
+891716,B,12.72,13.78,81.78,492.1,0.09667,0.08393,0.01288,0.01924,0.1638,0.061,0.1807,0.6931,1.34,13.38,0.006064,0.0118,0.006564,0.007978,0.01374,0.001392,13.5,17.48,88.54,553.7,0.1298,0.1472,0.05233,0.06343,0.2369,0.06922
+891923,B,13.77,13.27,88.06,582.7,0.09198,0.06221,0.01063,0.01917,0.1592,0.05912,0.2191,0.6946,1.479,17.74,0.004348,0.008153,0.004272,0.006829,0.02154,0.001802,14.67,16.93,94.17,661.1,0.117,0.1072,0.03732,0.05802,0.2823,0.06794
+891936,B,10.91,12.35,69.14,363.7,0.08518,0.04721,0.01236,0.01369,0.1449,0.06031,0.1753,1.027,1.267,11.09,0.003478,0.01221,0.01072,0.009393,0.02941,0.003428,11.37,14.82,72.42,392.2,0.09312,0.07506,0.02884,0.03194,0.2143,0.06643
+892189,M,11.76,18.14,75,431.1,0.09968,0.05914,0.02685,0.03515,0.1619,0.06287,0.645,2.105,4.138,49.11,0.005596,0.01005,0.01272,0.01432,0.01575,0.002758,13.36,23.39,85.1,553.6,0.1137,0.07974,0.0612,0.0716,0.1978,0.06915
+892214,B,14.26,18.17,91.22,633.1,0.06576,0.0522,0.02475,0.01374,0.1635,0.05586,0.23,0.669,1.661,20.56,0.003169,0.01377,0.01079,0.005243,0.01103,0.001957,16.22,25.26,105.8,819.7,0.09445,0.2167,0.1565,0.0753,0.2636,0.07676
+892399,B,10.51,23.09,66.85,334.2,0.1015,0.06797,0.02495,0.01875,0.1695,0.06556,0.2868,1.143,2.289,20.56,0.01017,0.01443,0.01861,0.0125,0.03464,0.001971,10.93,24.22,70.1,362.7,0.1143,0.08614,0.04158,0.03125,0.2227,0.06777
+892438,M,19.53,18.9,129.5,1217,0.115,0.1642,0.2197,0.1062,0.1792,0.06552,1.111,1.161,7.237,133,0.006056,0.03203,0.05638,0.01733,0.01884,0.004787,25.93,26.24,171.1,2053,0.1495,0.4116,0.6121,0.198,0.2968,0.09929
+892604,B,12.46,19.89,80.43,471.3,0.08451,0.1014,0.0683,0.03099,0.1781,0.06249,0.3642,1.04,2.579,28.32,0.00653,0.03369,0.04712,0.01403,0.0274,0.004651,13.46,23.07,88.13,551.3,0.105,0.2158,0.1904,0.07625,0.2685,0.07764
+89263202,M,20.09,23.86,134.7,1247,0.108,0.1838,0.2283,0.128,0.2249,0.07469,1.072,1.743,7.804,130.8,0.007964,0.04732,0.07649,0.01936,0.02736,0.005928,23.68,29.43,158.8,1696,0.1347,0.3391,0.4932,0.1923,0.3294,0.09469
+892657,B,10.49,18.61,66.86,334.3,0.1068,0.06678,0.02297,0.0178,0.1482,0.066,0.1485,1.563,1.035,10.08,0.008875,0.009362,0.01808,0.009199,0.01791,0.003317,11.06,24.54,70.76,375.4,0.1413,0.1044,0.08423,0.06528,0.2213,0.07842
+89296,B,11.46,18.16,73.59,403.1,0.08853,0.07694,0.03344,0.01502,0.1411,0.06243,0.3278,1.059,2.475,22.93,0.006652,0.02652,0.02221,0.007807,0.01894,0.003411,12.68,21.61,82.69,489.8,0.1144,0.1789,0.1226,0.05509,0.2208,0.07638
+893061,B,11.6,24.49,74.23,417.2,0.07474,0.05688,0.01974,0.01313,0.1935,0.05878,0.2512,1.786,1.961,18.21,0.006122,0.02337,0.01596,0.006998,0.03194,0.002211,12.44,31.62,81.39,476.5,0.09545,0.1361,0.07239,0.04815,0.3244,0.06745
+89344,B,13.2,15.82,84.07,537.3,0.08511,0.05251,0.001461,0.003261,0.1632,0.05894,0.1903,0.5735,1.204,15.5,0.003632,0.007861,0.001128,0.002386,0.01344,0.002585,14.41,20.45,92,636.9,0.1128,0.1346,0.0112,0.025,0.2651,0.08385
+89346,B,9,14.4,56.36,246.3,0.07005,0.03116,0.003681,0.003472,0.1788,0.06833,0.1746,1.305,1.144,9.789,0.007389,0.004883,0.003681,0.003472,0.02701,0.002153,9.699,20.07,60.9,285.5,0.09861,0.05232,0.01472,0.01389,0.2991,0.07804
+893526,B,13.5,12.71,85.69,566.2,0.07376,0.03614,0.002758,0.004419,0.1365,0.05335,0.2244,0.6864,1.509,20.39,0.003338,0.003746,0.00203,0.003242,0.0148,0.001566,14.97,16.94,95.48,698.7,0.09023,0.05836,0.01379,0.0221,0.2267,0.06192
+893548,B,13.05,13.84,82.71,530.6,0.08352,0.03735,0.004559,0.008829,0.1453,0.05518,0.3975,0.8285,2.567,33.01,0.004148,0.004711,0.002831,0.004821,0.01422,0.002273,14.73,17.4,93.96,672.4,0.1016,0.05847,0.01824,0.03532,0.2107,0.0658
+893783,B,11.7,19.11,74.33,418.7,0.08814,0.05253,0.01583,0.01148,0.1936,0.06128,0.1601,1.43,1.109,11.28,0.006064,0.00911,0.01042,0.007638,0.02349,0.001661,12.61,26.55,80.92,483.1,0.1223,0.1087,0.07915,0.05741,0.3487,0.06958
+89382601,B,14.61,15.69,92.68,664.9,0.07618,0.03515,0.01447,0.01877,0.1632,0.05255,0.316,0.9115,1.954,28.9,0.005031,0.006021,0.005325,0.006324,0.01494,0.0008948,16.46,21.75,103.7,840.8,0.1011,0.07087,0.04746,0.05813,0.253,0.05695
+89382602,B,12.76,13.37,82.29,504.1,0.08794,0.07948,0.04052,0.02548,0.1601,0.0614,0.3265,0.6594,2.346,25.18,0.006494,0.02768,0.03137,0.01069,0.01731,0.004392,14.19,16.4,92.04,618.8,0.1194,0.2208,0.1769,0.08411,0.2564,0.08253
+893988,B,11.54,10.72,73.73,409.1,0.08597,0.05969,0.01367,0.008907,0.1833,0.061,0.1312,0.3602,1.107,9.438,0.004124,0.0134,0.01003,0.004667,0.02032,0.001952,12.34,12.87,81.23,467.8,0.1092,0.1626,0.08324,0.04715,0.339,0.07434
+894047,B,8.597,18.6,54.09,221.2,0.1074,0.05847,0,0,0.2163,0.07359,0.3368,2.777,2.222,17.81,0.02075,0.01403,0,0,0.06146,0.00682,8.952,22.44,56.65,240.1,0.1347,0.07767,0,0,0.3142,0.08116
+894089,B,12.49,16.85,79.19,481.6,0.08511,0.03834,0.004473,0.006423,0.1215,0.05673,0.1716,0.7151,1.047,12.69,0.004928,0.003012,0.00262,0.00339,0.01393,0.001344,13.34,19.71,84.48,544.2,0.1104,0.04953,0.01938,0.02784,0.1917,0.06174
+894090,B,12.18,14.08,77.25,461.4,0.07734,0.03212,0.01123,0.005051,0.1673,0.05649,0.2113,0.5996,1.438,15.82,0.005343,0.005767,0.01123,0.005051,0.01977,0.0009502,12.85,16.47,81.6,513.1,0.1001,0.05332,0.04116,0.01852,0.2293,0.06037
+894326,M,18.22,18.87,118.7,1027,0.09746,0.1117,0.113,0.0795,0.1807,0.05664,0.4041,0.5503,2.547,48.9,0.004821,0.01659,0.02408,0.01143,0.01275,0.002451,21.84,25,140.9,1485,0.1434,0.2763,0.3853,0.1776,0.2812,0.08198
+894329,B,9.042,18.9,60.07,244.5,0.09968,0.1972,0.1975,0.04908,0.233,0.08743,0.4653,1.911,3.769,24.2,0.009845,0.0659,0.1027,0.02527,0.03491,0.007877,10.06,23.4,68.62,297.1,0.1221,0.3748,0.4609,0.1145,0.3135,0.1055
+894335,B,12.43,17,78.6,477.3,0.07557,0.03454,0.01342,0.01699,0.1472,0.05561,0.3778,2.2,2.487,31.16,0.007357,0.01079,0.009959,0.0112,0.03433,0.002961,12.9,20.21,81.76,515.9,0.08409,0.04712,0.02237,0.02832,0.1901,0.05932
+894604,B,10.25,16.18,66.52,324.2,0.1061,0.1111,0.06726,0.03965,0.1743,0.07279,0.3677,1.471,1.597,22.68,0.01049,0.04265,0.04004,0.01544,0.02719,0.007596,11.28,20.61,71.53,390.4,0.1402,0.236,0.1898,0.09744,0.2608,0.09702
+894618,M,20.16,19.66,131.1,1274,0.0802,0.08564,0.1155,0.07726,0.1928,0.05096,0.5925,0.6863,3.868,74.85,0.004536,0.01376,0.02645,0.01247,0.02193,0.001589,23.06,23.03,150.2,1657,0.1054,0.1537,0.2606,0.1425,0.3055,0.05933
+894855,B,12.86,13.32,82.82,504.8,0.1134,0.08834,0.038,0.034,0.1543,0.06476,0.2212,1.042,1.614,16.57,0.00591,0.02016,0.01902,0.01011,0.01202,0.003107,14.04,21.08,92.8,599.5,0.1547,0.2231,0.1791,0.1155,0.2382,0.08553
+895100,M,20.34,21.51,135.9,1264,0.117,0.1875,0.2565,0.1504,0.2569,0.0667,0.5702,1.023,4.012,69.06,0.005485,0.02431,0.0319,0.01369,0.02768,0.003345,25.3,31.86,171.1,1938,0.1592,0.4492,0.5344,0.2685,0.5558,0.1024
+89511501,B,12.2,15.21,78.01,457.9,0.08673,0.06545,0.01994,0.01692,0.1638,0.06129,0.2575,0.8073,1.959,19.01,0.005403,0.01418,0.01051,0.005142,0.01333,0.002065,13.75,21.38,91.11,583.1,0.1256,0.1928,0.1167,0.05556,0.2661,0.07961
+89511502,B,12.67,17.3,81.25,489.9,0.1028,0.07664,0.03193,0.02107,0.1707,0.05984,0.21,0.9505,1.566,17.61,0.006809,0.009514,0.01329,0.006474,0.02057,0.001784,13.71,21.1,88.7,574.4,0.1384,0.1212,0.102,0.05602,0.2688,0.06888
+89524,B,14.11,12.88,90.03,616.5,0.09309,0.05306,0.01765,0.02733,0.1373,0.057,0.2571,1.081,1.558,23.92,0.006692,0.01132,0.005717,0.006627,0.01416,0.002476,15.53,18,98.4,749.9,0.1281,0.1109,0.05307,0.0589,0.21,0.07083
+895299,B,12.03,17.93,76.09,446,0.07683,0.03892,0.001546,0.005592,0.1382,0.0607,0.2335,0.9097,1.466,16.97,0.004729,0.006887,0.001184,0.003951,0.01466,0.001755,13.07,22.25,82.74,523.4,0.1013,0.0739,0.007732,0.02796,0.2171,0.07037
+8953902,M,16.27,20.71,106.9,813.7,0.1169,0.1319,0.1478,0.08488,0.1948,0.06277,0.4375,1.232,3.27,44.41,0.006697,0.02083,0.03248,0.01392,0.01536,0.002789,19.28,30.38,129.8,1121,0.159,0.2947,0.3597,0.1583,0.3103,0.082
+895633,M,16.26,21.88,107.5,826.8,0.1165,0.1283,0.1799,0.07981,0.1869,0.06532,0.5706,1.457,2.961,57.72,0.01056,0.03756,0.05839,0.01186,0.04022,0.006187,17.73,25.21,113.7,975.2,0.1426,0.2116,0.3344,0.1047,0.2736,0.07953
+896839,M,16.03,15.51,105.8,793.2,0.09491,0.1371,0.1204,0.07041,0.1782,0.05976,0.3371,0.7476,2.629,33.27,0.005839,0.03245,0.03715,0.01459,0.01467,0.003121,18.76,21.98,124.3,1070,0.1435,0.4478,0.4956,0.1981,0.3019,0.09124
+896864,B,12.98,19.35,84.52,514,0.09579,0.1125,0.07107,0.0295,0.1761,0.0654,0.2684,0.5664,2.465,20.65,0.005727,0.03255,0.04393,0.009811,0.02751,0.004572,14.42,21.95,99.21,634.3,0.1288,0.3253,0.3439,0.09858,0.3596,0.09166
+897132,B,11.22,19.86,71.94,387.3,0.1054,0.06779,0.005006,0.007583,0.194,0.06028,0.2976,1.966,1.959,19.62,0.01289,0.01104,0.003297,0.004967,0.04243,0.001963,11.98,25.78,76.91,436.1,0.1424,0.09669,0.01335,0.02022,0.3292,0.06522
+897137,B,11.25,14.78,71.38,390,0.08306,0.04458,0.0009737,0.002941,0.1773,0.06081,0.2144,0.9961,1.529,15.07,0.005617,0.007124,0.0009737,0.002941,0.017,0.00203,12.76,22.06,82.08,492.7,0.1166,0.09794,0.005518,0.01667,0.2815,0.07418
+897374,B,12.3,19.02,77.88,464.4,0.08313,0.04202,0.007756,0.008535,0.1539,0.05945,0.184,1.532,1.199,13.24,0.007881,0.008432,0.007004,0.006522,0.01939,0.002222,13.35,28.46,84.53,544.3,0.1222,0.09052,0.03619,0.03983,0.2554,0.07207
+89742801,M,17.06,21,111.8,918.6,0.1119,0.1056,0.1508,0.09934,0.1727,0.06071,0.8161,2.129,6.076,87.17,0.006455,0.01797,0.04502,0.01744,0.01829,0.003733,20.99,33.15,143.2,1362,0.1449,0.2053,0.392,0.1827,0.2623,0.07599
+897604,B,12.99,14.23,84.08,514.3,0.09462,0.09965,0.03738,0.02098,0.1652,0.07238,0.1814,0.6412,0.9219,14.41,0.005231,0.02305,0.03113,0.007315,0.01639,0.005701,13.72,16.91,87.38,576,0.1142,0.1975,0.145,0.0585,0.2432,0.1009
+897630,M,18.77,21.43,122.9,1092,0.09116,0.1402,0.106,0.0609,0.1953,0.06083,0.6422,1.53,4.369,88.25,0.007548,0.03897,0.03914,0.01816,0.02168,0.004445,24.54,34.37,161.1,1873,0.1498,0.4827,0.4634,0.2048,0.3679,0.0987
+897880,B,10.05,17.53,64.41,310.8,0.1007,0.07326,0.02511,0.01775,0.189,0.06331,0.2619,2.015,1.778,16.85,0.007803,0.01449,0.0169,0.008043,0.021,0.002778,11.16,26.84,71.98,384,0.1402,0.1402,0.1055,0.06499,0.2894,0.07664
+89812,M,23.51,24.27,155.1,1747,0.1069,0.1283,0.2308,0.141,0.1797,0.05506,1.009,0.9245,6.462,164.1,0.006292,0.01971,0.03582,0.01301,0.01479,0.003118,30.67,30.73,202.4,2906,0.1515,0.2678,0.4819,0.2089,0.2593,0.07738
+89813,B,14.42,16.54,94.15,641.2,0.09751,0.1139,0.08007,0.04223,0.1912,0.06412,0.3491,0.7706,2.677,32.14,0.004577,0.03053,0.0384,0.01243,0.01873,0.003373,16.67,21.51,111.4,862.1,0.1294,0.3371,0.3755,0.1414,0.3053,0.08764
+898143,B,9.606,16.84,61.64,280.5,0.08481,0.09228,0.08422,0.02292,0.2036,0.07125,0.1844,0.9429,1.429,12.07,0.005954,0.03471,0.05028,0.00851,0.0175,0.004031,10.75,23.07,71.25,353.6,0.1233,0.3416,0.4341,0.0812,0.2982,0.09825
+89827,B,11.06,14.96,71.49,373.9,0.1033,0.09097,0.05397,0.03341,0.1776,0.06907,0.1601,0.8225,1.355,10.8,0.007416,0.01877,0.02758,0.0101,0.02348,0.002917,11.92,19.9,79.76,440,0.1418,0.221,0.2299,0.1075,0.3301,0.0908
+898431,M,19.68,21.68,129.9,1194,0.09797,0.1339,0.1863,0.1103,0.2082,0.05715,0.6226,2.284,5.173,67.66,0.004756,0.03368,0.04345,0.01806,0.03756,0.003288,22.75,34.66,157.6,1540,0.1218,0.3458,0.4734,0.2255,0.4045,0.07918
+89864002,B,11.71,15.45,75.03,420.3,0.115,0.07281,0.04006,0.0325,0.2009,0.06506,0.3446,0.7395,2.355,24.53,0.009536,0.01097,0.01651,0.01121,0.01953,0.0031,13.06,18.16,84.16,516.4,0.146,0.1115,0.1087,0.07864,0.2765,0.07806
+898677,B,10.26,14.71,66.2,321.6,0.09882,0.09159,0.03581,0.02037,0.1633,0.07005,0.338,2.509,2.394,19.33,0.01736,0.04671,0.02611,0.01296,0.03675,0.006758,10.88,19.48,70.89,357.1,0.136,0.1636,0.07162,0.04074,0.2434,0.08488
+898678,B,12.06,18.9,76.66,445.3,0.08386,0.05794,0.00751,0.008488,0.1555,0.06048,0.243,1.152,1.559,18.02,0.00718,0.01096,0.005832,0.005495,0.01982,0.002754,13.64,27.06,86.54,562.6,0.1289,0.1352,0.04506,0.05093,0.288,0.08083
+89869,B,14.76,14.74,94.87,668.7,0.08875,0.0778,0.04608,0.03528,0.1521,0.05912,0.3428,0.3981,2.537,29.06,0.004732,0.01506,0.01855,0.01067,0.02163,0.002783,17.27,17.93,114.2,880.8,0.122,0.2009,0.2151,0.1251,0.3109,0.08187
+898690,B,11.47,16.03,73.02,402.7,0.09076,0.05886,0.02587,0.02322,0.1634,0.06372,0.1707,0.7615,1.09,12.25,0.009191,0.008548,0.0094,0.006315,0.01755,0.003009,12.51,20.79,79.67,475.8,0.1531,0.112,0.09823,0.06548,0.2851,0.08763
+899147,B,11.95,14.96,77.23,426.7,0.1158,0.1206,0.01171,0.01787,0.2459,0.06581,0.361,1.05,2.455,26.65,0.0058,0.02417,0.007816,0.01052,0.02734,0.003114,12.81,17.72,83.09,496.2,0.1293,0.1885,0.03122,0.04766,0.3124,0.0759
+899187,B,11.66,17.07,73.7,421,0.07561,0.0363,0.008306,0.01162,0.1671,0.05731,0.3534,0.6724,2.225,26.03,0.006583,0.006991,0.005949,0.006296,0.02216,0.002668,13.28,19.74,83.61,542.5,0.09958,0.06476,0.03046,0.04262,0.2731,0.06825
+899667,M,15.75,19.22,107.1,758.6,0.1243,0.2364,0.2914,0.1242,0.2375,0.07603,0.5204,1.324,3.477,51.22,0.009329,0.06559,0.09953,0.02283,0.05543,0.00733,17.36,24.17,119.4,915.3,0.155,0.5046,0.6872,0.2135,0.4245,0.105
+899987,M,25.73,17.46,174.2,2010,0.1149,0.2363,0.3368,0.1913,0.1956,0.06121,0.9948,0.8509,7.222,153.1,0.006369,0.04243,0.04266,0.01508,0.02335,0.003385,33.13,23.58,229.3,3234,0.153,0.5937,0.6451,0.2756,0.369,0.08815
+9010018,M,15.08,25.74,98,716.6,0.1024,0.09769,0.1235,0.06553,0.1647,0.06464,0.6534,1.506,4.174,63.37,0.01052,0.02431,0.04912,0.01746,0.0212,0.004867,18.51,33.22,121.2,1050,0.166,0.2356,0.4029,0.1526,0.2654,0.09438
+901011,B,11.14,14.07,71.24,384.6,0.07274,0.06064,0.04505,0.01471,0.169,0.06083,0.4222,0.8092,3.33,28.84,0.005541,0.03387,0.04505,0.01471,0.03102,0.004831,12.12,15.82,79.62,453.5,0.08864,0.1256,0.1201,0.03922,0.2576,0.07018
+9010258,B,12.56,19.07,81.92,485.8,0.0876,0.1038,0.103,0.04391,0.1533,0.06184,0.3602,1.478,3.212,27.49,0.009853,0.04235,0.06271,0.01966,0.02639,0.004205,13.37,22.43,89.02,547.4,0.1096,0.2002,0.2388,0.09265,0.2121,0.07188
+9010259,B,13.05,18.59,85.09,512,0.1082,0.1304,0.09603,0.05603,0.2035,0.06501,0.3106,1.51,2.59,21.57,0.007807,0.03932,0.05112,0.01876,0.0286,0.005715,14.19,24.85,94.22,591.2,0.1343,0.2658,0.2573,0.1258,0.3113,0.08317
+901028,B,13.87,16.21,88.52,593.7,0.08743,0.05492,0.01502,0.02088,0.1424,0.05883,0.2543,1.363,1.737,20.74,0.005638,0.007939,0.005254,0.006042,0.01544,0.002087,15.11,25.58,96.74,694.4,0.1153,0.1008,0.05285,0.05556,0.2362,0.07113
+9010333,B,8.878,15.49,56.74,241,0.08293,0.07698,0.04721,0.02381,0.193,0.06621,0.5381,1.2,4.277,30.18,0.01093,0.02899,0.03214,0.01506,0.02837,0.004174,9.981,17.7,65.27,302,0.1015,0.1248,0.09441,0.04762,0.2434,0.07431
+901034301,B,9.436,18.32,59.82,278.6,0.1009,0.05956,0.0271,0.01406,0.1506,0.06959,0.5079,1.247,3.267,30.48,0.006836,0.008982,0.02348,0.006565,0.01942,0.002713,12.02,25.02,75.79,439.6,0.1333,0.1049,0.1144,0.05052,0.2454,0.08136
+901034302,B,12.54,18.07,79.42,491.9,0.07436,0.0265,0.001194,0.005449,0.1528,0.05185,0.3511,0.9527,2.329,28.3,0.005783,0.004693,0.0007929,0.003617,0.02043,0.001058,13.72,20.98,86.82,585.7,0.09293,0.04327,0.003581,0.01635,0.2233,0.05521
+901041,B,13.3,21.57,85.24,546.1,0.08582,0.06373,0.03344,0.02424,0.1815,0.05696,0.2621,1.539,2.028,20.98,0.005498,0.02045,0.01795,0.006399,0.01829,0.001956,14.2,29.2,92.94,621.2,0.114,0.1667,0.1212,0.05614,0.2637,0.06658
+9010598,B,12.76,18.84,81.87,496.6,0.09676,0.07952,0.02688,0.01781,0.1759,0.06183,0.2213,1.285,1.535,17.26,0.005608,0.01646,0.01529,0.009997,0.01909,0.002133,13.75,25.99,87.82,579.7,0.1298,0.1839,0.1255,0.08312,0.2744,0.07238
+9010872,B,16.5,18.29,106.6,838.1,0.09686,0.08468,0.05862,0.04835,0.1495,0.05593,0.3389,1.439,2.344,33.58,0.007257,0.01805,0.01832,0.01033,0.01694,0.002001,18.13,25.45,117.2,1009,0.1338,0.1679,0.1663,0.09123,0.2394,0.06469
+9010877,B,13.4,16.95,85.48,552.4,0.07937,0.05696,0.02181,0.01473,0.165,0.05701,0.1584,0.6124,1.036,13.22,0.004394,0.0125,0.01451,0.005484,0.01291,0.002074,14.73,21.7,93.76,663.5,0.1213,0.1676,0.1364,0.06987,0.2741,0.07582
+901088,M,20.44,21.78,133.8,1293,0.0915,0.1131,0.09799,0.07785,0.1618,0.05557,0.5781,0.9168,4.218,72.44,0.006208,0.01906,0.02375,0.01461,0.01445,0.001906,24.31,26.37,161.2,1780,0.1327,0.2376,0.2702,0.1765,0.2609,0.06735
+9011494,M,20.2,26.83,133.7,1234,0.09905,0.1669,0.1641,0.1265,0.1875,0.0602,0.9761,1.892,7.128,103.6,0.008439,0.04674,0.05904,0.02536,0.0371,0.004286,24.19,33.81,160,1671,0.1278,0.3416,0.3703,0.2152,0.3271,0.07632
+9011495,B,12.21,18.02,78.31,458.4,0.09231,0.07175,0.04392,0.02027,0.1695,0.05916,0.2527,0.7786,1.874,18.57,0.005833,0.01388,0.02,0.007087,0.01938,0.00196,14.29,24.04,93.85,624.6,0.1368,0.217,0.2413,0.08829,0.3218,0.0747
+9011971,M,21.71,17.25,140.9,1546,0.09384,0.08562,0.1168,0.08465,0.1717,0.05054,1.207,1.051,7.733,224.1,0.005568,0.01112,0.02096,0.01197,0.01263,0.001803,30.75,26.44,199.5,3143,0.1363,0.1628,0.2861,0.182,0.251,0.06494
+9012000,M,22.01,21.9,147.2,1482,0.1063,0.1954,0.2448,0.1501,0.1824,0.0614,1.008,0.6999,7.561,130.2,0.003978,0.02821,0.03576,0.01471,0.01518,0.003796,27.66,25.8,195,2227,0.1294,0.3885,0.4756,0.2432,0.2741,0.08574
+9012315,M,16.35,23.29,109,840.4,0.09742,0.1497,0.1811,0.08773,0.2175,0.06218,0.4312,1.022,2.972,45.5,0.005635,0.03917,0.06072,0.01656,0.03197,0.004085,19.38,31.03,129.3,1165,0.1415,0.4665,0.7087,0.2248,0.4824,0.09614
+9012568,B,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,0.1721,0.05544,0.1783,0.4125,1.338,17.72,0.005012,0.01485,0.01551,0.009155,0.01647,0.001767,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766
+9012795,M,21.37,15.1,141.3,1386,0.1001,0.1515,0.1932,0.1255,0.1973,0.06183,0.3414,1.309,2.407,39.06,0.004426,0.02675,0.03437,0.01343,0.01675,0.004367,22.69,21.84,152.1,1535,0.1192,0.284,0.4024,0.1966,0.273,0.08666
+901288,M,20.64,17.35,134.8,1335,0.09446,0.1076,0.1527,0.08941,0.1571,0.05478,0.6137,0.6575,4.119,77.02,0.006211,0.01895,0.02681,0.01232,0.01276,0.001711,25.37,23.17,166.8,1946,0.1562,0.3055,0.4159,0.2112,0.2689,0.07055
+9013005,B,13.69,16.07,87.84,579.1,0.08302,0.06374,0.02556,0.02031,0.1872,0.05669,0.1705,0.5066,1.372,14,0.00423,0.01587,0.01169,0.006335,0.01943,0.002177,14.84,20.21,99.16,670.6,0.1105,0.2096,0.1346,0.06987,0.3323,0.07701
+901303,B,16.17,16.07,106.3,788.5,0.0988,0.1438,0.06651,0.05397,0.199,0.06572,0.1745,0.489,1.349,14.91,0.00451,0.01812,0.01951,0.01196,0.01934,0.003696,16.97,19.14,113.1,861.5,0.1235,0.255,0.2114,0.1251,0.3153,0.0896
+901315,B,10.57,20.22,70.15,338.3,0.09073,0.166,0.228,0.05941,0.2188,0.0845,0.1115,1.231,2.363,7.228,0.008499,0.07643,0.1535,0.02919,0.01617,0.0122,10.85,22.82,76.51,351.9,0.1143,0.3619,0.603,0.1465,0.2597,0.12
+9013579,B,13.46,28.21,85.89,562.1,0.07517,0.04726,0.01271,0.01117,0.1421,0.05763,0.1689,1.15,1.4,14.91,0.004942,0.01203,0.007508,0.005179,0.01442,0.001684,14.69,35.63,97.11,680.6,0.1108,0.1457,0.07934,0.05781,0.2694,0.07061
+9013594,B,13.66,15.15,88.27,580.6,0.08268,0.07548,0.04249,0.02471,0.1792,0.05897,0.1402,0.5417,1.101,11.35,0.005212,0.02984,0.02443,0.008356,0.01818,0.004868,14.54,19.64,97.96,657,0.1275,0.3104,0.2569,0.1054,0.3387,0.09638
+9013838,M,11.08,18.83,73.3,361.6,0.1216,0.2154,0.1689,0.06367,0.2196,0.0795,0.2114,1.027,1.719,13.99,0.007405,0.04549,0.04588,0.01339,0.01738,0.004435,13.24,32.82,91.76,508.1,0.2184,0.9379,0.8402,0.2524,0.4154,0.1403
+901549,B,11.27,12.96,73.16,386.3,0.1237,0.1111,0.079,0.0555,0.2018,0.06914,0.2562,0.9858,1.809,16.04,0.006635,0.01777,0.02101,0.01164,0.02108,0.003721,12.84,20.53,84.93,476.1,0.161,0.2429,0.2247,0.1318,0.3343,0.09215
+901836,B,11.04,14.93,70.67,372.7,0.07987,0.07079,0.03546,0.02074,0.2003,0.06246,0.1642,1.031,1.281,11.68,0.005296,0.01903,0.01723,0.00696,0.0188,0.001941,12.09,20.83,79.73,447.1,0.1095,0.1982,0.1553,0.06754,0.3202,0.07287
+90250,B,12.05,22.72,78.75,447.8,0.06935,0.1073,0.07943,0.02978,0.1203,0.06659,0.1194,1.434,1.778,9.549,0.005042,0.0456,0.04305,0.01667,0.0247,0.007358,12.57,28.71,87.36,488.4,0.08799,0.3214,0.2912,0.1092,0.2191,0.09349
+90251,B,12.39,17.48,80.64,462.9,0.1042,0.1297,0.05892,0.0288,0.1779,0.06588,0.2608,0.873,2.117,19.2,0.006715,0.03705,0.04757,0.01051,0.01838,0.006884,14.18,23.13,95.23,600.5,0.1427,0.3593,0.3206,0.09804,0.2819,0.1118
+902727,B,13.28,13.72,85.79,541.8,0.08363,0.08575,0.05077,0.02864,0.1617,0.05594,0.1833,0.5308,1.592,15.26,0.004271,0.02073,0.02828,0.008468,0.01461,0.002613,14.24,17.37,96.59,623.7,0.1166,0.2685,0.2866,0.09173,0.2736,0.0732
+90291,M,14.6,23.29,93.97,664.7,0.08682,0.06636,0.0839,0.05271,0.1627,0.05416,0.4157,1.627,2.914,33.01,0.008312,0.01742,0.03389,0.01576,0.0174,0.002871,15.79,31.71,102.2,758.2,0.1312,0.1581,0.2675,0.1359,0.2477,0.06836
+902975,B,12.21,14.09,78.78,462,0.08108,0.07823,0.06839,0.02534,0.1646,0.06154,0.2666,0.8309,2.097,19.96,0.004405,0.03026,0.04344,0.01087,0.01921,0.004622,13.13,19.29,87.65,529.9,0.1026,0.2431,0.3076,0.0914,0.2677,0.08824
+902976,B,13.88,16.16,88.37,596.6,0.07026,0.04831,0.02045,0.008507,0.1607,0.05474,0.2541,0.6218,1.709,23.12,0.003728,0.01415,0.01988,0.007016,0.01647,0.00197,15.51,19.97,99.66,745.3,0.08484,0.1233,0.1091,0.04537,0.2542,0.06623
+903011,B,11.27,15.5,73.38,392,0.08365,0.1114,0.1007,0.02757,0.181,0.07252,0.3305,1.067,2.569,22.97,0.01038,0.06669,0.09472,0.02047,0.01219,0.01233,12.04,18.93,79.73,450,0.1102,0.2809,0.3021,0.08272,0.2157,0.1043
+90312,M,19.55,23.21,128.9,1174,0.101,0.1318,0.1856,0.1021,0.1989,0.05884,0.6107,2.836,5.383,70.1,0.01124,0.04097,0.07469,0.03441,0.02768,0.00624,20.82,30.44,142,1313,0.1251,0.2414,0.3829,0.1825,0.2576,0.07602
+90317302,B,10.26,12.22,65.75,321.6,0.09996,0.07542,0.01923,0.01968,0.18,0.06569,0.1911,0.5477,1.348,11.88,0.005682,0.01365,0.008496,0.006929,0.01938,0.002371,11.38,15.65,73.23,394.5,0.1343,0.165,0.08615,0.06696,0.2937,0.07722
+903483,B,8.734,16.84,55.27,234.3,0.1039,0.07428,0,0,0.1985,0.07098,0.5169,2.079,3.167,28.85,0.01582,0.01966,0,0,0.01865,0.006736,10.17,22.8,64.01,317,0.146,0.131,0,0,0.2445,0.08865
+903507,M,15.49,19.97,102.4,744.7,0.116,0.1562,0.1891,0.09113,0.1929,0.06744,0.647,1.331,4.675,66.91,0.007269,0.02928,0.04972,0.01639,0.01852,0.004232,21.2,29.41,142.1,1359,0.1681,0.3913,0.5553,0.2121,0.3187,0.1019
+903516,M,21.61,22.28,144.4,1407,0.1167,0.2087,0.281,0.1562,0.2162,0.06606,0.6242,0.9209,4.158,80.99,0.005215,0.03726,0.04718,0.01288,0.02045,0.004028,26.23,28.74,172,2081,0.1502,0.5717,0.7053,0.2422,0.3828,0.1007
+903554,B,12.1,17.72,78.07,446.2,0.1029,0.09758,0.04783,0.03326,0.1937,0.06161,0.2841,1.652,1.869,22.22,0.008146,0.01631,0.01843,0.007513,0.02015,0.001798,13.56,25.8,88.33,559.5,0.1432,0.1773,0.1603,0.06266,0.3049,0.07081
+903811,B,14.06,17.18,89.75,609.1,0.08045,0.05361,0.02681,0.03251,0.1641,0.05764,0.1504,1.685,1.237,12.67,0.005371,0.01273,0.01132,0.009155,0.01719,0.001444,14.92,25.34,96.42,684.5,0.1066,0.1231,0.0846,0.07911,0.2523,0.06609
+90401601,B,13.51,18.89,88.1,558.1,0.1059,0.1147,0.0858,0.05381,0.1806,0.06079,0.2136,1.332,1.513,19.29,0.005442,0.01957,0.03304,0.01367,0.01315,0.002464,14.8,27.2,97.33,675.2,0.1428,0.257,0.3438,0.1453,0.2666,0.07686
+90401602,B,12.8,17.46,83.05,508.3,0.08044,0.08895,0.0739,0.04083,0.1574,0.0575,0.3639,1.265,2.668,30.57,0.005421,0.03477,0.04545,0.01384,0.01869,0.004067,13.74,21.06,90.72,591,0.09534,0.1812,0.1901,0.08296,0.1988,0.07053
+904302,B,11.06,14.83,70.31,378.2,0.07741,0.04768,0.02712,0.007246,0.1535,0.06214,0.1855,0.6881,1.263,12.98,0.004259,0.01469,0.0194,0.004168,0.01191,0.003537,12.68,20.35,80.79,496.7,0.112,0.1879,0.2079,0.05556,0.259,0.09158
+904357,B,11.8,17.26,75.26,431.9,0.09087,0.06232,0.02853,0.01638,0.1847,0.06019,0.3438,1.14,2.225,25.06,0.005463,0.01964,0.02079,0.005398,0.01477,0.003071,13.45,24.49,86,562,0.1244,0.1726,0.1449,0.05356,0.2779,0.08121
+90439701,M,17.91,21.02,124.4,994,0.123,0.2576,0.3189,0.1198,0.2113,0.07115,0.403,0.7747,3.123,41.51,0.007159,0.03718,0.06165,0.01051,0.01591,0.005099,20.8,27.78,149.6,1304,0.1873,0.5917,0.9034,0.1964,0.3245,0.1198
+904647,B,11.93,10.91,76.14,442.7,0.08872,0.05242,0.02606,0.01796,0.1601,0.05541,0.2522,1.045,1.649,18.95,0.006175,0.01204,0.01376,0.005832,0.01096,0.001857,13.8,20.14,87.64,589.5,0.1374,0.1575,0.1514,0.06876,0.246,0.07262
+904689,B,12.96,18.29,84.18,525.2,0.07351,0.07899,0.04057,0.01883,0.1874,0.05899,0.2357,1.299,2.397,20.21,0.003629,0.03713,0.03452,0.01065,0.02632,0.003705,14.13,24.61,96.31,621.9,0.09329,0.2318,0.1604,0.06608,0.3207,0.07247
+9047,B,12.94,16.17,83.18,507.6,0.09879,0.08836,0.03296,0.0239,0.1735,0.062,0.1458,0.905,0.9975,11.36,0.002887,0.01285,0.01613,0.007308,0.0187,0.001972,13.86,23.02,89.69,580.9,0.1172,0.1958,0.181,0.08388,0.3297,0.07834
+904969,B,12.34,14.95,78.29,469.1,0.08682,0.04571,0.02109,0.02054,0.1571,0.05708,0.3833,0.9078,2.602,30.15,0.007702,0.008491,0.01307,0.0103,0.0297,0.001432,13.18,16.85,84.11,533.1,0.1048,0.06744,0.04921,0.04793,0.2298,0.05974
+904971,B,10.94,18.59,70.39,370,0.1004,0.0746,0.04944,0.02932,0.1486,0.06615,0.3796,1.743,3.018,25.78,0.009519,0.02134,0.0199,0.01155,0.02079,0.002701,12.4,25.58,82.76,472.4,0.1363,0.1644,0.1412,0.07887,0.2251,0.07732
+905189,B,16.14,14.86,104.3,80

<TRUNCATED>

[07/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterIterator.java
new file mode 100644
index 0000000..516177f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterIterator.java
@@ -0,0 +1,219 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import com.google.common.io.Closeables;
+
+/**
+ * This is a clustering iterator which works with a set of Vector data and a prior ClusterClassifier which has been
+ * initialized with a set of models. Its implementation is algorithm-neutral and works for any iterative clustering
+ * algorithm (currently k-means and fuzzy-k-means) that processes all the input vectors in each iteration.
+ * The cluster classifier is configured with a ClusteringPolicy to select the desired clustering algorithm.
+ */
+public final class ClusterIterator {
+  
+  public static final String PRIOR_PATH_KEY = "org.apache.mahout.clustering.prior.path";
+
+  private ClusterIterator() {
+  }
+  
+  /**
+   * Iterate over data using a prior-trained ClusterClassifier, for a number of iterations
+   *
+   * @param data
+   *          a {@code List<Vector>} of input vectors
+   * @param classifier
+   *          a prior ClusterClassifier
+   * @param numIterations
+   *          the int number of iterations to perform
+   * 
+   * @return the posterior ClusterClassifier
+   */
+  public static ClusterClassifier iterate(Iterable<Vector> data, ClusterClassifier classifier, int numIterations) {
+    ClusteringPolicy policy = classifier.getPolicy();
+    for (int iteration = 1; iteration <= numIterations; iteration++) {
+      for (Vector vector : data) {
+        // update the policy based upon the prior
+        policy.update(classifier);
+        // classification yields probabilities
+        Vector probabilities = classifier.classify(vector);
+        // policy selects weights for models given those probabilities
+        Vector weights = policy.select(probabilities);
+        // training causes all models to observe data
+        for (Vector.Element e : weights.nonZeroes()) {
+          int index = e.index();
+          classifier.train(index, vector, weights.get(index));
+        }
+      }
+      // compute the posterior models
+      classifier.close();
+    }
+    return classifier;
+  }
+  
+  /**
+   * Iterate over data using a prior-trained ClusterClassifier, for a number of iterations using a sequential
+   * implementation
+   * 
+   * @param conf
+   *          the Configuration
+   * @param inPath
+   *          a Path to input VectorWritables
+   * @param priorPath
+   *          a Path to the prior classifier
+   * @param outPath
+   *          a Path of output directory
+   * @param numIterations
+   *          the int number of iterations to perform
+   */
+  public static void iterateSeq(Configuration conf, Path inPath, Path priorPath, Path outPath, int numIterations)
+    throws IOException {
+    ClusterClassifier classifier = new ClusterClassifier();
+    classifier.readFromSeqFiles(conf, priorPath);
+    Path clustersOut = null;
+    int iteration = 1;
+    while (iteration <= numIterations) {
+      for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(inPath, PathType.LIST,
+          PathFilters.logsCRCFilter(), conf)) {
+        Vector vector = vw.get();
+        // classification yields probabilities
+        Vector probabilities = classifier.classify(vector);
+        // policy selects weights for models given those probabilities
+        Vector weights = classifier.getPolicy().select(probabilities);
+        // training causes all models to observe data
+        for (Vector.Element e : weights.nonZeroes()) {
+          int index = e.index();
+          classifier.train(index, vector, weights.get(index));
+        }
+      }
+      // compute the posterior models
+      classifier.close();
+      // update the policy
+      classifier.getPolicy().update(classifier);
+      // output the classifier
+      clustersOut = new Path(outPath, Cluster.CLUSTERS_DIR + iteration);
+      classifier.writeToSeqFiles(clustersOut);
+      FileSystem fs = FileSystem.get(outPath.toUri(), conf);
+      iteration++;
+      if (isConverged(clustersOut, conf, fs)) {
+        break;
+      }
+    }
+    Path finalClustersIn = new Path(outPath, Cluster.CLUSTERS_DIR + (iteration - 1) + Cluster.FINAL_ITERATION_SUFFIX);
+    FileSystem.get(clustersOut.toUri(), conf).rename(clustersOut, finalClustersIn);
+  }
+  
+  /**
+   * Iterate over data using a prior-trained ClusterClassifier, for a number of iterations using a mapreduce
+   * implementation
+   * 
+   * @param conf
+   *          the Configuration
+   * @param inPath
+   *          a Path to input VectorWritables
+   * @param priorPath
+   *          a Path to the prior classifier
+   * @param outPath
+   *          a Path of output directory
+   * @param numIterations
+   *          the int number of iterations to perform
+   */
+  public static void iterateMR(Configuration conf, Path inPath, Path priorPath, Path outPath, int numIterations)
+    throws IOException, InterruptedException, ClassNotFoundException {
+    ClusteringPolicy policy = ClusterClassifier.readPolicy(priorPath);
+    Path clustersOut = null;
+    int iteration = 1;
+    while (iteration <= numIterations) {
+      conf.set(PRIOR_PATH_KEY, priorPath.toString());
+      
+      String jobName = "Cluster Iterator running iteration " + iteration + " over priorPath: " + priorPath;
+      Job job = new Job(conf, jobName);
+      job.setMapOutputKeyClass(IntWritable.class);
+      job.setMapOutputValueClass(ClusterWritable.class);
+      job.setOutputKeyClass(IntWritable.class);
+      job.setOutputValueClass(ClusterWritable.class);
+      
+      job.setInputFormatClass(SequenceFileInputFormat.class);
+      job.setOutputFormatClass(SequenceFileOutputFormat.class);
+      job.setMapperClass(CIMapper.class);
+      job.setReducerClass(CIReducer.class);
+      
+      FileInputFormat.addInputPath(job, inPath);
+      clustersOut = new Path(outPath, Cluster.CLUSTERS_DIR + iteration);
+      priorPath = clustersOut;
+      FileOutputFormat.setOutputPath(job, clustersOut);
+      
+      job.setJarByClass(ClusterIterator.class);
+      if (!job.waitForCompletion(true)) {
+        throw new InterruptedException("Cluster Iteration " + iteration + " failed processing " + priorPath);
+      }
+      ClusterClassifier.writePolicy(policy, clustersOut);
+      FileSystem fs = FileSystem.get(outPath.toUri(), conf);
+      iteration++;
+      if (isConverged(clustersOut, conf, fs)) {
+        break;
+      }
+    }
+    Path finalClustersIn = new Path(outPath, Cluster.CLUSTERS_DIR + (iteration - 1) + Cluster.FINAL_ITERATION_SUFFIX);
+    FileSystem.get(clustersOut.toUri(), conf).rename(clustersOut, finalClustersIn);
+  }
+  
+  /**
+   * Return if all of the Clusters in the parts in the filePath have converged or not
+   * 
+   * @param filePath
+   *          the file path to the single file containing the clusters
+   * @return true if all Clusters are converged
+   * @throws IOException
+   *           if there was an IO error
+   */
+  private static boolean isConverged(Path filePath, Configuration conf, FileSystem fs) throws IOException {
+    for (FileStatus part : fs.listStatus(filePath, PathFilters.partFilter())) {
+      SequenceFileValueIterator<ClusterWritable> iterator = new SequenceFileValueIterator<>(
+          part.getPath(), true, conf);
+      while (iterator.hasNext()) {
+        ClusterWritable value = iterator.next();
+        if (!value.getValue().isConverged()) {
+          Closeables.close(iterator, true);
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterWritable.java
new file mode 100644
index 0000000..855685f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterWritable.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.sgd.PolymorphicWritable;
+import org.apache.mahout.clustering.Cluster;
+
+public class ClusterWritable implements Writable {
+  
+  private Cluster value;
+  
+  public ClusterWritable(Cluster first) {
+    value = first;
+  }
+
+  public ClusterWritable() {
+  }
+
+  public Cluster getValue() {
+    return value;
+  }
+  
+  public void setValue(Cluster value) {
+    this.value = value;
+  }
+  
+  @Override
+  public void write(DataOutput out) throws IOException {
+    PolymorphicWritable.write(out, value);
+  }
+  
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    value = PolymorphicWritable.read(in, Cluster.class);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicy.java
new file mode 100644
index 0000000..6e15838
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicy.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.math.Vector;
+
+/**
+ * A ClusteringPolicy captures the semantics of assignment of points to clusters
+ * 
+ */
+public interface ClusteringPolicy extends Writable {
+  
+  /**
+   * Classify the data vector given the classifier's models
+   * 
+   * @param data
+   *          a data Vector
+   * @param prior
+   *          a prior ClusterClassifier
+   * @return a Vector of probabilities that the data is described by each of the
+   *         models
+   */
+  Vector classify(Vector data, ClusterClassifier prior);
+  
+  /**
+   * Return a vector of weights for each of the models given those probabilities
+   * 
+   * @param probabilities
+   *          a Vector of pdfs
+   * @return a Vector of weights
+   */
+  Vector select(Vector probabilities);
+  
+  /**
+   * Update the policy with the given classifier
+   * 
+   * @param posterior
+   *          a ClusterClassifier
+   */
+  void update(ClusterClassifier posterior);
+  
+  /**
+   * Close the policy using the classifier's models
+   * 
+   * @param posterior
+   *          a posterior ClusterClassifier
+   */
+  void close(ClusterClassifier posterior);
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicyWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicyWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicyWritable.java
new file mode 100644
index 0000000..f69442d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicyWritable.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.sgd.PolymorphicWritable;
+
+public class ClusteringPolicyWritable implements Writable {
+  
+  private ClusteringPolicy value;
+  
+  public ClusteringPolicyWritable(ClusteringPolicy policy) {
+    this.value = policy;
+  }
+
+  public ClusteringPolicyWritable() {
+  }
+
+  public ClusteringPolicy getValue() {
+    return value;
+  }
+  
+  public void setValue(ClusteringPolicy value) {
+    this.value = value;
+  }
+  
+  @Override
+  public void write(DataOutput out) throws IOException {
+    PolymorphicWritable.write(out, value);
+  }
+  
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    value = PolymorphicWritable.read(in, ClusteringPolicy.class);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/DistanceMeasureCluster.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/DistanceMeasureCluster.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/DistanceMeasureCluster.java
new file mode 100644
index 0000000..f61aa27
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/DistanceMeasureCluster.java
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.iterator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.Model;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public class DistanceMeasureCluster extends AbstractCluster {
+
+  private DistanceMeasure measure;
+
+  public DistanceMeasureCluster(Vector point, int id, DistanceMeasure measure) {
+    super(point, id);
+    this.measure = measure;
+  }
+
+  public DistanceMeasureCluster() {
+  }
+
+  @Override
+  public void configure(Configuration job) {
+    if (measure != null) {
+      measure.configure(job);
+    }
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    String dm = in.readUTF();
+    this.measure = ClassUtils.instantiateAs(dm, DistanceMeasure.class);
+    super.readFields(in);
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeUTF(measure.getClass().getName());
+    super.write(out);
+  }
+
+  @Override
+  public double pdf(VectorWritable vw) {
+    return 1 / (1 + measure.distance(vw.get(), getCenter()));
+  }
+
+  @Override
+  public Model<VectorWritable> sampleFromPosterior() {
+    return new DistanceMeasureCluster(getCenter(), getId(), measure);
+  }
+
+  public DistanceMeasure getMeasure() {
+    return measure;
+  }
+
+  /**
+   * @param measure
+   *          the measure to set
+   */
+  public void setMeasure(DistanceMeasure measure) {
+    this.measure = measure;
+  }
+
+  @Override
+  public String getIdentifier() {
+    return "DMC:" + getId();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/FuzzyKMeansClusteringPolicy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/FuzzyKMeansClusteringPolicy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/FuzzyKMeansClusteringPolicy.java
new file mode 100644
index 0000000..b4e41b6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/FuzzyKMeansClusteringPolicy.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansClusterer;
+import org.apache.mahout.clustering.fuzzykmeans.SoftCluster;
+import org.apache.mahout.math.Vector;
+
+/**
+ * This is a probability-weighted clustering policy, suitable for fuzzy k-means
+ * clustering
+ * 
+ */
+public class FuzzyKMeansClusteringPolicy extends AbstractClusteringPolicy {
+
+  private double m = 2;
+  private double convergenceDelta = 0.05;
+
+  public FuzzyKMeansClusteringPolicy() {
+  }
+
+  public FuzzyKMeansClusteringPolicy(double m, double convergenceDelta) {
+    this.m = m;
+    this.convergenceDelta = convergenceDelta;
+  }
+
+  @Override
+  public Vector select(Vector probabilities) {
+    return probabilities;
+  }
+  
+  @Override
+  public Vector classify(Vector data, ClusterClassifier prior) {
+    Collection<SoftCluster> clusters = new ArrayList<>();
+    List<Double> distances = new ArrayList<>();
+    for (Cluster model : prior.getModels()) {
+      SoftCluster sc = (SoftCluster) model;
+      clusters.add(sc);
+      distances.add(sc.getMeasure().distance(data, sc.getCenter()));
+    }
+    FuzzyKMeansClusterer fuzzyKMeansClusterer = new FuzzyKMeansClusterer();
+    fuzzyKMeansClusterer.setM(m);
+    return fuzzyKMeansClusterer.computePi(clusters, distances);
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeDouble(m);
+    out.writeDouble(convergenceDelta);
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    this.m = in.readDouble();
+    this.convergenceDelta = in.readDouble();
+  }
+
+  @Override
+  public void close(ClusterClassifier posterior) {
+    for (Cluster cluster : posterior.getModels()) {
+      ((org.apache.mahout.clustering.kmeans.Kluster) cluster).calculateConvergence(convergenceDelta);
+      cluster.computeParameters();
+    }
+    
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/KMeansClusteringPolicy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/KMeansClusteringPolicy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/KMeansClusteringPolicy.java
new file mode 100644
index 0000000..1cc9faf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/KMeansClusteringPolicy.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+
+/**
+ * This is a simple maximum likelihood clustering policy, suitable for k-means
+ * clustering
+ * 
+ */
+public class KMeansClusteringPolicy extends AbstractClusteringPolicy {
+  
+  public KMeansClusteringPolicy() {
+  }
+  
+  public KMeansClusteringPolicy(double convergenceDelta) {
+    this.convergenceDelta = convergenceDelta;
+  }
+  
+  private double convergenceDelta = 0.001;
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeDouble(convergenceDelta);
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    this.convergenceDelta = in.readDouble();
+  }
+  
+  @Override
+  public void close(ClusterClassifier posterior) {
+    boolean allConverged = true;
+    for (Cluster cluster : posterior.getModels()) {
+      org.apache.mahout.clustering.kmeans.Kluster kluster = (org.apache.mahout.clustering.kmeans.Kluster) cluster;
+      boolean converged = kluster.calculateConvergence(convergenceDelta);
+      allConverged = allConverged && converged;
+      cluster.computeParameters();
+    }
+    
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/IKernelProfile.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/IKernelProfile.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/IKernelProfile.java
new file mode 100644
index 0000000..96c4082
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/IKernelProfile.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.kernel;
+
+public interface IKernelProfile {
+
+  /**
+   * @return the calculated dervative value of the kernel
+   */
+  double calculateDerivativeValue(double distance, double h);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/TriangularKernelProfile.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/TriangularKernelProfile.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/TriangularKernelProfile.java
new file mode 100644
index 0000000..46909bb
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/TriangularKernelProfile.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.kernel;
+
+public class TriangularKernelProfile implements IKernelProfile {
+  
+  @Override
+  public double calculateDerivativeValue(double distance, double h) {
+    return distance < h ? 1.0 : 0.0;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
new file mode 100644
index 0000000..3b9094e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
@@ -0,0 +1,257 @@
+/* Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.kmeans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassificationDriver;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.clustering.iterator.ClusterIterator;
+import org.apache.mahout.clustering.iterator.ClusteringPolicy;
+import org.apache.mahout.clustering.iterator.KMeansClusteringPolicy;
+import org.apache.mahout.clustering.topdown.PathDirectory;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class KMeansDriver extends AbstractJob {
+  
+  private static final Logger log = LoggerFactory.getLogger(KMeansDriver.class);
+  
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new KMeansDriver(), args);
+  }
+  
+  @Override
+  public int run(String[] args) throws Exception {
+    
+    addInputOption();
+    addOutputOption();
+    addOption(DefaultOptionCreator.distanceMeasureOption().create());
+    addOption(DefaultOptionCreator
+        .clustersInOption()
+        .withDescription(
+            "The input centroids, as Vectors.  Must be a SequenceFile of Writable, Cluster/Canopy.  "
+                + "If k is also specified, then a random set of vectors will be selected"
+                + " and written out to this path first").create());
+    addOption(DefaultOptionCreator
+        .numClustersOption()
+        .withDescription(
+            "The k in k-Means.  If specified, then a random selection of k Vectors will be chosen"
+                + " as the Centroid and written to the clusters input path.").create());
+    addOption(DefaultOptionCreator.useSetRandomSeedOption().create());
+    addOption(DefaultOptionCreator.convergenceOption().create());
+    addOption(DefaultOptionCreator.maxIterationsOption().create());
+    addOption(DefaultOptionCreator.overwriteOption().create());
+    addOption(DefaultOptionCreator.clusteringOption().create());
+    addOption(DefaultOptionCreator.methodOption().create());
+    addOption(DefaultOptionCreator.outlierThresholdOption().create());
+   
+    if (parseArguments(args) == null) {
+      return -1;
+    }
+    
+    Path input = getInputPath();
+    Path clusters = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION));
+    Path output = getOutputPath();
+    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+    if (measureClass == null) {
+      measureClass = SquaredEuclideanDistanceMeasure.class.getName();
+    }
+    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.delete(getConf(), output);
+    }
+    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
+    
+    if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
+      int numClusters = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
+
+      Long seed = null;
+      if (hasOption(DefaultOptionCreator.RANDOM_SEED)) {
+        seed = Long.parseLong(getOption(DefaultOptionCreator.RANDOM_SEED));
+      }
+
+      clusters = RandomSeedGenerator.buildRandom(getConf(), input, clusters, numClusters, measure, seed);
+    }
+    boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
+    boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(
+        DefaultOptionCreator.SEQUENTIAL_METHOD);
+    double clusterClassificationThreshold = 0.0;
+    if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) {
+      clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD));
+    }
+    run(getConf(), input, clusters, output, convergenceDelta, maxIterations, runClustering,
+        clusterClassificationThreshold, runSequential);
+    return 0;
+  }
+  
+  /**
+   * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to
+   * cluster the input vectors.
+   *
+   * @param input
+   *          the directory pathname for input points
+   * @param clustersIn
+   *          the directory pathname for initial & computed clusters
+   * @param output
+   *          the directory pathname for output points
+   * @param convergenceDelta
+   *          the convergence delta value
+   * @param maxIterations
+   *          the maximum number of iterations
+   * @param runClustering
+   *          true if points are to be clustered after iterations are completed
+   * @param clusterClassificationThreshold
+   *          Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors
+   *          having pdf below this value will not be clustered.
+   * @param runSequential
+   *          if true execute sequential algorithm
+   */
+  public static void run(Configuration conf, Path input, Path clustersIn, Path output,
+    double convergenceDelta, int maxIterations, boolean runClustering, double clusterClassificationThreshold,
+    boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException {
+    
+    // iterate until the clusters converge
+    String delta = Double.toString(convergenceDelta);
+    if (log.isInfoEnabled()) {
+      log.info("Input: {} Clusters In: {} Out: {}", input, clustersIn, output);
+      log.info("convergence: {} max Iterations: {}", convergenceDelta, maxIterations);
+    }
+    Path clustersOut = buildClusters(conf, input, clustersIn, output, maxIterations, delta, runSequential);
+    if (runClustering) {
+      log.info("Clustering data");
+      clusterData(conf, input, clustersOut, output, clusterClassificationThreshold, runSequential);
+    }
+  }
+  
+  /**
+   * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to
+   * cluster the input vectors.
+   *
+   * @param input
+   *          the directory pathname for input points
+   * @param clustersIn
+   *          the directory pathname for initial & computed clusters
+   * @param output
+   *          the directory pathname for output points
+   * @param convergenceDelta
+   *          the convergence delta value
+   * @param maxIterations
+   *          the maximum number of iterations
+   * @param runClustering
+   *          true if points are to be clustered after iterations are completed
+   * @param clusterClassificationThreshold
+   *          Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors
+   *          having pdf below this value will not be clustered.
+   * @param runSequential
+   *          if true execute sequential algorithm
+   */
+  public static void run(Path input, Path clustersIn, Path output, double convergenceDelta,
+    int maxIterations, boolean runClustering, double clusterClassificationThreshold, boolean runSequential)
+    throws IOException, InterruptedException, ClassNotFoundException {
+    run(new Configuration(), input, clustersIn, output, convergenceDelta, maxIterations, runClustering,
+        clusterClassificationThreshold, runSequential);
+  }
+  
+  /**
+   * Iterate over the input vectors to produce cluster directories for each iteration
+   * 
+   *
+   * @param conf
+   *          the Configuration to use
+   * @param input
+   *          the directory pathname for input points
+   * @param clustersIn
+   *          the directory pathname for initial & computed clusters
+   * @param output
+   *          the directory pathname for output points
+   * @param maxIterations
+   *          the maximum number of iterations
+   * @param delta
+   *          the convergence delta value
+   * @param runSequential
+   *          if true execute sequential algorithm
+   *
+   * @return the Path of the final clusters directory
+   */
+  public static Path buildClusters(Configuration conf, Path input, Path clustersIn, Path output,
+    int maxIterations, String delta, boolean runSequential) throws IOException,
+    InterruptedException, ClassNotFoundException {
+    
+    double convergenceDelta = Double.parseDouble(delta);
+    List<Cluster> clusters = new ArrayList<>();
+    KMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters);
+    
+    if (clusters.isEmpty()) {
+      throw new IllegalStateException("No input clusters found in " + clustersIn + ". Check your -c argument.");
+    }
+    
+    Path priorClustersPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
+    ClusteringPolicy policy = new KMeansClusteringPolicy(convergenceDelta);
+    ClusterClassifier prior = new ClusterClassifier(clusters, policy);
+    prior.writeToSeqFiles(priorClustersPath);
+    
+    if (runSequential) {
+      ClusterIterator.iterateSeq(conf, input, priorClustersPath, output, maxIterations);
+    } else {
+      ClusterIterator.iterateMR(conf, input, priorClustersPath, output, maxIterations);
+    }
+    return output;
+  }
+  
+  /**
+   * Run the job using supplied arguments
+   *
+   * @param input
+   *          the directory pathname for input points
+   * @param clustersIn
+   *          the directory pathname for input clusters
+   * @param output
+   *          the directory pathname for output points
+   * @param clusterClassificationThreshold
+   *          Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors
+   *          having pdf below this value will not be clustered.
+   * @param runSequential
+   *          if true execute sequential algorithm
+   */
+  public static void clusterData(Configuration conf, Path input, Path clustersIn, Path output,
+    double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException,
+    ClassNotFoundException {
+    
+    if (log.isInfoEnabled()) {
+      log.info("Running Clustering");
+      log.info("Input: {} Clusters In: {} Out: {}", input, clustersIn, output);
+    }
+    ClusterClassifier.writePolicy(new KMeansClusteringPolicy(), clustersIn);
+    ClusterClassificationDriver.run(conf, input, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY),
+        clusterClassificationThreshold, true, runSequential);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
new file mode 100644
index 0000000..3365f70
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.kmeans;
+
+import java.util.Collection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.canopy.Canopy;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+final class KMeansUtil {
+  
+  private static final Logger log = LoggerFactory.getLogger(KMeansUtil.class);
+
+  private KMeansUtil() {}
+  
+  /**
+   * Create a list of Klusters from whatever Cluster type is passed in as the prior
+   * 
+   * @param conf
+   *          the Configuration
+   * @param clusterPath
+   *          the path to the prior Clusters
+   * @param clusters
+   *          a List<Cluster> to put values into
+   */
+  public static void configureWithClusterInfo(Configuration conf, Path clusterPath, Collection<Cluster> clusters) {
+    for (Writable value : new SequenceFileDirValueIterable<>(clusterPath, PathType.LIST,
+        PathFilters.partFilter(), conf)) {
+      Class<? extends Writable> valueClass = value.getClass();
+      if (valueClass.equals(ClusterWritable.class)) {
+        ClusterWritable clusterWritable = (ClusterWritable) value;
+        value = clusterWritable.getValue();
+        valueClass = value.getClass();
+      }
+      log.debug("Read 1 Cluster from {}", clusterPath);
+      
+      if (valueClass.equals(Kluster.class)) {
+        // get the cluster info
+        clusters.add((Kluster) value);
+      } else if (valueClass.equals(Canopy.class)) {
+        // get the cluster info
+        Canopy canopy = (Canopy) value;
+        clusters.add(new Kluster(canopy.getCenter(), canopy.getId(), canopy.getMeasure()));
+      } else {
+        throw new IllegalStateException("Bad value class: " + valueClass);
+      }
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/Kluster.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/Kluster.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/Kluster.java
new file mode 100644
index 0000000..15daec5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/Kluster.java
@@ -0,0 +1,117 @@
+/* Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.kmeans;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.mahout.clustering.iterator.DistanceMeasureCluster;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Vector;
+
+public class Kluster extends DistanceMeasureCluster {
+  
+  /** Has the centroid converged with the center? */
+  private boolean converged;
+  
+  /** For (de)serialization as a Writable */
+  public Kluster() {
+  }
+  
+  /**
+   * Construct a new cluster with the given point as its center
+   * 
+   * @param center
+   *          the Vector center
+   * @param clusterId
+   *          the int cluster id
+   * @param measure
+   *          a DistanceMeasure
+   */
+  public Kluster(Vector center, int clusterId, DistanceMeasure measure) {
+    super(center, clusterId, measure);
+  }
+  
+  /**
+   * Format the cluster for output
+   * 
+   * @param cluster
+   *          the Cluster
+   * @return the String representation of the Cluster
+   */
+  public static String formatCluster(Kluster cluster) {
+    return cluster.getIdentifier() + ": " + cluster.computeCentroid().asFormatString();
+  }
+  
+  public String asFormatString() {
+    return formatCluster(this);
+  }
+  
+  @Override
+  public void write(DataOutput out) throws IOException {
+    super.write(out);
+    out.writeBoolean(converged);
+  }
+  
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    super.readFields(in);
+    this.converged = in.readBoolean();
+  }
+  
+  @Override
+  public String toString() {
+    return asFormatString(null);
+  }
+  
+  @Override
+  public String getIdentifier() {
+    return (converged ? "VL-" : "CL-") + getId();
+  }
+  
+  /**
+   * Return if the cluster is converged by comparing its center and centroid.
+   * 
+   * @param measure
+   *          The distance measure to use for cluster-point comparisons.
+   * @param convergenceDelta
+   *          the convergence delta to use for stopping.
+   * @return if the cluster is converged
+   */
+  public boolean computeConvergence(DistanceMeasure measure, double convergenceDelta) {
+    Vector centroid = computeCentroid();
+    converged = measure.distance(centroid.getLengthSquared(), centroid, getCenter()) <= convergenceDelta;
+    return converged;
+  }
+  
+  @Override
+  public boolean isConverged() {
+    return converged;
+  }
+  
+  protected void setConverged(boolean converged) {
+    this.converged = converged;
+  }
+  
+  public boolean calculateConvergence(double convergenceDelta) {
+    Vector centroid = computeCentroid();
+    converged = getMeasure().distance(centroid.getLengthSquared(), centroid, getCenter()) <= convergenceDelta;
+    return converged;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
new file mode 100644
index 0000000..fbbabc5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.kmeans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Given an Input Path containing a {@link org.apache.hadoop.io.SequenceFile}, randomly select k vectors and
+ * write them to the output file as a {@link org.apache.mahout.clustering.kmeans.Kluster} representing the
+ * initial centroid to use.
+ *
+ * This implementation uses reservoir sampling as described in http://en.wikipedia.org/wiki/Reservoir_sampling
+ */
+public final class RandomSeedGenerator {
+  
+  private static final Logger log = LoggerFactory.getLogger(RandomSeedGenerator.class);
+  
+  public static final String K = "k";
+  
+  private RandomSeedGenerator() {}
+
+  public static Path buildRandom(Configuration conf, Path input, Path output, int k, DistanceMeasure measure)
+    throws IOException {
+    return buildRandom(conf, input, output, k, measure, null);
+  }
+
+  public static Path buildRandom(Configuration conf,
+                                 Path input,
+                                 Path output,
+                                 int k,
+                                 DistanceMeasure measure,
+                                 Long seed) throws IOException {
+
+    Preconditions.checkArgument(k > 0, "Must be: k > 0, but k = " + k);
+    // delete the output directory
+    FileSystem fs = FileSystem.get(output.toUri(), conf);
+    HadoopUtil.delete(conf, output);
+    Path outFile = new Path(output, "part-randomSeed");
+    boolean newFile = fs.createNewFile(outFile);
+    if (newFile) {
+      Path inputPathPattern;
+
+      if (fs.getFileStatus(input).isDir()) {
+        inputPathPattern = new Path(input, "*");
+      } else {
+        inputPathPattern = input;
+      }
+      
+      FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter());
+
+      Random random = (seed != null) ? RandomUtils.getRandom(seed) : RandomUtils.getRandom();
+
+      List<Text> chosenTexts = new ArrayList<>(k);
+      List<ClusterWritable> chosenClusters = new ArrayList<>(k);
+      int nextClusterId = 0;
+
+      int index = 0;
+      for (FileStatus fileStatus : inputFiles) {
+        if (!fileStatus.isDir()) {
+          for (Pair<Writable, VectorWritable> record
+              : new SequenceFileIterable<Writable, VectorWritable>(fileStatus.getPath(), true, conf)) {
+            Writable key = record.getFirst();
+            VectorWritable value = record.getSecond();
+            Kluster newCluster = new Kluster(value.get(), nextClusterId++, measure);
+            newCluster.observe(value.get(), 1);
+            Text newText = new Text(key.toString());
+            int currentSize = chosenTexts.size();
+            if (currentSize < k) {
+              chosenTexts.add(newText);
+              ClusterWritable clusterWritable = new ClusterWritable();
+              clusterWritable.setValue(newCluster);
+              chosenClusters.add(clusterWritable);
+            } else {
+              int j = random.nextInt(index);
+              if (j < k) {
+                chosenTexts.set(j, newText);
+                ClusterWritable clusterWritable = new ClusterWritable();
+                clusterWritable.setValue(newCluster);
+                chosenClusters.set(j, clusterWritable);
+              }
+            }
+            index++;
+          }
+        }
+      }
+
+      try (SequenceFile.Writer writer =
+               SequenceFile.createWriter(fs, conf, outFile, Text.class, ClusterWritable.class)){
+        for (int i = 0; i < chosenTexts.size(); i++) {
+          writer.append(chosenTexts.get(i), chosenClusters.get(i));
+        }
+        log.info("Wrote {} Klusters to {}", k, outFile);
+      }
+    }
+    
+    return outFile;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/package-info.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/package-info.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/package-info.java
new file mode 100644
index 0000000..d6921b6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/package-info.java
@@ -0,0 +1,5 @@
+/**
+ * This package provides an implementation of the <a href="http://en.wikipedia.org/wiki/Kmeans">k-means</a> clustering
+ * algorithm.
+ */
+package org.apache.mahout.clustering.kmeans;

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0DocInferenceMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0DocInferenceMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0DocInferenceMapper.java
new file mode 100644
index 0000000..46fcc7f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0DocInferenceMapper.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.SparseRowMatrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+
+public class CVB0DocInferenceMapper extends CachingCVB0Mapper {
+
+  private final VectorWritable topics = new VectorWritable();
+
+  @Override
+  public void map(IntWritable docId, VectorWritable doc, Context context)
+    throws IOException, InterruptedException {
+    int numTopics = getNumTopics();
+    Vector docTopics = new DenseVector(numTopics).assign(1.0 / numTopics);
+    Matrix docModel = new SparseRowMatrix(numTopics, doc.get().size());
+    int maxIters = getMaxIters();
+    ModelTrainer modelTrainer = getModelTrainer();
+    for (int i = 0; i < maxIters; i++) {
+      modelTrainer.getReadModel().trainDocTopicModel(doc.get(), docTopics, docModel);
+    }
+    topics.set(docTopics);
+    context.write(docId, topics);
+  }
+
+  @Override
+  protected void cleanup(Context context) {
+    getModelTrainer().stop();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0Driver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0Driver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0Driver.java
new file mode 100644
index 0000000..31c0d60
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0Driver.java
@@ -0,0 +1,536 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.List;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.common.mapreduce.VectorSumReducer;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * See {@link CachingCVB0Mapper} for more details on scalability and room for improvement.
+ * To try out this LDA implementation without using Hadoop, check out
+ * {@link InMemoryCollapsedVariationalBayes0}.  If you want to do training directly in java code
+ * with your own main(), then look to {@link ModelTrainer} and {@link TopicModel}.
+ *
+ * Usage: {@code ./bin/mahout cvb <i>options</i>}
+ * <p>
+ * Valid options include:
+ * <dl>
+ * <dt>{@code --input path}</td>
+ * <dd>Input path for {@code SequenceFile<IntWritable, VectorWritable>} document vectors. See
+ * {@link org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles}
+ *  for details on how to generate this input format.</dd>
+ * <dt>{@code --dictionary path}</dt>
+ * <dd>Path to dictionary file(s) generated during construction of input document vectors (glob
+ * expression supported). If set, this data is scanned to determine an appropriate value for option
+ * {@code --num_terms}.</dd>
+ * <dt>{@code --output path}</dt>
+ * <dd>Output path for topic-term distributions.</dd>
+ * <dt>{@code --doc_topic_output path}</dt>
+ * <dd>Output path for doc-topic distributions.</dd>
+ * <dt>{@code --num_topics k}</dt>
+ * <dd>Number of latent topics.</dd>
+ * <dt>{@code --num_terms nt}</dt>
+ * <dd>Number of unique features defined by input document vectors. If option {@code --dictionary}
+ * is defined and this option is unspecified, term count is calculated from dictionary.</dd>
+ * <dt>{@code --topic_model_temp_dir path}</dt>
+ * <dd>Path in which to store model state after each iteration.</dd>
+ * <dt>{@code --maxIter i}</dt>
+ * <dd>Maximum number of iterations to perform. If this value is less than or equal to the number of
+ * iteration states found beneath the path specified by option {@code --topic_model_temp_dir}, no
+ * further iterations are performed. Instead, output topic-term and doc-topic distributions are
+ * generated using data from the specified iteration.</dd>
+ * <dt>{@code --max_doc_topic_iters i}</dt>
+ * <dd>Maximum number of iterations per doc for p(topic|doc) learning. Defaults to {@code 10}.</dd>
+ * <dt>{@code --doc_topic_smoothing a}</dt>
+ * <dd>Smoothing for doc-topic distribution. Defaults to {@code 0.0001}.</dd>
+ * <dt>{@code --term_topic_smoothing e}</dt>
+ * <dd>Smoothing for topic-term distribution. Defaults to {@code 0.0001}.</dd>
+ * <dt>{@code --random_seed seed}</dt>
+ * <dd>Integer seed for random number generation.</dd>
+ * <dt>{@code --test_set_percentage p}</dt>
+ * <dd>Fraction of data to hold out for testing. Defaults to {@code 0.0}.</dd>
+ * <dt>{@code --iteration_block_size block}</dt>
+ * <dd>Number of iterations between perplexity checks. Defaults to {@code 10}. This option is
+ * ignored unless option {@code --test_set_percentage} is greater than zero.</dd>
+ * </dl>
+ */
+public class CVB0Driver extends AbstractJob {
+  private static final Logger log = LoggerFactory.getLogger(CVB0Driver.class);
+
+  public static final String NUM_TOPICS = "num_topics";
+  public static final String NUM_TERMS = "num_terms";
+  public static final String DOC_TOPIC_SMOOTHING = "doc_topic_smoothing";
+  public static final String TERM_TOPIC_SMOOTHING = "term_topic_smoothing";
+  public static final String DICTIONARY = "dictionary";
+  public static final String DOC_TOPIC_OUTPUT = "doc_topic_output";
+  public static final String MODEL_TEMP_DIR = "topic_model_temp_dir";
+  public static final String ITERATION_BLOCK_SIZE = "iteration_block_size";
+  public static final String RANDOM_SEED = "random_seed";
+  public static final String TEST_SET_FRACTION = "test_set_fraction";
+  public static final String NUM_TRAIN_THREADS = "num_train_threads";
+  public static final String NUM_UPDATE_THREADS = "num_update_threads";
+  public static final String MAX_ITERATIONS_PER_DOC = "max_doc_topic_iters";
+  public static final String MODEL_WEIGHT = "prev_iter_mult";
+  public static final String NUM_REDUCE_TASKS = "num_reduce_tasks";
+  public static final String BACKFILL_PERPLEXITY = "backfill_perplexity";
+  private static final String MODEL_PATHS = "mahout.lda.cvb.modelPath";
+
+  private static final double DEFAULT_CONVERGENCE_DELTA = 0;
+  private static final double DEFAULT_DOC_TOPIC_SMOOTHING = 0.0001;
+  private static final double DEFAULT_TERM_TOPIC_SMOOTHING = 0.0001;
+  private static final int DEFAULT_ITERATION_BLOCK_SIZE = 10;
+  private static final double DEFAULT_TEST_SET_FRACTION = 0;
+  private static final int DEFAULT_NUM_TRAIN_THREADS = 4;
+  private static final int DEFAULT_NUM_UPDATE_THREADS = 1;
+  private static final int DEFAULT_MAX_ITERATIONS_PER_DOC = 10;
+  private static final int DEFAULT_NUM_REDUCE_TASKS = 10;
+
+  @Override
+  public int run(String[] args) throws Exception {
+    addInputOption();
+    addOutputOption();
+    addOption(DefaultOptionCreator.maxIterationsOption().create());
+    addOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION, "cd", "The convergence delta value",
+              String.valueOf(DEFAULT_CONVERGENCE_DELTA));
+    addOption(DefaultOptionCreator.overwriteOption().create());
+
+    addOption(NUM_TOPICS, "k", "Number of topics to learn", true);
+    addOption(NUM_TERMS, "nt", "Vocabulary size", false);
+    addOption(DOC_TOPIC_SMOOTHING, "a", "Smoothing for document/topic distribution",
+              String.valueOf(DEFAULT_DOC_TOPIC_SMOOTHING));
+    addOption(TERM_TOPIC_SMOOTHING, "e", "Smoothing for topic/term distribution",
+              String.valueOf(DEFAULT_TERM_TOPIC_SMOOTHING));
+    addOption(DICTIONARY, "dict", "Path to term-dictionary file(s) (glob expression supported)", false);
+    addOption(DOC_TOPIC_OUTPUT, "dt", "Output path for the training doc/topic distribution", false);
+    addOption(MODEL_TEMP_DIR, "mt", "Path to intermediate model path (useful for restarting)", false);
+    addOption(ITERATION_BLOCK_SIZE, "block", "Number of iterations per perplexity check",
+              String.valueOf(DEFAULT_ITERATION_BLOCK_SIZE));
+    addOption(RANDOM_SEED, "seed", "Random seed", false);
+    addOption(TEST_SET_FRACTION, "tf", "Fraction of data to hold out for testing",
+              String.valueOf(DEFAULT_TEST_SET_FRACTION));
+    addOption(NUM_TRAIN_THREADS, "ntt", "number of threads per mapper to train with",
+              String.valueOf(DEFAULT_NUM_TRAIN_THREADS));
+    addOption(NUM_UPDATE_THREADS, "nut", "number of threads per mapper to update the model with",
+              String.valueOf(DEFAULT_NUM_UPDATE_THREADS));
+    addOption(MAX_ITERATIONS_PER_DOC, "mipd", "max number of iterations per doc for p(topic|doc) learning",
+              String.valueOf(DEFAULT_MAX_ITERATIONS_PER_DOC));
+    addOption(NUM_REDUCE_TASKS, null, "number of reducers to use during model estimation",
+              String.valueOf(DEFAULT_NUM_REDUCE_TASKS));
+    addOption(buildOption(BACKFILL_PERPLEXITY, null, "enable backfilling of missing perplexity values", false, false,
+              null));
+
+    if (parseArguments(args) == null) {
+      return -1;
+    }
+
+    int numTopics = Integer.parseInt(getOption(NUM_TOPICS));
+    Path inputPath = getInputPath();
+    Path topicModelOutputPath = getOutputPath();
+    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+    int iterationBlockSize = Integer.parseInt(getOption(ITERATION_BLOCK_SIZE));
+    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+    double alpha = Double.parseDouble(getOption(DOC_TOPIC_SMOOTHING));
+    double eta = Double.parseDouble(getOption(TERM_TOPIC_SMOOTHING));
+    int numTrainThreads = Integer.parseInt(getOption(NUM_TRAIN_THREADS));
+    int numUpdateThreads = Integer.parseInt(getOption(NUM_UPDATE_THREADS));
+    int maxItersPerDoc = Integer.parseInt(getOption(MAX_ITERATIONS_PER_DOC));
+    Path dictionaryPath = hasOption(DICTIONARY) ? new Path(getOption(DICTIONARY)) : null;
+    int numTerms = hasOption(NUM_TERMS)
+                 ? Integer.parseInt(getOption(NUM_TERMS))
+                 : getNumTerms(getConf(), dictionaryPath);
+    Path docTopicOutputPath = hasOption(DOC_TOPIC_OUTPUT) ? new Path(getOption(DOC_TOPIC_OUTPUT)) : null;
+    Path modelTempPath = hasOption(MODEL_TEMP_DIR)
+                       ? new Path(getOption(MODEL_TEMP_DIR))
+                       : getTempPath("topicModelState");
+    long seed = hasOption(RANDOM_SEED)
+              ? Long.parseLong(getOption(RANDOM_SEED))
+              : System.nanoTime() % 10000;
+    float testFraction = hasOption(TEST_SET_FRACTION)
+                       ? Float.parseFloat(getOption(TEST_SET_FRACTION))
+                       : 0.0f;
+    int numReduceTasks = Integer.parseInt(getOption(NUM_REDUCE_TASKS));
+    boolean backfillPerplexity = hasOption(BACKFILL_PERPLEXITY);
+
+    return run(getConf(), inputPath, topicModelOutputPath, numTopics, numTerms, alpha, eta,
+        maxIterations, iterationBlockSize, convergenceDelta, dictionaryPath, docTopicOutputPath,
+        modelTempPath, seed, testFraction, numTrainThreads, numUpdateThreads, maxItersPerDoc,
+        numReduceTasks, backfillPerplexity);
+  }
+
+  private static int getNumTerms(Configuration conf, Path dictionaryPath) throws IOException {
+    FileSystem fs = dictionaryPath.getFileSystem(conf);
+    Text key = new Text();
+    IntWritable value = new IntWritable();
+    int maxTermId = -1;
+    for (FileStatus stat : fs.globStatus(dictionaryPath)) {
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, stat.getPath(), conf);
+      while (reader.next(key, value)) {
+        maxTermId = Math.max(maxTermId, value.get());
+      }
+    }
+    return maxTermId + 1;
+  }
+
+  public int run(Configuration conf,
+                 Path inputPath,
+                 Path topicModelOutputPath,
+                 int numTopics,
+                 int numTerms,
+                 double alpha,
+                 double eta,
+                 int maxIterations,
+                 int iterationBlockSize,
+                 double convergenceDelta,
+                 Path dictionaryPath,
+                 Path docTopicOutputPath,
+                 Path topicModelStateTempPath,
+                 long randomSeed,
+                 float testFraction,
+                 int numTrainThreads,
+                 int numUpdateThreads,
+                 int maxItersPerDoc,
+                 int numReduceTasks,
+                 boolean backfillPerplexity)
+    throws ClassNotFoundException, IOException, InterruptedException {
+
+    setConf(conf);
+
+    // verify arguments
+    Preconditions.checkArgument(testFraction >= 0.0 && testFraction <= 1.0,
+        "Expected 'testFraction' value in range [0, 1] but found value '%s'", testFraction);
+    Preconditions.checkArgument(!backfillPerplexity || testFraction > 0.0,
+        "Expected 'testFraction' value in range (0, 1] but found value '%s'", testFraction);
+
+    String infoString = "Will run Collapsed Variational Bayes (0th-derivative approximation) " 
+      + "learning for LDA on {} (numTerms: {}), finding {}-topics, with document/topic prior {}, " 
+      + "topic/term prior {}.  Maximum iterations to run will be {}, unless the change in " 
+      + "perplexity is less than {}.  Topic model output (p(term|topic) for each topic) will be " 
+      + "stored {}.  Random initialization seed is {}, holding out {} of the data for perplexity " 
+      + "check\n";
+    log.info(infoString, inputPath, numTerms, numTopics, alpha, eta, maxIterations,
+             convergenceDelta, topicModelOutputPath, randomSeed, testFraction);
+    infoString = dictionaryPath == null
+               ? "" : "Dictionary to be used located " + dictionaryPath.toString() + '\n';
+    infoString += docTopicOutputPath == null
+               ? "" : "p(topic|docId) will be stored " + docTopicOutputPath.toString() + '\n';
+    log.info(infoString);
+
+    FileSystem fs = FileSystem.get(topicModelStateTempPath.toUri(), conf);
+    int iterationNumber = getCurrentIterationNumber(conf, topicModelStateTempPath, maxIterations);
+    log.info("Current iteration number: {}", iterationNumber);
+
+    conf.set(NUM_TOPICS, String.valueOf(numTopics));
+    conf.set(NUM_TERMS, String.valueOf(numTerms));
+    conf.set(DOC_TOPIC_SMOOTHING, String.valueOf(alpha));
+    conf.set(TERM_TOPIC_SMOOTHING, String.valueOf(eta));
+    conf.set(RANDOM_SEED, String.valueOf(randomSeed));
+    conf.set(NUM_TRAIN_THREADS, String.valueOf(numTrainThreads));
+    conf.set(NUM_UPDATE_THREADS, String.valueOf(numUpdateThreads));
+    conf.set(MAX_ITERATIONS_PER_DOC, String.valueOf(maxItersPerDoc));
+    conf.set(MODEL_WEIGHT, "1"); // TODO
+    conf.set(TEST_SET_FRACTION, String.valueOf(testFraction));
+
+    List<Double> perplexities = new ArrayList<>();
+    for (int i = 1; i <= iterationNumber; i++) {
+      // form path to model
+      Path modelPath = modelPath(topicModelStateTempPath, i);
+
+      // read perplexity
+      double perplexity = readPerplexity(conf, topicModelStateTempPath, i);
+      if (Double.isNaN(perplexity)) {
+        if (!(backfillPerplexity && i % iterationBlockSize == 0)) {
+          continue;
+        }
+        log.info("Backfilling perplexity at iteration {}", i);
+        if (!fs.exists(modelPath)) {
+          log.error("Model path '{}' does not exist; Skipping iteration {} perplexity calculation",
+              modelPath.toString(), i);
+          continue;
+        }
+        perplexity = calculatePerplexity(conf, inputPath, modelPath, i);
+      }
+
+      // register and log perplexity
+      perplexities.add(perplexity);
+      log.info("Perplexity at iteration {} = {}", i, perplexity);
+    }
+
+    long startTime = System.currentTimeMillis();
+    while (iterationNumber < maxIterations) {
+      // test convergence
+      if (convergenceDelta > 0.0) {
+        double delta = rateOfChange(perplexities);
+        if (delta < convergenceDelta) {
+          log.info("Convergence achieved at iteration {} with perplexity {} and delta {}",
+                   iterationNumber, perplexities.get(perplexities.size() - 1), delta);
+          break;
+        }
+      }
+
+      // update model
+      iterationNumber++;
+      log.info("About to run iteration {} of {}", iterationNumber, maxIterations);
+      Path modelInputPath = modelPath(topicModelStateTempPath, iterationNumber - 1);
+      Path modelOutputPath = modelPath(topicModelStateTempPath, iterationNumber);
+      runIteration(conf, inputPath, modelInputPath, modelOutputPath, iterationNumber,
+          maxIterations, numReduceTasks);
+
+      // calculate perplexity
+      if (testFraction > 0 && iterationNumber % iterationBlockSize == 0) {
+        perplexities.add(calculatePerplexity(conf, inputPath, modelOutputPath, iterationNumber));
+        log.info("Current perplexity = {}", perplexities.get(perplexities.size() - 1));
+        log.info("(p_{} - p_{}) / p_0 = {}; target = {}", iterationNumber, iterationNumber - iterationBlockSize,
+            rateOfChange(perplexities), convergenceDelta);
+      }
+    }
+    log.info("Completed {} iterations in {} seconds", iterationNumber,
+        (System.currentTimeMillis() - startTime) / 1000);
+    log.info("Perplexities: ({})", Joiner.on(", ").join(perplexities));
+
+    // write final topic-term and doc-topic distributions
+    Path finalIterationData = modelPath(topicModelStateTempPath, iterationNumber);
+    Job topicModelOutputJob = topicModelOutputPath != null
+        ? writeTopicModel(conf, finalIterationData, topicModelOutputPath)
+        : null;
+    Job docInferenceJob = docTopicOutputPath != null
+        ? writeDocTopicInference(conf, inputPath, finalIterationData, docTopicOutputPath)
+        : null;
+    if (topicModelOutputJob != null && !topicModelOutputJob.waitForCompletion(true)) {
+      return -1;
+    }
+    if (docInferenceJob != null && !docInferenceJob.waitForCompletion(true)) {
+      return -1;
+    }
+    return 0;
+  }
+
+  private static double rateOfChange(List<Double> perplexities) {
+    int sz = perplexities.size();
+    if (sz < 2) {
+      return Double.MAX_VALUE;
+    }
+    return Math.abs(perplexities.get(sz - 1) - perplexities.get(sz - 2)) / perplexities.get(0);
+  }
+
+  private double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration)
+    throws IOException, ClassNotFoundException, InterruptedException {
+    String jobName = "Calculating perplexity for " + modelPath;
+    log.info("About to run: {}", jobName);
+
+    Path outputPath = perplexityPath(modelPath.getParent(), iteration);
+    Job job = prepareJob(corpusPath, outputPath, CachingCVB0PerplexityMapper.class, DoubleWritable.class,
+        DoubleWritable.class, DualDoubleSumReducer.class, DoubleWritable.class, DoubleWritable.class);
+
+    job.setJobName(jobName);
+    job.setCombinerClass(DualDoubleSumReducer.class);
+    job.setNumReduceTasks(1);
+    setModelPaths(job, modelPath);
+    HadoopUtil.delete(conf, outputPath);
+    if (!job.waitForCompletion(true)) {
+      throw new InterruptedException("Failed to calculate perplexity for: " + modelPath);
+    }
+    return readPerplexity(conf, modelPath.getParent(), iteration);
+  }
+
+  /**
+   * Sums keys and values independently.
+   */
+  public static class DualDoubleSumReducer extends
+    Reducer<DoubleWritable, DoubleWritable, DoubleWritable, DoubleWritable> {
+    private final DoubleWritable outKey = new DoubleWritable();
+    private final DoubleWritable outValue = new DoubleWritable();
+
+    @Override
+    public void run(Context context) throws IOException,
+        InterruptedException {
+      double keySum = 0.0;
+      double valueSum = 0.0;
+      while (context.nextKey()) {
+        keySum += context.getCurrentKey().get();
+        for (DoubleWritable value : context.getValues()) {
+          valueSum += value.get();
+        }
+      }
+      outKey.set(keySum);
+      outValue.set(valueSum);
+      context.write(outKey, outValue);
+    }
+  }
+
+  /**
+   * @param topicModelStateTemp
+   * @param iteration
+   * @return {@code double[2]} where first value is perplexity and second is model weight of those
+   *         documents sampled during perplexity computation, or {@code null} if no perplexity data
+   *         exists for the given iteration.
+   * @throws IOException
+   */
+  public static double readPerplexity(Configuration conf, Path topicModelStateTemp, int iteration)
+    throws IOException {
+    Path perplexityPath = perplexityPath(topicModelStateTemp, iteration);
+    FileSystem fs = FileSystem.get(perplexityPath.toUri(), conf);
+    if (!fs.exists(perplexityPath)) {
+      log.warn("Perplexity path {} does not exist, returning NaN", perplexityPath);
+      return Double.NaN;
+    }
+    double perplexity = 0;
+    double modelWeight = 0;
+    long n = 0;
+    for (Pair<DoubleWritable, DoubleWritable> pair : new SequenceFileDirIterable<DoubleWritable, DoubleWritable>(
+        perplexityPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
+      modelWeight += pair.getFirst().get();
+      perplexity += pair.getSecond().get();
+      n++;
+    }
+    log.info("Read {} entries with total perplexity {} and model weight {}", n,
+             perplexity, modelWeight);
+    return perplexity / modelWeight;
+  }
+
+  private Job writeTopicModel(Configuration conf, Path modelInput, Path output)
+    throws IOException, InterruptedException, ClassNotFoundException {
+    String jobName = String.format("Writing final topic/term distributions from %s to %s", modelInput, output);
+    log.info("About to run: {}", jobName);
+
+    Job job = prepareJob(modelInput, output, SequenceFileInputFormat.class, CVB0TopicTermVectorNormalizerMapper.class,
+        IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, jobName);
+    job.submit();
+    return job;
+  }
+
+  private Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output)
+    throws IOException, ClassNotFoundException, InterruptedException {
+    String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output);
+    log.info("About to run: {}", jobName);
+
+    Job job = prepareJob(corpus, output, SequenceFileInputFormat.class, CVB0DocInferenceMapper.class,
+        IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, jobName);
+
+    FileSystem fs = FileSystem.get(corpus.toUri(), conf);
+    if (modelInput != null && fs.exists(modelInput)) {
+      FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter());
+      URI[] modelUris = new URI[statuses.length];
+      for (int i = 0; i < statuses.length; i++) {
+        modelUris[i] = statuses[i].getPath().toUri();
+      }
+      DistributedCache.setCacheFiles(modelUris, conf);
+      setModelPaths(job, modelInput);
+    }
+    job.submit();
+    return job;
+  }
+
+  public static Path modelPath(Path topicModelStateTempPath, int iterationNumber) {
+    return new Path(topicModelStateTempPath, "model-" + iterationNumber);
+  }
+
+  public static Path perplexityPath(Path topicModelStateTempPath, int iterationNumber) {
+    return new Path(topicModelStateTempPath, "perplexity-" + iterationNumber);
+  }
+
+  private static int getCurrentIterationNumber(Configuration config, Path modelTempDir, int maxIterations)
+    throws IOException {
+    FileSystem fs = FileSystem.get(modelTempDir.toUri(), config);
+    int iterationNumber = 1;
+    Path iterationPath = modelPath(modelTempDir, iterationNumber);
+    while (fs.exists(iterationPath) && iterationNumber <= maxIterations) {
+      log.info("Found previous state: {}", iterationPath);
+      iterationNumber++;
+      iterationPath = modelPath(modelTempDir, iterationNumber);
+    }
+    return iterationNumber - 1;
+  }
+
+  public void runIteration(Configuration conf, Path corpusInput, Path modelInput, Path modelOutput,
+                           int iterationNumber, int maxIterations, int numReduceTasks)
+    throws IOException, ClassNotFoundException, InterruptedException {
+    String jobName = String.format("Iteration %d of %d, input path: %s",
+        iterationNumber, maxIterations, modelInput);
+    log.info("About to run: {}", jobName);
+    Job job = prepareJob(corpusInput, modelOutput, CachingCVB0Mapper.class, IntWritable.class, VectorWritable.class,
+        VectorSumReducer.class, IntWritable.class, VectorWritable.class);
+    job.setCombinerClass(VectorSumReducer.class);
+    job.setNumReduceTasks(numReduceTasks);
+    job.setJobName(jobName);
+    setModelPaths(job, modelInput);
+    HadoopUtil.delete(conf, modelOutput);
+    if (!job.waitForCompletion(true)) {
+      throw new InterruptedException(String.format("Failed to complete iteration %d stage 1",
+          iterationNumber));
+    }
+  }
+
+  private static void setModelPaths(Job job, Path modelPath) throws IOException {
+    Configuration conf = job.getConfiguration();
+    if (modelPath == null || !FileSystem.get(modelPath.toUri(), conf).exists(modelPath)) {
+      return;
+    }
+    FileStatus[] statuses = FileSystem.get(modelPath.toUri(), conf).listStatus(modelPath, PathFilters.partFilter());
+    Preconditions.checkState(statuses.length > 0, "No part files found in model path '%s'", modelPath.toString());
+    String[] modelPaths = new String[statuses.length];
+    for (int i = 0; i < statuses.length; i++) {
+      modelPaths[i] = statuses[i].getPath().toUri().toString();
+    }
+    conf.setStrings(MODEL_PATHS, modelPaths);
+  }
+
+  public static Path[] getModelPaths(Configuration conf) {
+    String[] modelPathNames = conf.getStrings(MODEL_PATHS);
+    if (modelPathNames == null || modelPathNames.length == 0) {
+      return null;
+    }
+    Path[] modelPaths = new Path[modelPathNames.length];
+    for (int i = 0; i < modelPathNames.length; i++) {
+      modelPaths[i] = new Path(modelPathNames[i]);
+    }
+    return modelPaths;
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new CVB0Driver(), args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0TopicTermVectorNormalizerMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0TopicTermVectorNormalizerMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0TopicTermVectorNormalizerMapper.java
new file mode 100644
index 0000000..1253942
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0TopicTermVectorNormalizerMapper.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.Functions;
+
+import java.io.IOException;
+
+/**
+ * Performs L1 normalization of input vectors.
+ */
+public class CVB0TopicTermVectorNormalizerMapper extends
+    Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> {
+
+  @Override
+  protected void map(IntWritable key, VectorWritable value, Context context) throws IOException,
+      InterruptedException {
+    value.get().assign(Functions.div(value.get().norm(1.0)));
+    context.write(key, value);
+  }
+}


[50/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/conf/log4j.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/log4j.xml b/community/mahout-mr/conf/log4j.xml
index 6231b48..179f1a9 100644
--- a/community/mahout-mr/conf/log4j.xml
+++ b/community/mahout-mr/conf/log4j.xml
@@ -1,4 +1,21 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
 <!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
 <log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
     <appender class="org.apache.log4j.ConsoleAppender" name="console">

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/README.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/README.txt b/community/mahout-mr/examples/bin/README.txt
deleted file mode 100644
index 7ad3a38..0000000
--- a/community/mahout-mr/examples/bin/README.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-This directory contains helpful shell scripts for working with some of Mahout's examples.  
-
-To set a non-default temporary work directory: `export MAHOUT_WORK_DIR=/path/in/hdfs/to/temp/dir`
-  Note that this requires the same path to be writable both on the local file system as well as on HDFS.
-
-Here's a description of what each does:
-
-classify-20newsgroups.sh -- Run SGD and Bayes classifiers over the classic 20 News Groups.  Downloads the data set automatically.
-cluster-reuters.sh -- Cluster the Reuters data set using a variety of algorithms.  Downloads the data set automatically.
-cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set.  Downloads the data set automatically.
-factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on the Grouplens data set (size 1M).
-factorize-netflix.sh -- (Deprecated due to lack of availability of the data set) Run the ALS Recommender on the Netflix data set.
-spark-document-classifier.mscala -- A mahout-shell script which trains and tests a Naive Bayes model on the Wikipedia XML dump and defines simple methods to classify new text.

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/classify-20newsgroups.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/classify-20newsgroups.sh b/community/mahout-mr/examples/bin/classify-20newsgroups.sh
deleted file mode 100755
index f47d5c5..0000000
--- a/community/mahout-mr/examples/bin/classify-20newsgroups.sh
+++ /dev/null
@@ -1,197 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads the 20newsgroups dataset, trains and tests a classifier.
-#
-# To run:  change into the mahout directory and type:
-# examples/bin/classify-20newsgroups.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script runs SGD and Bayes classifiers over the classic 20 News Groups."
-  exit
-fi
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
-  cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-${USER}
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean)
-if [ -n "$1" ]; then
-  choice=$1
-else
-  echo "Please select a number to choose the corresponding task to run"
-  echo "1. ${algorithm[0]}"
-  echo "2. ${algorithm[1]}"
-  echo "3. ${algorithm[2]}"
-  echo "4. ${algorithm[3]}"
-  echo "5. ${algorithm[4]}"
-  echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR"
-  read -p "Enter your choice : " choice
-fi
-
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
-alg=${algorithm[$choice-1]}
-
-# Spark specific check and work 
-if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then
-  if [ "$MASTER" == "" ] ; then
-    echo "Please set your MASTER env variable to point to your Spark Master URL. exiting..."
-    exit 1
-  fi
-  if [ "$MAHOUT_LOCAL" != "" ] ; then
-    echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..."
-    exit 1
-  fi
-fi
-
-if [ "x$alg" != "xclean" ]; then
-  echo "creating work directory at ${WORK_DIR}"
-
-  mkdir -p ${WORK_DIR}
-  if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then
-    if [ ! -e ${WORK_DIR}/20news-bydate ]; then
-      if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then
-        echo "Downloading 20news-bydate"
-        curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz
-      fi
-      mkdir -p ${WORK_DIR}/20news-bydate
-      echo "Extracting..."
-      cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd ..
-    fi
-  fi
-fi
-#echo $START_PATH
-cd $START_PATH
-cd ../..
-
-set -e
-
-if  ( [ "x$alg" == "xnaivebayes-MapReduce" ] ||  [ "x$alg" == "xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark"  ] || [ "x$alg" == "xcnaivebayes-Spark" ] ); then
-  c=""
-
-  if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" == "xnaivebayes-Spark" ]; then
-    c=" -c"
-  fi
-
-  set -x
-  echo "Preparing 20newsgroups data"
-  rm -rf ${WORK_DIR}/20news-all
-  mkdir ${WORK_DIR}/20news-all
-  cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all
-
-  if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-    echo "Copying 20newsgroups data to HDFS"
-    set +e
-    $DFSRM ${WORK_DIR}/20news-all
-    $DFS -mkdir -p ${WORK_DIR}
-    $DFS -mkdir ${WORK_DIR}/20news-all
-    set -e
-    if [ $HVERSION -eq "1" ] ; then
-      echo "Copying 20newsgroups data to Hadoop 1 HDFS"
-      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all
-    elif [ $HVERSION -eq "2" ] ; then
-      echo "Copying 20newsgroups data to Hadoop 2 HDFS"
-      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/
-    fi
-  fi
-
-  echo "Creating sequence files from 20newsgroups data"
-  ./bin/mahout seqdirectory \
-    -i ${WORK_DIR}/20news-all \
-    -o ${WORK_DIR}/20news-seq -ow
-
-  echo "Converting sequence files to vectors"
-  ./bin/mahout seq2sparse \
-    -i ${WORK_DIR}/20news-seq \
-    -o ${WORK_DIR}/20news-vectors  -lnorm -nv  -wt tfidf
-
-  echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
-  ./bin/mahout split \
-    -i ${WORK_DIR}/20news-vectors/tfidf-vectors \
-    --trainingOutput ${WORK_DIR}/20news-train-vectors \
-    --testOutput ${WORK_DIR}/20news-test-vectors  \
-    --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential
-
-    if [ "x$alg" == "xnaivebayes-MapReduce"  -o  "x$alg" == "xcnaivebayes-MapReduce" ]; then
-
-      echo "Training Naive Bayes model"
-      ./bin/mahout trainnb \
-        -i ${WORK_DIR}/20news-train-vectors \
-        -o ${WORK_DIR}/model \
-        -li ${WORK_DIR}/labelindex \
-        -ow $c
-
-      echo "Self testing on training set"
-
-      ./bin/mahout testnb \
-        -i ${WORK_DIR}/20news-train-vectors\
-        -m ${WORK_DIR}/model \
-        -l ${WORK_DIR}/labelindex \
-        -ow -o ${WORK_DIR}/20news-testing $c
-
-      echo "Testing on holdout set"
-
-      ./bin/mahout testnb \
-        -i ${WORK_DIR}/20news-test-vectors\
-        -m ${WORK_DIR}/model \
-        -l ${WORK_DIR}/labelindex \
-        -ow -o ${WORK_DIR}/20news-testing $c
-
-    elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then
-
-      echo "Training Naive Bayes model"
-      ./bin/mahout spark-trainnb \
-        -i ${WORK_DIR}/20news-train-vectors \
-        -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER
-
-      echo "Self testing on training set"
-      ./bin/mahout spark-testnb \
-        -i ${WORK_DIR}/20news-train-vectors\
-        -m ${WORK_DIR}/spark-model $c -ma $MASTER
-
-      echo "Testing on holdout set"
-      ./bin/mahout spark-testnb \
-        -i ${WORK_DIR}/20news-test-vectors\
-        -m ${WORK_DIR}/spark-model $c -ma $MASTER
-        
-    fi
-elif [ "x$alg" == "xsgd" ]; then
-  if [ ! -e "/tmp/news-group.model" ]; then
-    echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/"
-    ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups ${WORK_DIR}/20news-bydate/20news-bydate-train/
-  fi
-  echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model"
-  ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model
-elif [ "x$alg" == "xclean" ]; then
-  rm -rf $WORK_DIR
-  rm -rf /tmp/news-group.model
-  $DFSRM $WORK_DIR
-fi
-# Remove the work directory
-#

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/classify-wikipedia.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/classify-wikipedia.sh b/community/mahout-mr/examples/bin/classify-wikipedia.sh
deleted file mode 100755
index 41dc0c9..0000000
--- a/community/mahout-mr/examples/bin/classify-wikipedia.sh
+++ /dev/null
@@ -1,196 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads a (partial) wikipedia dump, trains and tests a classifier.
-#
-# To run:  change into the mahout directory and type:
-# examples/bin/classify-wikipedia.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
-  exit
-fi
-
-# ensure that MAHOUT_HOME is set
-if [[ -z "$MAHOUT_HOME" ]]; then
-  echo "Please set MAHOUT_HOME."
-  exit
-fi
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
-  cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-wiki
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-algorithm=( CBayes BinaryCBayes clean)
-if [ -n "$1" ]; then
-  choice=$1
-else
-  echo "Please select a number to choose the corresponding task to run"
-  echo "1. ${algorithm[0]} (may require increased heap space on yarn)"
-  echo "2. ${algorithm[1]}"
-  echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR"
-  read -p "Enter your choice : " choice
-fi
-
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
-alg=${algorithm[$choice-1]}
-
-if [ "x$alg" != "xclean" ]; then
-  echo "creating work directory at ${WORK_DIR}"
-
-  mkdir -p ${WORK_DIR}
-    if [ ! -e ${WORK_DIR}/wikixml ]; then
-        mkdir -p ${WORK_DIR}/wikixml
-    fi
-    if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ]; then
-        echo "Downloading wikipedia XML dump"
-        ########################################################   
-        #  Datasets: uncomment and run "clean" to change dataset   
-        ########################################################
-        ########## partial small 42.5M zipped
-        # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000030302.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
-        ########## partial larger 256M zipped
-        curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p2336425p3046511.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
-        ######### full wikipedia dump: 10G zipped
-        # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
-        ########################################################
-    fi
-    if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml ]; then
-        echo "Extracting..."
-       
-        cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd ..
-    fi
-
-echo $START_PATH
-
-set -e
-
-if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then
-
-  set -x
-  echo "Preparing wikipedia data"
-  rm -rf ${WORK_DIR}/wiki
-  mkdir ${WORK_DIR}/wiki
-  
-  if [ "x$alg" == "xCBayes" ] ; then
-    # use a list of 10 countries as categories
-    cp $MAHOUT_HOME/examples/bin/resources/country10.txt ${WORK_DIR}/country.txt
-    chmod 666 ${WORK_DIR}/country.txt
-  fi
-  
-  if [ "x$alg" == "xBinaryCBayes" ] ; then
-    # use United States and United Kingdom as categories
-    cp $MAHOUT_HOME/examples/bin/resources/country2.txt ${WORK_DIR}/country.txt
-    chmod 666 ${WORK_DIR}/country.txt
-  fi
-
-  if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-    echo "Copying wikipedia data to HDFS"
-    set +e
-    $DFSRM ${WORK_DIR}/wikixml
-    $DFS -mkdir -p ${WORK_DIR}
-    set -e
-    $DFS -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
-  fi
-
-  echo "Creating sequence files from wikiXML"
-  $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt \
-                                  -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml \
-                                  -o ${WORK_DIR}/wikipediainput
-   
-  # if using the 10 class problem use bigrams
-  if [ "x$alg" == "xCBayes" ] ; then
-    echo "Converting sequence files to vectors using bigrams"
-    $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
-                                       -o ${WORK_DIR}/wikipediaVecs \
-                                       -wt tfidf \
-                                       -lnorm -nv \
-                                       -ow -ng 2
-  fi
-  
-  # if using the 2 class problem try different options
-  if [ "x$alg" == "xBinaryCBayes" ] ; then
-    echo "Converting sequence files to vectors using unigrams and a max document frequency of 30%"
-    $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
-                                       -o ${WORK_DIR}/wikipediaVecs \
-                                       -wt tfidf \
-                                       -lnorm \
-                                       -nv \
-                                       -ow \
-                                       -ng 1 \
-                                       -x 30
-  fi
-  
-  echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
-  $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ \
-                                --trainingOutput ${WORK_DIR}/training \
-                                --testOutput ${WORK_DIR}/testing \
-                                -rp 20 \
-                                -ow \
-                                -seq \
-                                -xm sequential
-
-  echo "Training Naive Bayes model"
-  $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training \
-                                  -o ${WORK_DIR}/model \
-                                  -li ${WORK_DIR}/labelindex \
-                                  -ow \
-                                  -c
-
-  echo "Self testing on training set"
-  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training \
-                                 -m ${WORK_DIR}/model \
-                                 -l ${WORK_DIR}/labelindex \
-                                 -ow \
-                                 -o ${WORK_DIR}/output \
-                                 -c
-
-  echo "Testing on holdout set: Bayes"
-  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
-                                 -m ${WORK_DIR}/model \
-                                 -l ${WORK_DIR}/labelindex \
-                                 -ow \
-                                 -o ${WORK_DIR}/output \
-                                 -seq
-
- echo "Testing on holdout set: CBayes"
-  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
-                                 -m ${WORK_DIR}/model -l \
-                                 ${WORK_DIR}/labelindex \
-                                 -ow \
-                                 -o ${WORK_DIR}/output  \
-                                 -c \
-                                 -seq
-fi
-
-elif [ "x$alg" == "xclean" ]; then
-  rm -rf $WORK_DIR
-  $DFSRM $WORK_DIR
-fi
-# Remove the work directory

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/cluster-reuters.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/cluster-reuters.sh b/community/mahout-mr/examples/bin/cluster-reuters.sh
deleted file mode 100755
index 49f6c94..0000000
--- a/community/mahout-mr/examples/bin/cluster-reuters.sh
+++ /dev/null
@@ -1,203 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads the Reuters dataset and prepares it for clustering
-#
-# To run:  change into the mahout directory and type:
-#  examples/bin/cluster-reuters.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script clusters the Reuters data set using a variety of algorithms.  The data set is downloaded automatically."
-  exit
-fi
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then 
-  cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-MAHOUT="../../bin/mahout"
-
-if [ ! -e $MAHOUT ]; then
-  echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.."
-  exit 1
-fi
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-${USER}
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-algorithm=( kmeans fuzzykmeans lda streamingkmeans clean)
-if [ -n "$1" ]; then
-  choice=$1
-else
-  echo "Please select a number to choose the corresponding clustering algorithm"
-  echo "1. ${algorithm[0]} clustering (runs from this example script in cluster mode only)" 
-  echo "2. ${algorithm[1]} clustering (may require increased heap space on yarn)"
-  echo "3. ${algorithm[2]} clustering"
-  echo "4. ${algorithm[3]} clustering"
-  echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR"
-  read -p "Enter your choice : " choice
-fi
-
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
-clustertype=${algorithm[$choice-1]}
-
-if [ "x$clustertype" == "xclean" ]; then
-  rm -rf $WORK_DIR
-  $DFSRM $WORK_DIR
-  exit 1
-else
-  $DFS -mkdir -p $WORK_DIR
-  mkdir -p $WORK_DIR
-  echo "Creating work directory at ${WORK_DIR}"
-fi
-if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
-  if [ ! -e ${WORK_DIR}/reuters-out ]; then
-    if [ ! -e ${WORK_DIR}/reuters-sgm ]; then
-      if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
-	  if [ -n "$2" ]; then
-	      echo "Copying Reuters from local download"
-	      cp $2 ${WORK_DIR}/reuters21578.tar.gz
-	  else
-              echo "Downloading Reuters-21578"
-              curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${WORK_DIR}/reuters21578.tar.gz
-	  fi
-      fi
-      #make sure it was actually downloaded
-      if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
-	  echo "Failed to download reuters"
-	  exit 1
-      fi
-      mkdir -p ${WORK_DIR}/reuters-sgm
-      echo "Extracting..."
-      tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm
-    fi
-    echo "Extracting Reuters"
-    $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out
-    if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-        echo "Copying Reuters data to Hadoop"
-        set +e
-        $DFSRM ${WORK_DIR}/reuters-sgm
-        $DFSRM ${WORK_DIR}/reuters-out
-        $DFS -mkdir -p ${WORK_DIR}/
-        $DFS -mkdir ${WORK_DIR}/reuters-sgm
-        $DFS -mkdir ${WORK_DIR}/reuters-out
-        $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm
-        $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out
-        set -e
-    fi
-  fi
-  echo "Converting to Sequence Files from Directory"
-  $MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o ${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential
-fi
-
-if [ "x$clustertype" == "xkmeans" ]; then
-  $MAHOUT seq2sparse \
-    -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 --namedVector \
-  && \
-  $MAHOUT kmeans \
-    -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
-    -c ${WORK_DIR}/reuters-kmeans-clusters \
-    -o ${WORK_DIR}/reuters-kmeans \
-    -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
-    -x 10 -k 20 -ow --clustering \
-  && \
-  $MAHOUT clusterdump \
-    -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print $8}'` \
-    -o ${WORK_DIR}/reuters-kmeans/clusterdump \
-    -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
-    -dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \
-    --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints \
-    && \
-  cat ${WORK_DIR}/reuters-kmeans/clusterdump
-elif [ "x$clustertype" == "xfuzzykmeans" ]; then
-  $MAHOUT seq2sparse \
-    -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 --namedVector \
-  && \
-  $MAHOUT fkmeans \
-    -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \
-    -c ${WORK_DIR}/reuters-fkmeans-clusters \
-    -o ${WORK_DIR}/reuters-fkmeans \
-    -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
-    -x 10 -k 20 -ow -m 1.1 \
-  && \
-  $MAHOUT clusterdump \
-    -i ${WORK_DIR}/reuters-fkmeans/clusters-*-final \
-    -o ${WORK_DIR}/reuters-fkmeans/clusterdump \
-    -d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0 \
-    -dt sequencefile -b 100 -n 20 -sp 0 \
-    && \
-  cat ${WORK_DIR}/reuters-fkmeans/clusterdump
-elif [ "x$clustertype" == "xlda" ]; then
-  $MAHOUT seq2sparse \
-    -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector \
-  && \
-  $MAHOUT rowid \
-    -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors \
-    -o ${WORK_DIR}/reuters-out-matrix \
-  && \
-  rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics ${WORK_DIR}/reuters-lda-model \
-  && \
-  $MAHOUT cvb \
-    -i ${WORK_DIR}/reuters-out-matrix/matrix \
-    -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \
-    -dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
-    -dt ${WORK_DIR}/reuters-lda-topics \
-    -mt ${WORK_DIR}/reuters-lda-model \
-  && \
-  $MAHOUT vectordump \
-    -i ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
-    -o ${WORK_DIR}/reuters-lda/vectordump \
-    -vs 10 -p true \
-    -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
-    -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
-    && \
-  cat ${WORK_DIR}/reuters-lda/vectordump
-elif [ "x$clustertype" == "xstreamingkmeans" ]; then
-  $MAHOUT seq2sparse \
-    -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow --maxDFPercent 85 --namedVector \
-  && \
-  rm -rf ${WORK_DIR}/reuters-streamingkmeans \
-  && \
-  $MAHOUT streamingkmeans \
-    -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/ \
-    --tempDir ${WORK_DIR}/tmp \
-    -o ${WORK_DIR}/reuters-streamingkmeans \
-    -sc org.apache.mahout.math.neighborhood.FastProjectionSearch \
-    -dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure \
-    -k 10 -km 100 -ow \
-  && \
-  $MAHOUT qualcluster \
-    -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/part-r-00000 \
-    -c ${WORK_DIR}/reuters-streamingkmeans/part-r-00000   \
-    -o ${WORK_DIR}/reuters-cluster-distance.csv \
-    && \
-  cat ${WORK_DIR}/reuters-cluster-distance.csv
-fi

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh b/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh
deleted file mode 100755
index 796da33..0000000
--- a/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads the Synthetic control dataset and prepares it for clustering
-#
-# To run:  change into the mahout directory and type:
-#  examples/bin/cluster-syntheticcontrol.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script clusters the Synthetic Control data set.  The data set is downloaded automatically."
-  exit
-fi
-
-algorithm=( kmeans fuzzykmeans )
-if [ -n "$1" ]; then
-  choice=$1
-else
-  echo "Please select a number to choose the corresponding clustering algorithm"
-  echo "1. ${algorithm[0]} clustering"
-  echo "2. ${algorithm[1]} clustering"
-  read -p "Enter your choice : " choice
-fi
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
-clustertype=${algorithm[$choice-1]}
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
-  cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-${USER}
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-echo "creating work directory at ${WORK_DIR}"
-mkdir -p ${WORK_DIR}
-if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
-  if [ -n "$2" ]; then
-    cp $2 ${WORK_DIR}/.
-  else
-    echo "Downloading Synthetic control data"
-    curl http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data  -o ${WORK_DIR}/synthetic_control.data
-  fi
-fi
-if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
-  echo "Couldn't download synthetic control"
-  exit 1
-fi
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]; then
-  echo "Checking the health of DFS..."
-  $DFS -ls /
-  if [ $? -eq 0 ];then 
-    echo "DFS is healthy... "
-    echo "Uploading Synthetic control data to HDFS"
-    $DFSRM ${WORK_DIR}/testdata
-    $DFS -mkdir -p ${WORK_DIR}/testdata
-    $DFS -put ${WORK_DIR}/synthetic_control.data ${WORK_DIR}/testdata
-    echo "Successfully Uploaded Synthetic control data to HDFS "
-
-    options="--input ${WORK_DIR}/testdata --output ${WORK_DIR}/output --maxIter 10 --convergenceDelta 0.5"
-
-    if [ "${clustertype}" == "kmeans" ]; then
-      options="${options} --numClusters 6"
-      # t1 & t2 not used if --numClusters specified, but parser requires input
-      options="${options} --t1 1 --t2 2"
-      ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
-    else
-      options="${options} --m 2.0f --t1 80 --t2 55"
-      ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
-    fi
-  else
-    echo " HADOOP is not running. Please make sure you hadoop is running. "
-  fi
-elif [ "$MAHOUT_LOCAL" != "" ]; then
-  echo "running MAHOUT_LOCAL"
-  cp ${WORK_DIR}/synthetic_control.data testdata
-  ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job
-  rm testdata
-else
-  echo " HADOOP_HOME variable is not set. Please set this environment variable and rerun the script"
-fi
-# Remove the work directory
-rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/factorize-movielens-1M.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/factorize-movielens-1M.sh b/community/mahout-mr/examples/bin/factorize-movielens-1M.sh
deleted file mode 100755
index 29730e1..0000000
--- a/community/mahout-mr/examples/bin/factorize-movielens-1M.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Instructions:
-#
-# Before using this script, you have to download and extract the Movielens 1M dataset
-# from http://www.grouplens.org/node/73
-#
-# To run:  change into the mahout directory and type:
-#  export MAHOUT_LOCAL=true
-# Then:
-#  examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script runs the Alternating Least Squares Recommender on the Grouplens data set (size 1M)."
-  echo "Syntax: $0 /path/to/ratings.dat\n"
-  exit
-fi
-
-if [ $# -ne 1 ]
-then
-  echo -e "\nYou have to download the Movielens 1M dataset from http://www.grouplens.org/node/73 before"
-  echo -e "you can run this example. After that extract it and supply the path to the ratings.dat file.\n"
-  echo -e "Syntax: $0 /path/to/ratings.dat\n"
-  exit -1
-fi
-
-export MAHOUT_LOCAL=true
-MAHOUT="$MAHOUT_HOME/bin/mahout"
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-${USER}
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-echo "creating work directory at ${WORK_DIR}"
-mkdir -p ${WORK_DIR}/movielens
-
-echo "Converting ratings..."
-cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv
-
-# create a 90% percent training set and a 10% probe set
-$MAHOUT splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset \
-    --trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp
-
-# run distributed ALS-WR to factorize the rating matrix defined by the training set
-$MAHOUT parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out \
-    --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 2
-
-# compute predictions against the probe set, measure the error
-$MAHOUT evaluateFactorization --input ${WORK_DIR}/dataset/probeSet/ --output ${WORK_DIR}/als/rmse/ \
-    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
-
-# compute recommendations
-$MAHOUT recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations/ \
-    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ \
-    --numRecommendations 6 --maxRating 5 --numThreads 2
-
-# print the error
-echo -e "\nRMSE is:\n"
-cat ${WORK_DIR}/als/rmse/rmse.txt
-echo -e "\n"
-
-echo -e "\nSample recommendations:\n"
-shuf ${WORK_DIR}/recommendations/part-m-00000 |head
-echo -e "\n\n"
-
-echo "removing work directory"
-rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/factorize-netflix.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/factorize-netflix.sh b/community/mahout-mr/examples/bin/factorize-netflix.sh
deleted file mode 100755
index 26faf66..0000000
--- a/community/mahout-mr/examples/bin/factorize-netflix.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Instructions:
-#
-# You can only use this script in conjunction with the Netflix dataset. Unpack the Netflix dataset and provide the
-# following:
-#
-#   1) the path to the folder 'training_set' that contains all the movie rating files
-#   2) the path to the file 'qualifying.txt' that contains the user,item pairs to predict
-#   3) the path to the file 'judging.txt' that contains the ratings of user,item pairs to predict for
-#
-# To run:
-#  ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt
-
-echo "Note this script has been deprecated due to the lack of access to the Netflix data set."
-exit 1
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script runs the ALS Recommender on the Netflix data set."
-  echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n"
-  exit
-fi
-
-if [ $# -ne 3 ]
-then
-  echo -e "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n"
-  exit -1
-fi
-
-MAHOUT="../../bin/mahout"
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
-  WORK_DIR=/tmp/mahout-work-${USER}
-else
-  WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-echo "Preparing data..."
-$MAHOUT org.apache.mahout.cf.taste.hadoop.example.als.netflix.NetflixDatasetConverter $1 $2 $3 ${WORK_DIR}
-
-# run distributed ALS-WR to factorize the rating matrix defined by the training set
-$MAHOUT parallelALS --input ${WORK_DIR}/trainingSet/ratings.tsv --output ${WORK_DIR}/als/out \
-    --tempDir ${WORK_DIR}/als/tmp --numFeatures 25 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 4
-
-# compute predictions against the probe set, measure the error
-$MAHOUT evaluateFactorization --input ${WORK_DIR}/probeSet/ratings.tsv --output ${WORK_DIR}/als/rmse/ \
-    --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
-
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-
-  # print the error, should be around 0.923
-  echo -e "\nRMSE is:\n"
-  $DFS -tail ${WORK_DIR}/als/rmse/rmse.txt
-  echo -e "\n"
-  echo "removing work directory"
-  set +e
-  $DFSRM ${WORK_DIR}
-
-else
-
-  # print the error, should be around 0.923
-  echo -e "\nRMSE is:\n"
-  cat ${WORK_DIR}/als/rmse/rmse.txt
-  echo -e "\n"
-  echo "removing work directory"
-  rm -rf ${WORK_DIR}
-
-fi
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/get-all-examples.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/get-all-examples.sh b/community/mahout-mr/examples/bin/get-all-examples.sh
deleted file mode 100755
index 4128e47..0000000
--- a/community/mahout-mr/examples/bin/get-all-examples.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Clones Mahout example code from remote repositories with their own 
-# build process.  Follow the README for each example for instructions.
-#
-# Usage:  change into the mahout directory and type:
-#  examples/bin/get-all-examples.sh
-
-# Solr-recommender
-echo " Solr-recommender example: "
-echo " 1) imports text 'log files' of some delimited form for user preferences"
-echo " 2) creates the correct Mahout files and stores distionaries to translate external Id to and from Mahout Ids"
-echo " 3) it implements a prototype two actions 'cross-recommender', which takes two actions made by the same user and creates recommendations"
-echo " 4) it creates output for user->preference history CSV and and item->similar items 'similarity' matrix for use in a Solr-recommender."
-echo "    To use Solr you would index the similarity matrix CSV, and use user preference history from the history CSV as a query, the result"
-echo "    from Solr will be an ordered list of recommendations returning the same item Ids as were input."
-echo " For further description see the README.md here https://github.com/pferrel/solr-recommender"
-echo " To build run 'cd solr-recommender; mvn install'"
-echo " To process the example after building make sure MAHOUT_LOCAL IS SET and hadoop is in local mode then "
-echo " run 'cd scripts; ./solr-recommender-example'"
-git clone https://github.com/pferrel/solr-recommender

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/lda.algorithm
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/lda.algorithm b/community/mahout-mr/examples/bin/lda.algorithm
deleted file mode 100644
index fb84ea0..0000000
--- a/community/mahout-mr/examples/bin/lda.algorithm
+++ /dev/null
@@ -1,45 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-merge.policy=org.apache.lucene.index.LogDocMergePolicy
-merge.factor=mrg:10:20
-max.buffered=buf:100:1000
-compound=true
-
-analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
-directory=FSDirectory
-
-doc.stored=true
-doc.term.vector=true
-doc.tokenized=true
-log.step=600
-
-content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
-content.source.forever=false
-doc.maker.forever=false
-query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
-
-# task at this depth or less would print when they start
-task.max.depth.log=2
-
-log.queries=false
-# --------- alg
-{ "BuildReuters"
-  CreateIndex 
-  { "AddDocs" AddDoc > : *
-#  Optimize
-  CloseIndex
-}
-


[51/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
NO-JIRA Clean up MR refactor


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/410ed16a
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/410ed16a
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/410ed16a

Branch: refs/heads/branch-0.14.0
Commit: 410ed16af1fc587999868dd4990cebfa7d14633e
Parents: e0573de
Author: Trevor a.k.a @rawkintrevo <tr...@gmail.com>
Authored: Thu Jun 28 09:38:33 2018 -0500
Committer: Trevor a.k.a @rawkintrevo <tr...@gmail.com>
Committed: Thu Jun 28 09:38:33 2018 -0500

----------------------------------------------------------------------
 community/mahout-mr/conf/log4j.xml              |    17 +
 community/mahout-mr/examples/bin/README.txt     |    13 -
 .../examples/bin/classify-20newsgroups.sh       |   197 -
 .../examples/bin/classify-wikipedia.sh          |   196 -
 .../mahout-mr/examples/bin/cluster-reuters.sh   |   203 -
 .../examples/bin/cluster-syntheticcontrol.sh    |   105 -
 .../examples/bin/factorize-movielens-1M.sh      |    85 -
 .../mahout-mr/examples/bin/factorize-netflix.sh |    90 -
 .../mahout-mr/examples/bin/get-all-examples.sh  |    36 -
 community/mahout-mr/examples/bin/lda.algorithm  |    45 -
 .../examples/bin/resources/bank-full.csv        | 45212 ---------
 .../examples/bin/resources/country.txt          |   229 -
 .../examples/bin/resources/country10.txt        |    10 -
 .../examples/bin/resources/country2.txt         |     2 -
 .../examples/bin/resources/donut-test.csv       |    41 -
 .../mahout-mr/examples/bin/resources/donut.csv  |    41 -
 .../examples/bin/resources/test-data.csv        |    61 -
 .../mahout-mr/examples/bin/set-dfs-commands.sh  |    54 -
 community/mahout-mr/examples/pom.xml            |   199 -
 .../examples/src/main/assembly/job.xml          |    46 -
 .../cf/taste/example/TasteOptionParser.java     |    75 -
 .../BookCrossingBooleanRecommender.java         |   102 -
 .../BookCrossingBooleanRecommenderBuilder.java  |    32 -
 ...ossingBooleanRecommenderEvaluatorRunner.java |    59 -
 .../bookcrossing/BookCrossingDataModel.java     |    99 -
 .../BookCrossingDataModelBuilder.java           |    33 -
 .../bookcrossing/BookCrossingRecommender.java   |   101 -
 .../BookCrossingRecommenderBuilder.java         |    32 -
 .../BookCrossingRecommenderEvaluatorRunner.java |    54 -
 .../mahout/cf/taste/example/bookcrossing/README |     9 -
 .../cf/taste/example/email/EmailUtility.java    |   104 -
 .../email/FromEmailToDictionaryMapper.java      |    61 -
 .../example/email/MailToDictionaryReducer.java  |    43 -
 .../taste/example/email/MailToPrefsDriver.java  |   274 -
 .../cf/taste/example/email/MailToRecMapper.java |   101 -
 .../taste/example/email/MailToRecReducer.java   |    53 -
 .../example/email/MsgIdToDictionaryMapper.java  |    49 -
 .../taste/example/kddcup/DataFileIterable.java  |    44 -
 .../taste/example/kddcup/DataFileIterator.java  |   158 -
 .../taste/example/kddcup/KDDCupDataModel.java   |   231 -
 .../mahout/cf/taste/example/kddcup/ToCSV.java   |    77 -
 .../kddcup/track1/EstimateConverter.java        |    43 -
 .../example/kddcup/track1/Track1Callable.java   |    67 -
 .../kddcup/track1/Track1Recommender.java        |    94 -
 .../kddcup/track1/Track1RecommenderBuilder.java |    32 -
 .../track1/Track1RecommenderEvaluator.java      |   108 -
 .../Track1RecommenderEvaluatorRunner.java       |    56 -
 .../example/kddcup/track1/Track1Runner.java     |    95 -
 .../svd/DataModelFactorizablePreferences.java   |   107 -
 .../track1/svd/FactorizablePreferences.java     |    44 -
 .../svd/KDDCupFactorizablePreferences.java      |   123 -
 .../track1/svd/ParallelArraysSGDFactorizer.java |   265 -
 .../kddcup/track1/svd/Track1SVDRunner.java      |   141 -
 .../example/kddcup/track2/HybridSimilarity.java |    62 -
 .../example/kddcup/track2/Track2Callable.java   |   106 -
 .../kddcup/track2/Track2Recommender.java        |   100 -
 .../kddcup/track2/Track2RecommenderBuilder.java |    33 -
 .../example/kddcup/track2/Track2Runner.java     |   100 -
 .../taste/example/kddcup/track2/TrackData.java  |    71 -
 .../kddcup/track2/TrackItemSimilarity.java      |   106 -
 .../taste/example/kddcup/track2/UserResult.java |    54 -
 .../als/netflix/NetflixDatasetConverter.java    |   140 -
 .../example/BatchItemSimilaritiesGroupLens.java |    65 -
 .../precompute/example/GroupLensDataModel.java  |    96 -
 .../mahout/classifier/NewsgroupHelper.java      |   128 -
 .../classifier/email/PrepEmailMapper.java       |    65 -
 .../classifier/email/PrepEmailReducer.java      |    47 -
 .../email/PrepEmailVectorsDriver.java           |    76 -
 .../sequencelearning/hmm/PosTagger.java         |   277 -
 .../sgd/AdaptiveLogisticModelParameters.java    |   236 -
 .../classifier/sgd/LogisticModelParameters.java |   265 -
 .../classifier/sgd/PrintResourceOrFile.java     |    42 -
 .../classifier/sgd/RunAdaptiveLogistic.java     |   197 -
 .../mahout/classifier/sgd/RunLogistic.java      |   163 -
 .../apache/mahout/classifier/sgd/SGDHelper.java |   151 -
 .../apache/mahout/classifier/sgd/SGDInfo.java   |    59 -
 .../classifier/sgd/SimpleCsvExamples.java       |   283 -
 .../mahout/classifier/sgd/TestASFEmail.java     |   152 -
 .../mahout/classifier/sgd/TestNewsGroups.java   |   141 -
 .../mahout/classifier/sgd/TrainASFEmail.java    |   137 -
 .../classifier/sgd/TrainAdaptiveLogistic.java   |   377 -
 .../mahout/classifier/sgd/TrainLogistic.java    |   311 -
 .../mahout/classifier/sgd/TrainNewsGroups.java  |   154 -
 .../sgd/ValidateAdaptiveLogistic.java           |   218 -
 .../BankMarketingClassificationMain.java        |    70 -
 .../sgd/bankmarketing/TelephoneCall.java        |   104 -
 .../sgd/bankmarketing/TelephoneCallParser.java  |    66 -
 .../clustering/display/ClustersFilter.java      |    31 -
 .../clustering/display/DisplayCanopy.java       |    88 -
 .../clustering/display/DisplayClustering.java   |   374 -
 .../clustering/display/DisplayFuzzyKMeans.java  |   110 -
 .../clustering/display/DisplayKMeans.java       |   106 -
 .../display/DisplaySpectralKMeans.java          |    85 -
 .../apache/mahout/clustering/display/README.txt |    22 -
 .../tools/ClusterQualitySummarizer.java         |   279 -
 .../clustering/streaming/tools/IOUtils.java     |    80 -
 .../clustering/syntheticcontrol/canopy/Job.java |   125 -
 .../syntheticcontrol/fuzzykmeans/Job.java       |   144 -
 .../clustering/syntheticcontrol/kmeans/Job.java |   187 -
 .../fpm/pfpgrowth/DeliciousTagsExample.java     |    94 -
 .../dataset/KeyBasedStringTupleCombiner.java    |    40 -
 .../dataset/KeyBasedStringTupleGrouper.java     |    77 -
 .../dataset/KeyBasedStringTupleMapper.java      |    90 -
 .../dataset/KeyBasedStringTupleReducer.java     |    74 -
 .../examples/src/main/resources/bank-full.csv   | 45212 ---------
 .../src/main/resources/cf-data-purchase.txt     |     7 -
 .../src/main/resources/cf-data-view.txt         |    12 -
 .../examples/src/main/resources/donut-test.csv  |    41 -
 .../examples/src/main/resources/donut.csv       |    41 -
 .../examples/src/main/resources/test-data.csv   |    61 -
 .../sgd/LogisticModelParametersTest.java        |    43 -
 .../classifier/sgd/ModelDissectorTest.java      |    40 -
 .../classifier/sgd/TrainLogisticTest.java       |   167 -
 .../clustering/display/ClustersFilterTest.java  |    75 -
 .../apache/mahout/examples/MahoutTestCase.java  |    30 -
 .../examples/src/test/resources/country.txt     |   229 -
 .../examples/src/test/resources/country10.txt   |    10 -
 .../examples/src/test/resources/country2.txt    |     2 -
 .../examples/src/test/resources/subjects.txt    |     2 -
 .../examples/src/test/resources/wdbc.infos      |    32 -
 .../examples/src/test/resources/wdbc/wdbc.data  |   569 -
 community/mahout-mr/integration/pom.xml         |     2 +-
 community/mahout-mr/mr-examples/bin/README.txt  |    13 +
 .../mr-examples/bin/classify-20newsgroups.sh    |   197 +
 .../mr-examples/bin/classify-wikipedia.sh       |   196 +
 .../mr-examples/bin/cluster-reuters.sh          |   203 +
 .../mr-examples/bin/cluster-syntheticcontrol.sh |   105 +
 .../mr-examples/bin/factorize-movielens-1M.sh   |    85 +
 .../mr-examples/bin/factorize-netflix.sh        |    90 +
 .../mr-examples/bin/get-all-examples.sh         |    36 +
 .../mahout-mr/mr-examples/bin/lda.algorithm     |    45 +
 .../mr-examples/bin/resources/bank-full.csv     | 45212 +++++++++
 .../mr-examples/bin/resources/country.txt       |   229 +
 .../mr-examples/bin/resources/country10.txt     |    10 +
 .../mr-examples/bin/resources/country2.txt      |     2 +
 .../mr-examples/bin/resources/donut-test.csv    |    41 +
 .../mr-examples/bin/resources/donut.csv         |    41 +
 .../mr-examples/bin/resources/test-data.csv     |    61 +
 .../mr-examples/bin/set-dfs-commands.sh         |    54 +
 community/mahout-mr/mr-examples/pom.xml         |   121 +
 .../mr-examples/src/main/assembly/job.xml       |    46 +
 .../cf/taste/example/TasteOptionParser.java     |    75 +
 .../BookCrossingBooleanRecommender.java         |   102 +
 .../BookCrossingBooleanRecommenderBuilder.java  |    32 +
 ...ossingBooleanRecommenderEvaluatorRunner.java |    59 +
 .../bookcrossing/BookCrossingDataModel.java     |    99 +
 .../BookCrossingDataModelBuilder.java           |    33 +
 .../bookcrossing/BookCrossingRecommender.java   |   101 +
 .../BookCrossingRecommenderBuilder.java         |    32 +
 .../BookCrossingRecommenderEvaluatorRunner.java |    54 +
 .../mahout/cf/taste/example/bookcrossing/README |     9 +
 .../cf/taste/example/email/EmailUtility.java    |   104 +
 .../email/FromEmailToDictionaryMapper.java      |    61 +
 .../example/email/MailToDictionaryReducer.java  |    43 +
 .../taste/example/email/MailToPrefsDriver.java  |   274 +
 .../cf/taste/example/email/MailToRecMapper.java |   101 +
 .../taste/example/email/MailToRecReducer.java   |    53 +
 .../example/email/MsgIdToDictionaryMapper.java  |    49 +
 .../taste/example/kddcup/DataFileIterable.java  |    44 +
 .../taste/example/kddcup/DataFileIterator.java  |   158 +
 .../taste/example/kddcup/KDDCupDataModel.java   |   231 +
 .../mahout/cf/taste/example/kddcup/ToCSV.java   |    77 +
 .../kddcup/track1/EstimateConverter.java        |    43 +
 .../example/kddcup/track1/Track1Callable.java   |    67 +
 .../kddcup/track1/Track1Recommender.java        |    94 +
 .../kddcup/track1/Track1RecommenderBuilder.java |    32 +
 .../track1/Track1RecommenderEvaluator.java      |   108 +
 .../Track1RecommenderEvaluatorRunner.java       |    56 +
 .../example/kddcup/track1/Track1Runner.java     |    95 +
 .../svd/DataModelFactorizablePreferences.java   |   107 +
 .../track1/svd/FactorizablePreferences.java     |    44 +
 .../svd/KDDCupFactorizablePreferences.java      |   123 +
 .../track1/svd/ParallelArraysSGDFactorizer.java |   265 +
 .../kddcup/track1/svd/Track1SVDRunner.java      |   141 +
 .../example/kddcup/track2/HybridSimilarity.java |    62 +
 .../example/kddcup/track2/Track2Callable.java   |   106 +
 .../kddcup/track2/Track2Recommender.java        |   100 +
 .../kddcup/track2/Track2RecommenderBuilder.java |    33 +
 .../example/kddcup/track2/Track2Runner.java     |   100 +
 .../taste/example/kddcup/track2/TrackData.java  |    71 +
 .../kddcup/track2/TrackItemSimilarity.java      |   106 +
 .../taste/example/kddcup/track2/UserResult.java |    54 +
 .../als/netflix/NetflixDatasetConverter.java    |   140 +
 .../example/BatchItemSimilaritiesGroupLens.java |    65 +
 .../precompute/example/GroupLensDataModel.java  |    96 +
 .../mahout/classifier/NewsgroupHelper.java      |   128 +
 .../classifier/email/PrepEmailMapper.java       |    65 +
 .../classifier/email/PrepEmailReducer.java      |    47 +
 .../email/PrepEmailVectorsDriver.java           |    76 +
 .../sequencelearning/hmm/PosTagger.java         |   277 +
 .../sgd/AdaptiveLogisticModelParameters.java    |   236 +
 .../classifier/sgd/LogisticModelParameters.java |   265 +
 .../classifier/sgd/PrintResourceOrFile.java     |    42 +
 .../classifier/sgd/RunAdaptiveLogistic.java     |   197 +
 .../mahout/classifier/sgd/RunLogistic.java      |   163 +
 .../apache/mahout/classifier/sgd/SGDHelper.java |   151 +
 .../apache/mahout/classifier/sgd/SGDInfo.java   |    59 +
 .../classifier/sgd/SimpleCsvExamples.java       |   283 +
 .../mahout/classifier/sgd/TestASFEmail.java     |   152 +
 .../mahout/classifier/sgd/TestNewsGroups.java   |   141 +
 .../mahout/classifier/sgd/TrainASFEmail.java    |   137 +
 .../classifier/sgd/TrainAdaptiveLogistic.java   |   377 +
 .../mahout/classifier/sgd/TrainLogistic.java    |   311 +
 .../mahout/classifier/sgd/TrainNewsGroups.java  |   154 +
 .../sgd/ValidateAdaptiveLogistic.java           |   218 +
 .../BankMarketingClassificationMain.java        |    70 +
 .../sgd/bankmarketing/TelephoneCall.java        |   104 +
 .../sgd/bankmarketing/TelephoneCallParser.java  |    66 +
 .../clustering/display/ClustersFilter.java      |    31 +
 .../clustering/display/DisplayCanopy.java       |    88 +
 .../clustering/display/DisplayClustering.java   |   374 +
 .../clustering/display/DisplayFuzzyKMeans.java  |   110 +
 .../clustering/display/DisplayKMeans.java       |   106 +
 .../display/DisplaySpectralKMeans.java          |    85 +
 .../apache/mahout/clustering/display/README.txt |    22 +
 .../tools/ClusterQualitySummarizer.java         |   279 +
 .../clustering/streaming/tools/IOUtils.java     |    80 +
 .../clustering/syntheticcontrol/canopy/Job.java |   125 +
 .../syntheticcontrol/fuzzykmeans/Job.java       |   144 +
 .../clustering/syntheticcontrol/kmeans/Job.java |   187 +
 .../fpm/pfpgrowth/DeliciousTagsExample.java     |    94 +
 .../dataset/KeyBasedStringTupleCombiner.java    |    40 +
 .../dataset/KeyBasedStringTupleGrouper.java     |    77 +
 .../dataset/KeyBasedStringTupleMapper.java      |    90 +
 .../dataset/KeyBasedStringTupleReducer.java     |    74 +
 .../src/main/resources/bank-full.csv            | 45212 +++++++++
 .../src/main/resources/cf-data-purchase.txt     |     7 +
 .../src/main/resources/cf-data-view.txt         |    12 +
 .../src/main/resources/donut-test.csv           |    41 +
 .../mr-examples/src/main/resources/donut.csv    |    41 +
 .../src/main/resources/test-data.csv            |    61 +
 .../sgd/LogisticModelParametersTest.java        |    43 +
 .../classifier/sgd/ModelDissectorTest.java      |    40 +
 .../classifier/sgd/TrainLogisticTest.java       |   167 +
 .../clustering/display/ClustersFilterTest.java  |    75 +
 .../apache/mahout/examples/MahoutTestCase.java  |    30 +
 .../mr-examples/src/test/resources/country.txt  |   229 +
 .../src/test/resources/country10.txt            |    10 +
 .../mr-examples/src/test/resources/country2.txt |     2 +
 .../mr-examples/src/test/resources/subjects.txt |     2 +
 .../mr-examples/src/test/resources/wdbc.infos   |    32 +
 .../src/test/resources/wdbc/wdbc.data           |   569 +
 community/mahout-mr/mr/pom.xml                  |   295 +
 .../appended-resources/supplemental-models.xml  |   279 +
 .../mr/src/images/logos/ mahout-powered.svg     |   630 +
 .../mahout-mr/mr/src/images/logos/favicon.ico   |   Bin 0 -> 28838 bytes
 .../mr/src/images/logos/favicon128.png          |   Bin 0 -> 5259 bytes
 .../mahout-mr/mr/src/images/logos/favicon16.png |   Bin 0 -> 1009 bytes
 .../mahout-mr/mr/src/images/logos/favicon32.png |   Bin 0 -> 1847 bytes
 .../mahout-mr/mr/src/images/logos/favicon64.png |   Bin 0 -> 3148 bytes
 .../mr/src/images/logos/mahout-logo-100.png     |   Bin 0 -> 19477 bytes
 .../mr/src/images/logos/mahout-logo-200.png     |   Bin 0 -> 46360 bytes
 .../mr/src/images/logos/mahout-logo-300.png     |   Bin 0 -> 70139 bytes
 .../mr/src/images/logos/mahout-logo-400.png     |   Bin 0 -> 55468 bytes
 .../images/logos/mahout-logo-poweredby-100.png  |   Bin 0 -> 24623 bytes
 .../images/logos/mahout-logo-poweredby-55.png   |   Bin 0 -> 11684 bytes
 .../logos/mahout-logo-transparent-400.png       |   Bin 0 -> 61970 bytes
 .../mr/src/images/logos/mahout-logo.svg         |   627 +
 .../mahout-mr/mr/src/main/assembly/job.xml      |    61 +
 .../mahout-mr/mr/src/main/assembly/src.xml      |    64 +
 .../main/java/org/apache/mahout/Version.java    |    41 +
 .../cf/taste/common/NoSuchItemException.java    |    32 +
 .../cf/taste/common/NoSuchUserException.java    |    32 +
 .../mahout/cf/taste/common/Refreshable.java     |    53 +
 .../mahout/cf/taste/common/TasteException.java  |    41 +
 .../mahout/cf/taste/common/Weighting.java       |    31 +
 .../mahout/cf/taste/eval/DataModelBuilder.java  |    45 +
 .../mahout/cf/taste/eval/IRStatistics.java      |    80 +
 .../cf/taste/eval/RecommenderBuilder.java       |    45 +
 .../cf/taste/eval/RecommenderEvaluator.java     |   105 +
 .../taste/eval/RecommenderIRStatsEvaluator.java |    64 +
 .../taste/eval/RelevantItemsDataSplitter.java   |    62 +
 .../cf/taste/hadoop/EntityEntityWritable.java   |    98 +
 .../cf/taste/hadoop/EntityPrefWritable.java     |    89 +
 .../cf/taste/hadoop/MutableRecommendedItem.java |    81 +
 .../taste/hadoop/RecommendedItemsWritable.java  |    96 +
 .../cf/taste/hadoop/TasteHadoopUtils.java       |    84 +
 .../cf/taste/hadoop/ToEntityPrefsMapper.java    |    78 +
 .../cf/taste/hadoop/ToItemPrefsMapper.java      |    46 +
 .../mahout/cf/taste/hadoop/TopItemsQueue.java   |    60 +
 .../apache/mahout/cf/taste/hadoop/als/ALS.java  |   100 +
 .../cf/taste/hadoop/als/DatasetSplitter.java    |   158 +
 .../hadoop/als/FactorizationEvaluator.java      |   166 +
 .../hadoop/als/MultithreadedSharingMapper.java  |    62 +
 .../hadoop/als/ParallelALSFactorizationJob.java |   414 +
 .../cf/taste/hadoop/als/PredictionMapper.java   |   145 +
 .../cf/taste/hadoop/als/RecommenderJob.java     |   110 +
 .../cf/taste/hadoop/als/SharingMapper.java      |    59 +
 .../hadoop/als/SolveExplicitFeedbackMapper.java |    61 +
 .../hadoop/als/SolveImplicitFeedbackMapper.java |    58 +
 .../item/AggregateAndRecommendReducer.java      |   220 +
 .../mahout/cf/taste/hadoop/item/IDReader.java   |   244 +
 .../item/ItemFilterAsVectorAndPrefsReducer.java |    62 +
 .../cf/taste/hadoop/item/ItemFilterMapper.java  |    47 +
 .../cf/taste/hadoop/item/ItemIDIndexMapper.java |    56 +
 .../taste/hadoop/item/ItemIDIndexReducer.java   |    48 +
 .../hadoop/item/PartialMultiplyMapper.java      |    57 +
 .../item/PrefAndSimilarityColumnWritable.java   |    85 +
 .../cf/taste/hadoop/item/RecommenderJob.java    |   337 +
 .../item/SimilarityMatrixRowWrapperMapper.java  |    54 +
 .../taste/hadoop/item/ToUserVectorsReducer.java |    84 +
 .../hadoop/item/ToVectorAndPrefReducer.java     |    63 +
 .../hadoop/item/UserVectorSplitterMapper.java   |   116 +
 .../hadoop/item/VectorAndPrefsWritable.java     |    92 +
 .../taste/hadoop/item/VectorOrPrefWritable.java |   104 +
 .../preparation/PreparePreferenceMatrixJob.java |   115 +
 .../hadoop/preparation/ToItemVectorsMapper.java |    56 +
 .../preparation/ToItemVectorsReducer.java       |    38 +
 .../similarity/item/ItemSimilarityJob.java      |   233 +
 .../similarity/item/TopSimilarItemsQueue.java   |    60 +
 .../common/AbstractLongPrimitiveIterator.java   |    27 +
 .../mahout/cf/taste/impl/common/BitSet.java     |    93 +
 .../mahout/cf/taste/impl/common/Cache.java      |   178 +
 .../cf/taste/impl/common/FastByIDMap.java       |   661 +
 .../mahout/cf/taste/impl/common/FastIDSet.java  |   426 +
 .../mahout/cf/taste/impl/common/FastMap.java    |   729 +
 .../taste/impl/common/FixedRunningAverage.java  |    83 +
 .../common/FixedRunningAverageAndStdDev.java    |    51 +
 .../taste/impl/common/FullRunningAverage.java   |   109 +
 .../common/FullRunningAverageAndStdDev.java     |   107 +
 .../impl/common/InvertedRunningAverage.java     |    58 +
 .../common/InvertedRunningAverageAndStdDev.java |    63 +
 .../impl/common/LongPrimitiveArrayIterator.java |    93 +
 .../impl/common/LongPrimitiveIterator.java      |    39 +
 .../cf/taste/impl/common/RefreshHelper.java     |   122 +
 .../mahout/cf/taste/impl/common/Retriever.java  |    36 +
 .../cf/taste/impl/common/RunningAverage.java    |    67 +
 .../impl/common/RunningAverageAndStdDev.java    |    36 +
 .../common/SamplingLongPrimitiveIterator.java   |   111 +
 .../cf/taste/impl/common/SkippingIterator.java  |    35 +
 .../impl/common/WeightedRunningAverage.java     |   100 +
 .../common/WeightedRunningAverageAndStdDev.java |    89 +
 .../impl/common/jdbc/AbstractJDBCComponent.java |    88 +
 .../taste/impl/common/jdbc/EachRowIterator.java |    92 +
 .../impl/common/jdbc/ResultSetIterator.java     |    66 +
 .../AbstractDifferenceRecommenderEvaluator.java |   276 +
 ...eAbsoluteDifferenceRecommenderEvaluator.java |    59 +
 .../GenericRecommenderIRStatsEvaluator.java     |   237 +
 .../eval/GenericRelevantItemsDataSplitter.java  |    83 +
 .../cf/taste/impl/eval/IRStatisticsImpl.java    |    95 +
 .../mahout/cf/taste/impl/eval/LoadCallable.java |    40 +
 .../cf/taste/impl/eval/LoadEvaluator.java       |    61 +
 .../cf/taste/impl/eval/LoadStatistics.java      |    34 +
 .../eval/OrderBasedRecommenderEvaluator.java    |   431 +
 .../impl/eval/RMSRecommenderEvaluator.java      |    56 +
 .../cf/taste/impl/eval/StatsCallable.java       |    64 +
 .../cf/taste/impl/model/AbstractDataModel.java  |    53 +
 .../cf/taste/impl/model/AbstractIDMigrator.java |    66 +
 .../impl/model/AbstractJDBCIDMigrator.java      |   108 +
 .../impl/model/BooleanItemPreferenceArray.java  |   234 +
 .../cf/taste/impl/model/BooleanPreference.java  |    64 +
 .../impl/model/BooleanUserPreferenceArray.java  |   234 +
 .../impl/model/GenericBooleanPrefDataModel.java |   320 +
 .../cf/taste/impl/model/GenericDataModel.java   |   361 +
 .../impl/model/GenericItemPreferenceArray.java  |   301 +
 .../cf/taste/impl/model/GenericPreference.java  |    70 +
 .../impl/model/GenericUserPreferenceArray.java  |   307 +
 .../cf/taste/impl/model/MemoryIDMigrator.java   |    55 +
 .../taste/impl/model/MySQLJDBCIDMigrator.java   |    67 +
 .../PlusAnonymousConcurrentUserDataModel.java   |   352 +
 .../impl/model/PlusAnonymousUserDataModel.java  |   320 +
 .../PlusAnonymousUserLongPrimitiveIterator.java |    90 +
 .../cf/taste/impl/model/file/FileDataModel.java |   758 +
 .../taste/impl/model/file/FileIDMigrator.java   |   117 +
 .../neighborhood/AbstractUserNeighborhood.java  |    71 +
 .../neighborhood/CachingUserNeighborhood.java   |    69 +
 .../neighborhood/NearestNUserNeighborhood.java  |   122 +
 .../neighborhood/ThresholdUserNeighborhood.java |   104 +
 .../AbstractCandidateItemsStrategy.java         |    57 +
 .../impl/recommender/AbstractRecommender.java   |   140 +
 .../AllSimilarItemsCandidateItemsStrategy.java  |    50 +
 .../AllUnknownItemsCandidateItemsStrategy.java  |    41 +
 .../impl/recommender/ByRescoreComparator.java   |    65 +
 .../ByValueRecommendedItemComparator.java       |    43 +
 .../impl/recommender/CachingRecommender.java    |   251 +
 .../recommender/EstimatedPreferenceCapper.java  |    46 +
 .../GenericBooleanPrefItemBasedRecommender.java |    71 +
 .../GenericBooleanPrefUserBasedRecommender.java |    82 +
 .../GenericItemBasedRecommender.java            |   378 +
 .../recommender/GenericRecommendedItem.java     |    76 +
 .../GenericUserBasedRecommender.java            |   247 +
 .../recommender/ItemAverageRecommender.java     |   199 +
 .../recommender/ItemUserAverageRecommender.java |   240 +
 .../cf/taste/impl/recommender/NullRescorer.java |    86 +
 ...ItemsNeighborhoodCandidateItemsStrategy.java |    48 +
 .../impl/recommender/RandomRecommender.java     |    97 +
 .../SamplingCandidateItemsStrategy.java         |   165 +
 .../cf/taste/impl/recommender/SimilarUser.java  |    80 +
 .../cf/taste/impl/recommender/TopItems.java     |   211 +
 .../impl/recommender/svd/ALSWRFactorizer.java   |   312 +
 .../recommender/svd/AbstractFactorizer.java     |    94 +
 .../impl/recommender/svd/Factorization.java     |   137 +
 .../taste/impl/recommender/svd/Factorizer.java  |    30 +
 .../svd/FilePersistenceStrategy.java            |   139 +
 .../recommender/svd/NoPersistenceStrategy.java  |    37 +
 .../recommender/svd/ParallelSGDFactorizer.java  |   340 +
 .../recommender/svd/PersistenceStrategy.java    |    46 +
 .../recommender/svd/RatingSGDFactorizer.java    |   221 +
 .../recommender/svd/SVDPlusPlusFactorizer.java  |   178 +
 .../impl/recommender/svd/SVDPreference.java     |    41 +
 .../impl/recommender/svd/SVDRecommender.java    |   185 +
 .../impl/similarity/AbstractItemSimilarity.java |    64 +
 .../impl/similarity/AbstractSimilarity.java     |   343 +
 .../similarity/AveragingPreferenceInferrer.java |    85 +
 .../impl/similarity/CachingItemSimilarity.java  |   111 +
 .../impl/similarity/CachingUserSimilarity.java  |   104 +
 .../impl/similarity/CityBlockSimilarity.java    |    98 +
 .../similarity/EuclideanDistanceSimilarity.java |    67 +
 .../impl/similarity/GenericItemSimilarity.java  |   358 +
 .../impl/similarity/GenericUserSimilarity.java  |   238 +
 .../similarity/LogLikelihoodSimilarity.java     |   121 +
 .../impl/similarity/LongPairMatchPredicate.java |    40 +
 .../PearsonCorrelationSimilarity.java           |    93 +
 .../SpearmanCorrelationSimilarity.java          |   135 +
 .../TanimotoCoefficientSimilarity.java          |   126 +
 .../similarity/UncenteredCosineSimilarity.java  |    69 +
 .../file/FileItemItemSimilarityIterable.java    |    46 +
 .../file/FileItemItemSimilarityIterator.java    |    60 +
 .../similarity/file/FileItemSimilarity.java     |   137 +
 .../precompute/FileSimilarItemsWriter.java      |    67 +
 .../MultithreadedBatchItemSimilarities.java     |   230 +
 .../apache/mahout/cf/taste/model/DataModel.java |   199 +
 .../mahout/cf/taste/model/IDMigrator.java       |    63 +
 .../mahout/cf/taste/model/JDBCDataModel.java    |    43 +
 .../mahout/cf/taste/model/Preference.java       |    48 +
 .../mahout/cf/taste/model/PreferenceArray.java  |   143 +
 .../cf/taste/model/UpdatableIDMigrator.java     |    47 +
 .../cf/taste/neighborhood/UserNeighborhood.java |    40 +
 .../recommender/CandidateItemsStrategy.java     |    37 +
 .../mahout/cf/taste/recommender/IDRescorer.java |    47 +
 .../taste/recommender/ItemBasedRecommender.java |   145 +
 .../MostSimilarItemsCandidateItemsStrategy.java |    31 +
 .../cf/taste/recommender/RecommendedItem.java   |    41 +
 .../cf/taste/recommender/Recommender.java       |   132 +
 .../mahout/cf/taste/recommender/Rescorer.java   |    52 +
 .../taste/recommender/UserBasedRecommender.java |    54 +
 .../cf/taste/similarity/ItemSimilarity.java     |    64 +
 .../cf/taste/similarity/PreferenceInferrer.java |    47 +
 .../cf/taste/similarity/UserSimilarity.java     |    58 +
 .../precompute/BatchItemSimilarities.java       |    56 +
 .../similarity/precompute/SimilarItem.java      |    56 +
 .../similarity/precompute/SimilarItems.java     |    84 +
 .../precompute/SimilarItemsWriter.java          |    33 +
 .../classifier/AbstractVectorClassifier.java    |   248 +
 .../mahout/classifier/ClassifierResult.java     |    74 +
 .../mahout/classifier/ConfusionMatrix.java      |   444 +
 .../apache/mahout/classifier/OnlineLearner.java |    96 +
 .../classifier/RegressionResultAnalyzer.java    |   144 +
 .../mahout/classifier/ResultAnalyzer.java       |   132 +
 .../apache/mahout/classifier/df/Bagging.java    |    61 +
 .../apache/mahout/classifier/df/DFUtils.java    |   174 +
 .../mahout/classifier/df/DecisionForest.java    |   241 +
 .../mahout/classifier/df/ErrorEstimate.java     |    51 +
 .../df/builder/DecisionTreeBuilder.java         |   422 +
 .../df/builder/DefaultTreeBuilder.java          |   253 +
 .../classifier/df/builder/TreeBuilder.java      |    42 +
 .../apache/mahout/classifier/df/data/Data.java  |   281 +
 .../classifier/df/data/DataConverter.java       |    72 +
 .../mahout/classifier/df/data/DataLoader.java   |   255 +
 .../mahout/classifier/df/data/DataUtils.java    |    89 +
 .../mahout/classifier/df/data/Dataset.java      |   422 +
 .../classifier/df/data/DescriptorException.java |    28 +
 .../classifier/df/data/DescriptorUtils.java     |   110 +
 .../mahout/classifier/df/data/Instance.java     |    75 +
 .../df/data/conditions/Condition.java           |    57 +
 .../classifier/df/data/conditions/Equals.java   |    42 +
 .../df/data/conditions/GreaterOrEquals.java     |    42 +
 .../classifier/df/data/conditions/Lesser.java   |    42 +
 .../mahout/classifier/df/mapreduce/Builder.java |   333 +
 .../classifier/df/mapreduce/Classifier.java     |   238 +
 .../classifier/df/mapreduce/MapredMapper.java   |    75 +
 .../classifier/df/mapreduce/MapredOutput.java   |   120 +
 .../df/mapreduce/inmem/InMemBuilder.java        |   114 +
 .../df/mapreduce/inmem/InMemInputFormat.java    |   284 +
 .../df/mapreduce/inmem/InMemMapper.java         |   106 +
 .../df/mapreduce/inmem/package-info.java        |    22 +
 .../df/mapreduce/partial/PartialBuilder.java    |   158 +
 .../df/mapreduce/partial/Step1Mapper.java       |   168 +
 .../classifier/df/mapreduce/partial/TreeID.java |    58 +
 .../df/mapreduce/partial/package-info.java      |    16 +
 .../classifier/df/node/CategoricalNode.java     |   134 +
 .../apache/mahout/classifier/df/node/Leaf.java  |    95 +
 .../apache/mahout/classifier/df/node/Node.java  |    96 +
 .../classifier/df/node/NumericalNode.java       |   115 +
 .../classifier/df/ref/SequentialBuilder.java    |    78 +
 .../classifier/df/split/DefaultIgSplit.java     |   118 +
 .../mahout/classifier/df/split/IgSplit.java     |    35 +
 .../mahout/classifier/df/split/OptIgSplit.java  |   232 +
 .../classifier/df/split/RegressionSplit.java    |   177 +
 .../mahout/classifier/df/split/Split.java       |    68 +
 .../mahout/classifier/df/tools/Describe.java    |   166 +
 .../classifier/df/tools/ForestVisualizer.java   |   158 +
 .../mahout/classifier/df/tools/Frequencies.java |   122 +
 .../classifier/df/tools/FrequenciesJob.java     |   297 +
 .../classifier/df/tools/TreeVisualizer.java     |   264 +
 .../mahout/classifier/df/tools/UDistrib.java    |   212 +
 .../mahout/classifier/evaluation/Auc.java       |   233 +
 .../AbstractNaiveBayesClassifier.java           |    82 +
 .../classifier/naivebayes/BayesUtils.java       |   161 +
 .../ComplementaryNaiveBayesClassifier.java      |    43 +
 .../classifier/naivebayes/NaiveBayesModel.java  |   170 +
 .../StandardNaiveBayesClassifier.java           |    40 +
 .../naivebayes/test/BayesTestMapper.java        |    76 +
 .../naivebayes/test/TestNaiveBayesDriver.java   |   176 +
 .../training/ComplementaryThetaTrainer.java     |    83 +
 .../training/IndexInstancesMapper.java          |    53 +
 .../naivebayes/training/ThetaMapper.java        |    61 +
 .../naivebayes/training/TrainNaiveBayesJob.java |   177 +
 .../naivebayes/training/WeightsMapper.java      |    68 +
 .../sequencelearning/hmm/BaumWelchTrainer.java  |   161 +
 .../sequencelearning/hmm/HmmAlgorithms.java     |   306 +
 .../sequencelearning/hmm/HmmEvaluator.java      |   194 +
 .../sequencelearning/hmm/HmmModel.java          |   383 +
 .../sequencelearning/hmm/HmmTrainer.java        |   488 +
 .../sequencelearning/hmm/HmmUtils.java          |   360 +
 .../hmm/LossyHmmSerializer.java                 |    62 +
 .../hmm/RandomSequenceGenerator.java            |   102 +
 .../sequencelearning/hmm/ViterbiEvaluator.java  |   122 +
 .../sgd/AbstractOnlineLogisticRegression.java   |   317 +
 .../sgd/AdaptiveLogisticRegression.java         |   586 +
 .../mahout/classifier/sgd/CrossFoldLearner.java |   334 +
 .../mahout/classifier/sgd/CsvRecordFactory.java |   395 +
 .../mahout/classifier/sgd/DefaultGradient.java  |    49 +
 .../mahout/classifier/sgd/ElasticBandPrior.java |    76 +
 .../apache/mahout/classifier/sgd/Gradient.java  |    30 +
 .../mahout/classifier/sgd/GradientMachine.java  |   405 +
 .../org/apache/mahout/classifier/sgd/L1.java    |    59 +
 .../org/apache/mahout/classifier/sgd/L2.java    |    66 +
 .../mahout/classifier/sgd/MixedGradient.java    |    66 +
 .../mahout/classifier/sgd/ModelDissector.java   |   232 +
 .../mahout/classifier/sgd/ModelSerializer.java  |    67 +
 .../sgd/OnlineLogisticRegression.java           |   172 +
 .../classifier/sgd/PassiveAggressive.java       |   204 +
 .../classifier/sgd/PolymorphicWritable.java     |    46 +
 .../mahout/classifier/sgd/PriorFunction.java    |    45 +
 .../mahout/classifier/sgd/RankingGradient.java  |    85 +
 .../mahout/classifier/sgd/RecordFactory.java    |    47 +
 .../apache/mahout/classifier/sgd/TPrior.java    |    61 +
 .../mahout/classifier/sgd/UniformPrior.java     |    47 +
 .../mahout/classifier/sgd/package-info.java     |    23 +
 .../mahout/clustering/AbstractCluster.java      |   390 +
 .../org/apache/mahout/clustering/Cluster.java   |    90 +
 .../mahout/clustering/ClusteringUtils.java      |   306 +
 .../mahout/clustering/GaussianAccumulator.java  |    62 +
 .../org/apache/mahout/clustering/Model.java     |    93 +
 .../mahout/clustering/ModelDistribution.java    |    41 +
 .../clustering/OnlineGaussianAccumulator.java   |   107 +
 .../RunningSumsGaussianAccumulator.java         |    90 +
 .../clustering/UncommonDistributions.java       |   136 +
 .../apache/mahout/clustering/canopy/Canopy.java |    60 +
 .../clustering/canopy/CanopyClusterer.java      |   220 +
 .../clustering/canopy/CanopyConfigKeys.java     |    70 +
 .../mahout/clustering/canopy/CanopyDriver.java  |   379 +
 .../mahout/clustering/canopy/CanopyMapper.java  |    66 +
 .../mahout/clustering/canopy/CanopyReducer.java |    70 +
 .../ClusterClassificationConfigKeys.java        |    33 +
 .../classify/ClusterClassificationDriver.java   |   313 +
 .../classify/ClusterClassificationMapper.java   |   161 +
 .../clustering/classify/ClusterClassifier.java  |   231 +
 .../WeightedPropertyVectorWritable.java         |    95 +
 .../classify/WeightedVectorWritable.java        |    72 +
 .../fuzzykmeans/FuzzyKMeansClusterer.java       |    59 +
 .../fuzzykmeans/FuzzyKMeansDriver.java          |   324 +
 .../clustering/fuzzykmeans/FuzzyKMeansUtil.java |    76 +
 .../clustering/fuzzykmeans/SoftCluster.java     |    60 +
 .../iterator/AbstractClusteringPolicy.java      |    72 +
 .../mahout/clustering/iterator/CIMapper.java    |    71 +
 .../mahout/clustering/iterator/CIReducer.java   |    64 +
 .../iterator/CanopyClusteringPolicy.java        |    52 +
 .../clustering/iterator/ClusterIterator.java    |   219 +
 .../clustering/iterator/ClusterWritable.java    |    56 +
 .../clustering/iterator/ClusteringPolicy.java   |    66 +
 .../iterator/ClusteringPolicyWritable.java      |    55 +
 .../iterator/DistanceMeasureCluster.java        |    91 +
 .../iterator/FuzzyKMeansClusteringPolicy.java   |    90 +
 .../iterator/KMeansClusteringPolicy.java        |    64 +
 .../clustering/kernel/IKernelProfile.java       |    27 +
 .../kernel/TriangularKernelProfile.java         |    27 +
 .../mahout/clustering/kmeans/KMeansDriver.java  |   257 +
 .../mahout/clustering/kmeans/KMeansUtil.java    |    74 +
 .../mahout/clustering/kmeans/Kluster.java       |   117 +
 .../clustering/kmeans/RandomSeedGenerator.java  |   136 +
 .../mahout/clustering/kmeans/package-info.java  |     5 +
 .../lda/cvb/CVB0DocInferenceMapper.java         |    51 +
 .../mahout/clustering/lda/cvb/CVB0Driver.java   |   536 +
 .../CVB0TopicTermVectorNormalizerMapper.java    |    38 +
 .../clustering/lda/cvb/CachingCVB0Mapper.java   |   133 +
 .../lda/cvb/CachingCVB0PerplexityMapper.java    |   108 +
 .../cvb/InMemoryCollapsedVariationalBayes0.java |   492 +
 .../mahout/clustering/lda/cvb/ModelTrainer.java |   301 +
 .../mahout/clustering/lda/cvb/TopicModel.java   |   513 +
 .../apache/mahout/clustering/package-info.java  |    13 +
 .../spectral/AffinityMatrixInputJob.java        |    84 +
 .../spectral/AffinityMatrixInputMapper.java     |    78 +
 .../spectral/AffinityMatrixInputReducer.java    |    59 +
 .../spectral/IntDoublePairWritable.java         |    75 +
 .../apache/mahout/clustering/spectral/Keys.java |    31 +
 .../spectral/MatrixDiagonalizeJob.java          |   108 +
 .../clustering/spectral/UnitVectorizerJob.java  |    79 +
 .../mahout/clustering/spectral/VectorCache.java |   116 +
 .../spectral/VectorMatrixMultiplicationJob.java |   139 +
 .../clustering/spectral/VertexWritable.java     |   101 +
 .../spectral/kmeans/EigenSeedGenerator.java     |   120 +
 .../spectral/kmeans/SpectralKMeansDriver.java   |   243 +
 .../streaming/cluster/BallKMeans.java           |   456 +
 .../streaming/cluster/StreamingKMeans.java      |   368 +
 .../streaming/mapreduce/CentroidWritable.java   |    88 +
 .../mapreduce/StreamingKMeansDriver.java        |   493 +
 .../mapreduce/StreamingKMeansMapper.java        |   102 +
 .../mapreduce/StreamingKMeansReducer.java       |   109 +
 .../mapreduce/StreamingKMeansThread.java        |    92 +
 .../mapreduce/StreamingKMeansUtilsMR.java       |   154 +
 .../streaming/tools/ResplitSequenceFiles.java   |   149 +
 .../clustering/topdown/PathDirectory.java       |    94 +
 .../postprocessor/ClusterCountReader.java       |   103 +
 .../ClusterOutputPostProcessor.java             |   139 +
 .../ClusterOutputPostProcessorDriver.java       |   182 +
 .../ClusterOutputPostProcessorMapper.java       |    58 +
 .../ClusterOutputPostProcessorReducer.java      |    62 +
 .../org/apache/mahout/common/AbstractJob.java   |   648 +
 .../org/apache/mahout/common/ClassUtils.java    |    61 +
 .../apache/mahout/common/CommandLineUtil.java   |    68 +
 .../org/apache/mahout/common/HadoopUtil.java    |   435 +
 .../apache/mahout/common/IntPairWritable.java   |   270 +
 .../org/apache/mahout/common/IntegerTuple.java  |   176 +
 .../java/org/apache/mahout/common/LongPair.java |    80 +
 .../org/apache/mahout/common/MemoryUtil.java    |    99 +
 .../java/org/apache/mahout/common/Pair.java     |    99 +
 .../org/apache/mahout/common/Parameters.java    |    98 +
 .../org/apache/mahout/common/StringTuple.java   |   177 +
 .../org/apache/mahout/common/StringUtils.java   |    63 +
 .../apache/mahout/common/TimingStatistics.java  |   154 +
 .../commandline/DefaultOptionCreator.java       |   417 +
 .../distance/ChebyshevDistanceMeasure.java      |    63 +
 .../common/distance/CosineDistanceMeasure.java  |   119 +
 .../mahout/common/distance/DistanceMeasure.java |    48 +
 .../distance/EuclideanDistanceMeasure.java      |    41 +
 .../distance/MahalanobisDistanceMeasure.java    |   197 +
 .../distance/ManhattanDistanceMeasure.java      |    70 +
 .../distance/MinkowskiDistanceMeasure.java      |    93 +
 .../SquaredEuclideanDistanceMeasure.java        |    59 +
 .../distance/TanimotoDistanceMeasure.java       |    69 +
 .../distance/WeightedDistanceMeasure.java       |    93 +
 .../WeightedEuclideanDistanceMeasure.java       |    51 +
 .../WeightedManhattanDistanceMeasure.java       |    53 +
 .../iterator/CopyConstructorIterator.java       |    64 +
 .../common/iterator/CountingIterator.java       |    43 +
 .../common/iterator/FileLineIterable.java       |    88 +
 .../common/iterator/FileLineIterator.java       |   167 +
 .../iterator/FixedSizeSamplingIterator.java     |    59 +
 .../common/iterator/SamplingIterable.java       |    45 +
 .../common/iterator/SamplingIterator.java       |    73 +
 .../StableFixedSizeSamplingIterator.java        |    72 +
 .../common/iterator/StringRecordIterator.java   |    55 +
 .../iterator/sequencefile/PathFilters.java      |    81 +
 .../common/iterator/sequencefile/PathType.java  |    27 +
 .../sequencefile/SequenceFileDirIterable.java   |    84 +
 .../sequencefile/SequenceFileDirIterator.java   |   136 +
 .../SequenceFileDirValueIterable.java           |    83 +
 .../SequenceFileDirValueIterator.java           |   159 +
 .../sequencefile/SequenceFileIterable.java      |    68 +
 .../sequencefile/SequenceFileIterator.java      |   118 +
 .../sequencefile/SequenceFileValueIterable.java |    67 +
 .../sequencefile/SequenceFileValueIterator.java |    97 +
 .../mahout/common/lucene/AnalyzerUtils.java     |    61 +
 .../common/lucene/IteratorTokenStream.java      |    45 +
 .../common/lucene/TokenStreamIterator.java      |    57 +
 .../common/mapreduce/MergeVectorsCombiner.java  |    34 +
 .../common/mapreduce/MergeVectorsReducer.java   |    40 +
 .../common/mapreduce/TransposeMapper.java       |    49 +
 .../common/mapreduce/VectorSumCombiner.java     |    38 +
 .../common/mapreduce/VectorSumReducer.java      |    35 +
 .../org/apache/mahout/common/nlp/NGrams.java    |    94 +
 .../common/parameters/AbstractParameter.java    |   120 +
 .../common/parameters/ClassParameter.java       |    44 +
 .../common/parameters/DoubleParameter.java      |    33 +
 .../mahout/common/parameters/Parameter.java     |    62 +
 .../mahout/common/parameters/Parametered.java   |   206 +
 .../mahout/common/parameters/PathParameter.java |    33 +
 .../org/apache/mahout/driver/MahoutDriver.java  |   244 +
 .../apache/mahout/ep/EvolutionaryProcess.java   |   229 +
 .../main/java/org/apache/mahout/ep/Mapping.java |   206 +
 .../main/java/org/apache/mahout/ep/Payload.java |    36 +
 .../main/java/org/apache/mahout/ep/State.java   |   302 +
 .../java/org/apache/mahout/ep/package-info.java |    26 +
 .../mahout/math/DistributedRowMatrixWriter.java |    47 +
 .../org/apache/mahout/math/MatrixUtils.java     |   114 +
 .../mahout/math/MultiLabelVectorWritable.java   |    88 +
 .../math/als/AlternatingLeastSquaresSolver.java |   116 +
 ...itFeedbackAlternatingLeastSquaresSolver.java |   171 +
 .../math/decomposer/AsyncEigenVerifier.java     |    80 +
 .../mahout/math/decomposer/EigenStatus.java     |    50 +
 .../math/decomposer/SimpleEigenVerifier.java    |    41 +
 .../math/decomposer/SingularVectorVerifier.java |    25 +
 .../math/decomposer/hebbian/EigenUpdater.java   |    25 +
 .../math/decomposer/hebbian/HebbianSolver.java  |   342 +
 .../math/decomposer/hebbian/HebbianUpdater.java |    71 +
 .../math/decomposer/hebbian/TrainingState.java  |   143 +
 .../math/decomposer/lanczos/LanczosSolver.java  |   213 +
 .../math/decomposer/lanczos/LanczosState.java   |   107 +
 .../math/hadoop/DistributedRowMatrix.java       |   390 +
 .../math/hadoop/MatrixColumnMeansJob.java       |   236 +
 .../math/hadoop/MatrixMultiplicationJob.java    |   177 +
 .../mahout/math/hadoop/TimesSquaredJob.java     |   251 +
 .../apache/mahout/math/hadoop/TransposeJob.java |    85 +
 .../decomposer/DistributedLanczosSolver.java    |   299 +
 .../math/hadoop/decomposer/EigenVector.java     |    76 +
 .../hadoop/decomposer/EigenVerificationJob.java |   333 +
 .../decomposer/HdfsBackedLanczosState.java      |   237 +
 .../math/hadoop/similarity/SeedVectorUtil.java  |   104 +
 .../VectorDistanceInvertedMapper.java           |    71 +
 .../hadoop/similarity/VectorDistanceMapper.java |    80 +
 .../similarity/VectorDistanceSimilarityJob.java |   153 +
 .../similarity/cooccurrence/MutableElement.java |    50 +
 .../cooccurrence/RowSimilarityJob.java          |   562 +
 .../cooccurrence/TopElementsQueue.java          |    59 +
 .../hadoop/similarity/cooccurrence/Vectors.java |   199 +
 .../measures/CityBlockSimilarity.java           |    26 +
 .../measures/CooccurrenceCountSimilarity.java   |    32 +
 .../cooccurrence/measures/CosineSimilarity.java |    50 +
 .../measures/CountbasedMeasure.java             |    44 +
 .../measures/EuclideanDistanceSimilarity.java   |    57 +
 .../measures/LoglikelihoodSimilarity.java       |    34 +
 .../measures/PearsonCorrelationSimilarity.java  |    37 +
 .../measures/TanimotoCoefficientSimilarity.java |    34 +
 .../measures/VectorSimilarityMeasure.java       |    32 +
 .../measures/VectorSimilarityMeasures.java      |    46 +
 .../DistributedConjugateGradientSolver.java     |   172 +
 .../mahout/math/hadoop/stats/BasicStats.java    |   148 +
 .../StandardDeviationCalculatorMapper.java      |    55 +
 .../StandardDeviationCalculatorReducer.java     |    37 +
 .../math/hadoop/stats/VarianceTotals.java       |    68 +
 .../hadoop/stochasticsvd/ABtDenseOutJob.java    |   585 +
 .../math/hadoop/stochasticsvd/ABtJob.java       |   494 +
 .../mahout/math/hadoop/stochasticsvd/BtJob.java |   628 +
 .../stochasticsvd/DenseBlockWritable.java       |    83 +
 .../mahout/math/hadoop/stochasticsvd/Omega.java |   257 +
 .../mahout/math/hadoop/stochasticsvd/QJob.java  |   237 +
 .../math/hadoop/stochasticsvd/SSVDCli.java      |   201 +
 .../math/hadoop/stochasticsvd/SSVDHelper.java   |   322 +
 .../math/hadoop/stochasticsvd/SSVDSolver.java   |   662 +
 .../SparseRowBlockAccumulator.java              |    90 +
 .../stochasticsvd/SparseRowBlockWritable.java   |   159 +
 .../stochasticsvd/SplitPartitionedWritable.java |   151 +
 .../mahout/math/hadoop/stochasticsvd/UJob.java  |   170 +
 .../mahout/math/hadoop/stochasticsvd/VJob.java  |   224 +
 .../math/hadoop/stochasticsvd/YtYJob.java       |   220 +
 .../stochasticsvd/qr/GivensThinSolver.java      |   643 +
 .../hadoop/stochasticsvd/qr/GramSchmidt.java    |    52 +
 .../hadoop/stochasticsvd/qr/QRFirstStep.java    |   284 +
 .../hadoop/stochasticsvd/qr/QRLastStep.java     |   144 +
 .../mahout/math/neighborhood/BruteSearch.java   |   186 +
 .../math/neighborhood/FastProjectionSearch.java |   326 +
 .../mahout/math/neighborhood/HashedVector.java  |   103 +
 .../LocalitySensitiveHashSearch.java            |   295 +
 .../math/neighborhood/ProjectionSearch.java     |   233 +
 .../mahout/math/neighborhood/Searcher.java      |   155 +
 .../math/neighborhood/UpdatableSearcher.java    |    37 +
 .../math/random/AbstractSamplerFunction.java    |    39 +
 .../mahout/math/random/ChineseRestaurant.java   |   111 +
 .../apache/mahout/math/random/Empirical.java    |   124 +
 .../apache/mahout/math/random/IndianBuffet.java |   157 +
 .../org/apache/mahout/math/random/Missing.java  |    59 +
 .../apache/mahout/math/random/MultiNormal.java  |   118 +
 .../apache/mahout/math/random/Multinomial.java  |   202 +
 .../org/apache/mahout/math/random/Normal.java   |    40 +
 .../mahout/math/random/PoissonSampler.java      |    67 +
 .../mahout/math/random/RandomProjector.java     |   133 +
 .../org/apache/mahout/math/random/Sampler.java  |    25 +
 .../mahout/math/random/WeightedThing.java       |    71 +
 .../mahout/math/ssvd/SequentialBigSvd.java      |    69 +
 .../math/ssvd/SequentialOutOfCoreSvd.java       |   233 +
 .../mahout/math/stats/GlobalOnlineAuc.java      |   168 +
 .../mahout/math/stats/GroupedOnlineAuc.java     |   113 +
 .../org/apache/mahout/math/stats/OnlineAuc.java |    38 +
 .../mahout/math/stats/OnlineSummarizer.java     |    93 +
 .../org/apache/mahout/math/stats/Sampler.java   |    79 +
 .../mahout/vectorizer/DictionaryVectorizer.java |   422 +
 .../mahout/vectorizer/DocumentProcessor.java    |    99 +
 .../EncodedVectorsFromSequenceFiles.java        |   104 +
 .../mahout/vectorizer/EncodingMapper.java       |    92 +
 .../mahout/vectorizer/HighDFWordsPruner.java    |   147 +
 .../SimpleTextEncodingVectorizer.java           |    72 +
 .../SparseVectorsFromSequenceFiles.java         |   369 +
 .../java/org/apache/mahout/vectorizer/TF.java   |    30 +
 .../org/apache/mahout/vectorizer/TFIDF.java     |    31 +
 .../apache/mahout/vectorizer/Vectorizer.java    |    29 +
 .../mahout/vectorizer/VectorizerConfig.java     |   179 +
 .../org/apache/mahout/vectorizer/Weight.java    |    32 +
 .../collocations/llr/CollocCombiner.java        |    46 +
 .../collocations/llr/CollocDriver.java          |   284 +
 .../collocations/llr/CollocMapper.java          |   178 +
 .../collocations/llr/CollocReducer.java         |   176 +
 .../vectorizer/collocations/llr/Gram.java       |   239 +
 .../vectorizer/collocations/llr/GramKey.java    |   133 +
 .../llr/GramKeyGroupComparator.java             |    43 +
 .../collocations/llr/GramKeyPartitioner.java    |    40 +
 .../vectorizer/collocations/llr/LLRReducer.java |   170 +
 .../common/PartialVectorMergeReducer.java       |    89 +
 .../vectorizer/common/PartialVectorMerger.java  |   144 +
 .../document/SequenceFileTokenizerMapper.java   |    70 +
 .../encoders/AdaptiveWordValueEncoder.java      |    69 +
 .../encoders/CachingContinuousValueEncoder.java |    64 +
 .../encoders/CachingStaticWordValueEncoder.java |    66 +
 .../encoders/CachingTextValueEncoder.java       |    25 +
 .../encoders/CachingValueEncoder.java           |    64 +
 .../encoders/ConstantValueEncoder.java          |    57 +
 .../encoders/ContinuousValueEncoder.java        |    76 +
 .../mahout/vectorizer/encoders/Dictionary.java  |    54 +
 .../encoders/FeatureVectorEncoder.java          |   279 +
 .../encoders/InteractionValueEncoder.java       |   126 +
 .../encoders/LuceneTextValueEncoder.java        |   129 +
 .../encoders/StaticWordValueEncoder.java        |    80 +
 .../vectorizer/encoders/TextValueEncoder.java   |   142 +
 .../vectorizer/encoders/WordValueEncoder.java   |    81 +
 .../pruner/PrunedPartialVectorMergeReducer.java |    65 +
 .../vectorizer/pruner/WordsPrunerReducer.java   |    86 +
 .../vectorizer/term/TFPartialVectorReducer.java |   139 +
 .../vectorizer/term/TermCountCombiner.java      |    41 +
 .../mahout/vectorizer/term/TermCountMapper.java |    58 +
 .../vectorizer/term/TermCountReducer.java       |    55 +
 .../term/TermDocumentCountMapper.java           |    50 +
 .../term/TermDocumentCountReducer.java          |    41 +
 .../mahout/vectorizer/tfidf/TFIDFConverter.java |   361 +
 .../tfidf/TFIDFPartialVectorReducer.java        |   114 +
 .../src/main/resources/supplemental-models.xml  |   279 +
 .../mahout-mr/mr/src/main/resources/version     |     1 +
 .../mahout/cf/taste/common/CommonTest.java      |    60 +
 .../cf/taste/hadoop/TasteHadoopUtilsTest.java   |    40 +
 .../cf/taste/hadoop/TopItemsQueueTest.java      |    72 +
 .../als/ParallelALSFactorizationJobTest.java    |   379 +
 .../cf/taste/hadoop/item/IDReaderTest.java      |    66 +
 .../taste/hadoop/item/RecommenderJobTest.java   |   928 +
 .../hadoop/item/ToUserVectorsReducerTest.java   |    74 +
 .../similarity/item/ItemSimilarityJobTest.java  |   269 +
 .../mahout/cf/taste/impl/TasteTestCase.java     |    98 +
 .../mahout/cf/taste/impl/common/BitSetTest.java |    74 +
 .../mahout/cf/taste/impl/common/CacheTest.java  |    61 +
 .../cf/taste/impl/common/FastByIDMapTest.java   |   147 +
 .../cf/taste/impl/common/FastIDSetTest.java     |   162 +
 .../cf/taste/impl/common/FastMapTest.java       |   228 +
 .../impl/common/InvertedRunningAverageTest.java |    88 +
 .../common/LongPrimitiveArrayIteratorTest.java  |    56 +
 .../cf/taste/impl/common/MockRefreshable.java   |    45 +
 .../cf/taste/impl/common/RefreshHelperTest.java |    70 +
 .../common/RunningAverageAndStdDevTest.java     |   107 +
 .../taste/impl/common/RunningAverageTest.java   |    75 +
 .../SamplingLongPrimitiveIteratorTest.java      |    91 +
 .../impl/common/WeightedRunningAverageTest.java |    85 +
 ...ericRecommenderIRStatsEvaluatorImplTest.java |    73 +
 .../taste/impl/eval/LoadEvaluationRunner.java   |    68 +
 .../model/BooleanItemPreferenceArrayTest.java   |    89 +
 .../model/BooleanUserPreferenceArrayTest.java   |    89 +
 .../taste/impl/model/GenericDataModelTest.java  |    51 +
 .../model/GenericItemPreferenceArrayTest.java   |   110 +
 .../model/GenericUserPreferenceArrayTest.java   |   110 +
 .../taste/impl/model/MemoryIDMigratorTest.java  |    57 +
 ...lusAnonymousConcurrentUserDataModelTest.java |   313 +
 .../impl/model/file/FileDataModelTest.java      |   216 +
 .../impl/model/file/FileIDMigratorTest.java     |   103 +
 .../impl/neighborhood/DummySimilarity.java      |    68 +
 .../neighborhood/NearestNNeighborhoodTest.java  |    53 +
 .../neighborhood/ThresholdNeighborhoodTest.java |    51 +
 ...lUnknownItemsCandidateItemsStrategyTest.java |    65 +
 .../recommender/CachingRecommenderTest.java     |    78 +
 .../GenericItemBasedRecommenderTest.java        |   324 +
 .../GenericUserBasedRecommenderTest.java        |   174 +
 .../recommender/ItemAverageRecommenderTest.java |    43 +
 .../ItemUserAverageRecommenderTest.java         |    43 +
 .../taste/impl/recommender/MockRecommender.java |    89 +
 .../impl/recommender/NullRescorerTest.java      |    47 +
 ...sNeighborhoodCandidateItemsStrategyTest.java |    75 +
 .../impl/recommender/RandomRecommenderTest.java |    41 +
 .../impl/recommender/ReversingRescorer.java     |    46 +
 .../SamplingCandidateItemsStrategyTest.java     |    71 +
 .../cf/taste/impl/recommender/TopItemsTest.java |   158 +
 .../recommender/svd/ALSWRFactorizerTest.java    |   208 +
 .../svd/FilePersistenceStrategyTest.java        |    53 +
 .../svd/ParallelSGDFactorizerTest.java          |   355 +
 .../recommender/svd/SVDRecommenderTest.java     |    86 +
 .../AveragingPreferenceInferrerTest.java        |    37 +
 .../EuclideanDistanceSimilarityTest.java        |   236 +
 .../similarity/GenericItemSimilarityTest.java   |   104 +
 .../similarity/LogLikelihoodSimilarityTest.java |    80 +
 .../PearsonCorrelationSimilarityTest.java       |   265 +
 .../impl/similarity/SimilarityTestCase.java     |    35 +
 .../SpearmanCorrelationSimilarityTest.java      |    80 +
 .../TanimotoCoefficientSimilarityTest.java      |   121 +
 .../similarity/file/FileItemSimilarityTest.java |   142 +
 .../MultithreadedBatchItemSimilaritiesTest.java |    98 +
 .../similarity/precompute/SimilarItemsTest.java |    50 +
 .../mahout/classifier/ClassifierData.java       |   102 +
 .../mahout/classifier/ConfusionMatrixTest.java  |   119 +
 .../RegressionResultAnalyzerTest.java           |   128 +
 .../classifier/df/DecisionForestTest.java       |   206 +
 .../df/builder/DecisionTreeBuilderTest.java     |    78 +
 .../df/builder/DefaultTreeBuilderTest.java      |    74 +
 .../df/builder/InfiniteRecursionTest.java       |    60 +
 .../classifier/df/data/DataConverterTest.java   |    60 +
 .../classifier/df/data/DataLoaderTest.java      |   350 +
 .../mahout/classifier/df/data/DataTest.java     |   396 +
 .../mahout/classifier/df/data/DatasetTest.java  |    72 +
 .../classifier/df/data/DescriptorUtilsTest.java |    92 +
 .../apache/mahout/classifier/df/data/Utils.java |   284 +
 .../mapreduce/inmem/InMemInputFormatTest.java   |   109 +
 .../df/mapreduce/inmem/InMemInputSplitTest.java |    77 +
 .../mapreduce/partial/PartialBuilderTest.java   |   197 +
 .../df/mapreduce/partial/Step1MapperTest.java   |   160 +
 .../df/mapreduce/partial/TreeIDTest.java        |    48 +
 .../mahout/classifier/df/node/NodeTest.java     |   108 +
 .../classifier/df/split/DefaultIgSplitTest.java |    78 +
 .../df/split/RegressionSplitTest.java           |    87 +
 .../classifier/df/tools/VisualizerTest.java     |   211 +
 .../mahout/classifier/evaluation/AucTest.java   |    86 +
 .../ComplementaryNaiveBayesClassifierTest.java  |    47 +
 .../naivebayes/NaiveBayesModelTest.java         |    36 +
 .../classifier/naivebayes/NaiveBayesTest.java   |   135 +
 .../naivebayes/NaiveBayesTestBase.java          |   135 +
 .../StandardNaiveBayesClassifierTest.java       |    47 +
 .../training/IndexInstancesMapperTest.java      |    85 +
 .../naivebayes/training/ThetaMapperTest.java    |    61 +
 .../naivebayes/training/WeightsMapperTest.java  |    60 +
 .../sequencelearning/hmm/HMMAlgorithmsTest.java |   164 +
 .../sequencelearning/hmm/HMMEvaluatorTest.java  |    63 +
 .../sequencelearning/hmm/HMMModelTest.java      |    32 +
 .../sequencelearning/hmm/HMMTestBase.java       |    73 +
 .../sequencelearning/hmm/HMMTrainerTest.java    |   163 +
 .../sequencelearning/hmm/HMMUtilsTest.java      |   161 +
 .../sgd/AdaptiveLogisticRegressionTest.java     |   186 +
 .../classifier/sgd/CsvRecordFactoryTest.java    |    90 +
 .../classifier/sgd/GradientMachineTest.java     |    41 +
 .../classifier/sgd/ModelSerializerTest.java     |   162 +
 .../mahout/classifier/sgd/OnlineBaseTest.java   |   160 +
 .../sgd/OnlineLogisticRegressionTest.java       |   330 +
 .../classifier/sgd/PassiveAggressiveTest.java   |    35 +
 .../mahout/clustering/ClusteringTestUtils.java  |   152 +
 .../mahout/clustering/TestClusterInterface.java |    83 +
 .../clustering/TestGaussianAccumulators.java    |   186 +
 .../clustering/canopy/TestCanopyCreation.java   |   674 +
 .../ClusterClassificationDriverTest.java        |   255 +
 .../fuzzykmeans/TestFuzzyKmeansClustering.java  |   202 +
 .../iterator/TestClusterClassifier.java         |   238 +
 .../clustering/kmeans/TestKmeansClustering.java |   385 +
 .../kmeans/TestRandomSeedGenerator.java         |   169 +
 .../clustering/lda/cvb/TestCVBModelTrainer.java |   138 +
 .../spectral/TestAffinityMatrixInputJob.java    |   145 +
 .../spectral/TestMatrixDiagonalizeJob.java      |   116 +
 .../spectral/TestUnitVectorizerJob.java         |    65 +
 .../clustering/spectral/TestVectorCache.java    |   110 +
 .../TestVectorMatrixMultiplicationJob.java      |    75 +
 .../spectral/kmeans/TestEigenSeedGenerator.java |   100 +
 .../streaming/cluster/BallKMeansTest.java       |   196 +
 .../clustering/streaming/cluster/DataUtils.java |    92 +
 .../streaming/cluster/StreamingKMeansTest.java  |   169 +
 .../mapreduce/StreamingKMeansTestMR.java        |   282 +
 .../tools/ResplitSequenceFilesTest.java         |    80 +
 .../clustering/topdown/PathDirectoryTest.java   |    65 +
 .../postprocessor/ClusterCountReaderTest.java   |   121 +
 .../ClusterOutputPostProcessorTest.java         |   205 +
 .../apache/mahout/common/AbstractJobTest.java   |   240 +
 .../DistributedCacheFileLocationTest.java       |    46 +
 .../mahout/common/DummyOutputCollector.java     |    57 +
 .../apache/mahout/common/DummyRecordWriter.java |   223 +
 .../mahout/common/DummyRecordWriterTest.java    |    45 +
 .../mahout/common/DummyStatusReporter.java      |    76 +
 .../mahout/common/IntPairWritableTest.java      |   114 +
 .../apache/mahout/common/MahoutTestCase.java    |   148 +
 .../org/apache/mahout/common/MockIterator.java  |    51 +
 .../apache/mahout/common/StringUtilsTest.java   |    70 +
 .../distance/CosineDistanceMeasureTest.java     |    66 +
 .../distance/DefaultDistanceMeasureTest.java    |   103 +
 .../DefaultWeightedDistanceMeasureTest.java     |    56 +
 .../common/distance/TestChebyshevMeasure.java   |    55 +
 .../distance/TestEuclideanDistanceMeasure.java  |    26 +
 .../TestMahalanobisDistanceMeasure.java         |    56 +
 .../distance/TestManhattanDistanceMeasure.java  |    26 +
 .../common/distance/TestMinkowskiMeasure.java   |    64 +
 .../distance/TestTanimotoDistanceMeasure.java   |    25 +
 ...estWeightedEuclideanDistanceMeasureTest.java |    25 +
 .../TestWeightedManhattanDistanceMeasure.java   |    26 +
 .../common/iterator/CountingIteratorTest.java   |    44 +
 .../mahout/common/iterator/SamplerCase.java     |   101 +
 .../common/iterator/TestFixedSizeSampler.java   |    33 +
 .../common/iterator/TestSamplingIterator.java   |    77 +
 .../iterator/TestStableFixedSizeSampler.java    |    33 +
 .../mahout/common/lucene/AnalyzerUtilsTest.java |    38 +
 .../apache/mahout/driver/MahoutDriverTest.java  |    32 +
 .../mahout/ep/EvolutionaryProcessTest.java      |    81 +
 .../apache/mahout/math/MatrixWritableTest.java  |   148 +
 .../java/org/apache/mahout/math/VarintTest.java |   189 +
 .../apache/mahout/math/VectorWritableTest.java  |   123 +
 .../apache/mahout/math/hadoop/MathHelper.java   |   236 +
 .../math/hadoop/TestDistributedRowMatrix.java   |   395 +
 .../TestDistributedLanczosSolver.java           |   132 +
 .../TestDistributedLanczosSolverCLI.java        |   190 +
 .../TestVectorDistanceSimilarityJob.java        |   238 +
 .../cooccurrence/RowSimilarityJobTest.java      |   214 +
 .../measures/VectorSimilarityMeasuresTest.java  |   133 +
 .../TestDistributedConjugateGradientSolver.java |    59 +
 ...stDistributedConjugateGradientSolverCLI.java |   111 +
 .../math/hadoop/stats/BasicStatsTest.java       |   121 +
 .../stochasticsvd/LocalSSVDPCASparseTest.java   |   296 +
 .../stochasticsvd/LocalSSVDSolverDenseTest.java |   206 +
 .../LocalSSVDSolverSparseSequentialTest.java    |   209 +
 .../hadoop/stochasticsvd/SSVDCommonTest.java    |   105 +
 .../hadoop/stochasticsvd/SSVDTestsHelper.java   |   172 +
 .../LocalitySensitiveHashSearchTest.java        |   119 +
 .../mahout/math/neighborhood/LumpyData.java     |    77 +
 .../math/neighborhood/SearchQualityTest.java    |   178 +
 .../math/neighborhood/SearchSanityTest.java     |   244 +
 .../math/ssvd/SequentialOutOfCoreSvdTest.java   |   195 +
 .../apache/mahout/math/stats/OnlineAucTest.java |   127 +
 .../apache/mahout/math/stats/SamplerTest.java   |    45 +
 .../vectorizer/DictionaryVectorizerTest.java    |   220 +
 .../vectorizer/DocumentProcessorTest.java       |    81 +
 .../EncodedVectorsFromSequenceFilesTest.java    |   126 +
 .../vectorizer/HighDFWordsPrunerTest.java       |   154 +
 .../vectorizer/RandomDocumentGenerator.java     |    69 +
 .../SparseVectorsFromSequenceFilesTest.java     |   203 +
 .../collocations/llr/CollocMapperTest.java      |   180 +
 .../collocations/llr/CollocReducerTest.java     |    86 +
 .../llr/GramKeyGroupComparatorTest.java         |    45 +
 .../llr/GramKeyPartitionerTest.java             |    54 +
 .../collocations/llr/GramKeyTest.java           |   106 +
 .../vectorizer/collocations/llr/GramTest.java   |   215 +
 .../collocations/llr/LLRReducerTest.java        |   116 +
 .../vectorizer/encoders/CachingEncoderTest.java |    48 +
 .../encoders/ConstantValueEncoderTest.java      |    74 +
 .../encoders/ContinuousValueEncoderTest.java    |    88 +
 .../encoders/InteractionValueEncoderTest.java   |   103 +
 .../encoders/TextValueEncoderTest.java          |    99 +
 .../encoders/WordLikeValueEncoderTest.java      |    99 +
 .../mr/src/test/resources/FPGsynth.dat          |   193 +
 .../mahout-mr/mr/src/test/resources/cancer.csv  |   684 +
 .../mahout-mr/mr/src/test/resources/iris.csv    |   151 +
 .../mahout-mr/mr/src/test/resources/retail.dat  | 88162 +++++++++++++++++
 .../retail_results_with_min_sup_100.dat         |  6438 ++
 .../mahout-mr/mr/src/test/resources/sgd.csv     |    61 +
 .../mr/src/test/resources/word-list.txt         |   512 +
 community/mahout-mr/pom.xml                     |   259 +-
 .../appended-resources/supplemental-models.xml  |   279 -
 .../src/images/logos/ mahout-powered.svg        |   630 -
 .../mahout-mr/src/images/logos/favicon.ico      |   Bin 28838 -> 0 bytes
 .../mahout-mr/src/images/logos/favicon128.png   |   Bin 5259 -> 0 bytes
 .../mahout-mr/src/images/logos/favicon16.png    |   Bin 1009 -> 0 bytes
 .../mahout-mr/src/images/logos/favicon32.png    |   Bin 1847 -> 0 bytes
 .../mahout-mr/src/images/logos/favicon64.png    |   Bin 3148 -> 0 bytes
 .../src/images/logos/mahout-logo-100.png        |   Bin 19477 -> 0 bytes
 .../src/images/logos/mahout-logo-200.png        |   Bin 46360 -> 0 bytes
 .../src/images/logos/mahout-logo-300.png        |   Bin 70139 -> 0 bytes
 .../src/images/logos/mahout-logo-400.png        |   Bin 55468 -> 0 bytes
 .../images/logos/mahout-logo-poweredby-100.png  |   Bin 24623 -> 0 bytes
 .../images/logos/mahout-logo-poweredby-55.png   |   Bin 11684 -> 0 bytes
 .../logos/mahout-logo-transparent-400.png       |   Bin 61970 -> 0 bytes
 .../mahout-mr/src/images/logos/mahout-logo.svg  |   627 -
 community/mahout-mr/src/main/assembly/job.xml   |    61 -
 community/mahout-mr/src/main/assembly/src.xml   |    64 -
 .../main/java/org/apache/mahout/Version.java    |    41 -
 .../cf/taste/common/NoSuchItemException.java    |    32 -
 .../cf/taste/common/NoSuchUserException.java    |    32 -
 .../mahout/cf/taste/common/Refreshable.java     |    53 -
 .../mahout/cf/taste/common/TasteException.java  |    41 -
 .../mahout/cf/taste/common/Weighting.java       |    31 -
 .../mahout/cf/taste/eval/DataModelBuilder.java  |    45 -
 .../mahout/cf/taste/eval/IRStatistics.java      |    80 -
 .../cf/taste/eval/RecommenderBuilder.java       |    45 -
 .../cf/taste/eval/RecommenderEvaluator.java     |   105 -
 .../taste/eval/RecommenderIRStatsEvaluator.java |    64 -
 .../taste/eval/RelevantItemsDataSplitter.java   |    62 -
 .../cf/taste/hadoop/EntityEntityWritable.java   |    98 -
 .../cf/taste/hadoop/EntityPrefWritable.java     |    89 -
 .../cf/taste/hadoop/MutableRecommendedItem.java |    81 -
 .../taste/hadoop/RecommendedItemsWritable.java  |    96 -
 .../cf/taste/hadoop/TasteHadoopUtils.java       |    84 -
 .../cf/taste/hadoop/ToEntityPrefsMapper.java    |    78 -
 .../cf/taste/hadoop/ToItemPrefsMapper.java      |    46 -
 .../mahout/cf/taste/hadoop/TopItemsQueue.java   |    60 -
 .../apache/mahout/cf/taste/hadoop/als/ALS.java  |   100 -
 .../cf/taste/hadoop/als/DatasetSplitter.java    |   158 -
 .../hadoop/als/FactorizationEvaluator.java      |   166 -
 .../hadoop/als/MultithreadedSharingMapper.java  |    62 -
 .../hadoop/als/ParallelALSFactorizationJob.java |   414 -
 .../cf/taste/hadoop/als/PredictionMapper.java   |   145 -
 .../cf/taste/hadoop/als/RecommenderJob.java     |   110 -
 .../cf/taste/hadoop/als/SharingMapper.java      |    59 -
 .../hadoop/als/SolveExplicitFeedbackMapper.java |    61 -
 .../hadoop/als/SolveImplicitFeedbackMapper.java |    58 -
 .../item/AggregateAndRecommendReducer.java      |   220 -
 .../mahout/cf/taste/hadoop/item/IDReader.java   |   244 -
 .../item/ItemFilterAsVectorAndPrefsReducer.java |    62 -
 .../cf/taste/hadoop/item/ItemFilterMapper.java  |    47 -
 .../cf/taste/hadoop/item/ItemIDIndexMapper.java |    56 -
 .../taste/hadoop/item/ItemIDIndexReducer.java   |    48 -
 .../hadoop/item/PartialMultiplyMapper.java      |    57 -
 .../item/PrefAndSimilarityColumnWritable.java   |    85 -
 .../cf/taste/hadoop/item/RecommenderJob.java    |   337 -
 .../item/SimilarityMatrixRowWrapperMapper.java  |    54 -
 .../taste/hadoop/item/ToUserVectorsReducer.java |    84 -
 .../hadoop/item/ToVectorAndPrefReducer.java     |    63 -
 .../hadoop/item/UserVectorSplitterMapper.java   |   116 -
 .../hadoop/item/VectorAndPrefsWritable.java     |    92 -
 .../taste/hadoop/item/VectorOrPrefWritable.java |   104 -
 .../preparation/PreparePreferenceMatrixJob.java |   115 -
 .../hadoop/preparation/ToItemVectorsMapper.java |    56 -
 .../preparation/ToItemVectorsReducer.java       |    38 -
 .../similarity/item/ItemSimilarityJob.java      |   233 -
 .../similarity/item/TopSimilarItemsQueue.java   |    60 -
 .../common/AbstractLongPrimitiveIterator.java   |    27 -
 .../mahout/cf/taste/impl/common/BitSet.java     |    93 -
 .../mahout/cf/taste/impl/common/Cache.java      |   178 -
 .../cf/taste/impl/common/FastByIDMap.java       |   661 -
 .../mahout/cf/taste/impl/common/FastIDSet.java  |   426 -
 .../mahout/cf/taste/impl/common/FastMap.java    |   729 -
 .../taste/impl/common/FixedRunningAverage.java  |    83 -
 .../common/FixedRunningAverageAndStdDev.java    |    51 -
 .../taste/impl/common/FullRunningAverage.java   |   109 -
 .../common/FullRunningAverageAndStdDev.java     |   107 -
 .../impl/common/InvertedRunningAverage.java     |    58 -
 .../common/InvertedRunningAverageAndStdDev.java |    63 -
 .../impl/common/LongPrimitiveArrayIterator.java |    93 -
 .../impl/common/LongPrimitiveIterator.java      |    39 -
 .../cf/taste/impl/common/RefreshHelper.java     |   122 -
 .../mahout/cf/taste/impl/common/Retriever.java  |    36 -
 .../cf/taste/impl/common/RunningAverage.java    |    67 -
 .../impl/common/RunningAverageAndStdDev.java    |    36 -
 .../common/SamplingLongPrimitiveIterator.java   |   111 -
 .../cf/taste/impl/common/SkippingIterator.java  |    35 -
 .../impl/common/WeightedRunningAverage.java     |   100 -
 .../common/WeightedRunningAverageAndStdDev.java |    89 -
 .../impl/common/jdbc/AbstractJDBCComponent.java |    88 -
 .../taste/impl/common/jdbc/EachRowIterator.java |    92 -
 .../impl/common/jdbc/ResultSetIterator.java     |    66 -
 .../AbstractDifferenceRecommenderEvaluator.java |   276 -
 ...eAbsoluteDifferenceRecommenderEvaluator.java |    59 -
 .../GenericRecommenderIRStatsEvaluator.java     |   237 -
 .../eval/GenericRelevantItemsDataSplitter.java  |    83 -
 .../cf/taste/impl/eval/IRStatisticsImpl.java    |    95 -
 .../mahout/cf/taste/impl/eval/LoadCallable.java |    40 -
 .../cf/taste/impl/eval/LoadEvaluator.java       |    61 -
 .../cf/taste/impl/eval/LoadStatistics.java      |    34 -
 .../eval/OrderBasedRecommenderEvaluator.java    |   431 -
 .../impl/eval/RMSRecommenderEvaluator.java      |    56 -
 .../cf/taste/impl/eval/StatsCallable.java       |    64 -
 .../cf/taste/impl/model/AbstractDataModel.java  |    53 -
 .../cf/taste/impl/model/AbstractIDMigrator.java |    66 -
 .../impl/model/AbstractJDBCIDMigrator.java      |   108 -
 .../impl/model/BooleanItemPreferenceArray.java  |   234 -
 .../cf/taste/impl/model/BooleanPreference.java  |    64 -
 .../impl/model/BooleanUserPreferenceArray.java  |   234 -
 .../impl/model/GenericBooleanPrefDataModel.java |   320 -
 .../cf/taste/impl/model/GenericDataModel.java   |   361 -
 .../impl/model/GenericItemPreferenceArray.java  |   301 -
 .../cf/taste/impl/model/GenericPreference.java  |    70 -
 .../impl/model/GenericUserPreferenceArray.java  |   307 -
 .../cf/taste/impl/model/MemoryIDMigrator.java   |    55 -
 .../taste/impl/model/MySQLJDBCIDMigrator.java   |    67 -
 .../PlusAnonymousConcurrentUserDataModel.java   |   352 -
 .../impl/model/PlusAnonymousUserDataModel.java  |   320 -
 .../PlusAnonymousUserLongPrimitiveIterator.java |    90 -
 .../cf/taste/impl/model/file/FileDataModel.java |   758 -
 .../taste/impl/model/file/FileIDMigrator.java   |   117 -
 .../neighborhood/AbstractUserNeighborhood.java  |    71 -
 .../neighborhood/CachingUserNeighborhood.java   |    69 -
 .../neighborhood/NearestNUserNeighborhood.java  |   122 -
 .../neighborhood/ThresholdUserNeighborhood.java |   104 -
 .../AbstractCandidateItemsStrategy.java         |    57 -
 .../impl/recommender/AbstractRecommender.java   |   140 -
 .../AllSimilarItemsCandidateItemsStrategy.java  |    50 -
 .../AllUnknownItemsCandidateItemsStrategy.java  |    41 -
 .../impl/recommender/ByRescoreComparator.java   |    65 -
 .../ByValueRecommendedItemComparator.java       |    43 -
 .../impl/recommender/CachingRecommender.java    |   251 -
 .../recommender/EstimatedPreferenceCapper.java  |    46 -
 .../GenericBooleanPrefItemBasedRecommender.java |    71 -
 .../GenericBooleanPrefUserBasedRecommender.java |    82 -
 .../GenericItemBasedRecommender.java            |   378 -
 .../recommender/GenericRecommendedItem.java     |    76 -
 .../GenericUserBasedRecommender.java            |   247 -
 .../recommender/ItemAverageRecommender.java     |   199 -
 .../recommender/ItemUserAverageRecommender.java |   240 -
 .../cf/taste/impl/recommender/NullRescorer.java |    86 -
 ...ItemsNeighborhoodCandidateItemsStrategy.java |    48 -
 .../impl/recommender/RandomRecommender.java     |    97 -
 .../SamplingCandidateItemsStrategy.java         |   165 -
 .../cf/taste/impl/recommender/SimilarUser.java  |    80 -
 .../cf/taste/impl/recommender/TopItems.java     |   211 -
 .../impl/recommender/svd/ALSWRFactorizer.java   |   312 -
 .../recommender/svd/AbstractFactorizer.java     |    94 -
 .../impl/recommender/svd/Factorization.java     |   137 -
 .../taste/impl/recommender/svd/Factorizer.java  |    30 -
 .../svd/FilePersistenceStrategy.java            |   139 -
 .../recommender/svd/NoPersistenceStrategy.java  |    37 -
 .../recommender/svd/ParallelSGDFactorizer.java  |   340 -
 .../recommender/svd/PersistenceStrategy.java    |    46 -
 .../recommender/svd/RatingSGDFactorizer.java    |   221 -
 .../recommender/svd/SVDPlusPlusFactorizer.java  |   178 -
 .../impl/recommender/svd/SVDPreference.java     |    41 -
 .../impl/recommender/svd/SVDRecommender.java    |   185 -
 .../impl/similarity/AbstractItemSimilarity.java |    64 -
 .../impl/similarity/AbstractSimilarity.java     |   343 -
 .../similarity/AveragingPreferenceInferrer.java |    85 -
 .../impl/similarity/CachingItemSimilarity.java  |   111 -
 .../impl/similarity/CachingUserSimilarity.java  |   104 -
 .../impl/similarity/CityBlockSimilarity.java    |    98 -
 .../similarity/EuclideanDistanceSimilarity.java |    67 -
 .../impl/similarity/GenericItemSimilarity.java  |   358 -
 .../impl/similarity/GenericUserSimilarity.java  |   238 -
 .../similarity/LogLikelihoodSimilarity.java     |   121 -
 .../impl/similarity/LongPairMatchPredicate.java |    40 -
 .../PearsonCorrelationSimilarity.java           |    93 -
 .../SpearmanCorrelationSimilarity.java          |   135 -
 .../TanimotoCoefficientSimilarity.java          |   126 -
 .../similarity/UncenteredCosineSimilarity.java  |    69 -
 .../file/FileItemItemSimilarityIterable.java    |    46 -
 .../file/FileItemItemSimilarityIterator.java    |    60 -
 .../similarity/file/FileItemSimilarity.java     |   137 -
 .../precompute/FileSimilarItemsWriter.java      |    67 -
 .../MultithreadedBatchItemSimilarities.java     |   230 -
 .../apache/mahout/cf/taste/model/DataModel.java |   199 -
 .../mahout/cf/taste/model/IDMigrator.java       |    63 -
 .../mahout/cf/taste/model/JDBCDataModel.java    |    43 -
 .../mahout/cf/taste/model/Preference.java       |    48 -
 .../mahout/cf/taste/model/PreferenceArray.java  |   143 -
 .../cf/taste/model/UpdatableIDMigrator.java     |    47 -
 .../cf/taste/neighborhood/UserNeighborhood.java |    40 -
 .../recommender/CandidateItemsStrategy.java     |    37 -
 .../mahout/cf/taste/recommender/IDRescorer.java |    47 -
 .../taste/recommender/ItemBasedRecommender.java |   145 -
 .../MostSimilarItemsCandidateItemsStrategy.java |    31 -
 .../cf/taste/recommender/RecommendedItem.java   |    41 -
 .../cf/taste/recommender/Recommender.java       |   132 -
 .../mahout/cf/taste/recommender/Rescorer.java   |    52 -
 .../taste/recommender/UserBasedRecommender.java |    54 -
 .../cf/taste/similarity/ItemSimilarity.java     |    64 -
 .../cf/taste/similarity/PreferenceInferrer.java |    47 -
 .../cf/taste/similarity/UserSimilarity.java     |    58 -
 .../precompute/BatchItemSimilarities.java       |    56 -
 .../similarity/precompute/SimilarItem.java      |    56 -
 .../similarity/precompute/SimilarItems.java     |    84 -
 .../precompute/SimilarItemsWriter.java          |    33 -
 .../classifier/AbstractVectorClassifier.java    |   248 -
 .../mahout/classifier/ClassifierResult.java     |    74 -
 .../mahout/classifier/ConfusionMatrix.java      |   444 -
 .../apache/mahout/classifier/OnlineLearner.java |    96 -
 .../classifier/RegressionResultAnalyzer.java    |   144 -
 .../mahout/classifier/ResultAnalyzer.java       |   132 -
 .../apache/mahout/classifier/df/Bagging.java    |    61 -
 .../apache/mahout/classifier/df/DFUtils.java    |   174 -
 .../mahout/classifier/df/DecisionForest.java    |   241 -
 .../mahout/classifier/df/ErrorEstimate.java     |    51 -
 .../df/builder/DecisionTreeBuilder.java         |   422 -
 .../df/builder/DefaultTreeBuilder.java          |   253 -
 .../classifier/df/builder/TreeBuilder.java      |    42 -
 .../apache/mahout/classifier/df/data/Data.java  |   281 -
 .../classifier/df/data/DataConverter.java       |    72 -
 .../mahout/classifier/df/data/DataLoader.java   |   255 -
 .../mahout/classifier/df/data/DataUtils.java    |    89 -
 .../mahout/classifier/df/data/Dataset.java      |   422 -
 .../classifier/df/data/DescriptorException.java |    28 -
 .../classifier/df/data/DescriptorUtils.java     |   110 -
 .../mahout/classifier/df/data/Instance.java     |    75 -
 .../df/data/conditions/Condition.java           |    57 -
 .../classifier/df/data/conditions/Equals.java   |    42 -
 .../df/data/conditions/GreaterOrEquals.java     |    42 -
 .../classifier/df/data/conditions/Lesser.java   |    42 -
 .../mahout/classifier/df/mapreduce/Builder.java |   333 -
 .../classifier/df/mapreduce/Classifier.java     |   238 -
 .../classifier/df/mapreduce/MapredMapper.java   |    75 -
 .../classifier/df/mapreduce/MapredOutput.java   |   120 -
 .../df/mapreduce/inmem/InMemBuilder.java        |   114 -
 .../df/mapreduce/inmem/InMemInputFormat.java    |   284 -
 .../df/mapreduce/inmem/InMemMapper.java         |   106 -
 .../df/mapreduce/inmem/package-info.java        |    22 -
 .../df/mapreduce/partial/PartialBuilder.java    |   158 -
 .../df/mapreduce/partial/Step1Mapper.java       |   168 -
 .../classifier/df/mapreduce/partial/TreeID.java |    58 -
 .../df/mapreduce/partial/package-info.java      |    16 -
 .../classifier/df/node/CategoricalNode.java     |   134 -
 .../apache/mahout/classifier/df/node/Leaf.java  |    95 -
 .../apache/mahout/classifier/df/node/Node.java  |    96 -
 .../classifier/df/node/NumericalNode.java       |   115 -
 .../classifier/df/ref/SequentialBuilder.java    |    78 -
 .../classifier/df/split/DefaultIgSplit.java     |   118 -
 .../mahout/classifier/df/split/IgSplit.java     |    35 -
 .../mahout/classifier/df/split/OptIgSplit.java  |   232 -
 .../classifier/df/split/RegressionSplit.java    |   177 -
 .../mahout/classifier/df/split/Split.java       |    68 -
 .../mahout/classifier/df/tools/Describe.java    |   166 -
 .../classifier/df/tools/ForestVisualizer.java   |   158 -
 .../mahout/classifier/df/tools/Frequencies.java |   122 -
 .../classifier/df/tools/FrequenciesJob.java     |   297 -
 .../classifier/df/tools/TreeVisualizer.java     |   264 -
 .../mahout/classifier/df/tools/UDistrib.java    |   212 -
 .../mahout/classifier/evaluation/Auc.java       |   233 -
 .../AbstractNaiveBayesClassifier.java           |    82 -
 .../classifier/naivebayes/BayesUtils.java       |   161 -
 .../ComplementaryNaiveBayesClassifier.java      |    43 -
 .../classifier/naivebayes/NaiveBayesModel.java  |   170 -
 .../StandardNaiveBayesClassifier.java           |    40 -
 .../naivebayes/test/BayesTestMapper.java        |    76 -
 .../naivebayes/test/TestNaiveBayesDriver.java   |   176 -
 .../training/ComplementaryThetaTrainer.java     |    83 -
 .../training/IndexInstancesMapper.java          |    53 -
 .../naivebayes/training/ThetaMapper.java        |    61 -
 .../naivebayes/training/TrainNaiveBayesJob.java |   177 -
 .../naivebayes/training/WeightsMapper.java      |    68 -
 .../sequencelearning/hmm/BaumWelchTrainer.java  |   161 -
 .../sequencelearning/hmm/HmmAlgorithms.java     |   306 -
 .../sequencelearning/hmm/HmmEvaluator.java      |   194 -
 .../sequencelearning/hmm/HmmModel.java          |   383 -
 .../sequencelearning/hmm/HmmTrainer.java        |   488 -
 .../sequencelearning/hmm/HmmUtils.java          |   360 -
 .../hmm/LossyHmmSerializer.java                 |    62 -
 .../hmm/RandomSequenceGenerator.java            |   102 -
 .../sequencelearning/hmm/ViterbiEvaluator.java  |   122 -
 .../sgd/AbstractOnlineLogisticRegression.java   |   317 -
 .../sgd/AdaptiveLogisticRegression.java         |   586 -
 .../mahout/classifier/sgd/CrossFoldLearner.java |   334 -
 .../mahout/classifier/sgd/CsvRecordFactory.java |   395 -
 .../mahout/classifier/sgd/DefaultGradient.java  |    49 -
 .../mahout/classifier/sgd/ElasticBandPrior.java |    76 -
 .../apache/mahout/classifier/sgd/Gradient.java  |    30 -
 .../mahout/classifier/sgd/GradientMachine.java  |   405 -
 .../org/apache/mahout/classifier/sgd/L1.java    |    59 -
 .../org/apache/mahout/classifier/sgd/L2.java    |    66 -
 .../mahout/classifier/sgd/MixedGradient.java    |    66 -
 .../mahout/classifier/sgd/ModelDissector.java   |   232 -
 .../mahout/classifier/sgd/ModelSerializer.java  |    67 -
 .../sgd/OnlineLogisticRegression.java           |   172 -
 .../classifier/sgd/PassiveAggressive.java       |   204 -
 .../classifier/sgd/PolymorphicWritable.java     |    46 -
 .../mahout/classifier/sgd/PriorFunction.java    |    45 -
 .../mahout/classifier/sgd/RankingGradient.java  |    85 -
 .../mahout/classifier/sgd/RecordFactory.java    |    47 -
 .../apache/mahout/classifier/sgd/TPrior.java    |    61 -
 .../mahout/classifier/sgd/UniformPrior.java     |    47 -
 .../mahout/classifier/sgd/package-info.java     |    23 -
 .../mahout/clustering/AbstractCluster.java      |   390 -
 .../org/apache/mahout/clustering/Cluster.java   |    90 -
 .../mahout/clustering/ClusteringUtils.java      |   306 -
 .../mahout/clustering/GaussianAccumulator.java  |    62 -
 .../org/apache/mahout/clustering/Model.java     |    93 -
 .../mahout/clustering/ModelDistribution.java    |    41 -
 .../clustering/OnlineGaussianAccumulator.java   |   107 -
 .../RunningSumsGaussianAccumulator.java         |    90 -
 .../clustering/UncommonDistributions.java       |   136 -
 .../apache/mahout/clustering/canopy/Canopy.java |    60 -
 .../clustering/canopy/CanopyClusterer.java      |   220 -
 .../clustering/canopy/CanopyConfigKeys.java     |    70 -
 .../mahout/clustering/canopy/CanopyDriver.java  |   379 -
 .../mahout/clustering/canopy/CanopyMapper.java  |    66 -
 .../mahout/clustering/canopy/CanopyReducer.java |    70 -
 .../ClusterClassificationConfigKeys.java        |    33 -
 .../classify/ClusterClassificationDriver.java   |   313 -
 .../classify/ClusterClassificationMapper.java   |   161 -
 .../clustering/classify/ClusterClassifier.java  |   231 -
 .../WeightedPropertyVectorWritable.java         |    95 -
 .../classify/WeightedVectorWritable.java        |    72 -
 .../fuzzykmeans/FuzzyKMeansClusterer.java       |    59 -
 .../fuzzykmeans/FuzzyKMeansDriver.java          |   324 -
 .../clustering/fuzzykmeans/FuzzyKMeansUtil.java |    76 -
 .../clustering/fuzzykmeans/SoftCluster.java     |    60 -
 .../iterator/AbstractClusteringPolicy.java      |    72 -
 .../mahout/clustering/iterator/CIMapper.java    |    71 -
 .../mahout/clustering/iterator/CIReducer.java   |    64 -
 .../iterator/CanopyClusteringPolicy.java        |    52 -
 .../clustering/iterator/ClusterIterator.java    |   219 -
 .../clustering/iterator/ClusterWritable.java    |    56 -
 .../clustering/iterator/ClusteringPolicy.java   |    66 -
 .../iterator/ClusteringPolicyWritable.java      |    55 -
 .../iterator/DistanceMeasureCluster.java        |    91 -
 .../iterator/FuzzyKMeansClusteringPolicy.java   |    90 -
 .../iterator/KMeansClusteringPolicy.java        |    64 -
 .../clustering/kernel/IKernelProfile.java       |    27 -
 .../kernel/TriangularKernelProfile.java         |    27 -
 .../mahout/clustering/kmeans/KMeansDriver.java  |   257 -
 .../mahout/clustering/kmeans/KMeansUtil.java    |    74 -
 .../mahout/clustering/kmeans/Kluster.java       |   117 -
 .../clustering/kmeans/RandomSeedGenerator.java  |   136 -
 .../mahout/clustering/kmeans/package-info.java  |     5 -
 .../lda/cvb/CVB0DocInferenceMapper.java         |    51 -
 .../mahout/clustering/lda/cvb/CVB0Driver.java   |   536 -
 .../CVB0TopicTermVectorNormalizerMapper.java    |    38 -
 .../clustering/lda/cvb/CachingCVB0Mapper.java   |   133 -
 .../lda/cvb/CachingCVB0PerplexityMapper.java    |   108 -
 .../cvb/InMemoryCollapsedVariationalBayes0.java |   492 -
 .../mahout/clustering/lda/cvb/ModelTrainer.java |   301 -
 .../mahout/clustering/lda/cvb/TopicModel.java   |   513 -
 .../apache/mahout/clustering/package-info.java  |    13 -
 .../spectral/AffinityMatrixInputJob.java        |    84 -
 .../spectral/AffinityMatrixInputMapper.java     |    78 -
 .../spectral/AffinityMatrixInputReducer.java    |    59 -
 .../spectral/IntDoublePairWritable.java         |    75 -
 .../apache/mahout/clustering/spectral/Keys.java |    31 -
 .../spectral/MatrixDiagonalizeJob.java          |   108 -
 .../clustering/spectral/UnitVectorizerJob.java  |    79 -
 .../mahout/clustering/spectral/VectorCache.java |   116 -
 .../spectral/VectorMatrixMultiplicationJob.java |   139 -
 .../clustering/spectral/VertexWritable.java     |   101 -
 .../spectral/kmeans/EigenSeedGenerator.java     |   120 -
 .../spectral/kmeans/SpectralKMeansDriver.java   |   243 -
 .../streaming/cluster/BallKMeans.java           |   456 -
 .../streaming/cluster/StreamingKMeans.java      |   368 -
 .../streaming/mapreduce/CentroidWritable.java   |    88 -
 .../mapreduce/StreamingKMeansDriver.java        |   493 -
 .../mapreduce/StreamingKMeansMapper.java        |   102 -
 .../mapreduce/StreamingKMeansReducer.java       |   109 -
 .../mapreduce/StreamingKMeansThread.java        |    92 -
 .../mapreduce/StreamingKMeansUtilsMR.java       |   154 -
 .../streaming/tools/ResplitSequenceFiles.java   |   149 -
 .../clustering/topdown/PathDirectory.java       |    94 -
 .../postprocessor/ClusterCountReader.java       |   103 -
 .../ClusterOutputPostProcessor.java             |   139 -
 .../ClusterOutputPostProcessorDriver.java       |   182 -
 .../ClusterOutputPostProcessorMapper.java       |    58 -
 .../ClusterOutputPostProcessorReducer.java      |    62 -
 .../org/apache/mahout/common/AbstractJob.java   |   648 -
 .../org/apache/mahout/common/ClassUtils.java    |    61 -
 .../apache/mahout/common/CommandLineUtil.java   |    68 -
 .../org/apache/mahout/common/HadoopUtil.java    |   435 -
 .../apache/mahout/common/IntPairWritable.java   |   270 -
 .../org/apache/mahout/common/IntegerTuple.java  |   176 -
 .../java/org/apache/mahout/common/LongPair.java |    80 -
 .../org/apache/mahout/common/MemoryUtil.java    |    99 -
 .../java/org/apache/mahout/common/Pair.java     |    99 -
 .../org/apache/mahout/common/Parameters.java    |    98 -
 .../org/apache/mahout/common/StringTuple.java   |   177 -
 .../org/apache/mahout/common/StringUtils.java   |    63 -
 .../apache/mahout/common/TimingStatistics.java  |   154 -
 .../commandline/DefaultOptionCreator.java       |   417 -
 .../distance/ChebyshevDistanceMeasure.java      |    63 -
 .../common/distance/CosineDistanceMeasure.java  |   119 -
 .../mahout/common/distance/DistanceMeasure.java |    48 -
 .../distance/EuclideanDistanceMeasure.java      |    41 -
 .../distance/MahalanobisDistanceMeasure.java    |   197 -
 .../distance/ManhattanDistanceMeasure.java      |    70 -
 .../distance/MinkowskiDistanceMeasure.java      |    93 -
 .../SquaredEuclideanDistanceMeasure.java        |    59 -
 .../distance/TanimotoDistanceMeasure.java       |    69 -
 .../distance/WeightedDistanceMeasure.java       |    93 -
 .../WeightedEuclideanDistanceMeasure.java       |    51 -
 .../WeightedManhattanDistanceMeasure.java       |    53 -
 .../iterator/CopyConstructorIterator.java       |    64 -
 .../common/iterator/CountingIterator.java       |    43 -
 .../common/iterator/FileLineIterable.java       |    88 -
 .../common/iterator/FileLineIterator.java       |   167 -
 .../iterator/FixedSizeSamplingIterator.java     |    59 -
 .../common/iterator/SamplingIterable.java       |    45 -
 .../common/iterator/SamplingIterator.java       |    73 -
 .../StableFixedSizeSamplingIterator.java        |    72 -
 .../common/iterator/StringRecordIterator.java   |    55 -
 .../iterator/sequencefile/PathFilters.java      |    81 -
 .../common/iterator/sequencefile/PathType.java  |    27 -
 .../sequencefile/SequenceFileDirIterable.java   |    84 -
 .../sequencefile/SequenceFileDirIterator.java   |   136 -
 .../SequenceFileDirValueIterable.java           |    83 -
 .../SequenceFileDirValueIterator.java           |   159 -
 .../sequencefile/SequenceFileIterable.java      |    68 -
 .../sequencefile/SequenceFileIterator.java      |   118 -
 .../sequencefile/SequenceFileValueIterable.java |    67 -
 .../sequencefile/SequenceFileValueIterator.java |    97 -
 .../mahout/common/lucene/AnalyzerUtils.java     |    61 -
 .../common/lucene/IteratorTokenStream.java      |    45 -
 .../common/lucene/TokenStreamIterator.java      |    57 -
 .../common/mapreduce/MergeVectorsCombiner.java  |    34 -
 .../common/mapreduce/MergeVectorsReducer.java   |    40 -
 .../common/mapreduce/TransposeMapper.java       |    49 -
 .../common/mapreduce/VectorSumCombiner.java     |    38 -
 .../common/mapreduce/VectorSumReducer.java      |    35 -
 .../org/apache/mahout/common/nlp/NGrams.java    |    94 -
 .../common/parameters/AbstractParameter.java    |   120 -
 .../common/parameters/ClassParameter.java       |    44 -
 .../common/parameters/DoubleParameter.java      |    33 -
 .../mahout/common/parameters/Parameter.java     |    62 -
 .../mahout/common/parameters/Parametered.java   |   206 -
 .../mahout/common/parameters/PathParameter.java |    33 -
 .../org/apache/mahout/driver/MahoutDriver.java  |   244 -
 .../apache/mahout/ep/EvolutionaryProcess.java   |   229 -
 .../main/java/org/apache/mahout/ep/Mapping.java |   206 -
 .../main/java/org/apache/mahout/ep/Payload.java |    36 -
 .../main/java/org/apache/mahout/ep/State.java   |   302 -
 .../java/org/apache/mahout/ep/package-info.java |    26 -
 .../mahout/math/DistributedRowMatrixWriter.java |    47 -
 .../org/apache/mahout/math/MatrixUtils.java     |   114 -
 .../mahout/math/MultiLabelVectorWritable.java   |    88 -
 .../math/als/AlternatingLeastSquaresSolver.java |   116 -
 ...itFeedbackAlternatingLeastSquaresSolver.java |   171 -
 .../math/decomposer/AsyncEigenVerifier.java     |    80 -
 .../mahout/math/decomposer/EigenStatus.java     |    50 -
 .../math/decomposer/SimpleEigenVerifier.java    |    41 -
 .../math/decomposer/SingularVectorVerifier.java |    25 -
 .../math/decomposer/hebbian/EigenUpdater.java   |    25 -
 .../math/decomposer/hebbian/HebbianSolver.java  |   342 -
 .../math/decomposer/hebbian/HebbianUpdater.java |    71 -
 .../math/decomposer/hebbian/TrainingState.java  |   143 -
 .../math/decomposer/lanczos/LanczosSolver.java  |   213 -
 .../math/decomposer/lanczos/LanczosState.java   |   107 -
 .../math/hadoop/DistributedRowMatrix.java       |   390 -
 .../math/hadoop/MatrixColumnMeansJob.java       |   236 -
 .../math/hadoop/MatrixMultiplicationJob.java    |   177 -
 .../mahout/math/hadoop/TimesSquaredJob.java     |   251 -
 .../apache/mahout/math/hadoop/TransposeJob.java |    85 -
 .../decomposer/DistributedLanczosSolver.java    |   299 -
 .../math/hadoop/decomposer/EigenVector.java     |    76 -
 .../hadoop/decomposer/EigenVerificationJob.java |   333 -
 .../decomposer/HdfsBackedLanczosState.java      |   237 -
 .../math/hadoop/similarity/SeedVectorUtil.java  |   104 -
 .../VectorDistanceInvertedMapper.java           |    71 -
 .../hadoop/similarity/VectorDistanceMapper.java |    80 -
 .../similarity/VectorDistanceSimilarityJob.java |   153 -
 .../similarity/cooccurrence/MutableElement.java |    50 -
 .../cooccurrence/RowSimilarityJob.java          |   562 -
 .../cooccurrence/TopElementsQueue.java          |    59 -
 .../hadoop/similarity/cooccurrence/Vectors.java |   199 -
 .../measures/CityBlockSimilarity.java           |    26 -
 .../measures/CooccurrenceCountSimilarity.java   |    32 -
 .../cooccurrence/measures/CosineSimilarity.java |    50 -
 .../measures/CountbasedMeasure.java             |    44 -
 .../measures/EuclideanDistanceSimilarity.java   |    57 -
 .../measures/LoglikelihoodSimilarity.java       |    34 -
 .../measures/PearsonCorrelationSimilarity.java  |    37 -
 .../measures/TanimotoCoefficientSimilarity.java |    34 -
 .../measures/VectorSimilarityMeasure.java       |    32 -
 .../measures/VectorSimilarityMeasures.java      |    46 -
 .../DistributedConjugateGradientSolver.java     |   172 -
 .../mahout/math/hadoop/stats/BasicStats.java    |   148 -
 .../StandardDeviationCalculatorMapper.java      |    55 -
 .../StandardDeviationCalculatorReducer.java     |    37 -
 .../math/hadoop/stats/VarianceTotals.java       |    68 -
 .../hadoop/stochasticsvd/ABtDenseOutJob.java    |   585 -
 .../math/hadoop/stochasticsvd/ABtJob.java       |   494 -
 .../mahout/math/hadoop/stochasticsvd/BtJob.java |   628 -
 .../stochasticsvd/DenseBlockWritable.java       |    83 -
 .../mahout/math/hadoop/stochasticsvd/Omega.java |   257 -
 .../mahout/math/hadoop/stochasticsvd/QJob.java  |   237 -
 .../math/hadoop/stochasticsvd/SSVDCli.java      |   201 -
 .../math/hadoop/stochasticsvd/SSVDHelper.java   |   322 -
 .../math/hadoop/stochasticsvd/SSVDSolver.java   |   662 -
 .../SparseRowBlockAccumulator.java              |    90 -
 .../stochasticsvd/SparseRowBlockWritable.java   |   159 -
 .../stochasticsvd/SplitPartitionedWritable.java |   151 -
 .../mahout/math/hadoop/stochasticsvd/UJob.java  |   170 -
 .../mahout/math/hadoop/stochasticsvd/VJob.java  |   224 -
 .../math/hadoop/stochasticsvd/YtYJob.java       |   220 -
 .../stochasticsvd/qr/GivensThinSolver.java      |   643 -
 .../hadoop/stochasticsvd/qr/GramSchmidt.java    |    52 -
 .../hadoop/stochasticsvd/qr/QRFirstStep.java    |   284 -
 .../hadoop/stochasticsvd/qr/QRLastStep.java     |   144 -
 .../mahout/math/neighborhood/BruteSearch.java   |   186 -
 .../math/neighborhood/FastProjectionSearch.java |   326 -
 .../mahout/math/neighborhood/HashedVector.java  |   103 -
 .../LocalitySensitiveHashSearch.java            |   295 -
 .../math/neighborhood/ProjectionSearch.java     |   233 -
 .../mahout/math/neighborhood/Searcher.java      |   155 -
 .../math/neighborhood/UpdatableSearcher.java    |    37 -
 .../math/random/AbstractSamplerFunction.java    |    39 -
 .../mahout/math/random/ChineseRestaurant.java   |   111 -
 .../apache/mahout/math/random/Empirical.java    |   124 -
 .../apache/mahout/math/random/IndianBuffet.java |   157 -
 .../org/apache/mahout/math/random/Missing.java  |    59 -
 .../apache/mahout/math/random/MultiNormal.java  |   118 -
 .../apache/mahout/math/random/Multinomial.java  |   202 -
 .../org/apache/mahout/math/random/Normal.java   |    40 -
 .../mahout/math/random/PoissonSampler.java      |    67 -
 .../mahout/math/random/RandomProjector.java     |   133 -
 .../org/apache/mahout/math/random/Sampler.java  |    25 -
 .../mahout/math/random/WeightedThing.java       |    71 -
 .../mahout/math/ssvd/SequentialBigSvd.java      |    69 -
 .../math/ssvd/SequentialOutOfCoreSvd.java       |   233 -
 .../mahout/math/stats/GlobalOnlineAuc.java      |   168 -
 .../mahout/math/stats/GroupedOnlineAuc.java     |   113 -
 .../org/apache/mahout/math/stats/OnlineAuc.java |    38 -
 .../mahout/math/stats/OnlineSummarizer.java     |    93 -
 .../org/apache/mahout/math/stats/Sampler.java   |    79 -
 .../mahout/vectorizer/DictionaryVectorizer.java |   422 -
 .../mahout/vectorizer/DocumentProcessor.java    |    99 -
 .../EncodedVectorsFromSequenceFiles.java        |   104 -
 .../mahout/vectorizer/EncodingMapper.java       |    92 -
 .../mahout/vectorizer/HighDFWordsPruner.java    |   147 -
 .../SimpleTextEncodingVectorizer.java           |    72 -
 .../SparseVectorsFromSequenceFiles.java         |   369 -
 .../java/org/apache/mahout/vectorizer/TF.java   |    30 -
 .../org/apache/mahout/vectorizer/TFIDF.java     |    31 -
 .../apache/mahout/vectorizer/Vectorizer.java    |    29 -
 .../mahout/vectorizer/VectorizerConfig.java     |   179 -
 .../org/apache/mahout/vectorizer/Weight.java    |    32 -
 .../collocations/llr/CollocCombiner.java        |    46 -
 .../collocations/llr/CollocDriver.java          |   284 -
 .../collocations/llr/CollocMapper.java          |   178 -
 .../collocations/llr/CollocReducer.java         |   176 -
 .../vectorizer/collocations/llr/Gram.java       |   239 -
 .../vectorizer/collocations/llr/GramKey.java    |   133 -
 .../llr/GramKeyGroupComparator.java             |    43 -
 .../collocations/llr/GramKeyPartitioner.java    |    40 -
 .../vectorizer/collocations/llr/LLRReducer.java |   170 -
 .../common/PartialVectorMergeReducer.java       |    89 -
 .../vectorizer/common/PartialVectorMerger.java  |   144 -
 .../document/SequenceFileTokenizerMapper.java   |    70 -
 .../encoders/AdaptiveWordValueEncoder.java      |    69 -
 .../encoders/CachingContinuousValueEncoder.java |    64 -
 .../encoders/CachingStaticWordValueEncoder.java |    66 -
 .../encoders/CachingTextValueEncoder.java       |    25 -
 .../encoders/CachingValueEncoder.java           |    64 -
 .../encoders/ConstantValueEncoder.java          |    57 -
 .../encoders/ContinuousValueEncoder.java        |    76 -
 .../mahout/vectorizer/encoders/Dictionary.java  |    54 -
 .../encoders/FeatureVectorEncoder.java          |   279 -
 .../encoders/InteractionValueEncoder.java       |   126 -
 .../encoders/LuceneTextValueEncoder.java        |   129 -
 .../encoders/StaticWordValueEncoder.java        |    80 -
 .../vectorizer/encoders/TextValueEncoder.java   |   142 -
 .../vectorizer/encoders/WordValueEncoder.java   |    81 -
 .../pruner/PrunedPartialVectorMergeReducer.java |    65 -
 .../vectorizer/pruner/WordsPrunerReducer.java   |    86 -
 .../vectorizer/term/TFPartialVectorReducer.java |   139 -
 .../vectorizer/term/TermCountCombiner.java      |    41 -
 .../mahout/vectorizer/term/TermCountMapper.java |    58 -
 .../vectorizer/term/TermCountReducer.java       |    55 -
 .../term/TermDocumentCountMapper.java           |    50 -
 .../term/TermDocumentCountReducer.java          |    41 -
 .../mahout/vectorizer/tfidf/TFIDFConverter.java |   361 -
 .../tfidf/TFIDFPartialVectorReducer.java        |   114 -
 .../src/main/resources/supplemental-models.xml  |   279 -
 community/mahout-mr/src/main/resources/version  |     1 -
 .../mahout/cf/taste/common/CommonTest.java      |    60 -
 .../cf/taste/hadoop/TasteHadoopUtilsTest.java   |    40 -
 .../cf/taste/hadoop/TopItemsQueueTest.java      |    72 -
 .../als/ParallelALSFactorizationJobTest.java    |   379 -
 .../cf/taste/hadoop/item/IDReaderTest.java      |    66 -
 .../taste/hadoop/item/RecommenderJobTest.java   |   928 -
 .../hadoop/item/ToUserVectorsReducerTest.java   |    74 -
 .../similarity/item/ItemSimilarityJobTest.java  |   269 -
 .../mahout/cf/taste/impl/TasteTestCase.java     |    98 -
 .../mahout/cf/taste/impl/common/BitSetTest.java |    74 -
 .../mahout/cf/taste/impl/common/CacheTest.java  |    61 -
 .../cf/taste/impl/common/FastByIDMapTest.java   |   147 -
 .../cf/taste/impl/common/FastIDSetTest.java     |   162 -
 .../cf/taste/impl/common/FastMapTest.java       |   228 -
 .../impl/common/InvertedRunningAverageTest.java |    88 -
 .../common/LongPrimitiveArrayIteratorTest.java  |    56 -
 .../cf/taste/impl/common/MockRefreshable.java   |    45 -
 .../cf/taste/impl/common/RefreshHelperTest.java |    70 -
 .../common/RunningAverageAndStdDevTest.java     |   107 -
 .../taste/impl/common/RunningAverageTest.java   |    75 -
 .../SamplingLongPrimitiveIteratorTest.java      |    91 -
 .../impl/common/WeightedRunningAverageTest.java |    85 -
 ...ericRecommenderIRStatsEvaluatorImplTest.java |    73 -
 .../taste/impl/eval/LoadEvaluationRunner.java   |    68 -
 .../model/BooleanItemPreferenceArrayTest.java   |    89 -
 .../model/BooleanUserPreferenceArrayTest.java   |    89 -
 .../taste/impl/model/GenericDataModelTest.java  |    51 -
 .../model/GenericItemPreferenceArrayTest.java   |   110 -
 .../model/GenericUserPreferenceArrayTest.java   |   110 -
 .../taste/impl/model/MemoryIDMigratorTest.java  |    57 -
 ...lusAnonymousConcurrentUserDataModelTest.java |   313 -
 .../impl/model/file/FileDataModelTest.java      |   216 -
 .../impl/model/file/FileIDMigratorTest.java     |   103 -
 .../impl/neighborhood/DummySimilarity.java      |    68 -
 .../neighborhood/NearestNNeighborhoodTest.java  |    53 -
 .../neighborhood/ThresholdNeighborhoodTest.java |    51 -
 ...lUnknownItemsCandidateItemsStrategyTest.java |    65 -
 .../recommender/CachingRecommenderTest.java     |    78 -
 .../GenericItemBasedRecommenderTest.java        |   324 -
 .../GenericUserBasedRecommenderTest.java        |   174 -
 .../recommender/ItemAverageRecommenderTest.java |    43 -
 .../ItemUserAverageRecommenderTest.java         |    43 -
 .../taste/impl/recommender/MockRecommender.java |    89 -
 .../impl/recommender/NullRescorerTest.java      |    47 -
 ...sNeighborhoodCandidateItemsStrategyTest.java |    75 -
 .../impl/recommender/RandomRecommenderTest.java |    41 -
 .../impl/recommender/ReversingRescorer.java     |    46 -
 .../SamplingCandidateItemsStrategyTest.java     |    71 -
 .../cf/taste/impl/recommender/TopItemsTest.java |   158 -
 .../recommender/svd/ALSWRFactorizerTest.java    |   208 -
 .../svd/FilePersistenceStrategyTest.java        |    53 -
 .../svd/ParallelSGDFactorizerTest.java          |   355 -
 .../recommender/svd/SVDRecommenderTest.java     |    86 -
 .../AveragingPreferenceInferrerTest.java        |    37 -
 .../EuclideanDistanceSimilarityTest.java        |   236 -
 .../similarity/GenericItemSimilarityTest.java   |   104 -
 .../similarity/LogLikelihoodSimilarityTest.java |    80 -
 .../PearsonCorrelationSimilarityTest.java       |   265 -
 .../impl/similarity/SimilarityTestCase.java     |    35 -
 .../SpearmanCorrelationSimilarityTest.java      |    80 -
 .../TanimotoCoefficientSimilarityTest.java      |   121 -
 .../similarity/file/FileItemSimilarityTest.java |   142 -
 .../MultithreadedBatchItemSimilaritiesTest.java |    98 -
 .../similarity/precompute/SimilarItemsTest.java |    50 -
 .../mahout/classifier/ClassifierData.java       |   102 -
 .../mahout/classifier/ConfusionMatrixTest.java  |   119 -
 .../RegressionResultAnalyzerTest.java           |   128 -
 .../classifier/df/DecisionForestTest.java       |   206 -
 .../df/builder/DecisionTreeBuilderTest.java     |    78 -
 .../df/builder/DefaultTreeBuilderTest.java      |    74 -
 .../df/builder/InfiniteRecursionTest.java       |    60 -
 .../classifier/df/data/DataConverterTest.java   |    60 -
 .../classifier/df/data/DataLoaderTest.java      |   350 -
 .../mahout/classifier/df/data/DataTest.java     |   396 -
 .../mahout/classifier/df/data/DatasetTest.java  |    72 -
 .../classifier/df/data/DescriptorUtilsTest.java |    92 -
 .../apache/mahout/classifier/df/data/Utils.java |   284 -
 .../mapreduce/inmem/InMemInputFormatTest.java   |   109 -
 .../df/mapreduce/inmem/InMemInputSplitTest.java |    77 -
 .../mapreduce/partial/PartialBuilderTest.java   |   197 -
 .../df/mapreduce/partial/Step1MapperTest.java   |   160 -
 .../df/mapreduce/partial/TreeIDTest.java        |    48 -
 .../mahout/classifier/df/node/NodeTest.java     |   108 -
 .../classifier/df/split/DefaultIgSplitTest.java |    78 -
 .../df/split/RegressionSplitTest.java           |    87 -
 .../classifier/df/tools/VisualizerTest.java     |   211 -
 .../mahout/classifier/evaluation/AucTest.java   |    86 -
 .../ComplementaryNaiveBayesClassifierTest.java  |    47 -
 .../naivebayes/NaiveBayesModelTest.java         |    36 -
 .../classifier/naivebayes/NaiveBayesTest.java   |   135 -
 .../naivebayes/NaiveBayesTestBase.java          |   135 -
 .../StandardNaiveBayesClassifierTest.java       |    47 -
 .../training/IndexInstancesMapperTest.java      |    85 -
 .../naivebayes/training/ThetaMapperTest.java    |    61 -
 .../naivebayes/training/WeightsMapperTest.java  |    60 -
 .../sequencelearning/hmm/HMMAlgorithmsTest.java |   164 -
 .../sequencelearning/hmm/HMMEvaluatorTest.java  |    63 -
 .../sequencelearning/hmm/HMMModelTest.java      |    32 -
 .../sequencelearning/hmm/HMMTestBase.java       |    73 -
 .../sequencelearning/hmm/HMMTrainerTest.java    |   163 -
 .../sequencelearning/hmm/HMMUtilsTest.java      |   161 -
 .../sgd/AdaptiveLogisticRegressionTest.java     |   186 -
 .../classifier/sgd/CsvRecordFactoryTest.java    |    90 -
 .../classifier/sgd/GradientMachineTest.java     |    41 -
 .../classifier/sgd/ModelSerializerTest.java     |   162 -
 .../mahout/classifier/sgd/OnlineBaseTest.java   |   160 -
 .../sgd/OnlineLogisticRegressionTest.java       |   330 -
 .../classifier/sgd/PassiveAggressiveTest.java   |    35 -
 .../mahout/clustering/ClusteringTestUtils.java  |   152 -
 .../mahout/clustering/TestClusterInterface.java |    83 -
 .../clustering/TestGaussianAccumulators.java    |   186 -
 .../clustering/canopy/TestCanopyCreation.java   |   674 -
 .../ClusterClassificationDriverTest.java        |   255 -
 .../fuzzykmeans/TestFuzzyKmeansClustering.java  |   202 -
 .../iterator/TestClusterClassifier.java         |   238 -
 .../clustering/kmeans/TestKmeansClustering.java |   385 -
 .../kmeans/TestRandomSeedGenerator.java         |   169 -
 .../clustering/lda/cvb/TestCVBModelTrainer.java |   138 -
 .../spectral/TestAffinityMatrixInputJob.java    |   145 -
 .../spectral/TestMatrixDiagonalizeJob.java      |   116 -
 .../spectral/TestUnitVectorizerJob.java         |    65 -
 .../clustering/spectral/TestVectorCache.java    |   110 -
 .../TestVectorMatrixMultiplicationJob.java      |    75 -
 .../spectral/kmeans/TestEigenSeedGenerator.java |   100 -
 .../streaming/cluster/BallKMeansTest.java       |   196 -
 .../clustering/streaming/cluster/DataUtils.java |    92 -
 .../streaming/cluster/StreamingKMeansTest.java  |   169 -
 .../mapreduce/StreamingKMeansTestMR.java        |   282 -
 .../tools/ResplitSequenceFilesTest.java         |    80 -
 .../clustering/topdown/PathDirectoryTest.java   |    65 -
 .../postprocessor/ClusterCountReaderTest.java   |   121 -
 .../ClusterOutputPostProcessorTest.java         |   205 -
 .../apache/mahout/common/AbstractJobTest.java   |   240 -
 .../DistributedCacheFileLocationTest.java       |    46 -
 .../mahout/common/DummyOutputCollector.java     |    57 -
 .../apache/mahout/common/DummyRecordWriter.java |   223 -
 .../mahout/common/DummyRecordWriterTest.java    |    45 -
 .../mahout/common/DummyStatusReporter.java      |    76 -
 .../mahout/common/IntPairWritableTest.java      |   114 -
 .../apache/mahout/common/MahoutTestCase.java    |   148 -
 .../org/apache/mahout/common/MockIterator.java  |    51 -
 .../apache/mahout/common/StringUtilsTest.java   |    70 -
 .../distance/CosineDistanceMeasureTest.java     |    66 -
 .../distance/DefaultDistanceMeasureTest.java    |   103 -
 .../DefaultWeightedDistanceMeasureTest.java     |    56 -
 .../common/distance/TestChebyshevMeasure.java   |    55 -
 .../distance/TestEuclideanDistanceMeasure.java  |    26 -
 .../TestMahalanobisDistanceMeasure.java         |    56 -
 .../distance/TestManhattanDistanceMeasure.java  |    26 -
 .../common/distance/TestMinkowskiMeasure.java   |    64 -
 .../distance/TestTanimotoDistanceMeasure.java   |    25 -
 ...estWeightedEuclideanDistanceMeasureTest.java |    25 -
 .../TestWeightedManhattanDistanceMeasure.java   |    26 -
 .../common/iterator/CountingIteratorTest.java   |    44 -
 .../mahout/common/iterator/SamplerCase.java     |   101 -
 .../common/iterator/TestFixedSizeSampler.java   |    33 -
 .../common/iterator/TestSamplingIterator.java   |    77 -
 .../iterator/TestStableFixedSizeSampler.java    |    33 -
 .../mahout/common/lucene/AnalyzerUtilsTest.java |    38 -
 .../apache/mahout/driver/MahoutDriverTest.java  |    32 -
 .../mahout/ep/EvolutionaryProcessTest.java      |    81 -
 .../apache/mahout/math/MatrixWritableTest.java  |   148 -
 .../java/org/apache/mahout/math/VarintTest.java |   189 -
 .../apache/mahout/math/VectorWritableTest.java  |   123 -
 .../apache/mahout/math/hadoop/MathHelper.java   |   236 -
 .../math/hadoop/TestDistributedRowMatrix.java   |   395 -
 .../TestDistributedLanczosSolver.java           |   132 -
 .../TestDistributedLanczosSolverCLI.java        |   190 -
 .../TestVectorDistanceSimilarityJob.java        |   238 -
 .../cooccurrence/RowSimilarityJobTest.java      |   214 -
 .../measures/VectorSimilarityMeasuresTest.java  |   133 -
 .../TestDistributedConjugateGradientSolver.java |    59 -
 ...stDistributedConjugateGradientSolverCLI.java |   111 -
 .../math/hadoop/stats/BasicStatsTest.java       |   121 -
 .../stochasticsvd/LocalSSVDPCASparseTest.java   |   296 -
 .../stochasticsvd/LocalSSVDSolverDenseTest.java |   206 -
 .../LocalSSVDSolverSparseSequentialTest.java    |   209 -
 .../hadoop/stochasticsvd/SSVDCommonTest.java    |   105 -
 .../hadoop/stochasticsvd/SSVDTestsHelper.java   |   172 -
 .../LocalitySensitiveHashSearchTest.java        |   119 -
 .../mahout/math/neighborhood/LumpyData.java     |    77 -
 .../math/neighborhood/SearchQualityTest.java    |   178 -
 .../math/neighborhood/SearchSanityTest.java     |   244 -
 .../math/ssvd/SequentialOutOfCoreSvdTest.java   |   195 -
 .../apache/mahout/math/stats/OnlineAucTest.java |   127 -
 .../apache/mahout/math/stats/SamplerTest.java   |    45 -
 .../vectorizer/DictionaryVectorizerTest.java    |   220 -
 .../vectorizer/DocumentProcessorTest.java       |    81 -
 .../EncodedVectorsFromSequenceFilesTest.java    |   126 -
 .../vectorizer/HighDFWordsPrunerTest.java       |   154 -
 .../vectorizer/RandomDocumentGenerator.java     |    69 -
 .../SparseVectorsFromSequenceFilesTest.java     |   203 -
 .../collocations/llr/CollocMapperTest.java      |   180 -
 .../collocations/llr/CollocReducerTest.java     |    86 -
 .../llr/GramKeyGroupComparatorTest.java         |    45 -
 .../llr/GramKeyPartitionerTest.java             |    54 -
 .../collocations/llr/GramKeyTest.java           |   106 -
 .../vectorizer/collocations/llr/GramTest.java   |   215 -
 .../collocations/llr/LLRReducerTest.java        |   116 -
 .../vectorizer/encoders/CachingEncoderTest.java |    48 -
 .../encoders/ConstantValueEncoderTest.java      |    74 -
 .../encoders/ContinuousValueEncoderTest.java    |    88 -
 .../encoders/InteractionValueEncoderTest.java   |   103 -
 .../encoders/TextValueEncoderTest.java          |    99 -
 .../encoders/WordLikeValueEncoderTest.java      |    99 -
 .../mahout-mr/src/test/resources/FPGsynth.dat   |   193 -
 .../mahout-mr/src/test/resources/cancer.csv     |   684 -
 community/mahout-mr/src/test/resources/iris.csv |   151 -
 .../mahout-mr/src/test/resources/retail.dat     | 88162 -----------------
 .../retail_results_with_min_sup_100.dat         |  6438 --
 community/mahout-mr/src/test/resources/sgd.csv  |    61 -
 .../mahout-mr/src/test/resources/word-list.txt  |   512 -
 engine/hdfs/pom.xml                             |    26 +-
 pom.xml                                         |    22 +-
 1838 files changed, 304042 insertions(+), 304003 deletions(-)
----------------------------------------------------------------------



[30/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/resources/cf-data-purchase.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/resources/cf-data-purchase.txt b/community/mahout-mr/mr-examples/src/main/resources/cf-data-purchase.txt
new file mode 100644
index 0000000..d87c031
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/resources/cf-data-purchase.txt
@@ -0,0 +1,7 @@
+u1,iphone
+u1,ipad
+u2,nexus
+u2,galaxy
+u3,surface
+u4,iphone
+u4,galaxy

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/resources/cf-data-view.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/resources/cf-data-view.txt b/community/mahout-mr/mr-examples/src/main/resources/cf-data-view.txt
new file mode 100644
index 0000000..09ad9b6
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/resources/cf-data-view.txt
@@ -0,0 +1,12 @@
+u1,ipad
+u1,nexus
+u1,galaxy
+u2,iphone
+u2,ipad
+u2,nexus
+u2,galaxy
+u3,surface
+u3,nexus
+u4,iphone
+u4,ipad
+u4,galaxy

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/resources/donut-test.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/resources/donut-test.csv b/community/mahout-mr/mr-examples/src/main/resources/donut-test.csv
new file mode 100644
index 0000000..46ea564
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/resources/donut-test.csv
@@ -0,0 +1,41 @@
+"x","y","shape","color","xx","xy","yy","c","a","b"
+0.802415437065065,0.0978854028508067,21,2,0.643870533640319,0.07854475831082,0.00958155209126472,0.503141377562721,0.808363832523192,0.220502180491382
+0.97073650965467,0.989339149091393,23,2,0.942329371176533,0.96038763245370,0.978791951924881,0.67900343471543,1.38604520961670,0.989771844311643
+0.566630310611799,0.369259539060295,25,1,0.321069908904024,0.209233647314105,0.136352607187021,0.146740132271139,0.676330182744379,0.569352171215186
+0.377948862500489,0.500907538458705,24,1,0.142845342665413,0.189317434378387,0.250908362084759,0.122054511555201,0.62749797190921,0.79865886318828
+0.0133881184738129,0.269793515326455,25,2,0.000179241716268851,0.00361202754665705,0.0727885409122062,0.538317888266967,0.270125494221621,1.02283505301727
+0.395229484187439,0.385281964903697,25,1,0.156206345171069,0.152274792255611,0.148442192480054,0.155361155247979,0.551949760078871,0.717070128562224
+0.757145672803745,0.416044564917684,21,1,0.573269569845435,0.315006342020941,0.173093079997545,0.270503996498299,0.863922826323613,0.481737796145881
+0.589166145538911,0.971624446567148,24,2,0.347116747049177,0.572448230095344,0.944054065166917,0.479979395505718,1.13629697360157,1.05491161769044
+0.843438957352191,0.218833807157353,25,2,0.711389274779351,0.184572958142208,0.0478882351549814,0.443852166182378,0.871365313708512,0.269071728782402
+0.628562391968444,0.801476288354024,25,2,0.395090680597092,0.503777852913796,0.642364240793743,0.327744170151609,1.01855531091386,0.8833629703887
+0.262267543468624,0.247060472844169,22,2,0.0687842643570668,0.0647959433010369,0.0610388772419841,0.347124077652729,0.360309785599907,0.778002605819416
+0.738417695043609,0.562460686312988,21,1,0.545260692353516,0.415330923539883,0.316362023647678,0.246463657857698,0.928236347058869,0.620312280963368
+0.498857178725302,0.164454092038795,21,1,0.248858484765768,0.0820391043843046,0.0270451483883046,0.335547854098302,0.525265297877247,0.527436513434051
+0.499293045606464,0.733599063009024,25,1,0.249293545390979,0.366280910423824,0.538167585247717,0.233600132755117,0.88739006679064,0.888186376514393
+0.553942533675581,0.548312899889424,24,1,0.306852330614922,0.303733837011753,0.30064703618515,0.0724150069741539,0.779422457207946,0.706833997094728
+0.661088703200221,0.98143746308051,24,2,0.43703827349895,0.64881721974001,0.963219493937908,0.507672730364875,1.1833248782295,1.03830648704340
+0.492181566543877,0.376017479225993,23,1,0.242242694445585,0.185068871973329,0.141389144683470,0.124228794404457,0.619380205632255,0.63187712891139
+0.991064163157716,0.216620326042175,21,2,0.982208175495505,0.21468464215194,0.0469243656546183,0.566963889458783,1.01446170018888,0.21680455446021
+0.601602173643187,0.343355831922963,24,1,0.361925175332207,0.206563614817919,0.117893227315510,0.186709392055052,0.692689254029335,0.52594111396747
+0.0397100185509771,0.0602901463862509,25,2,0.00157688557331895,0.00239412283143915,0.00363490175127556,0.636562347604197,0.0721927096360464,0.962180726382856
+0.158290433697402,0.630195834673941,23,2,0.0250558614001118,0.0997539719848347,0.397146790040385,0.365672507948237,0.649771230080632,1.05148551299849
+0.967184047214687,0.497705311980098,25,2,0.935444981186582,0.48137263796116,0.247710577573207,0.467189682639721,1.08772954302059,0.498785990511377
+0.538070349488407,0.0130743277259171,24,2,0.289519700998577,0.00703490808881019,0.000170938045484685,0.488411672495383,0.538229169633216,0.462114639529248
+0.758642012253404,0.673675778554752,25,2,0.575537702755893,0.511078748249156,0.453839054611352,0.311542880770993,1.01458206044028,0.715606548922268
+0.986405614530668,0.981674374546856,21,2,0.972996036377624,0.9683291146939,0.96368457764196,0.684544100071034,1.39164672744903,0.981768498658543
+0.51937106740661,0.462004136526957,23,1,0.269746305659081,0.239951581534275,0.213447822168019,0.0426488439882434,0.695121664046734,0.666672328069706
+0.534244359936565,0.692785677267238,21,1,0.28541703612403,0.370116840724856,0.479951994626626,0.195803456422130,0.87485371963012,0.83479357381183
+0.0795328004751354,0.536029864801094,22,2,0.00632546635141770,0.0426319562859392,0.287328015958679,0.422008076977050,0.541898036820671,1.06517035321108
+0.330987347057089,0.804738595616072,23,2,0.10955262391189,0.266358292837412,0.647604207274128,0.348469350894533,0.870147591610767,1.04650950166343
+0.9804020607844,0.74571731640026,25,2,0.961188200790297,0.731102793761427,0.556094315979205,0.539595348001485,1.23178022259229,0.745974795285138
+0.362560331821442,0.805498170899227,21,2,0.131449994210474,0.292041684122788,0.648827303322001,0.334990738397057,0.883333061496328,1.02720817456326
+0.47635925677605,0.961423690896481,21,2,0.226918141516230,0.457983074842334,0.924335513417013,0.462028903057712,1.07296488988841,1.09477629741475
+0.850710266502574,0.635807712096721,24,2,0.723707957532881,0.540888148202193,0.404251446761667,0.376086992190972,1.06205433208219,0.65309943445803
+0.136131341336295,0.714137809583917,25,2,0.0185317420940189,0.0972165379176223,0.509992811077315,0.422203034393551,0.726996941651981,1.12083088398685
+0.930458213202655,0.865616530412808,24,2,0.865752486516278,0.805420010206583,0.749291977723908,0.564774043865972,1.27084399681479,0.868405457050378
+0.374636142514646,0.197784703457728,21,2,0.140352239278254,0.0740972983518064,0.0391187889218614,0.327185241457712,0.423640210792266,0.655895375171089
+0.482126326300204,0.841961156809703,22,1,0.232445794511731,0.405931639420132,0.708898589576332,0.342427950053959,0.970229036922758,0.988479504839456
+0.660344187868759,0.746531683253124,24,2,0.436054446452051,0.492967858096082,0.557309554100743,0.294088642131774,0.996676477375078,0.82016804669243
+0.0772640188224614,0.437956433976069,22,2,0.00596972860459766,0.0338382741581451,0.191805838061035,0.427264688298837,0.444719649515999,1.02139489377063
+0.998469967395067,0.464829172473401,25,2,0.996942275789907,0.464117968683793,0.216066159582307,0.499709210945471,1.10136662168971,0.464831690595724

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/resources/donut.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/resources/donut.csv b/community/mahout-mr/mr-examples/src/main/resources/donut.csv
new file mode 100644
index 0000000..33ba3b7
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/resources/donut.csv
@@ -0,0 +1,41 @@
+"x","y","shape","color","k","k0","xx","xy","yy","a","b","c","bias"
+0.923307513352484,0.0135197141207755,21,2,4,8,0.852496764213146,0.0124828536260896,0.000182782669907495,0.923406490600458,0.0778750292332978,0.644866125183976,1
+0.711011884035543,0.909141522599384,22,2,3,9,0.505537899239772,0.64641042683833,0.826538308114327,1.15415605849213,0.953966686673604,0.46035073663368,1
+0.75118898646906,0.836567111080512,23,2,3,9,0.564284893392414,0.62842000028592,0.699844531341594,1.12433510339845,0.872783737128441,0.419968245447719,1
+0.308209649519995,0.418023289414123,24,1,5,1,0.094993188057238,0.128838811521522,0.174743470492603,0.519361780024138,0.808280495564412,0.208575453051705,1
+0.849057961953804,0.500220163026825,25,1,5,2,0.720899422757147,0.424715912147755,0.250220211498583,0.985454024425153,0.52249756970547,0.349058031386046,1
+0.0738831346388906,0.486534863477573,21,2,6,1,0.00545871758406844,0.0359467208248278,0.236716173379140,0.492112681164801,1.04613986717142,0.42632955896436,1
+0.612888508243486,0.0204555552918464,22,2,4,10,0.375632323536926,0.0125369747681119,0.000418429742297785,0.613229772009826,0.387651566219268,0.492652707029903,1
+0.207169560948387,0.932857288978994,23,2,1,4,0.0429192269835473,0.193259634985281,0.870222721601238,0.955584610897845,1.22425602987611,0.522604151014326,1
+0.309267645236105,0.506309477845207,24,1,5,1,0.0956464763898851,0.156585139973909,0.256349287355886,0.593292308854389,0.856423069092351,0.190836685845410,1
+0.78758287569508,0.171928803203627,25,2,4,10,0.620286786088131,0.135408181241926,0.0295595133710317,0.806130448165285,0.273277419610556,0.436273561610666,1
+0.930236018029973,0.0790199618786573,21,2,4,8,0.86533904924026,0.0735072146828825,0.00624415437530446,0.93358620577618,0.105409523078414,0.601936228937031,1
+0.238834470743313,0.623727766098455,22,1,5,1,0.0570419044152386,0.148967690904034,0.389036326202168,0.667890882268509,0.984077887735915,0.288991338582386,1
+0.83537525916472,0.802311758277938,23,2,3,7,0.697851823624524,0.670231393002335,0.643704157471036,1.15825557675997,0.819027144096042,0.451518508649315,1
+0.656760312616825,0.320640653371811,24,1,5,3,0.43133410822855,0.210584055746134,0.102810428594702,0.730851925374252,0.469706197095164,0.238209090579297,1
+0.180789119331166,0.114329558331519,25,2,2,5,0.0326847056685386,0.0206695401642766,0.0130712479082803,0.213906413126907,0.82715035810576,0.500636870310341,1
+0.990028728265315,0.061085847672075,21,2,4,8,0.980156882790638,0.0604767440857932,0.00373148078581595,0.991911469626425,0.06189432159595,0.657855445853466,1
+0.751934139290825,0.972332585137337,22,2,3,9,0.565404949831033,0.731130065509666,0.945430656119858,1.22916052895905,1.00347761677540,0.535321288127727,1
+0.136412925552577,0.552212274167687,23,2,6,1,0.0186084862578129,0.0753288918452558,0.304938395741448,0.5688118159807,1.02504684326820,0.3673168690368,1
+0.5729476721026,0.0981996888294816,24,2,4,10,0.328269034967789,0.0562632831160512,0.0096431788862070,0.581302170866406,0.43819729534628,0.408368525870829,1
+0.446335297077894,0.339370004367083,25,1,5,3,0.199215197417612,0.151472811718508,0.115171999864114,0.560702414192882,0.649397107420365,0.169357302283512,1
+0.922843366628513,0.912627586396411,21,2,3,7,0.851639879330248,0.842212314308118,0.832889111451739,1.29789405992245,0.915883320912091,0.590811338548155,1
+0.166969822719693,0.398156099021435,22,2,6,1,0.0278789216990458,0.0664800532683736,0.158528279187967,0.431749002184154,0.923291695753637,0.348254618269284,1
+0.350683249300346,0.84422400011681,23,2,1,6,0.122978741339848,0.296055215498298,0.712714162373228,0.914162405545687,1.06504760696993,0.375214144584023,1
+0.47748578293249,0.792779305484146,24,1,5,6,0.227992672902653,0.378540847371773,0.628499027203925,0.9254683679665,0.949484141121692,0.29364368150863,1
+0.384564548265189,0.153326370986179,25,2,2,5,0.147889891782409,0.0589638865954405,0.0235089760397912,0.414003463538894,0.634247405427742,0.365387395199715,1
+0.563622857443988,0.467359990812838,21,1,5,3,0.317670725433326,0.263414773476928,0.218425361012576,0.73218582781006,0.639414084578942,0.071506910079209,1
+0.343304847599939,0.854578266385943,22,2,1,6,0.117858218385617,0.293380861503846,0.730304013379203,0.920957236664559,1.07775346743350,0.387658506651072,1
+0.666085948701948,0.710089378990233,23,1,5,2,0.443670491058174,0.472980557667886,0.504226926154735,0.973600234805286,0.784681795257806,0.267809801016930,1
+0.190568120684475,0.0772022884339094,24,2,2,5,0.0363162086212125,0.0147122950193909,0.00596019333943254,0.205612261211838,0.813105258002736,0.523933195018469,1
+0.353534662164748,0.427994541125372,25,1,5,1,0.124986757351942,0.151310905505115,0.183179327233118,0.555127088678854,0.775304301713569,0.163208092002022,1
+0.127048352966085,0.927507144864649,21,2,1,4,0.0161412839913949,0.117838255119330,0.860269503774972,0.936168140755905,1.27370093893119,0.567322915045421,1
+0.960906301159412,0.891004979610443,22,2,3,7,0.923340919607862,0.856172299272088,0.793889873690606,1.31043152942016,0.891862204031343,0.604416671286136,1
+0.306814440060407,0.902291874401271,23,2,1,6,0.094135100629581,0.276836176215481,0.81413062661056,0.953029761990747,1.13782109627099,0.446272800849954,1
+0.087350245565176,0.671402548439801,24,2,6,4,0.00763006540029655,0.0586471774793016,0.450781382051459,0.677060889028273,1.13300968942079,0.446831795474291,1
+0.27015240653418,0.371201378758997,25,1,5,1,0.0729823227562089,0.100280945780549,0.137790463592580,0.459099974241765,0.81882108746687,0.263474858488646,1
+0.871842501685023,0.569787061074749,21,2,3,2,0.7601093477444,0.496764576755166,0.324657294968199,1.04152131169391,0.584021951079369,0.378334613738721,1
+0.686449621338397,0.169308491749689,22,2,4,10,0.471213082635629,0.116221750050949,0.0286653653785545,0.707020825728764,0.356341416814533,0.379631841296403,1
+0.67132937326096,0.571220482233912,23,1,5,2,0.450683127402953,0.383477088331915,0.326292839323543,0.881462402332905,0.659027480614106,0.185542747720368,1
+0.548616112209857,0.405350996181369,24,1,5,3,0.300979638576258,0.222382087605415,0.164309430105228,0.682121007359754,0.606676886210257,0.106404700508298,1
+0.677980388281867,0.993355110753328,25,2,3,9,0.459657406894831,0.673475283690318,0.986754376059756,1.20266860895036,1.04424662144096,0.524477152905055,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/resources/test-data.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/resources/test-data.csv b/community/mahout-mr/mr-examples/src/main/resources/test-data.csv
new file mode 100644
index 0000000..ab683cd
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/resources/test-data.csv
@@ -0,0 +1,61 @@
+"V1","V2","V3","V4","V5","V6","V7","V8","y"
+1,-0.212887381184450,-0.955959589855826,-0.00326541907490505,0.0560086232868742,0.091264583618544,0.0172194710825328,-0.0237399208336878,1
+1,3.14702017427074,2.12881054220556,-0.00566925018709358,-0.055626039510634,-0.0630510476335515,-0.00155145331201058,0.108559859662683,0
+1,-2.16541417186635,-2.71847685293678,-0.00833554984263851,0.0433655514274994,-0.102555485096075,-0.156155728366877,-0.0241458595902909,1
+1,-4.33686585982661,-2.6857484867589,-0.0115524101901378,0.122387581992154,0.081766215557828,-0.0206167352421607,-0.0424490760296281,1
+1,2.34100936064648,2.10958510331364,-0.0129315842415535,0.173866353524092,-0.0299915285951044,0.108136400830407,-0.0063355720943443,0
+1,1.30317270786224,3.37038662087804,-0.0230504278644102,-0.131884713919903,0.086455020204179,0.17337860146005,-0.0524355492943794,0
+1,1.94943481762617,3.54806480367192,-0.029538920288902,-0.0720379027720258,0.214306548234308,-0.082665692089578,0.226607475768828,0
+1,3.14635496849369,1.76134258264267,-0.0318247859223975,-0.187198080297378,-0.08576487890296,0.153638925055934,-0.0691201521844938,0
+1,-1.26105438936697,-1.95583819596755,-0.0367826492102569,-0.0936093811581598,-0.0317225362744449,-0.0840334569992295,-0.0627566339884115,1
+1,2.40442001058194,3.23077413487565,-0.0452264569747572,0.0371989606630366,-0.17352653795031,0.102543062447842,-0.0551882772900301,0
+1,-2.20940227045733,-0.175769402031962,-0.0465958462590872,0.130789407148096,-0.140283147466875,0.0708851428212228,0.0605244763586474,1
+1,-1.64710385829030,-2.57691366099069,-0.0553070134425288,-0.0349011715152424,-0.0826092377112715,0.106766133325393,-0.0585587032435851,1
+1,-2.6523724984616,-4.16903830585265,-0.0568310036349303,-0.0291979248790545,-0.255996825268056,0.0401827924643623,0.0179311252387879,1
+1,2.34337447158977,0.28996735916551,-0.0625800583342644,0.0899232083837452,0.0255207970332586,-0.0343458209061299,0.0755898049986344,0
+1,3.67556867120403,1.36097809464341,-0.0956707962851342,0.0537771695881714,-0.0373171704803031,0.0463473815328367,-0.228499359561800,0
+1,1.96533061882493,2.92646586187099,-0.103334098736041,-0.0194013528907574,0.0253359438067293,0.00748464018133427,-0.239745502177878,0
+1,-1.95041601303593,-0.860607985906108,-0.103721968898869,-0.00972933741506002,0.0227857854969761,-0.0287381002832544,-0.130156656165122,1
+1,-1.51543545229533,-1.35683836829949,-0.106483722717291,0.103877046729912,0.00840497101030744,0.0258430051020969,0.168907472637671,1
+1,1.45074382041585,1.88231080047069,-0.107681637419817,-0.00626324733854461,-0.144385489192821,0.00088239451623517,-0.00299885969569744,0
+1,3.87956616310254,4.31276421460554,-0.129963535661731,-0.0640782960295875,-0.0324909886960640,0.0428280701443882,0.0329254937199428,0
+1,-2.88187391546093,-3.16731558128991,-0.136390769151814,-0.155408895734766,0.105626409419800,-0.0918345772196075,0.197828194781600,1
+1,-2.65024496288248,-1.81147577507541,-0.145438998990911,0.0691687502404964,0.0749439097959056,-0.0674149410216342,0.123896965825847,1
+1,-1.37426198993006,-2.08894064826135,-0.153236566384176,0.0213513951854753,-0.134553043562400,0.00287304090325258,0.0122158739075685,1
+1,1.65698424179346,2.49004336804714,-0.153862461770005,0.105220938080375,-0.0946233303225818,-0.122426312548592,-0.00538234276442917,0
+1,2.93315586503758,2.75229115279104,-0.168877592929163,-0.0349207806558679,0.0189964813847077,0.202397029441612,0.0426299706123943,0
+1,-3.84306960373604,-2.35606387141237,-0.179511886850707,-0.0916819865200809,0.0265829433229566,0.101658708455140,-0.0855390303406673,1
+1,2.28101644492271,1.37963780647481,-0.180898801743387,-0.0789829066843624,-0.0779025366072777,0.0442621459868237,-0.136195159617836,0
+1,1.70008372335953,2.71018350574622,-0.188985514267118,-0.195856534813112,-0.106263419324547,-0.0311178988395261,-0.121173036989233,0
+1,-2.05613043162767,-1.73770126734937,0.00630625444849072,-0.134595964087825,0.0708994966210059,0.0739139562742148,-0.00416084523004362,1
+1,2.39375626983328,3.2468518382106,0.00951905535238045,-0.140380515724865,0.0630970962358967,0.00183192220061040,-0.0773483294293499,0
+1,4.26863682432937,3.49421800345979,0.0109175198048448,-0.109995560295421,-0.111585866731122,0.154763193427948,-0.0186987535307691,0
+1,1.54495296452702,3.17243560853872,0.0117478311845783,0.115838636637105,-0.1715332868224,0.0927292648278796,-0.0885962242970987,0
+1,2.16883227993245,1.63879588167162,0.0158863105366749,-0.00488771308802354,0.0280782748001184,0.131946735985038,0.066416828384239,0
+1,1.86427271422921,3.32026821853873,0.0162473257475520,0.0355005599857545,-0.0988825269654524,0.0527023072810735,0.100841323212596,0
+1,-3.03828333997027,-1.43214405751321,0.0247204684728272,0.146197859364444,0.0141171187314724,-0.201738256450160,0.044002672456105,1
+1,2.08595761680696,0.225336429607513,0.0335964287149376,0.0576493862055925,0.121452048491972,0.0640240734436852,0.224720096669846,0
+1,-1.85256114614442,-2.22817393781734,0.0346230650580488,0.160185441442375,0.0114059982858295,0.00496408500928602,-0.094156048483371,1
+1,2.33572915427688,1.03334367238243,0.0357824515834720,-0.172284120406131,0.0329286256184980,-0.101030665525296,-0.00238851979619332,0
+1,-2.00334039609229,-2.98875026257892,0.0375804284421083,0.142856636546252,-0.0862220203147005,-0.0441603903572752,0.0147126239348866,1
+1,2.38346139581192,1.21051372282823,0.0405425233313353,-0.145245065311593,-0.0216697981922324,-0.0128934036902430,-0.0325085994141851,0
+1,-1.15629168023471,-1.37784639006639,0.0429948703549178,-0.00491267793152886,0.0263522850749959,-0.0442602193050815,0.0582704866256344,1
+1,2.13230915550664,1.32833684701498,0.0434112538719301,-0.0296522957829338,0.00247091583877657,-0.123872403365319,-0.136549696313901,0
+1,-1.88291252343724,-1.99980946454726,0.0472833199907535,-0.0365284873908706,-0.0209054390489622,-0.0891896486647233,0.0542966824787834,1
+1,-1.34787394136153,-2.57763619051754,0.0493154843443071,0.0384664637019124,-0.00780509859650452,-0.118550134827935,0.00573215142098708,1
+1,-1.81748193199251,-2.72113041015796,0.0551479875680516,-0.255723061179778,-0.217672946803948,0.145106553357089,0.0632886151091758,1
+1,-3.13049595715861,-0.0285946551309455,0.0724437318718333,-0.0360911974267016,-0.121364676014540,0.038351368519738,-0.0125375424386282,1
+1,-2.3836883021805,-1.40162632998805,0.0746620557343183,0.069222624188286,0.04657285528431,0.0932835769596473,0.00836816351062604,1
+1,-2.43800450243598,-0.965440038635416,0.0763675021411913,-0.122575769653323,0.045866930905471,-0.0493852614669876,0.128116802512532,1
+1,1.09024638837653,2.21814920469686,0.0769910502309598,-0.270152593833931,-0.252735856082821,0.0661674666715274,-0.000429289775969046,0
+1,3.17642151475607,1.18015379683312,0.0776648965451875,-0.117234850817615,0.0759455286430382,0.119280079276134,0.117056969569811,0
+1,-3.5501372839931,-4.02435741321994,0.0833451415432366,-0.0185864612285970,0.0553371588028254,0.0269699189958747,-0.0930023774668385,1
+1,-2.85922019599943,-2.07644295605507,0.0903467736346066,0.124804691516462,0.0673015037344841,0.0234043567104492,0.0866115903248345,1
+1,0.513249476607372,5.0165612245778,0.0934321220365115,-0.0387550539552360,0.070129320868753,0.0635055975927393,-0.00773489793089484,0
+1,1.30094323285406,2.74698316868320,0.094239413405751,-0.105600040230387,-0.0134676903839459,0.00834379403909127,0.0978349326557826,0
+1,1.62511731278249,3.01296963021698,0.104352029985773,-0.0065839083200722,0.068460830526483,-0.1202220553,0.121998460927858,0
+1,1.82917662184333,2.89388269168932,0.110781239485760,-0.262387884050666,-0.00517657837760664,-0.0224028641246511,-0.108606003593092,0
+1,-3.17279743572930,-2.86698187406046,0.110873139279243,-0.093614374710967,0.0925974010859032,-0.00747619041107016,-0.066394213442664,1
+1,-3.20104938765970,-1.68043245593876,0.123227179211642,-0.00179275501686146,-0.175893752209014,-0.0835732816974749,0.0560957582079696,1
+1,-1.89923900052239,-2.92427973445236,0.147975477003611,0.00819675018680998,0.00470753628896422,-0.0122227288860826,0.209903875101594,1
+1,0.148491843864120,-1.54734877494689,0.162479731968606,0.112962938668545,-0.0100535803565242,0.0422099301034027,0.0752974779385111,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java
new file mode 100644
index 0000000..e849011
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+
+public class LogisticModelParametersTest extends MahoutTestCase {
+
+  @Test
+  public void serializationWithoutCsv() throws IOException {
+    LogisticModelParameters params = new LogisticModelParameters();
+    params.setTargetVariable("foo");
+    params.setTypeMap(Collections.<String, String>emptyMap());
+    params.setTargetCategories(Arrays.asList("foo", "bar"));
+    params.setNumFeatures(1);
+    params.createRegression();
+
+    //MAHOUT-1196 should work without "csv" being set
+    params.saveTo(new ByteArrayOutputStream());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java
new file mode 100644
index 0000000..c8e4879
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.examples.MahoutTestCase;
+import org.apache.mahout.math.DenseVector;
+import org.junit.Test;
+
+public class ModelDissectorTest extends MahoutTestCase {
+  @Test
+  public void testCategoryOrdering() {
+    ModelDissector.Weight w = new ModelDissector.Weight("a", new DenseVector(new double[]{-2, -5, 5, 2, 4, 1, 0}), 4);
+    assertEquals(1, w.getCategory(0), 0);
+    assertEquals(-5, w.getWeight(0), 0);
+
+    assertEquals(2, w.getCategory(1), 0);
+    assertEquals(5, w.getWeight(1), 0);
+
+    assertEquals(4, w.getCategory(2), 0);
+    assertEquals(4, w.getWeight(2), 0);
+
+    assertEquals(0, w.getCategory(3), 0);
+    assertEquals(-2, w.getWeight(3), 0);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
new file mode 100644
index 0000000..4cde692
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.base.Charsets;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Sets;
+import com.google.common.io.Resources;
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.examples.MahoutTestCase;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+
+public class TrainLogisticTest extends MahoutTestCase {
+
+  @Test
+  public void example131() throws Exception {
+    String outputFile = getTestTempFile("model").getAbsolutePath();
+
+    StringWriter sw = new StringWriter();
+    PrintWriter pw = new PrintWriter(sw, true);
+    TrainLogistic.mainToOutput(new String[]{
+        "--input", "donut.csv",
+        "--output", outputFile,
+        "--target", "color", "--categories", "2",
+        "--predictors", "x", "y",
+        "--types", "numeric",
+        "--features", "20",
+        "--passes", "100",
+        "--rate", "50"
+    }, pw);
+    String trainOut = sw.toString();
+    assertTrue(trainOut.contains("x -0.7"));
+    assertTrue(trainOut.contains("y -0.4"));
+
+    LogisticModelParameters lmp = TrainLogistic.getParameters();
+    assertEquals(1.0e-4, lmp.getLambda(), 1.0e-9);
+    assertEquals(20, lmp.getNumFeatures());
+    assertTrue(lmp.useBias());
+    assertEquals("color", lmp.getTargetVariable());
+    CsvRecordFactory csv = lmp.getCsvRecordFactory();
+    assertEquals("[1, 2]", new TreeSet<>(csv.getTargetCategories()).toString());
+    assertEquals("[Intercept Term, x, y]", Sets.newTreeSet(csv.getPredictors()).toString());
+
+    // verify model by building dissector
+    AbstractVectorClassifier model = TrainLogistic.getModel();
+    List<String> data = Resources.readLines(Resources.getResource("donut.csv"), Charsets.UTF_8);
+    Map<String, Double> expectedValues = ImmutableMap.of("x", -0.7, "y", -0.43, "Intercept Term", -0.15);
+    verifyModel(lmp, csv, data, model, expectedValues);
+
+    // test saved model
+    try (InputStream in = new FileInputStream(new File(outputFile))){
+      LogisticModelParameters lmpOut = LogisticModelParameters.loadFrom(in);
+      CsvRecordFactory csvOut = lmpOut.getCsvRecordFactory();
+      csvOut.firstLine(data.get(0));
+      OnlineLogisticRegression lrOut = lmpOut.createRegression();
+      verifyModel(lmpOut, csvOut, data, lrOut, expectedValues);
+    }
+
+    sw = new StringWriter();
+    pw = new PrintWriter(sw, true);
+    RunLogistic.mainToOutput(new String[]{
+        "--input", "donut.csv",
+        "--model", outputFile,
+        "--auc",
+        "--confusion"
+    }, pw);
+    trainOut = sw.toString();
+    assertTrue(trainOut.contains("AUC = 0.57"));
+    assertTrue(trainOut.contains("confusion: [[27.0, 13.0], [0.0, 0.0]]"));
+  }
+
+  @Test
+  public void example132() throws Exception {
+    String outputFile = getTestTempFile("model").getAbsolutePath();
+
+    StringWriter sw = new StringWriter();
+    PrintWriter pw = new PrintWriter(sw, true);
+    TrainLogistic.mainToOutput(new String[]{
+        "--input", "donut.csv",
+        "--output", outputFile,
+        "--target", "color",
+        "--categories", "2",
+        "--predictors", "x", "y", "a", "b", "c",
+        "--types", "numeric",
+        "--features", "20",
+        "--passes", "100",
+        "--rate", "50"
+    }, pw);
+
+    String trainOut = sw.toString();
+    assertTrue(trainOut.contains("a 0."));
+    assertTrue(trainOut.contains("b -1."));
+    assertTrue(trainOut.contains("c -25."));
+
+    sw = new StringWriter();
+    pw = new PrintWriter(sw, true);
+    RunLogistic.mainToOutput(new String[]{
+        "--input", "donut.csv",
+        "--model", outputFile,
+        "--auc",
+        "--confusion"
+    }, pw);
+    trainOut = sw.toString();
+    assertTrue(trainOut.contains("AUC = 1.00"));
+
+    sw = new StringWriter();
+    pw = new PrintWriter(sw, true);
+    RunLogistic.mainToOutput(new String[]{
+        "--input", "donut-test.csv",
+        "--model", outputFile,
+        "--auc",
+        "--confusion"
+    }, pw);
+    trainOut = sw.toString();
+    assertTrue(trainOut.contains("AUC = 0.9"));
+  }
+
+  private static void verifyModel(LogisticModelParameters lmp,
+                                  RecordFactory csv,
+                                  List<String> data,
+                                  AbstractVectorClassifier model,
+                                  Map<String, Double> expectedValues) {
+    ModelDissector md = new ModelDissector();
+    for (String line : data.subList(1, data.size())) {
+      Vector v = new DenseVector(lmp.getNumFeatures());
+      csv.getTraceDictionary().clear();
+      csv.processLine(line, v);
+      md.update(v, csv.getTraceDictionary(), model);
+    }
+
+    // check right variables are present
+    List<ModelDissector.Weight> weights = md.summary(10);
+    Set<String> expected = Sets.newHashSet(expectedValues.keySet());
+    for (ModelDissector.Weight weight : weights) {
+      assertTrue(expected.remove(weight.getFeature()));
+      assertEquals(expectedValues.get(weight.getFeature()), weight.getWeight(), 0.1);
+    }
+    assertEquals(0, expected.size());
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
new file mode 100644
index 0000000..6e43b97
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+
+public class ClustersFilterTest extends MahoutTestCase {
+
+  private Configuration configuration;
+  private Path output;
+
+  @Override
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+    configuration = getConfiguration();
+    output = getTestTempDirPath();
+  }
+
+  @Test
+  public void testAcceptNotFinal() throws Exception {
+    Path path0 = new Path(output, "clusters-0");
+    Path path1 = new Path(output, "clusters-1");
+
+    path0.getFileSystem(configuration).createNewFile(path0);
+    path1.getFileSystem(configuration).createNewFile(path1);
+
+    PathFilter clustersFilter = new ClustersFilter();
+
+    assertTrue(clustersFilter.accept(path0));
+    assertTrue(clustersFilter.accept(path1));
+  }
+
+  @Test
+  public void testAcceptFinalPath() throws IOException {
+    Path path0 = new Path(output, "clusters-0");
+    Path path1 = new Path(output, "clusters-1");
+    Path path2 = new Path(output, "clusters-2");
+    Path path3Final = new Path(output, "clusters-3-final");
+
+    path0.getFileSystem(configuration).createNewFile(path0);
+    path1.getFileSystem(configuration).createNewFile(path1);
+    path2.getFileSystem(configuration).createNewFile(path2);
+    path3Final.getFileSystem(configuration).createNewFile(path3Final);
+
+    PathFilter clustersFilter = new ClustersFilter();
+
+    assertTrue(clustersFilter.accept(path0));
+    assertTrue(clustersFilter.accept(path1));
+    assertTrue(clustersFilter.accept(path2));
+    assertTrue(clustersFilter.accept(path3Final));
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java
new file mode 100644
index 0000000..4d81e3f
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.examples;
+
+/**
+ * This class should not exist. It's here to work around some bizarre problem in Maven
+ * dependency management wherein it can see methods in {@link org.apache.mahout.common.MahoutTestCase}
+ * but not constants. Duplicated here to make it jive.
+ */
+public abstract class MahoutTestCase extends org.apache.mahout.common.MahoutTestCase {
+
+  /** "Close enough" value for floating-point comparisons. */
+  public static final double EPSILON = 0.000001;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/resources/country.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/resources/country.txt b/community/mahout-mr/mr-examples/src/test/resources/country.txt
new file mode 100644
index 0000000..6a22091
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/resources/country.txt
@@ -0,0 +1,229 @@
+Afghanistan
+Albania
+Algeria
+American Samoa
+Andorra
+Angola
+Anguilla
+Antigua and Barbuda
+Argentina
+Armenia
+Aruba
+Australia
+Austria
+Azerbaijan
+Bahamas
+Bangladesh
+Barbados
+Belarus
+Belgium
+Belize
+Benin
+Bermuda
+Bhutan
+Bolivia
+Bosnia and Herzegovina
+Botswana
+Bouvet Island
+Brazil
+British Indian Ocean Territory
+Brunei Darussalam
+Bulgaria
+Burkina Faso
+Burundi
+Cambodia
+Cameroon
+Canada
+Cape Verde
+Cayman Islands
+Central African Republic
+Chad
+Chile
+China
+Christmas Island
+Cocos  Islands
+Colombia
+Comoros
+Congo
+Cook Islands
+Costa Rica
+Croatia
+C�te d'Ivoire
+Cuba
+Cyprus
+Czech Republic
+Djibouti
+Dominica
+Dominican Republic
+Ecuador
+Egypt
+El Salvador
+Equatorial Guinea
+Eritrea
+Estonia
+Ethiopia
+Falkland Islands 
+Faroe Islands
+Fiji
+Finland
+France
+French Guiana
+French Polynesia
+French Southern Territories
+Gabon
+Georgia
+Germany
+Ghana
+Gibraltar
+Greece
+Greenland
+Grenada
+Guadeloupe
+Guam
+Guatemala
+Guernsey
+Guinea
+Guinea-Bissau
+Guyana
+Haiti
+Honduras
+Hong Kong
+Hungary
+Iceland
+India
+Indonesia
+Iran
+Iraq
+Ireland
+Isle of Man
+Israel
+Italy
+Japan
+Jersey
+Jordan
+Kazakhstan
+Kenya
+Kiribati
+Korea
+Kuwait
+Kyrgyzstan
+Latvia
+Lebanon
+Lesotho
+Liberia
+Liechtenstein
+Lithuania
+Luxembourg
+Macedonia
+Madagascar
+Malawi
+Malaysia
+Maldives
+Mali
+Malta
+Marshall Islands
+Martinique
+Mauritania
+Mauritius
+Mayotte
+Mexico
+Micronesia
+Moldova
+Monaco
+Mongolia
+Montenegro
+Montserrat
+Morocco
+Mozambique
+Myanmar
+Namibia
+Nauru
+Nepal
+Netherlands
+Netherlands Antilles
+New Caledonia
+New Zealand
+Nicaragua
+Niger
+Nigeria
+Niue
+Norfolk Island
+Northern Mariana Islands
+Norway
+Oman
+Pakistan
+Palau
+Palestinian Territory
+Panama
+Papua New Guinea
+Paraguay
+Peru
+Philippines
+Pitcairn
+Poland
+Portugal
+Puerto Rico
+Qatar
+R�union
+Russian Federation
+Rwanda
+Saint Barth�lemy
+Saint Helena
+Saint Kitts and Nevis
+Saint Lucia
+Saint Martin 
+Saint Pierre and Miquelon
+Saint Vincent and the Grenadines
+Samoa
+San Marino
+Sao Tome and Principe
+Saudi Arabia
+Senegal
+Serbia
+Seychelles
+Sierra Leone
+Singapore
+Slovakia
+Slovenia
+Solomon Islands
+Somalia
+South Africa
+South Georgia and the South Sandwich Islands
+Spain
+Sri Lanka
+Sudan
+Suriname
+Svalbard and Jan Mayen
+Swaziland
+Sweden
+Switzerland
+Syrian Arab Republic
+Taiwan
+Tanzania
+Thailand
+Timor-Leste
+Togo
+Tokelau
+Tonga
+Trinidad and Tobago
+Tunisia
+Turkey
+Turkmenistan
+Turks and Caicos Islands
+Tuvalu
+Ukraine
+United Arab Emirates
+United Kingdom
+United States
+United States Minor Outlying Islands
+Uruguay
+Uzbekistan
+Vanuatu
+Vatican 
+Venezuela
+Vietnam
+Virgin Islands
+Wallis and Futuna
+Yemen
+Zambia
+Zimbabwe

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/resources/country10.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/resources/country10.txt b/community/mahout-mr/mr-examples/src/test/resources/country10.txt
new file mode 100644
index 0000000..97a63e1
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/resources/country10.txt
@@ -0,0 +1,10 @@
+Australia
+Austria
+Bahamas
+Canada
+Colombia
+Cuba
+Panama
+Pakistan
+United Kingdom
+Vietnam

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/resources/country2.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/resources/country2.txt b/community/mahout-mr/mr-examples/src/test/resources/country2.txt
new file mode 100644
index 0000000..f4b4f61
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/resources/country2.txt
@@ -0,0 +1,2 @@
+United States
+United Kingdom

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/resources/subjects.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/resources/subjects.txt b/community/mahout-mr/mr-examples/src/test/resources/subjects.txt
new file mode 100644
index 0000000..f52ae33
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/resources/subjects.txt
@@ -0,0 +1,2 @@
+Science
+History

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/resources/wdbc.infos
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/resources/wdbc.infos b/community/mahout-mr/mr-examples/src/test/resources/wdbc.infos
new file mode 100644
index 0000000..94a63d6
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/resources/wdbc.infos
@@ -0,0 +1,32 @@
+IGNORED
+LABEL, B, M
+NUMERICAL, 6.9, 28.2
+NUMERICAL, 9.7, 39.3
+NUMERICAL, 43.7, 188.5
+NUMERICAL, 143.5, 2501.0
+NUMERICAL, 0.0, 0.2
+NUMERICAL, 0.0, 0.4
+NUMERICAL, 0.0, 0.5
+NUMERICAL, 0.0, 0.3
+NUMERICAL, 0.1, 0.4 
+NUMERICAL, 0.0, 0.1
+NUMERICAL, 0.1, 2.9
+NUMERICAL, 0.3, 4.9
+NUMERICAL, 0.7, 22.0
+NUMERICAL, 6.8, 542.3
+NUMERICAL, 0.0, 0.1
+NUMERICAL, 0.0, 0.2
+NUMERICAL, 0.0, 0.4
+NUMERICAL, 0.0, 0.1
+NUMERICAL, 0.0, 0.1
+NUMERICAL, 0.0, 0.1
+NUMERICAL, 7.9, 36.1
+NUMERICAL, 12.0, 49.6
+NUMERICAL, 50.4, 251.2
+NUMERICAL, 185.2, 4254.0
+NUMERICAL, 0.0, 0.3
+NUMERICAL, 0.0, 1.1
+NUMERICAL, 0.0, 1.3
+NUMERICAL, 0.0, 0.3
+NUMERICAL, 0.1, 0.7
+NUMERICAL, 0.0, 0.3 


[45/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java
deleted file mode 100644
index b2ce8b1..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import org.apache.mahout.math.stats.GlobalOnlineAuc;
-import org.apache.mahout.math.stats.GroupedOnlineAuc;
-import org.apache.mahout.math.stats.OnlineAuc;
-
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-
-public class AdaptiveLogisticModelParameters extends LogisticModelParameters {
-
-  private AdaptiveLogisticRegression alr;
-  private int interval = 800;
-  private int averageWindow = 500;
-  private int threads = 4;
-  private String prior = "L1";
-  private double priorOption = Double.NaN;
-  private String auc = null;
-
-  public AdaptiveLogisticRegression createAdaptiveLogisticRegression() {
-
-    if (alr == null) {
-      alr = new AdaptiveLogisticRegression(getMaxTargetCategories(),
-                                           getNumFeatures(), createPrior(prior, priorOption));
-      alr.setInterval(interval);
-      alr.setAveragingWindow(averageWindow);
-      alr.setThreadCount(threads);
-      alr.setAucEvaluator(createAUC(auc));
-    }
-    return alr;
-  }
-
-  public void checkParameters() {
-    if (prior != null) {
-      String priorUppercase = prior.toUpperCase(Locale.ENGLISH).trim();
-      if (("TP".equals(priorUppercase) || "EBP".equals(priorUppercase)) && Double.isNaN(priorOption)) {
-        throw new IllegalArgumentException("You must specify a double value for TPrior and ElasticBandPrior.");
-      }
-    }
-  }
-
-  private static PriorFunction createPrior(String cmd, double priorOption) {
-    if (cmd == null) {
-      return null;
-    }
-    if ("L1".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
-      return new L1();
-    }
-    if ("L2".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
-      return new L2();
-    }
-    if ("UP".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
-      return new UniformPrior();
-    }
-    if ("TP".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
-      return new TPrior(priorOption);
-    }
-    if ("EBP".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
-      return new ElasticBandPrior(priorOption);
-    }
-
-    return null;
-  }
-
-  private static OnlineAuc createAUC(String cmd) {
-    if (cmd == null) {
-      return null;
-    }
-    if ("GLOBAL".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
-      return new GlobalOnlineAuc();
-    }
-    if ("GROUPED".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
-      return new GroupedOnlineAuc();
-    }
-    return null;
-  }
-
-  @Override
-  public void saveTo(OutputStream out) throws IOException {
-    if (alr != null) {
-      alr.close();
-    }
-    setTargetCategories(getCsvRecordFactory().getTargetCategories());
-    write(new DataOutputStream(out));
-  }
-
-  @Override
-  public void write(DataOutput out) throws IOException {
-    out.writeUTF(getTargetVariable());
-    out.writeInt(getTypeMap().size());
-    for (Map.Entry<String, String> entry : getTypeMap().entrySet()) {
-      out.writeUTF(entry.getKey());
-      out.writeUTF(entry.getValue());
-    }
-    out.writeInt(getNumFeatures());
-    out.writeInt(getMaxTargetCategories());
-    out.writeInt(getTargetCategories().size());
-    for (String category : getTargetCategories()) {
-      out.writeUTF(category);
-    }
-
-    out.writeInt(interval);
-    out.writeInt(averageWindow);
-    out.writeInt(threads);
-    out.writeUTF(prior);
-    out.writeDouble(priorOption);
-    out.writeUTF(auc);
-
-    // skip csv
-    alr.write(out);
-  }
-
-  @Override
-  public void readFields(DataInput in) throws IOException {
-    setTargetVariable(in.readUTF());
-    int typeMapSize = in.readInt();
-    Map<String, String> typeMap = new HashMap<>(typeMapSize);
-    for (int i = 0; i < typeMapSize; i++) {
-      String key = in.readUTF();
-      String value = in.readUTF();
-      typeMap.put(key, value);
-    }
-    setTypeMap(typeMap);
-
-    setNumFeatures(in.readInt());
-    setMaxTargetCategories(in.readInt());
-    int targetCategoriesSize = in.readInt();
-    List<String> targetCategories = new ArrayList<>(targetCategoriesSize);
-    for (int i = 0; i < targetCategoriesSize; i++) {
-      targetCategories.add(in.readUTF());
-    }
-    setTargetCategories(targetCategories);
-
-    interval = in.readInt();
-    averageWindow = in.readInt();
-    threads = in.readInt();
-    prior = in.readUTF();
-    priorOption = in.readDouble();
-    auc = in.readUTF();
-
-    alr = new AdaptiveLogisticRegression();
-    alr.readFields(in);
-  }
-
-
-  private static AdaptiveLogisticModelParameters loadFromStream(InputStream in) throws IOException {
-    AdaptiveLogisticModelParameters result = new AdaptiveLogisticModelParameters();
-    result.readFields(new DataInputStream(in));
-    return result;
-  }
-
-  public static AdaptiveLogisticModelParameters loadFromFile(File in) throws IOException {
-    try (InputStream input = new FileInputStream(in)) {
-      return loadFromStream(input);
-    }
-  }
-
-  public int getInterval() {
-    return interval;
-  }
-
-  public void setInterval(int interval) {
-    this.interval = interval;
-  }
-
-  public int getAverageWindow() {
-    return averageWindow;
-  }
-
-  public void setAverageWindow(int averageWindow) {
-    this.averageWindow = averageWindow;
-  }
-
-  public int getThreads() {
-    return threads;
-  }
-
-  public void setThreads(int threads) {
-    this.threads = threads;
-  }
-
-  public String getPrior() {
-    return prior;
-  }
-
-  public void setPrior(String prior) {
-    this.prior = prior;
-  }
-
-  public String getAuc() {
-    return auc;
-  }
-
-  public void setAuc(String auc) {
-    this.auc = auc;
-  }
-
-  public double getPriorOption() {
-    return priorOption;
-  }
-
-  public void setPriorOption(double priorOption) {
-    this.priorOption = priorOption;
-  }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
deleted file mode 100644
index e762924..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.base.Preconditions;
-import com.google.common.io.Closeables;
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import org.apache.hadoop.io.Writable;
-
-/**
- * Encapsulates everything we need to know about a model and how it reads and vectorizes its input.
- * This encapsulation allows us to coherently save and restore a model from a file.  This also
- * allows us to keep command line arguments that affect learning in a coherent way.
- */
-public class LogisticModelParameters implements Writable {
-  private String targetVariable;
-  private Map<String, String> typeMap;
-  private int numFeatures;
-  private boolean useBias;
-  private int maxTargetCategories;
-  private List<String> targetCategories;
-  private double lambda;
-  private double learningRate;
-  private CsvRecordFactory csv;
-  private OnlineLogisticRegression lr;
-
-  /**
-   * Returns a CsvRecordFactory compatible with this logistic model.  The reason that this is tied
-   * in here is so that we have access to the list of target categories when it comes time to save
-   * the model.  If the input isn't CSV, then calling setTargetCategories before calling saveTo will
-   * suffice.
-   *
-   * @return The CsvRecordFactory.
-   */
-  public CsvRecordFactory getCsvRecordFactory() {
-    if (csv == null) {
-      csv = new CsvRecordFactory(getTargetVariable(), getTypeMap())
-              .maxTargetValue(getMaxTargetCategories())
-              .includeBiasTerm(useBias());
-      if (targetCategories != null) {
-        csv.defineTargetCategories(targetCategories);
-      }
-    }
-    return csv;
-  }
-
-  /**
-   * Creates a logistic regression trainer using the parameters collected here.
-   *
-   * @return The newly allocated OnlineLogisticRegression object
-   */
-  public OnlineLogisticRegression createRegression() {
-    if (lr == null) {
-      lr = new OnlineLogisticRegression(getMaxTargetCategories(), getNumFeatures(), new L1())
-              .lambda(getLambda())
-              .learningRate(getLearningRate())
-              .alpha(1 - 1.0e-3);
-    }
-    return lr;
-  }
-
-  /**
-   * Saves a model to an output stream.
-   */
-  public void saveTo(OutputStream out) throws IOException {
-    Closeables.close(lr, false);
-    targetCategories = getCsvRecordFactory().getTargetCategories();
-    write(new DataOutputStream(out));
-  }
-
-  /**
-   * Reads a model from a stream.
-   */
-  public static LogisticModelParameters loadFrom(InputStream in) throws IOException {
-    LogisticModelParameters result = new LogisticModelParameters();
-    result.readFields(new DataInputStream(in));
-    return result;
-  }
-
-  /**
-   * Reads a model from a file.
-   * @throws IOException If there is an error opening or closing the file.
-   */
-  public static LogisticModelParameters loadFrom(File in) throws IOException {
-    try (InputStream input = new FileInputStream(in)) {
-      return loadFrom(input);
-    }
-  }
-
-
-  @Override
-  public void write(DataOutput out) throws IOException {
-    out.writeUTF(targetVariable);
-    out.writeInt(typeMap.size());
-    for (Map.Entry<String,String> entry : typeMap.entrySet()) {
-      out.writeUTF(entry.getKey());
-      out.writeUTF(entry.getValue());
-    }
-    out.writeInt(numFeatures);
-    out.writeBoolean(useBias);
-    out.writeInt(maxTargetCategories);
-
-    if (targetCategories == null) {
-      out.writeInt(0);
-    } else {
-      out.writeInt(targetCategories.size());
-      for (String category : targetCategories) {
-        out.writeUTF(category);
-      }
-    }
-    out.writeDouble(lambda);
-    out.writeDouble(learningRate);
-    // skip csv
-    lr.write(out);
-  }
-
-  @Override
-  public void readFields(DataInput in) throws IOException {
-    targetVariable = in.readUTF();
-    int typeMapSize = in.readInt();
-    typeMap = new HashMap<>(typeMapSize);
-    for (int i = 0; i < typeMapSize; i++) {
-      String key = in.readUTF();
-      String value = in.readUTF();
-      typeMap.put(key, value);
-    }
-    numFeatures = in.readInt();
-    useBias = in.readBoolean();
-    maxTargetCategories = in.readInt();
-    int targetCategoriesSize = in.readInt();
-    targetCategories = new ArrayList<>(targetCategoriesSize);
-    for (int i = 0; i < targetCategoriesSize; i++) {
-      targetCategories.add(in.readUTF());
-    }
-    lambda = in.readDouble();
-    learningRate = in.readDouble();
-    csv = null;
-    lr = new OnlineLogisticRegression();
-    lr.readFields(in);
-  }
-
-  /**
-   * Sets the types of the predictors.  This will later be used when reading CSV data.  If you don't
-   * use the CSV data and convert to vectors on your own, you don't need to call this.
-   *
-   * @param predictorList The list of variable names.
-   * @param typeList      The list of types in the format preferred by CsvRecordFactory.
-   */
-  public void setTypeMap(Iterable<String> predictorList, List<String> typeList) {
-    Preconditions.checkArgument(!typeList.isEmpty(), "Must have at least one type specifier");
-    typeMap = new HashMap<>();
-    Iterator<String> iTypes = typeList.iterator();
-    String lastType = null;
-    for (Object x : predictorList) {
-      // type list can be short .. we just repeat last spec
-      if (iTypes.hasNext()) {
-        lastType = iTypes.next();
-      }
-      typeMap.put(x.toString(), lastType);
-    }
-  }
-
-  /**
-   * Sets the target variable.  If you don't use the CSV record factory, then this is irrelevant.
-   *
-   * @param targetVariable The name of the target variable.
-   */
-  public void setTargetVariable(String targetVariable) {
-    this.targetVariable = targetVariable;
-  }
-
-  /**
-   * Sets the number of target categories to be considered.
-   *
-   * @param maxTargetCategories The number of target categories.
-   */
-  public void setMaxTargetCategories(int maxTargetCategories) {
-    this.maxTargetCategories = maxTargetCategories;
-  }
-
-  public void setNumFeatures(int numFeatures) {
-    this.numFeatures = numFeatures;
-  }
-
-  public void setTargetCategories(List<String> targetCategories) {
-    this.targetCategories = targetCategories;
-    maxTargetCategories = targetCategories.size();
-  }
-
-  public List<String> getTargetCategories() {
-    return this.targetCategories;
-  }
-
-  public void setUseBias(boolean useBias) {
-    this.useBias = useBias;
-  }
-
-  public boolean useBias() {
-    return useBias;
-  }
-
-  public String getTargetVariable() {
-    return targetVariable;
-  }
-
-  public Map<String, String> getTypeMap() {
-    return typeMap;
-  }
-
-  public void setTypeMap(Map<String, String> map) {
-    this.typeMap = map;
-  }
-
-  public int getNumFeatures() {
-    return numFeatures;
-  }
-
-  public int getMaxTargetCategories() {
-    return maxTargetCategories;
-  }
-
-  public double getLambda() {
-    return lambda;
-  }
-
-  public void setLambda(double lambda) {
-    this.lambda = lambda;
-  }
-
-  public double getLearningRate() {
-    return learningRate;
-  }
-
-  public void setLearningRate(double learningRate) {
-    this.learningRate = learningRate;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java
deleted file mode 100644
index 3ec6a06..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.base.Preconditions;
-
-import java.io.BufferedReader;
-
-/**
- * Uses the same logic as TrainLogistic and RunLogistic for finding an input, but instead
- * of processing the input, this class just prints the input to standard out.
- */
-public final class PrintResourceOrFile {
-
-  private PrintResourceOrFile() {
-  }
-
-  public static void main(String[] args) throws Exception {
-    Preconditions.checkArgument(args.length == 1, "Must have a single argument that names a file or resource.");
-    try (BufferedReader in = TrainLogistic.open(args[0])){
-      String line;
-      while ((line = in.readLine()) != null) {
-        System.out.println(line);
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java
deleted file mode 100644
index 678a8f5..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.classifier.sgd.AdaptiveLogisticRegression.Wrapper;
-import org.apache.mahout.ep.State;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.util.HashMap;
-import java.util.Map;
-
-public final class RunAdaptiveLogistic {
-
-  private static String inputFile;
-  private static String modelFile;
-  private static String outputFile;
-  private static String idColumn;
-  private static boolean maxScoreOnly;
-
-  private RunAdaptiveLogistic() {
-  }
-
-  public static void main(String[] args) throws Exception {
-    mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
-  }
-
-  static void mainToOutput(String[] args, PrintWriter output) throws Exception {
-    if (!parseArgs(args)) {
-      return;
-    }
-    AdaptiveLogisticModelParameters lmp = AdaptiveLogisticModelParameters
-        .loadFromFile(new File(modelFile));
-
-    CsvRecordFactory csv = lmp.getCsvRecordFactory();
-    csv.setIdName(idColumn);
-
-    AdaptiveLogisticRegression lr = lmp.createAdaptiveLogisticRegression();
-
-    State<Wrapper, CrossFoldLearner> best = lr.getBest();
-    if (best == null) {
-      output.println("AdaptiveLogisticRegression has not be trained probably.");
-      return;
-    }
-    CrossFoldLearner learner = best.getPayload().getLearner();
-
-    BufferedReader in = TrainAdaptiveLogistic.open(inputFile);
-    int k = 0;
-
-    try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile),
-        Charsets.UTF_8))) {
-      out.write(idColumn + ",target,score");
-      out.newLine();
-
-      String line = in.readLine();
-      csv.firstLine(line);
-      line = in.readLine();
-      Map<String, Double> results = new HashMap<>();
-      while (line != null) {
-        Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
-        csv.processLine(line, v, false);
-        Vector scores = learner.classifyFull(v);
-        results.clear();
-        if (maxScoreOnly) {
-          results.put(csv.getTargetLabel(scores.maxValueIndex()),
-              scores.maxValue());
-        } else {
-          for (int i = 0; i < scores.size(); i++) {
-            results.put(csv.getTargetLabel(i), scores.get(i));
-          }
-        }
-
-        for (Map.Entry<String, Double> entry : results.entrySet()) {
-          out.write(csv.getIdString(line) + ',' + entry.getKey() + ',' + entry.getValue());
-          out.newLine();
-        }
-        k++;
-        if (k % 100 == 0) {
-          output.println(k + " records processed");
-        }
-        line = in.readLine();
-      }
-      out.flush();
-    }
-    output.println(k + " records processed totally.");
-  }
-
-  private static boolean parseArgs(String[] args) {
-    DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
-    Option help = builder.withLongName("help")
-      .withDescription("print this list").create();
-
-    Option quiet = builder.withLongName("quiet")
-      .withDescription("be extra quiet").create();
-
-    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
-    Option inputFileOption = builder
-      .withLongName("input")
-      .withRequired(true)
-      .withArgument(
-          argumentBuilder.withName("input").withMaximum(1)
-            .create())
-      .withDescription("where to get training data").create();
-
-    Option modelFileOption = builder
-      .withLongName("model")
-      .withRequired(true)
-      .withArgument(
-          argumentBuilder.withName("model").withMaximum(1)
-            .create())
-      .withDescription("where to get the trained model").create();
-    
-    Option outputFileOption = builder
-      .withLongName("output")
-      .withRequired(true)
-      .withDescription("the file path to output scores")
-      .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
-      .create();
-    
-    Option idColumnOption = builder
-      .withLongName("idcolumn")
-      .withRequired(true)
-      .withDescription("the name of the id column for each record")
-      .withArgument(argumentBuilder.withName("idcolumn").withMaximum(1).create())
-      .create();
-    
-    Option maxScoreOnlyOption = builder
-      .withLongName("maxscoreonly")
-      .withDescription("only output the target label with max scores")
-      .create();
-
-    Group normalArgs = new GroupBuilder()
-      .withOption(help).withOption(quiet)
-      .withOption(inputFileOption).withOption(modelFileOption)
-      .withOption(outputFileOption).withOption(idColumnOption)
-      .withOption(maxScoreOnlyOption)
-      .create();
-
-    Parser parser = new Parser();
-    parser.setHelpOption(help);
-    parser.setHelpTrigger("--help");
-    parser.setGroup(normalArgs);
-    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
-    CommandLine cmdLine = parser.parseAndHelp(args);
-
-    if (cmdLine == null) {
-      return false;
-    }
-
-    inputFile = getStringArgument(cmdLine, inputFileOption);
-    modelFile = getStringArgument(cmdLine, modelFileOption);
-    outputFile = getStringArgument(cmdLine, outputFileOption);
-    idColumn = getStringArgument(cmdLine, idColumnOption);
-    maxScoreOnly = getBooleanArgument(cmdLine, maxScoreOnlyOption);    
-    return true;
-  }
-
-  private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
-    return cmdLine.hasOption(option);
-  }
-
-  private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
-    return (String) cmdLine.getValue(inputFile);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java
deleted file mode 100644
index 2d57016..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.classifier.evaluation.Auc;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.util.Locale;
-
-public final class RunLogistic {
-
-  private static String inputFile;
-  private static String modelFile;
-  private static boolean showAuc;
-  private static boolean showScores;
-  private static boolean showConfusion;
-
-  private RunLogistic() {
-  }
-
-  public static void main(String[] args) throws Exception {
-    mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
-  }
-
-  static void mainToOutput(String[] args, PrintWriter output) throws Exception {
-    if (parseArgs(args)) {
-      if (!showAuc && !showConfusion && !showScores) {
-        showAuc = true;
-        showConfusion = true;
-      }
-
-      Auc collector = new Auc();
-      LogisticModelParameters lmp = LogisticModelParameters.loadFrom(new File(modelFile));
-
-      CsvRecordFactory csv = lmp.getCsvRecordFactory();
-      OnlineLogisticRegression lr = lmp.createRegression();
-      BufferedReader in = TrainLogistic.open(inputFile);
-      String line = in.readLine();
-      csv.firstLine(line);
-      line = in.readLine();
-      if (showScores) {
-        output.println("\"target\",\"model-output\",\"log-likelihood\"");
-      }
-      while (line != null) {
-        Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
-        int target = csv.processLine(line, v);
-
-        double score = lr.classifyScalar(v);
-        if (showScores) {
-          output.printf(Locale.ENGLISH, "%d,%.3f,%.6f%n", target, score, lr.logLikelihood(target, v));
-        }
-        collector.add(target, score);
-        line = in.readLine();
-      }
-
-      if (showAuc) {
-        output.printf(Locale.ENGLISH, "AUC = %.2f%n", collector.auc());
-      }
-      if (showConfusion) {
-        Matrix m = collector.confusion();
-        output.printf(Locale.ENGLISH, "confusion: [[%.1f, %.1f], [%.1f, %.1f]]%n",
-          m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
-        m = collector.entropy();
-        output.printf(Locale.ENGLISH, "entropy: [[%.1f, %.1f], [%.1f, %.1f]]%n",
-          m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
-      }
-    }
-  }
-
-  private static boolean parseArgs(String[] args) {
-    DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
-    Option help = builder.withLongName("help").withDescription("print this list").create();
-
-    Option quiet = builder.withLongName("quiet").withDescription("be extra quiet").create();
-
-    Option auc = builder.withLongName("auc").withDescription("print AUC").create();
-    Option confusion = builder.withLongName("confusion").withDescription("print confusion matrix").create();
-
-    Option scores = builder.withLongName("scores").withDescription("print scores").create();
-
-    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
-    Option inputFileOption = builder.withLongName("input")
-            .withRequired(true)
-            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
-            .withDescription("where to get training data")
-            .create();
-
-    Option modelFileOption = builder.withLongName("model")
-            .withRequired(true)
-            .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
-            .withDescription("where to get a model")
-            .create();
-
-    Group normalArgs = new GroupBuilder()
-            .withOption(help)
-            .withOption(quiet)
-            .withOption(auc)
-            .withOption(scores)
-            .withOption(confusion)
-            .withOption(inputFileOption)
-            .withOption(modelFileOption)
-            .create();
-
-    Parser parser = new Parser();
-    parser.setHelpOption(help);
-    parser.setHelpTrigger("--help");
-    parser.setGroup(normalArgs);
-    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
-    CommandLine cmdLine = parser.parseAndHelp(args);
-
-    if (cmdLine == null) {
-      return false;
-    }
-
-    inputFile = getStringArgument(cmdLine, inputFileOption);
-    modelFile = getStringArgument(cmdLine, modelFileOption);
-    showAuc = getBooleanArgument(cmdLine, auc);
-    showScores = getBooleanArgument(cmdLine, scores);
-    showConfusion = getBooleanArgument(cmdLine, confusion);
-
-    return true;
-  }
-
-  private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
-    return cmdLine.hasOption(option);
-  }
-
-  private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
-    return (String) cmdLine.getValue(inputFile);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java
deleted file mode 100644
index c657803..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java
+++ /dev/null
@@ -1,151 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.collect.Multiset;
-import org.apache.mahout.classifier.NewsgroupHelper;
-import org.apache.mahout.ep.State;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.function.DoubleFunction;
-import org.apache.mahout.math.function.Functions;
-import org.apache.mahout.vectorizer.encoders.Dictionary;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
-import java.util.TreeMap;
-
-public final class SGDHelper {
-
-  private static final String[] LEAK_LABELS = {"none", "month-year", "day-month-year"};
-
-  private SGDHelper() {
-  }
-
-  public static void dissect(int leakType,
-                             Dictionary dictionary,
-                             AdaptiveLogisticRegression learningAlgorithm,
-                             Iterable<File> files, Multiset<String> overallCounts) throws IOException {
-    CrossFoldLearner model = learningAlgorithm.getBest().getPayload().getLearner();
-    model.close();
-
-    Map<String, Set<Integer>> traceDictionary = new TreeMap<>();
-    ModelDissector md = new ModelDissector();
-
-    NewsgroupHelper helper = new NewsgroupHelper();
-    helper.getEncoder().setTraceDictionary(traceDictionary);
-    helper.getBias().setTraceDictionary(traceDictionary);
-
-    for (File file : permute(files, helper.getRandom()).subList(0, 500)) {
-      String ng = file.getParentFile().getName();
-      int actual = dictionary.intern(ng);
-
-      traceDictionary.clear();
-      Vector v = helper.encodeFeatureVector(file, actual, leakType, overallCounts);
-      md.update(v, traceDictionary, model);
-    }
-
-    List<String> ngNames = new ArrayList<>(dictionary.values());
-    List<ModelDissector.Weight> weights = md.summary(100);
-    System.out.println("============");
-    System.out.println("Model Dissection");
-    for (ModelDissector.Weight w : weights) {
-      System.out.printf("%s\t%.1f\t%s\t%.1f\t%s\t%.1f\t%s%n",
-                        w.getFeature(), w.getWeight(), ngNames.get(w.getMaxImpact() + 1),
-                        w.getCategory(1), w.getWeight(1), w.getCategory(2), w.getWeight(2));
-    }
-  }
-
-  public static List<File> permute(Iterable<File> files, Random rand) {
-    List<File> r = new ArrayList<>();
-    for (File file : files) {
-      int i = rand.nextInt(r.size() + 1);
-      if (i == r.size()) {
-        r.add(file);
-      } else {
-        r.add(r.get(i));
-        r.set(i, file);
-      }
-    }
-    return r;
-  }
-
-  static void analyzeState(SGDInfo info, int leakType, int k, State<AdaptiveLogisticRegression.Wrapper,
-      CrossFoldLearner> best) throws IOException {
-    int bump = info.getBumps()[(int) Math.floor(info.getStep()) % info.getBumps().length];
-    int scale = (int) Math.pow(10, Math.floor(info.getStep() / info.getBumps().length));
-    double maxBeta;
-    double nonZeros;
-    double positive;
-    double norm;
-
-    double lambda = 0;
-    double mu = 0;
-
-    if (best != null) {
-      CrossFoldLearner state = best.getPayload().getLearner();
-      info.setAverageCorrect(state.percentCorrect());
-      info.setAverageLL(state.logLikelihood());
-
-      OnlineLogisticRegression model = state.getModels().get(0);
-      // finish off pending regularization
-      model.close();
-
-      Matrix beta = model.getBeta();
-      maxBeta = beta.aggregate(Functions.MAX, Functions.ABS);
-      nonZeros = beta.aggregate(Functions.PLUS, new DoubleFunction() {
-        @Override
-        public double apply(double v) {
-          return Math.abs(v) > 1.0e-6 ? 1 : 0;
-        }
-      });
-      positive = beta.aggregate(Functions.PLUS, new DoubleFunction() {
-        @Override
-        public double apply(double v) {
-          return v > 0 ? 1 : 0;
-        }
-      });
-      norm = beta.aggregate(Functions.PLUS, Functions.ABS);
-
-      lambda = best.getMappedParams()[0];
-      mu = best.getMappedParams()[1];
-    } else {
-      maxBeta = 0;
-      nonZeros = 0;
-      positive = 0;
-      norm = 0;
-    }
-    if (k % (bump * scale) == 0) {
-      if (best != null) {
-        File modelFile = new File(System.getProperty("java.io.tmpdir"), "news-group-" + k + ".model");
-        ModelSerializer.writeBinary(modelFile.getAbsolutePath(), best.getPayload().getLearner().getModels().get(0));
-      }
-
-      info.setStep(info.getStep() + 0.25);
-      System.out.printf("%.2f\t%.2f\t%.2f\t%.2f\t%.8g\t%.8g\t", maxBeta, nonZeros, positive, norm, lambda, mu);
-      System.out.printf("%d\t%.3f\t%.2f\t%s%n",
-        k, info.getAverageLL(), info.getAverageCorrect() * 100, LEAK_LABELS[leakType % 3]);
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java
deleted file mode 100644
index be55d43..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-final class SGDInfo {
-
-  private double averageLL;
-  private double averageCorrect;
-  private double step;
-  private int[] bumps = {1, 2, 5};
-
-  double getAverageLL() {
-    return averageLL;
-  }
-
-  void setAverageLL(double averageLL) {
-    this.averageLL = averageLL;
-  }
-
-  double getAverageCorrect() {
-    return averageCorrect;
-  }
-
-  void setAverageCorrect(double averageCorrect) {
-    this.averageCorrect = averageCorrect;
-  }
-
-  double getStep() {
-    return step;
-  }
-
-  void setStep(double step) {
-    this.step = step;
-  }
-
-  int[] getBumps() {
-    return bumps;
-  }
-
-  void setBumps(int[] bumps) {
-    this.bumps = bumps;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
deleted file mode 100644
index b3da452..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.base.Joiner;
-import com.google.common.base.Splitter;
-import com.google.common.collect.Lists;
-import com.google.common.io.Closeables;
-import com.google.common.io.Files;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.list.IntArrayList;
-import org.apache.mahout.math.stats.OnlineSummarizer;
-import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
-import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedReader;
-import java.io.Closeable;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Random;
-
-/**
- * Shows how different encoding choices can make big speed differences.
- * <p/>
- * Run with command line options --generate 1000000 test.csv to generate a million data lines in
- * test.csv.
- * <p/>
- * Run with command line options --parser test.csv to time how long it takes to parse and encode
- * those million data points
- * <p/>
- * Run with command line options --fast test.csv to time how long it takes to parse and encode those
- * million data points using byte-level parsing and direct value encoding.
- * <p/>
- * This doesn't demonstrate text encoding which is subject to somewhat different tricks.  The basic
- * idea of caching hash locations and byte level parsing still very much applies to text, however.
- */
-public final class SimpleCsvExamples {
-
-  public static final char SEPARATOR_CHAR = '\t';
-  private static final int FIELDS = 100;
-
-  private static final Logger log = LoggerFactory.getLogger(SimpleCsvExamples.class);
-
-  private SimpleCsvExamples() {}
-
-  public static void main(String[] args) throws IOException {
-    FeatureVectorEncoder[] encoder = new FeatureVectorEncoder[FIELDS];
-    for (int i = 0; i < FIELDS; i++) {
-      encoder[i] = new ConstantValueEncoder("v" + 1);
-    }
-
-    OnlineSummarizer[] s = new OnlineSummarizer[FIELDS];
-    for (int i = 0; i < FIELDS; i++) {
-      s[i] = new OnlineSummarizer();
-    }
-    long t0 = System.currentTimeMillis();
-    Vector v = new DenseVector(1000);
-    if ("--generate".equals(args[0])) {
-      try (PrintWriter out =
-               new PrintWriter(new OutputStreamWriter(new FileOutputStream(new File(args[2])), Charsets.UTF_8))) {
-        int n = Integer.parseInt(args[1]);
-        for (int i = 0; i < n; i++) {
-          Line x = Line.generate();
-          out.println(x);
-        }
-      }
-    } else if ("--parse".equals(args[0])) {
-      try (BufferedReader in = Files.newReader(new File(args[1]), Charsets.UTF_8)){
-        String line = in.readLine();
-        while (line != null) {
-          v.assign(0);
-          Line x = new Line(line);
-          for (int i = 0; i < FIELDS; i++) {
-            s[i].add(x.getDouble(i));
-            encoder[i].addToVector(x.get(i), v);
-          }
-          line = in.readLine();
-        }
-      }
-      String separator = "";
-      for (int i = 0; i < FIELDS; i++) {
-        System.out.printf("%s%.3f", separator, s[i].getMean());
-        separator = ",";
-      }
-    } else if ("--fast".equals(args[0])) {
-      try (FastLineReader in = new FastLineReader(new FileInputStream(args[1]))){
-        FastLine line = in.read();
-        while (line != null) {
-          v.assign(0);
-          for (int i = 0; i < FIELDS; i++) {
-            double z = line.getDouble(i);
-            s[i].add(z);
-            encoder[i].addToVector((byte[]) null, z, v);
-          }
-          line = in.read();
-        }
-      }
-
-      String separator = "";
-      for (int i = 0; i < FIELDS; i++) {
-        System.out.printf("%s%.3f", separator, s[i].getMean());
-        separator = ",";
-      }
-    }
-    System.out.printf("\nElapsed time = %.3f%n", (System.currentTimeMillis() - t0) / 1000.0);
-  }
-
-
-  private static final class Line {
-    private static final Splitter ON_TABS = Splitter.on(SEPARATOR_CHAR).trimResults();
-    public static final Joiner WITH_COMMAS = Joiner.on(SEPARATOR_CHAR);
-
-    public static final Random RAND = RandomUtils.getRandom();
-
-    private final List<String> data;
-
-    private Line(CharSequence line) {
-      data = Lists.newArrayList(ON_TABS.split(line));
-    }
-
-    private Line() {
-      data = new ArrayList<>();
-    }
-
-    public double getDouble(int field) {
-      return Double.parseDouble(data.get(field));
-    }
-
-    /**
-     * Generate a random line with 20 fields each with integer values.
-     *
-     * @return A new line with data.
-     */
-    public static Line generate() {
-      Line r = new Line();
-      for (int i = 0; i < FIELDS; i++) {
-        double mean = ((i + 1) * 257) % 50 + 1;
-        r.data.add(Integer.toString(randomValue(mean)));
-      }
-      return r;
-    }
-
-    /**
-     * Returns a random exponentially distributed integer with a particular mean value.  This is
-     * just a way to create more small numbers than big numbers.
-     *
-     * @param mean mean of the distribution
-     * @return random exponentially distributed integer with the specific mean
-     */
-    private static int randomValue(double mean) {
-      return (int) (-mean * Math.log1p(-RAND.nextDouble()));
-    }
-
-    @Override
-    public String toString() {
-      return WITH_COMMAS.join(data);
-    }
-
-    public String get(int field) {
-      return data.get(field);
-    }
-  }
-
-  private static final class FastLine {
-
-    private final ByteBuffer base;
-    private final IntArrayList start = new IntArrayList();
-    private final IntArrayList length = new IntArrayList();
-
-    private FastLine(ByteBuffer base) {
-      this.base = base;
-    }
-
-    public static FastLine read(ByteBuffer buf) {
-      FastLine r = new FastLine(buf);
-      r.start.add(buf.position());
-      int offset = buf.position();
-      while (offset < buf.limit()) {
-        int ch = buf.get();
-        offset = buf.position();
-        switch (ch) {
-          case '\n':
-            r.length.add(offset - r.start.get(r.length.size()) - 1);
-            return r;
-          case SEPARATOR_CHAR:
-            r.length.add(offset - r.start.get(r.length.size()) - 1);
-            r.start.add(offset);
-            break;
-          default:
-            // nothing to do for now
-        }
-      }
-      throw new IllegalArgumentException("Not enough bytes in buffer");
-    }
-
-    public double getDouble(int field) {
-      int offset = start.get(field);
-      int size = length.get(field);
-      switch (size) {
-        case 1:
-          return base.get(offset) - '0';
-        case 2:
-          return (base.get(offset) - '0') * 10 + base.get(offset + 1) - '0';
-        default:
-          double r = 0;
-          for (int i = 0; i < size; i++) {
-            r = 10 * r + base.get(offset + i) - '0';
-          }
-          return r;
-      }
-    }
-  }
-
-  private static final class FastLineReader implements Closeable {
-    private final InputStream in;
-    private final ByteBuffer buf = ByteBuffer.allocate(100000);
-
-    private FastLineReader(InputStream in) throws IOException {
-      this.in = in;
-      buf.limit(0);
-      fillBuffer();
-    }
-
-    public FastLine read() throws IOException {
-      fillBuffer();
-      if (buf.remaining() > 0) {
-        return FastLine.read(buf);
-      } else {
-        return null;
-      }
-    }
-
-    private void fillBuffer() throws IOException {
-      if (buf.remaining() < 10000) {
-        buf.compact();
-        int n = in.read(buf.array(), buf.position(), buf.remaining());
-        if (n == -1) {
-          buf.flip();
-        } else {
-          buf.limit(buf.position() + n);
-          buf.position(0);
-        }
-      }
-    }
-
-    @Override
-    public void close() {
-      try {
-        Closeables.close(in, true);
-      } catch (IOException e) {
-        log.error(e.getMessage(), e);
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java
deleted file mode 100644
index 074f774..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java
+++ /dev/null
@@ -1,152 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.classifier.ClassifierResult;
-import org.apache.mahout.classifier.ResultAnalyzer;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.vectorizer.encoders.Dictionary;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-
-/**
- * Run the ASF email, as trained by TrainASFEmail
- */
-public final class TestASFEmail {
-
-  private String inputFile;
-  private String modelFile;
-
-  private TestASFEmail() {}
-
-  public static void main(String[] args) throws IOException {
-    TestASFEmail runner = new TestASFEmail();
-    if (runner.parseArgs(args)) {
-      runner.run(new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
-    }
-  }
-
-  public void run(PrintWriter output) throws IOException {
-
-    File base = new File(inputFile);
-    //contains the best model
-    OnlineLogisticRegression classifier =
-        ModelSerializer.readBinary(new FileInputStream(modelFile), OnlineLogisticRegression.class);
-
-
-    Dictionary asfDictionary = new Dictionary();
-    Configuration conf = new Configuration();
-    PathFilter testFilter = new PathFilter() {
-      @Override
-      public boolean accept(Path path) {
-        return path.getName().contains("test");
-      }
-    };
-    SequenceFileDirIterator<Text, VectorWritable> iter =
-        new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, testFilter,
-        null, true, conf);
-
-    long numItems = 0;
-    while (iter.hasNext()) {
-      Pair<Text, VectorWritable> next = iter.next();
-      asfDictionary.intern(next.getFirst().toString());
-      numItems++;
-    }
-
-    System.out.println(numItems + " test files");
-    ResultAnalyzer ra = new ResultAnalyzer(asfDictionary.values(), "DEFAULT");
-    iter = new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, testFilter,
-            null, true, conf);
-    while (iter.hasNext()) {
-      Pair<Text, VectorWritable> next = iter.next();
-      String ng = next.getFirst().toString();
-
-      int actual = asfDictionary.intern(ng);
-      Vector result = classifier.classifyFull(next.getSecond().get());
-      int cat = result.maxValueIndex();
-      double score = result.maxValue();
-      double ll = classifier.logLikelihood(actual, next.getSecond().get());
-      ClassifierResult cr = new ClassifierResult(asfDictionary.values().get(cat), score, ll);
-      ra.addInstance(asfDictionary.values().get(actual), cr);
-
-    }
-    output.println(ra);
-  }
-
-  boolean parseArgs(String[] args) {
-    DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
-    Option help = builder.withLongName("help").withDescription("print this list").create();
-
-    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
-    Option inputFileOption = builder.withLongName("input")
-            .withRequired(true)
-            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
-            .withDescription("where to get training data")
-            .create();
-
-    Option modelFileOption = builder.withLongName("model")
-            .withRequired(true)
-            .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
-            .withDescription("where to get a model")
-            .create();
-
-    Group normalArgs = new GroupBuilder()
-            .withOption(help)
-            .withOption(inputFileOption)
-            .withOption(modelFileOption)
-            .create();
-
-    Parser parser = new Parser();
-    parser.setHelpOption(help);
-    parser.setHelpTrigger("--help");
-    parser.setGroup(normalArgs);
-    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
-    CommandLine cmdLine = parser.parseAndHelp(args);
-
-    if (cmdLine == null) {
-      return false;
-    }
-
-    inputFile = (String) cmdLine.getValue(inputFileOption);
-    modelFile = (String) cmdLine.getValue(modelFileOption);
-    return true;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java
deleted file mode 100644
index f0316e9..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java
+++ /dev/null
@@ -1,141 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.collect.HashMultiset;
-import com.google.common.collect.Multiset;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.classifier.ClassifierResult;
-import org.apache.mahout.classifier.NewsgroupHelper;
-import org.apache.mahout.classifier.ResultAnalyzer;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.vectorizer.encoders.Dictionary;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Run the 20 news groups test data through SGD, as trained by {@link org.apache.mahout.classifier.sgd.TrainNewsGroups}.
- */
-public final class TestNewsGroups {
-
-  private String inputFile;
-  private String modelFile;
-
-  private TestNewsGroups() {
-  }
-
-  public static void main(String[] args) throws IOException {
-    TestNewsGroups runner = new TestNewsGroups();
-    if (runner.parseArgs(args)) {
-      runner.run(new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
-    }
-  }
-
-  public void run(PrintWriter output) throws IOException {
-
-    File base = new File(inputFile);
-    //contains the best model
-    OnlineLogisticRegression classifier =
-        ModelSerializer.readBinary(new FileInputStream(modelFile), OnlineLogisticRegression.class);
-
-    Dictionary newsGroups = new Dictionary();
-    Multiset<String> overallCounts = HashMultiset.create();
-
-    List<File> files = new ArrayList<>();
-    for (File newsgroup : base.listFiles()) {
-      if (newsgroup.isDirectory()) {
-        newsGroups.intern(newsgroup.getName());
-        files.addAll(Arrays.asList(newsgroup.listFiles()));
-      }
-    }
-    System.out.println(files.size() + " test files");
-    ResultAnalyzer ra = new ResultAnalyzer(newsGroups.values(), "DEFAULT");
-    for (File file : files) {
-      String ng = file.getParentFile().getName();
-
-      int actual = newsGroups.intern(ng);
-      NewsgroupHelper helper = new NewsgroupHelper();
-      //no leak type ensures this is a normal vector
-      Vector input = helper.encodeFeatureVector(file, actual, 0, overallCounts);
-      Vector result = classifier.classifyFull(input);
-      int cat = result.maxValueIndex();
-      double score = result.maxValue();
-      double ll = classifier.logLikelihood(actual, input);
-      ClassifierResult cr = new ClassifierResult(newsGroups.values().get(cat), score, ll);
-      ra.addInstance(newsGroups.values().get(actual), cr);
-
-    }
-    output.println(ra);
-  }
-
-  boolean parseArgs(String[] args) {
-    DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
-    Option help = builder.withLongName("help").withDescription("print this list").create();
-
-    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
-    Option inputFileOption = builder.withLongName("input")
-            .withRequired(true)
-            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
-            .withDescription("where to get training data")
-            .create();
-
-    Option modelFileOption = builder.withLongName("model")
-            .withRequired(true)
-            .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
-            .withDescription("where to get a model")
-            .create();
-
-    Group normalArgs = new GroupBuilder()
-            .withOption(help)
-            .withOption(inputFileOption)
-            .withOption(modelFileOption)
-            .create();
-
-    Parser parser = new Parser();
-    parser.setHelpOption(help);
-    parser.setHelpTrigger("--help");
-    parser.setGroup(normalArgs);
-    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
-    CommandLine cmdLine = parser.parseAndHelp(args);
-
-    if (cmdLine == null) {
-      return false;
-    }
-
-    inputFile = (String) cmdLine.getValue(inputFileOption);
-    modelFile = (String) cmdLine.getValue(modelFileOption);
-    return true;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java
deleted file mode 100644
index e681f92..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.collect.HashMultiset;
-import com.google.common.collect.Multiset;
-import com.google.common.collect.Ordering;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
-import org.apache.mahout.ep.State;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.vectorizer.encoders.Dictionary;
-
-import java.io.File;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-public final class TrainASFEmail extends AbstractJob {
-
-  private TrainASFEmail() {
-  }
-
-  @Override
-  public int run(String[] args) throws Exception {
-    addInputOption();
-    addOutputOption();
-    addOption("categories", "nc", "The number of categories to train on", true);
-    addOption("cardinality", "c", "The size of the vectors to use", "100000");
-    addOption("threads", "t", "The number of threads to use in the learner", "20");
-    addOption("poolSize", "p", "The number of CrossFoldLearners to use in the AdaptiveLogisticRegression. "
-                               + "Higher values require more memory.", "5");
-    if (parseArguments(args) == null) {
-      return -1;
-    }
-
-    File base = new File(getInputPath().toString());
-
-    Multiset<String> overallCounts = HashMultiset.create();
-    File output = new File(getOutputPath().toString());
-    output.mkdirs();
-    int numCats = Integer.parseInt(getOption("categories"));
-    int cardinality = Integer.parseInt(getOption("cardinality", "100000"));
-    int threadCount = Integer.parseInt(getOption("threads", "20"));
-    int poolSize = Integer.parseInt(getOption("poolSize", "5"));
-    Dictionary asfDictionary = new Dictionary();
-    AdaptiveLogisticRegression learningAlgorithm =
-        new AdaptiveLogisticRegression(numCats, cardinality, new L1(), threadCount, poolSize);
-    learningAlgorithm.setInterval(800);
-    learningAlgorithm.setAveragingWindow(500);
-
-    //We ran seq2encoded and split input already, so let's just build up the dictionary
-    Configuration conf = new Configuration();
-    PathFilter trainFilter = new PathFilter() {
-      @Override
-      public boolean accept(Path path) {
-        return path.getName().contains("training");
-      }
-    };
-    SequenceFileDirIterator<Text, VectorWritable> iter =
-        new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, trainFilter, null, true, conf);
-    long numItems = 0;
-    while (iter.hasNext()) {
-      Pair<Text, VectorWritable> next = iter.next();
-      asfDictionary.intern(next.getFirst().toString());
-      numItems++;
-    }
-
-    System.out.println(numItems + " training files");
-
-    SGDInfo info = new SGDInfo();
-
-    iter = new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, trainFilter,
-            null, true, conf);
-    int k = 0;
-    while (iter.hasNext()) {
-      Pair<Text, VectorWritable> next = iter.next();
-      String ng = next.getFirst().toString();
-      int actual = asfDictionary.intern(ng);
-      //we already have encoded
-      learningAlgorithm.train(actual, next.getSecond().get());
-      k++;
-      State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();
-
-      SGDHelper.analyzeState(info, 0, k, best);
-    }
-    learningAlgorithm.close();
-    //TODO: how to dissection since we aren't processing the files here
-    //SGDHelper.dissect(leakType, asfDictionary, learningAlgorithm, files, overallCounts);
-    System.out.println("exiting main, writing model to " + output);
-
-    ModelSerializer.writeBinary(output + "/asf.model",
-            learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
-
-    List<Integer> counts = new ArrayList<>();
-    System.out.println("Word counts");
-    for (String count : overallCounts.elementSet()) {
-      counts.add(overallCounts.count(count));
-    }
-    Collections.sort(counts, Ordering.natural().reverse());
-    k = 0;
-    for (Integer count : counts) {
-      System.out.println(k + "\t" + count);
-      k++;
-      if (k > 1000) {
-        break;
-      }
-    }
-    return 0;
-  }
-
-  public static void main(String[] args) throws Exception {
-    TrainASFEmail trainer = new TrainASFEmail();
-    trainer.run(args);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java
deleted file mode 100644
index defb5b9..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java
+++ /dev/null
@@ -1,377 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.io.Resources;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.classifier.sgd.AdaptiveLogisticRegression.Wrapper;
-import org.apache.mahout.ep.State;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Locale;
-
-public final class TrainAdaptiveLogistic {
-
-  private static String inputFile;
-  private static String outputFile;
-  private static AdaptiveLogisticModelParameters lmp;
-  private static int passes;
-  private static boolean showperf;
-  private static int skipperfnum = 99;
-  private static AdaptiveLogisticRegression model;
-
-  private TrainAdaptiveLogistic() {
-  }
-
-  public static void main(String[] args) throws Exception {
-    mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
-  }
-
-  static void mainToOutput(String[] args, PrintWriter output) throws Exception {
-    if (parseArgs(args)) {
-
-      CsvRecordFactory csv = lmp.getCsvRecordFactory();
-      model = lmp.createAdaptiveLogisticRegression();
-      State<Wrapper, CrossFoldLearner> best;
-      CrossFoldLearner learner = null;
-
-      int k = 0;
-      for (int pass = 0; pass < passes; pass++) {
-        BufferedReader in = open(inputFile);
-
-        // read variable names
-        csv.firstLine(in.readLine());
-
-        String line = in.readLine();
-        while (line != null) {
-          // for each new line, get target and predictors
-          Vector input = new RandomAccessSparseVector(lmp.getNumFeatures());
-          int targetValue = csv.processLine(line, input);
-
-          // update model
-          model.train(targetValue, input);
-          k++;
-
-          if (showperf && (k % (skipperfnum + 1) == 0)) {
-
-            best = model.getBest();
-            if (best != null) {
-              learner = best.getPayload().getLearner();
-            }
-            if (learner != null) {
-              double averageCorrect = learner.percentCorrect();
-              double averageLL = learner.logLikelihood();
-              output.printf("%d\t%.3f\t%.2f%n",
-                            k, averageLL, averageCorrect * 100);
-            } else {
-              output.printf(Locale.ENGLISH,
-                            "%10d %2d %s%n", k, targetValue,
-                            "AdaptiveLogisticRegression has not found a good model ......");
-            }
-          }
-          line = in.readLine();
-        }
-        in.close();
-      }
-
-      best = model.getBest();
-      if (best != null) {
-        learner = best.getPayload().getLearner();
-      }
-      if (learner == null) {
-        output.println("AdaptiveLogisticRegression has failed to train a model.");
-        return;
-      }
-
-      try (OutputStream modelOutput = new FileOutputStream(outputFile)) {
-        lmp.saveTo(modelOutput);
-      }
-
-      OnlineLogisticRegression lr = learner.getModels().get(0);
-      output.println(lmp.getNumFeatures());
-      output.println(lmp.getTargetVariable() + " ~ ");
-      String sep = "";
-      for (String v : csv.getTraceDictionary().keySet()) {
-        double weight = predictorWeight(lr, 0, csv, v);
-        if (weight != 0) {
-          output.printf(Locale.ENGLISH, "%s%.3f*%s", sep, weight, v);
-          sep = " + ";
-        }
-      }
-      output.printf("%n");
-
-      for (int row = 0; row < lr.getBeta().numRows(); row++) {
-        for (String key : csv.getTraceDictionary().keySet()) {
-          double weight = predictorWeight(lr, row, csv, key);
-          if (weight != 0) {
-            output.printf(Locale.ENGLISH, "%20s %.5f%n", key, weight);
-          }
-        }
-        for (int column = 0; column < lr.getBeta().numCols(); column++) {
-          output.printf(Locale.ENGLISH, "%15.9f ", lr.getBeta().get(row, column));
-        }
-        output.println();
-      }
-    }
-
-  }
-
-  private static double predictorWeight(OnlineLogisticRegression lr, int row, RecordFactory csv, String predictor) {
-    double weight = 0;
-    for (Integer column : csv.getTraceDictionary().get(predictor)) {
-      weight += lr.getBeta().get(row, column);
-    }
-    return weight;
-  }
-
-  private static boolean parseArgs(String[] args) {
-    DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
-    Option help = builder.withLongName("help")
-        .withDescription("print this list").create();
-
-    Option quiet = builder.withLongName("quiet")
-        .withDescription("be extra quiet").create();
-    
-   
-    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
-    Option showperf = builder
-      .withLongName("showperf")
-      .withDescription("output performance measures during training")
-      .create();
-
-    Option inputFile = builder
-        .withLongName("input")
-        .withRequired(true)
-        .withArgument(
-            argumentBuilder.withName("input").withMaximum(1)
-                .create())
-        .withDescription("where to get training data").create();
-
-    Option outputFile = builder
-        .withLongName("output")
-        .withRequired(true)
-        .withArgument(
-            argumentBuilder.withName("output").withMaximum(1)
-                .create())
-        .withDescription("where to write the model content").create();
-
-    Option threads = builder.withLongName("threads")
-        .withArgument(
-            argumentBuilder.withName("threads").withDefault("4").create())
-        .withDescription("the number of threads AdaptiveLogisticRegression uses")
-        .create();
-
-
-    Option predictors = builder.withLongName("predictors")
-        .withRequired(true)
-        .withArgument(argumentBuilder.withName("predictors").create())
-        .withDescription("a list of predictor variables").create();
-
-    Option types = builder
-        .withLongName("types")
-        .withRequired(true)
-        .withArgument(argumentBuilder.withName("types").create())
-        .withDescription(
-            "a list of predictor variable types (numeric, word, or text)")
-        .create();
-
-    Option target = builder
-        .withLongName("target")
-        .withDescription("the name of the target variable")    
-        .withRequired(true)    
-        .withArgument(
-            argumentBuilder.withName("target").withMaximum(1)
-                .create())
-         .create();
-    
-    Option targetCategories = builder
-      .withLongName("categories")
-      .withDescription("the number of target categories to be considered")
-      .withRequired(true)
-      .withArgument(argumentBuilder.withName("categories").withMaximum(1).create())
-      .create();
-    
-
-    Option features = builder
-        .withLongName("features")
-        .withDescription("the number of internal hashed features to use")
-        .withArgument(
-            argumentBuilder.withName("numFeatures")
-                .withDefault("1000").withMaximum(1).create())        
-        .create();
-
-    Option passes = builder
-        .withLongName("passes")
-        .withDescription("the number of times to pass over the input data")
-        .withArgument(
-            argumentBuilder.withName("passes").withDefault("2")
-                .withMaximum(1).create())        
-        .create();
-
-    Option interval = builder.withLongName("interval")
-        .withArgument(
-            argumentBuilder.withName("interval").withDefault("500").create())
-        .withDescription("the interval property of AdaptiveLogisticRegression")
-        .create();
-
-    Option window = builder.withLongName("window")
-        .withArgument(
-            argumentBuilder.withName("window").withDefault("800").create())
-        .withDescription("the average propery of AdaptiveLogisticRegression")
-        .create();
-
-    Option skipperfnum = builder.withLongName("skipperfnum")
-        .withArgument(
-            argumentBuilder.withName("skipperfnum").withDefault("99").create())
-        .withDescription("show performance measures every (skipperfnum + 1) rows")
-        .create();
-
-    Option prior = builder.withLongName("prior")
-        .withArgument(
-            argumentBuilder.withName("prior").withDefault("L1").create())
-        .withDescription("the prior algorithm to use: L1, L2, ebp, tp, up")
-        .create();
-
-    Option priorOption = builder.withLongName("prioroption")
-        .withArgument(
-            argumentBuilder.withName("prioroption").create())
-        .withDescription("constructor parameter for ElasticBandPrior and TPrior")
-        .create();
-
-    Option auc = builder.withLongName("auc")
-        .withArgument(
-            argumentBuilder.withName("auc").withDefault("global").create())
-        .withDescription("the auc to use: global or grouped")
-        .create();
-
-    
-
-    Group normalArgs = new GroupBuilder().withOption(help)
-        .withOption(quiet).withOption(inputFile).withOption(outputFile)
-        .withOption(target).withOption(targetCategories)
-        .withOption(predictors).withOption(types).withOption(passes)
-        .withOption(interval).withOption(window).withOption(threads)
-        .withOption(prior).withOption(features).withOption(showperf)
-        .withOption(skipperfnum).withOption(priorOption).withOption(auc)
-        .create();
-
-    Parser parser = new Parser();
-    parser.setHelpOption(help);
-    parser.setHelpTrigger("--help");
-    parser.setGroup(normalArgs);
-    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
-    CommandLine cmdLine = parser.parseAndHelp(args);
-
-    if (cmdLine == null) {
-      return false;
-    }
-
-    TrainAdaptiveLogistic.inputFile = getStringArgument(cmdLine, inputFile);
-    TrainAdaptiveLogistic.outputFile = getStringArgument(cmdLine,
-                                                         outputFile);
-
-    List<String> typeList = new ArrayList<>();
-    for (Object x : cmdLine.getValues(types)) {
-      typeList.add(x.toString());
-    }
-
-    List<String> predictorList = new ArrayList<>();
-    for (Object x : cmdLine.getValues(predictors)) {
-      predictorList.add(x.toString());
-    }
-
-    lmp = new AdaptiveLogisticModelParameters();
-    lmp.setTargetVariable(getStringArgument(cmdLine, target));
-    lmp.setMaxTargetCategories(getIntegerArgument(cmdLine, targetCategories));
-    lmp.setNumFeatures(getIntegerArgument(cmdLine, features));
-    lmp.setInterval(getIntegerArgument(cmdLine, interval));
-    lmp.setAverageWindow(getIntegerArgument(cmdLine, window));
-    lmp.setThreads(getIntegerArgument(cmdLine, threads));
-    lmp.setAuc(getStringArgument(cmdLine, auc));
-    lmp.setPrior(getStringArgument(cmdLine, prior));
-    if (cmdLine.getValue(priorOption) != null) {
-      lmp.setPriorOption(getDoubleArgument(cmdLine, priorOption));
-    }
-    lmp.setTypeMap(predictorList, typeList);
-    TrainAdaptiveLogistic.showperf = getBooleanArgument(cmdLine, showperf);
-    TrainAdaptiveLogistic.skipperfnum = getIntegerArgument(cmdLine, skipperfnum);
-    TrainAdaptiveLogistic.passes = getIntegerArgument(cmdLine, passes);
-
-    lmp.checkParameters();
-
-    return true;
-  }
-
-  private static String getStringArgument(CommandLine cmdLine,
-                                          Option inputFile) {
-    return (String) cmdLine.getValue(inputFile);
-  }
-
-  private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
-    return cmdLine.hasOption(option);
-  }
-
-  private static int getIntegerArgument(CommandLine cmdLine, Option features) {
-    return Integer.parseInt((String) cmdLine.getValue(features));
-  }
-
-  private static double getDoubleArgument(CommandLine cmdLine, Option op) {
-    return Double.parseDouble((String) cmdLine.getValue(op));
-  }
-
-  public static AdaptiveLogisticRegression getModel() {
-    return model;
-  }
-
-  public static LogisticModelParameters getParameters() {
-    return lmp;
-  }
-
-  static BufferedReader open(String inputFile) throws IOException {
-    InputStream in;
-    try {
-      in = Resources.getResource(inputFile).openStream();
-    } catch (IllegalArgumentException e) {
-      in = new FileInputStream(new File(inputFile));
-    }
-    return new BufferedReader(new InputStreamReader(in, Charsets.UTF_8));
-  }
-   
-}


[15/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DecisionTreeBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DecisionTreeBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DecisionTreeBuilder.java
new file mode 100644
index 0000000..9f84e9c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DecisionTreeBuilder.java
@@ -0,0 +1,422 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.builder;
+
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.classifier.df.data.conditions.Condition;
+import org.apache.mahout.classifier.df.node.CategoricalNode;
+import org.apache.mahout.classifier.df.node.Leaf;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.classifier.df.node.NumericalNode;
+import org.apache.mahout.classifier.df.split.IgSplit;
+import org.apache.mahout.classifier.df.split.OptIgSplit;
+import org.apache.mahout.classifier.df.split.RegressionSplit;
+import org.apache.mahout.classifier.df.split.Split;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Random;
+
+/**
+ * Builds a classification tree or regression tree<br>
+ * A classification tree is built when the criterion variable is the categorical attribute.<br>
+ * A regression tree is built when the criterion variable is the numerical attribute.
+ */
+@Deprecated
+public class DecisionTreeBuilder implements TreeBuilder {
+
+  private static final Logger log = LoggerFactory.getLogger(DecisionTreeBuilder.class);
+
+  private static final int[] NO_ATTRIBUTES = new int[0];
+  private static final double EPSILON = 1.0e-6;
+
+  /**
+   * indicates which CATEGORICAL attributes have already been selected in the parent nodes
+   */
+  private boolean[] selected;
+  /**
+   * number of attributes to select randomly at each node
+   */
+  private int m;
+  /**
+   * IgSplit implementation
+   */
+  private IgSplit igSplit;
+  /**
+   * tree is complemented
+   */
+  private boolean complemented = true;
+  /**
+   * minimum number for split
+   */
+  private double minSplitNum = 2.0;
+  /**
+   * minimum proportion of the total variance for split
+   */
+  private double minVarianceProportion = 1.0e-3;
+  /**
+   * full set data
+   */
+  private Data fullSet;
+  /**
+   * minimum variance for split
+   */
+  private double minVariance = Double.NaN;
+
+  public void setM(int m) {
+    this.m = m;
+  }
+
+  public void setIgSplit(IgSplit igSplit) {
+    this.igSplit = igSplit;
+  }
+
+  public void setComplemented(boolean complemented) {
+    this.complemented = complemented;
+  }
+
+  public void setMinSplitNum(int minSplitNum) {
+    this.minSplitNum = minSplitNum;
+  }
+
+  public void setMinVarianceProportion(double minVarianceProportion) {
+    this.minVarianceProportion = minVarianceProportion;
+  }
+
+  @Override
+  public Node build(Random rng, Data data) {
+    if (selected == null) {
+      selected = new boolean[data.getDataset().nbAttributes()];
+      selected[data.getDataset().getLabelId()] = true; // never select the label
+    }
+    if (m == 0) {
+      // set default m
+      double e = data.getDataset().nbAttributes() - 1;
+      if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+        // regression
+        m = (int) Math.ceil(e / 3.0);
+      } else {
+        // classification
+        m = (int) Math.ceil(Math.sqrt(e));
+      }
+    }
+
+    if (data.isEmpty()) {
+      return new Leaf(Double.NaN);
+    }
+
+    double sum = 0.0;
+    if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+      // regression
+      // sum and sum squared of a label is computed
+      double sumSquared = 0.0;
+      for (int i = 0; i < data.size(); i++) {
+        double label = data.getDataset().getLabel(data.get(i));
+        sum += label;
+        sumSquared += label * label;
+      }
+
+      // computes the variance
+      double var = sumSquared - (sum * sum) / data.size();
+
+      // computes the minimum variance
+      if (Double.compare(minVariance, Double.NaN) == 0) {
+        minVariance = var / data.size() * minVarianceProportion;
+        log.debug("minVariance:{}", minVariance);
+      }
+
+      // variance is compared with minimum variance
+      if ((var / data.size()) < minVariance) {
+        log.debug("variance({}) < minVariance({}) Leaf({})", var / data.size(), minVariance, sum / data.size());
+        return new Leaf(sum / data.size());
+      }
+    } else {
+      // classification
+      if (isIdentical(data)) {
+        return new Leaf(data.majorityLabel(rng));
+      }
+      if (data.identicalLabel()) {
+        return new Leaf(data.getDataset().getLabel(data.get(0)));
+      }
+    }
+
+    // store full set data
+    if (fullSet == null) {
+      fullSet = data;
+    }
+
+    int[] attributes = randomAttributes(rng, selected, m);
+    if (attributes == null || attributes.length == 0) {
+      // we tried all the attributes and could not split the data anymore
+      double label;
+      if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+        // regression
+        label = sum / data.size();
+      } else {
+        // classification
+        label = data.majorityLabel(rng);
+      }
+      log.warn("attribute which can be selected is not found Leaf({})", label);
+      return new Leaf(label);
+    }
+
+    if (igSplit == null) {
+      if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+        // regression
+        igSplit = new RegressionSplit();
+      } else {
+        // classification
+        igSplit = new OptIgSplit();
+      }
+    }
+
+    // find the best split
+    Split best = null;
+    for (int attr : attributes) {
+      Split split = igSplit.computeSplit(data, attr);
+      if (best == null || best.getIg() < split.getIg()) {
+        best = split;
+      }
+    }
+
+    // information gain is near to zero.
+    if (best.getIg() < EPSILON) {
+      double label;
+      if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+        label = sum / data.size();
+      } else {
+        label = data.majorityLabel(rng);
+      }
+      log.debug("ig is near to zero Leaf({})", label);
+      return new Leaf(label);
+    }
+
+    log.debug("best split attr:{}, split:{}, ig:{}", best.getAttr(), best.getSplit(), best.getIg());
+
+    boolean alreadySelected = selected[best.getAttr()];
+    if (alreadySelected) {
+      // attribute already selected
+      log.warn("attribute {} already selected in a parent node", best.getAttr());
+    }
+
+    Node childNode;
+    if (data.getDataset().isNumerical(best.getAttr())) {
+      boolean[] temp = null;
+
+      Data loSubset = data.subset(Condition.lesser(best.getAttr(), best.getSplit()));
+      Data hiSubset = data.subset(Condition.greaterOrEquals(best.getAttr(), best.getSplit()));
+
+      if (loSubset.isEmpty() || hiSubset.isEmpty()) {
+        // the selected attribute did not change the data, avoid using it in the child notes
+        selected[best.getAttr()] = true;
+      } else {
+        // the data changed, so we can unselect all previousely selected NUMERICAL attributes
+        temp = selected;
+        selected = cloneCategoricalAttributes(data.getDataset(), selected);
+      }
+
+      // size of the subset is less than the minSpitNum
+      if (loSubset.size() < minSplitNum || hiSubset.size() < minSplitNum) {
+        // branch is not split
+        double label;
+        if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+          label = sum / data.size();
+        } else {
+          label = data.majorityLabel(rng);
+        }
+        log.debug("branch is not split Leaf({})", label);
+        return new Leaf(label);
+      }
+
+      Node loChild = build(rng, loSubset);
+      Node hiChild = build(rng, hiSubset);
+
+      // restore the selection state of the attributes
+      if (temp != null) {
+        selected = temp;
+      } else {
+        selected[best.getAttr()] = alreadySelected;
+      }
+
+      childNode = new NumericalNode(best.getAttr(), best.getSplit(), loChild, hiChild);
+    } else { // CATEGORICAL attribute
+      double[] values = data.values(best.getAttr());
+
+      // tree is complemented
+      Collection<Double> subsetValues = null;
+      if (complemented) {
+        subsetValues = new HashSet<>();
+        for (double value : values) {
+          subsetValues.add(value);
+        }
+        values = fullSet.values(best.getAttr());
+      }
+
+      int cnt = 0;
+      Data[] subsets = new Data[values.length];
+      for (int index = 0; index < values.length; index++) {
+        if (complemented && !subsetValues.contains(values[index])) {
+          continue;
+        }
+        subsets[index] = data.subset(Condition.equals(best.getAttr(), values[index]));
+        if (subsets[index].size() >= minSplitNum) {
+          cnt++;
+        }
+      }
+
+      // size of the subset is less than the minSpitNum
+      if (cnt < 2) {
+        // branch is not split
+        double label;
+        if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+          label = sum / data.size();
+        } else {
+          label = data.majorityLabel(rng);
+        }
+        log.debug("branch is not split Leaf({})", label);
+        return new Leaf(label);
+      }
+
+      selected[best.getAttr()] = true;
+
+      Node[] children = new Node[values.length];
+      for (int index = 0; index < values.length; index++) {
+        if (complemented && (subsetValues == null || !subsetValues.contains(values[index]))) {
+          // tree is complemented
+          double label;
+          if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+            label = sum / data.size();
+          } else {
+            label = data.majorityLabel(rng);
+          }
+          log.debug("complemented Leaf({})", label);
+          children[index] = new Leaf(label);
+          continue;
+        }
+        children[index] = build(rng, subsets[index]);
+      }
+
+      selected[best.getAttr()] = alreadySelected;
+
+      childNode = new CategoricalNode(best.getAttr(), values, children);
+    }
+
+    return childNode;
+  }
+
+  /**
+   * checks if all the vectors have identical attribute values. Ignore selected attributes.
+   *
+   * @return true is all the vectors are identical or the data is empty<br>
+   *         false otherwise
+   */
+  private boolean isIdentical(Data data) {
+    if (data.isEmpty()) {
+      return true;
+    }
+
+    Instance instance = data.get(0);
+    for (int attr = 0; attr < selected.length; attr++) {
+      if (selected[attr]) {
+        continue;
+      }
+
+      for (int index = 1; index < data.size(); index++) {
+        if (data.get(index).get(attr) != instance.get(attr)) {
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+  /**
+   * Make a copy of the selection state of the attributes, unselect all numerical attributes
+   *
+   * @param selected selection state to clone
+   * @return cloned selection state
+   */
+  private static boolean[] cloneCategoricalAttributes(Dataset dataset, boolean[] selected) {
+    boolean[] cloned = new boolean[selected.length];
+
+    for (int i = 0; i < selected.length; i++) {
+      cloned[i] = !dataset.isNumerical(i) && selected[i];
+    }
+    cloned[dataset.getLabelId()] = true;
+
+    return cloned;
+  }
+
+  /**
+   * Randomly selects m attributes to consider for split, excludes IGNORED and LABEL attributes
+   *
+   * @param rng      random-numbers generator
+   * @param selected attributes' state (selected or not)
+   * @param m        number of attributes to choose
+   * @return list of selected attributes' indices, or null if all attributes have already been selected
+   */
+  private static int[] randomAttributes(Random rng, boolean[] selected, int m) {
+    int nbNonSelected = 0; // number of non selected attributes
+    for (boolean sel : selected) {
+      if (!sel) {
+        nbNonSelected++;
+      }
+    }
+
+    if (nbNonSelected == 0) {
+      log.warn("All attributes are selected !");
+      return NO_ATTRIBUTES;
+    }
+
+    int[] result;
+    if (nbNonSelected <= m) {
+      // return all non selected attributes
+      result = new int[nbNonSelected];
+      int index = 0;
+      for (int attr = 0; attr < selected.length; attr++) {
+        if (!selected[attr]) {
+          result[index++] = attr;
+        }
+      }
+    } else {
+      result = new int[m];
+      for (int index = 0; index < m; index++) {
+        // randomly choose a "non selected" attribute
+        int rind;
+        do {
+          rind = rng.nextInt(selected.length);
+        } while (selected[rind]);
+
+        result[index] = rind;
+        selected[rind] = true; // temporarily set the chosen attribute to be selected
+      }
+
+      // the chosen attributes are not yet selected
+      for (int attr : result) {
+        selected[attr] = false;
+      }
+    }
+
+    return result;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DefaultTreeBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DefaultTreeBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DefaultTreeBuilder.java
new file mode 100644
index 0000000..3392fb1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DefaultTreeBuilder.java
@@ -0,0 +1,253 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.builder;
+
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.classifier.df.data.conditions.Condition;
+import org.apache.mahout.classifier.df.node.CategoricalNode;
+import org.apache.mahout.classifier.df.node.Leaf;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.classifier.df.node.NumericalNode;
+import org.apache.mahout.classifier.df.split.IgSplit;
+import org.apache.mahout.classifier.df.split.OptIgSplit;
+import org.apache.mahout.classifier.df.split.Split;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Random;
+
+/**
+ * Builds a Decision Tree <br>
+ * Based on the algorithm described in the "Decision Trees" tutorials by Andrew W. Moore, available at:<br>
+ * <br>
+ * http://www.cs.cmu.edu/~awm/tutorials
+ * <br><br>
+ * This class can be used when the criterion variable is the categorical attribute.
+ */
+@Deprecated
+public class DefaultTreeBuilder implements TreeBuilder {
+
+  private static final Logger log = LoggerFactory.getLogger(DefaultTreeBuilder.class);
+
+  private static final int[] NO_ATTRIBUTES = new int[0];
+
+  /**
+   * indicates which CATEGORICAL attributes have already been selected in the parent nodes
+   */
+  private boolean[] selected;
+  /**
+   * number of attributes to select randomly at each node
+   */
+  private int m = 1;
+  /**
+   * IgSplit implementation
+   */
+  private final IgSplit igSplit;
+
+  public DefaultTreeBuilder() {
+    igSplit = new OptIgSplit();
+  }
+
+  public void setM(int m) {
+    this.m = m;
+  }
+
+  @Override
+  public Node build(Random rng, Data data) {
+
+    if (selected == null) {
+      selected = new boolean[data.getDataset().nbAttributes()];
+      selected[data.getDataset().getLabelId()] = true; // never select the label
+    }
+
+    if (data.isEmpty()) {
+      return new Leaf(-1);
+    }
+    if (isIdentical(data)) {
+      return new Leaf(data.majorityLabel(rng));
+    }
+    if (data.identicalLabel()) {
+      return new Leaf(data.getDataset().getLabel(data.get(0)));
+    }
+
+    int[] attributes = randomAttributes(rng, selected, m);
+    if (attributes == null || attributes.length == 0) {
+      // we tried all the attributes and could not split the data anymore
+      return new Leaf(data.majorityLabel(rng));
+    }
+
+    // find the best split
+    Split best = null;
+    for (int attr : attributes) {
+      Split split = igSplit.computeSplit(data, attr);
+      if (best == null || best.getIg() < split.getIg()) {
+        best = split;
+      }
+    }
+
+    boolean alreadySelected = selected[best.getAttr()];
+    if (alreadySelected) {
+      // attribute already selected
+      log.warn("attribute {} already selected in a parent node", best.getAttr());
+    }
+
+    Node childNode;
+    if (data.getDataset().isNumerical(best.getAttr())) {
+      boolean[] temp = null;
+
+      Data loSubset = data.subset(Condition.lesser(best.getAttr(), best.getSplit()));
+      Data hiSubset = data.subset(Condition.greaterOrEquals(best.getAttr(), best.getSplit()));
+
+      if (loSubset.isEmpty() || hiSubset.isEmpty()) {
+        // the selected attribute did not change the data, avoid using it in the child notes
+        selected[best.getAttr()] = true;
+      } else {
+        // the data changed, so we can unselect all previousely selected NUMERICAL attributes
+        temp = selected;
+        selected = cloneCategoricalAttributes(data.getDataset(), selected);
+      }
+
+      Node loChild = build(rng, loSubset);
+      Node hiChild = build(rng, hiSubset);
+
+      // restore the selection state of the attributes
+      if (temp != null) {
+        selected = temp;
+      } else {
+        selected[best.getAttr()] = alreadySelected;
+      }
+
+      childNode = new NumericalNode(best.getAttr(), best.getSplit(), loChild, hiChild);
+    } else { // CATEGORICAL attribute
+      selected[best.getAttr()] = true;
+
+      double[] values = data.values(best.getAttr());
+      Node[] children = new Node[values.length];
+
+      for (int index = 0; index < values.length; index++) {
+        Data subset = data.subset(Condition.equals(best.getAttr(), values[index]));
+        children[index] = build(rng, subset);
+      }
+
+      selected[best.getAttr()] = alreadySelected;
+
+      childNode = new CategoricalNode(best.getAttr(), values, children);
+    }
+
+    return childNode;
+  }
+
+  /**
+   * checks if all the vectors have identical attribute values. Ignore selected attributes.
+   *
+   * @return true is all the vectors are identical or the data is empty<br>
+   *         false otherwise
+   */
+  private boolean isIdentical(Data data) {
+    if (data.isEmpty()) {
+      return true;
+    }
+
+    Instance instance = data.get(0);
+    for (int attr = 0; attr < selected.length; attr++) {
+      if (selected[attr]) {
+        continue;
+      }
+
+      for (int index = 1; index < data.size(); index++) {
+        if (data.get(index).get(attr) != instance.get(attr)) {
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+
+  /**
+   * Make a copy of the selection state of the attributes, unselect all numerical attributes
+   *
+   * @param selected selection state to clone
+   * @return cloned selection state
+   */
+  private static boolean[] cloneCategoricalAttributes(Dataset dataset, boolean[] selected) {
+    boolean[] cloned = new boolean[selected.length];
+
+    for (int i = 0; i < selected.length; i++) {
+      cloned[i] = !dataset.isNumerical(i) && selected[i];
+    }
+
+    return cloned;
+  }
+
+  /**
+   * Randomly selects m attributes to consider for split, excludes IGNORED and LABEL attributes
+   *
+   * @param rng      random-numbers generator
+   * @param selected attributes' state (selected or not)
+   * @param m        number of attributes to choose
+   * @return list of selected attributes' indices, or null if all attributes have already been selected
+   */
+  protected static int[] randomAttributes(Random rng, boolean[] selected, int m) {
+    int nbNonSelected = 0; // number of non selected attributes
+    for (boolean sel : selected) {
+      if (!sel) {
+        nbNonSelected++;
+      }
+    }
+
+    if (nbNonSelected == 0) {
+      log.warn("All attributes are selected !");
+      return NO_ATTRIBUTES;
+    }
+
+    int[] result;
+    if (nbNonSelected <= m) {
+      // return all non selected attributes
+      result = new int[nbNonSelected];
+      int index = 0;
+      for (int attr = 0; attr < selected.length; attr++) {
+        if (!selected[attr]) {
+          result[index++] = attr;
+        }
+      }
+    } else {
+      result = new int[m];
+      for (int index = 0; index < m; index++) {
+        // randomly choose a "non selected" attribute
+        int rind;
+        do {
+          rind = rng.nextInt(selected.length);
+        } while (selected[rind]);
+
+        result[index] = rind;
+        selected[rind] = true; // temporarily set the chosen attribute to be selected
+      }
+
+      // the chosen attributes are not yet selected
+      for (int attr : result) {
+        selected[attr] = false;
+      }
+    }
+
+    return result;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/TreeBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/TreeBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/TreeBuilder.java
new file mode 100644
index 0000000..bf686a4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/TreeBuilder.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.builder;
+
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.node.Node;
+
+import java.util.Random;
+
+/**
+ * Abstract base class for TreeBuilders
+ */
+@Deprecated
+public interface TreeBuilder {
+  
+  /**
+   * Builds a Decision tree using the training data
+   * 
+   * @param rng
+   *          random-numbers generator
+   * @param data
+   *          training data
+   * @return root Node
+   */
+  Node build(Random rng, Data data);
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Data.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Data.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Data.java
new file mode 100644
index 0000000..77e5ed5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Data.java
@@ -0,0 +1,281 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+import org.apache.mahout.classifier.df.data.conditions.Condition;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * Holds a list of vectors and their corresponding Dataset. contains various operations that deals with the
+ * vectors (subset, count,...)
+ * 
+ */
+@Deprecated
+public class Data implements Cloneable {
+  
+  private final List<Instance> instances;
+  
+  private final Dataset dataset;
+
+  public Data(Dataset dataset) {
+    this.dataset = dataset;
+    this.instances = new ArrayList<>();
+  }
+
+  public Data(Dataset dataset, List<Instance> instances) {
+    this.dataset = dataset;
+    this.instances = new ArrayList<>(instances);
+  }
+  
+  /**
+   * @return the number of elements
+   */
+  public int size() {
+    return instances.size();
+  }
+  
+  /**
+   * @return true if this data contains no element
+   */
+  public boolean isEmpty() {
+    return instances.isEmpty();
+  }
+  
+  /**
+   * @param v
+   *          element whose presence in this list if to be searched
+   * @return true is this data contains the specified element.
+   */
+  public boolean contains(Instance v) {
+    return instances.contains(v);
+  }
+
+    /**
+   * Returns the element at the specified position
+   * 
+   * @param index
+   *          index of element to return
+   * @return the element at the specified position
+   * @throws IndexOutOfBoundsException
+   *           if the index is out of range
+   */
+  public Instance get(int index) {
+    return instances.get(index);
+  }
+  
+  /**
+   * @return the subset from this data that matches the given condition
+   */
+  public Data subset(Condition condition) {
+    List<Instance> subset = new ArrayList<>();
+    
+    for (Instance instance : instances) {
+      if (condition.isTrueFor(instance)) {
+        subset.add(instance);
+      }
+    }
+    
+    return new Data(dataset, subset);
+  }
+
+    /**
+   * if data has N cases, sample N cases at random -but with replacement.
+   */
+  public Data bagging(Random rng) {
+    int datasize = size();
+    List<Instance> bag = new ArrayList<>(datasize);
+    
+    for (int i = 0; i < datasize; i++) {
+      bag.add(instances.get(rng.nextInt(datasize)));
+    }
+    
+    return new Data(dataset, bag);
+  }
+  
+  /**
+   * if data has N cases, sample N cases at random -but with replacement.
+   * 
+   * @param sampled
+   *          indicating which instance has been sampled
+   * 
+   * @return sampled data
+   */
+  public Data bagging(Random rng, boolean[] sampled) {
+    int datasize = size();
+    List<Instance> bag = new ArrayList<>(datasize);
+    
+    for (int i = 0; i < datasize; i++) {
+      int index = rng.nextInt(datasize);
+      bag.add(instances.get(index));
+      sampled[index] = true;
+    }
+    
+    return new Data(dataset, bag);
+  }
+  
+  /**
+   * Splits the data in two, returns one part, and this gets the rest of the data. <b>VERY SLOW!</b>
+   */
+  public Data rsplit(Random rng, int subsize) {
+    List<Instance> subset = new ArrayList<>(subsize);
+    
+    for (int i = 0; i < subsize; i++) {
+      subset.add(instances.remove(rng.nextInt(instances.size())));
+    }
+    
+    return new Data(dataset, subset);
+  }
+  
+  /**
+   * checks if all the vectors have identical attribute values
+   * 
+   * @return true is all the vectors are identical or the data is empty<br>
+   *         false otherwise
+   */
+  public boolean isIdentical() {
+    if (isEmpty()) {
+      return true;
+    }
+    
+    Instance instance = get(0);
+    for (int attr = 0; attr < dataset.nbAttributes(); attr++) {
+      for (int index = 1; index < size(); index++) {
+        if (get(index).get(attr) != instance.get(attr)) {
+          return false;
+        }
+      }
+    }
+    
+    return true;
+  }
+  
+  /**
+   * checks if all the vectors have identical label values
+   */
+  public boolean identicalLabel() {
+    if (isEmpty()) {
+      return true;
+    }
+    
+    double label = dataset.getLabel(get(0));
+    for (int index = 1; index < size(); index++) {
+      if (dataset.getLabel(get(index)) != label) {
+        return false;
+      }
+    }
+    
+    return true;
+  }
+  
+  /**
+   * finds all distinct values of a given attribute
+   */
+  public double[] values(int attr) {
+    Collection<Double> result = new HashSet<>();
+    
+    for (Instance instance : instances) {
+      result.add(instance.get(attr));
+    }
+    
+    double[] values = new double[result.size()];
+    
+    int index = 0;
+    for (Double value : result) {
+      values[index++] = value;
+    }
+    
+    return values;
+  }
+  
+  @Override
+  public Data clone() {
+    return new Data(dataset, new ArrayList<>(instances));
+  }
+  
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+    if (!(obj instanceof Data)) {
+      return false;
+    }
+    
+    Data data = (Data) obj;
+    
+    return instances.equals(data.instances) && dataset.equals(data.dataset);
+  }
+  
+  @Override
+  public int hashCode() {
+    return instances.hashCode() + dataset.hashCode();
+  }
+  
+  /**
+   * extract the labels of all instances
+   */
+  public double[] extractLabels() {
+    double[] labels = new double[size()];
+    
+    for (int index = 0; index < labels.length; index++) {
+      labels[index] = dataset.getLabel(get(index));
+    }
+    
+    return labels;
+  }
+
+    /**
+   * finds the majority label, breaking ties randomly<br>
+   * This method can be used when the criterion variable is the categorical attribute.
+   *
+   * @return the majority label value
+   */
+  public int majorityLabel(Random rng) {
+    // count the frequency of each label value
+    int[] counts = new int[dataset.nblabels()];
+    
+    for (int index = 0; index < size(); index++) {
+      counts[(int) dataset.getLabel(get(index))]++;
+    }
+    
+    // find the label values that appears the most
+    return DataUtils.maxindex(rng, counts);
+  }
+  
+  /**
+   * Counts the number of occurrences of each label value<br>
+   * This method can be used when the criterion variable is the categorical attribute.
+   * 
+   * @param counts
+   *          will contain the results, supposed to be initialized at 0
+   */
+  public void countLabels(int[] counts) {
+    for (int index = 0; index < size(); index++) {
+      counts[(int) dataset.getLabel(get(index))]++;
+    }
+  }
+  
+  public Dataset getDataset() {
+    return dataset;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataConverter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataConverter.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataConverter.java
new file mode 100644
index 0000000..f1bdc95
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataConverter.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+import com.google.common.base.Preconditions;
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.mahout.math.DenseVector;
+
+import java.util.regex.Pattern;
+
+/**
+ * Converts String to Instance using a Dataset
+ */
+@Deprecated
+public class DataConverter {
+
+  private static final Pattern COMMA_SPACE = Pattern.compile("[, ]");
+
+  private final Dataset dataset;
+
+  public DataConverter(Dataset dataset) {
+    this.dataset = dataset;
+  }
+
+  public Instance convert(CharSequence string) {
+    // all attributes (categorical, numerical, label), ignored
+    int nball = dataset.nbAttributes() + dataset.getIgnored().length;
+
+    String[] tokens = COMMA_SPACE.split(string);
+    Preconditions.checkArgument(tokens.length == nball,
+        "Wrong number of attributes in the string: " + tokens.length + ". Must be " + nball);
+
+    int nbattrs = dataset.nbAttributes();
+    DenseVector vector = new DenseVector(nbattrs);
+
+    int aId = 0;
+    for (int attr = 0; attr < nball; attr++) {
+      if (!ArrayUtils.contains(dataset.getIgnored(), attr)) {
+        String token = tokens[attr].trim();
+
+        if ("?".equals(token)) {
+          // missing value
+          return null;
+        }
+
+        if (dataset.isNumerical(aId)) {
+          vector.set(aId++, Double.parseDouble(token));
+        } else { // CATEGORICAL
+          vector.set(aId, dataset.valueOf(aId, token));
+          aId++;
+        }
+      }
+    }
+
+    return new Instance(vector);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java
new file mode 100644
index 0000000..c62dcac
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java
@@ -0,0 +1,255 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.classifier.df.data.Dataset.Attribute;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Scanner;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+/**
+ * Converts the input data to a Vector Array using the information given by the Dataset.<br>
+ * Generates for each line a Vector that contains :<br>
+ * <ul>
+ * <li>double parsed value for NUMERICAL attributes</li>
+ * <li>int value for CATEGORICAL and LABEL attributes</li>
+ * </ul>
+ * <br>
+ * adds an IGNORED first attribute that will contain a unique id for each instance, which is the line number
+ * of the instance in the input data
+ */
+@Deprecated
+public final class DataLoader {
+
+  private static final Logger log = LoggerFactory.getLogger(DataLoader.class);
+
+  private static final Pattern SEPARATORS = Pattern.compile("[, ]");
+
+  private DataLoader() {}
+
+  /**
+   * Converts a comma-separated String to a Vector.
+   * 
+   * @param attrs
+   *          attributes description
+   * @param values
+   *          used to convert CATEGORICAL attribute values to Integer
+   * @return false if there are missing values '?' or NUMERICAL attribute values is not numeric
+   */
+  private static boolean parseString(Attribute[] attrs, Set<String>[] values, CharSequence string,
+    boolean regression) {
+    String[] tokens = SEPARATORS.split(string);
+    Preconditions.checkArgument(tokens.length == attrs.length,
+        "Wrong number of attributes in the string: " + tokens.length + ". Must be: " + attrs.length);
+
+    // extract tokens and check is there is any missing value
+    for (int attr = 0; attr < attrs.length; attr++) {
+      if (!attrs[attr].isIgnored() && "?".equals(tokens[attr])) {
+        return false; // missing value
+      }
+    }
+
+    for (int attr = 0; attr < attrs.length; attr++) {
+      if (!attrs[attr].isIgnored()) {
+        String token = tokens[attr];
+        if (attrs[attr].isCategorical() || (!regression && attrs[attr].isLabel())) {
+          // update values
+          if (values[attr] == null) {
+            values[attr] = new HashSet<>();
+          }
+          values[attr].add(token);
+        } else {
+          try {
+            Double.parseDouble(token);
+          } catch (NumberFormatException e) {
+            return false;
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+
+  /**
+   * Loads the data from a file
+   * 
+   * @param fs
+   *          file system
+   * @param fpath
+   *          data file path
+   * @throws IOException
+   *           if any problem is encountered
+   */
+
+  public static Data loadData(Dataset dataset, FileSystem fs, Path fpath) throws IOException {
+    FSDataInputStream input = fs.open(fpath);
+    Scanner scanner = new Scanner(input, "UTF-8");
+
+    List<Instance> instances = new ArrayList<>();
+
+    DataConverter converter = new DataConverter(dataset);
+
+    while (scanner.hasNextLine()) {
+      String line = scanner.nextLine();
+      if (!line.isEmpty()) {
+        Instance instance = converter.convert(line);
+        if (instance != null) {
+          instances.add(instance);
+        } else {
+          // missing values found
+          log.warn("{}: missing values", instances.size());
+        }
+      } else {
+        log.warn("{}: empty string", instances.size());
+      }
+    }
+
+    scanner.close();
+    return new Data(dataset, instances);
+  }
+
+
+  /** Loads the data from multiple paths specified by pathes */
+  public static Data loadData(Dataset dataset, FileSystem fs, Path[] pathes) throws IOException {
+    List<Instance> instances = new ArrayList<>();
+
+    for (Path path : pathes) {
+      Data loadedData = loadData(dataset, fs, path);
+      for (int index = 0; index <= loadedData.size(); index++) {
+        instances.add(loadedData.get(index));
+      }
+    }
+    return new Data(dataset, instances);
+  }
+
+  /** Loads the data from a String array */
+  public static Data loadData(Dataset dataset, String[] data) {
+    List<Instance> instances = new ArrayList<>();
+
+    DataConverter converter = new DataConverter(dataset);
+
+    for (String line : data) {
+      if (!line.isEmpty()) {
+        Instance instance = converter.convert(line);
+        if (instance != null) {
+          instances.add(instance);
+        } else {
+          // missing values found
+          log.warn("{}: missing values", instances.size());
+        }
+      } else {
+        log.warn("{}: empty string", instances.size());
+      }
+    }
+
+    return new Data(dataset, instances);
+  }
+
+  /**
+   * Generates the Dataset by parsing the entire data
+   * 
+   * @param descriptor  attributes description
+   * @param regression  if true, the label is numerical
+   * @param fs  file system
+   * @param path  data path
+   */
+  public static Dataset generateDataset(CharSequence descriptor,
+                                        boolean regression,
+                                        FileSystem fs,
+                                        Path path) throws DescriptorException, IOException {
+    Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor);
+
+    FSDataInputStream input = fs.open(path);
+    Scanner scanner = new Scanner(input, "UTF-8");
+
+    // used to convert CATEGORICAL attribute to Integer
+    @SuppressWarnings("unchecked")
+    Set<String>[] valsets = new Set[attrs.length];
+
+    int size = 0;
+    while (scanner.hasNextLine()) {
+      String line = scanner.nextLine();
+      if (!line.isEmpty()) {
+        if (parseString(attrs, valsets, line, regression)) {
+          size++;
+        }
+      }
+    }
+
+    scanner.close();
+
+    @SuppressWarnings("unchecked")
+    List<String>[] values = new List[attrs.length];
+    for (int i = 0; i < valsets.length; i++) {
+      if (valsets[i] != null) {
+        values[i] = Lists.newArrayList(valsets[i]);
+      }
+    }
+
+    return new Dataset(attrs, values, size, regression);
+  }
+
+  /**
+   * Generates the Dataset by parsing the entire data
+   * 
+   * @param descriptor
+   *          attributes description
+   */
+  public static Dataset generateDataset(CharSequence descriptor,
+                                        boolean regression,
+                                        String[] data) throws DescriptorException {
+    Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor);
+
+    // used to convert CATEGORICAL attributes to Integer
+    @SuppressWarnings("unchecked")
+    Set<String>[] valsets = new Set[attrs.length];
+
+    int size = 0;
+    for (String aData : data) {
+      if (!aData.isEmpty()) {
+        if (parseString(attrs, valsets, aData, regression)) {
+          size++;
+        }
+      }
+    }
+
+    @SuppressWarnings("unchecked")
+    List<String>[] values = new List[attrs.length];
+    for (int i = 0; i < valsets.length; i++) {
+      if (valsets[i] != null) {
+        values[i] = Lists.newArrayList(valsets[i]);
+      }
+    }
+
+    return new Dataset(attrs, values, size, regression);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataUtils.java
new file mode 100644
index 0000000..0889370
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataUtils.java
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+import com.google.common.base.Preconditions;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * Helper methods that deals with data lists and arrays of values
+ */
+@Deprecated
+public final class DataUtils {
+  private DataUtils() { }
+  
+  /**
+   * Computes the sum of the values
+   * 
+   */
+  public static int sum(int[] values) {
+    int sum = 0;
+    for (int value : values) {
+      sum += value;
+    }
+    
+    return sum;
+  }
+  
+  /**
+   * foreach i : array1[i] += array2[i]
+   */
+  public static void add(int[] array1, int[] array2) {
+    Preconditions.checkArgument(array1.length == array2.length, "array1.length != array2.length");
+    for (int index = 0; index < array1.length; index++) {
+      array1[index] += array2[index];
+    }
+  }
+  
+  /**
+   * foreach i : array1[i] -= array2[i]
+   */
+  public static void dec(int[] array1, int[] array2) {
+    Preconditions.checkArgument(array1.length == array2.length, "array1.length != array2.length");
+    for (int index = 0; index < array1.length; index++) {
+      array1[index] -= array2[index];
+    }
+  }
+  
+  /**
+   * return the index of the maximum of the array, breaking ties randomly
+   * 
+   * @param rng
+   *          used to break ties
+   * @return index of the maximum
+   */
+  public static int maxindex(Random rng, int[] values) {
+    int max = 0;
+    List<Integer> maxindices = new ArrayList<>();
+    
+    for (int index = 0; index < values.length; index++) {
+      if (values[index] > max) {
+        max = values[index];
+        maxindices.clear();
+        maxindices.add(index);
+      } else if (values[index] == max) {
+        maxindices.add(index);
+      }
+    }
+
+    return maxindices.size() > 1 ? maxindices.get(rng.nextInt(maxindices.size())) : maxindices.get(0);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java
new file mode 100644
index 0000000..a392669
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java
@@ -0,0 +1,422 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+import com.google.common.base.Preconditions;
+import com.google.common.io.Closeables;
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.codehaus.jackson.map.ObjectMapper;
+import org.codehaus.jackson.type.TypeReference;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * Contains information about the attributes.
+ */
+@Deprecated
+public class Dataset {
+
+  /**
+   * Attributes type
+   */
+  public enum Attribute {
+    IGNORED,
+    NUMERICAL,
+    CATEGORICAL,
+    LABEL;
+
+    public boolean isNumerical() {
+      return this == NUMERICAL;
+    }
+
+    public boolean isCategorical() {
+      return this == CATEGORICAL;
+    }
+
+    public boolean isLabel() {
+      return this == LABEL;
+    }
+
+    public boolean isIgnored() {
+      return this == IGNORED;
+    }
+    
+    private static Attribute fromString(String from) {
+      Attribute toReturn = LABEL;
+      if (NUMERICAL.toString().equalsIgnoreCase(from)) {
+        toReturn = NUMERICAL;
+      } else if (CATEGORICAL.toString().equalsIgnoreCase(from)) {
+        toReturn = CATEGORICAL;
+      } else if (IGNORED.toString().equalsIgnoreCase(from)) {
+        toReturn = IGNORED;
+      }
+      return toReturn;
+    }
+  }
+
+  private Attribute[] attributes;
+
+  /**
+   * list of ignored attributes
+   */
+  private int[] ignored;
+
+  /**
+   * distinct values (CATEGORIAL attributes only)
+   */
+  private String[][] values;
+
+  /**
+   * index of the label attribute in the loaded data (without ignored attributed)
+   */
+  private int labelId;
+
+  /**
+   * number of instances in the dataset
+   */
+  private int nbInstances;
+  
+  /** JSON serial/de-serial-izer */
+  private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+  // Some literals for JSON representation
+  static final String TYPE = "type";
+  static final String VALUES = "values";
+  static final String LABEL = "label";
+
+  protected Dataset() {}
+
+  /**
+   * Should only be called by a DataLoader
+   *
+   * @param attrs  attributes description
+   * @param values distinct values for all CATEGORICAL attributes
+   */
+  Dataset(Attribute[] attrs, List<String>[] values, int nbInstances, boolean regression) {
+    validateValues(attrs, values);
+
+    int nbattrs = countAttributes(attrs);
+
+    // the label values are set apart
+    attributes = new Attribute[nbattrs];
+    this.values = new String[nbattrs][];
+    ignored = new int[attrs.length - nbattrs]; // nbignored = total - nbattrs
+
+    labelId = -1;
+    int ignoredId = 0;
+    int ind = 0;
+    for (int attr = 0; attr < attrs.length; attr++) {
+      if (attrs[attr].isIgnored()) {
+        ignored[ignoredId++] = attr;
+        continue;
+      }
+
+      if (attrs[attr].isLabel()) {
+        if (labelId != -1) {
+          throw new IllegalStateException("Label found more than once");
+        }
+        labelId = ind;
+        if (regression) {
+          attrs[attr] = Attribute.NUMERICAL;
+        } else {
+          attrs[attr] = Attribute.CATEGORICAL;
+        }
+      }
+
+      if (attrs[attr].isCategorical() || (!regression && attrs[attr].isLabel())) {
+        this.values[ind] = new String[values[attr].size()];
+        values[attr].toArray(this.values[ind]);
+      }
+
+      attributes[ind++] = attrs[attr];
+    }
+
+    if (labelId == -1) {
+      throw new IllegalStateException("Label not found");
+    }
+
+    this.nbInstances = nbInstances;
+  }
+
+  public int nbValues(int attr) {
+    return values[attr].length;
+  }
+
+  public String[] labels() {
+    return Arrays.copyOf(values[labelId], nblabels());
+  }
+
+  public int nblabels() {
+    return values[labelId].length;
+  }
+
+  public int getLabelId() {
+    return labelId;
+  }
+
+  public double getLabel(Instance instance) {
+    return instance.get(getLabelId());
+  }
+  
+  public Attribute getAttribute(int attr) {
+    return attributes[attr];
+  }
+
+  /**
+   * Returns the code used to represent the label value in the data
+   *
+   * @param label label's value to code
+   * @return label's code
+   */
+  public int labelCode(String label) {
+    return ArrayUtils.indexOf(values[labelId], label);
+  }
+
+  /**
+   * Returns the label value in the data
+   * This method can be used when the criterion variable is the categorical attribute.
+   *
+   * @param code label's code
+   * @return label's value
+   */
+  public String getLabelString(double code) {
+    // handle the case (prediction is NaN)
+    if (Double.isNaN(code)) {
+      return "unknown";
+    }
+    return values[labelId][(int) code];
+  }
+  
+  @Override
+  public String toString() {
+    return "attributes=" + Arrays.toString(attributes);
+  }
+
+  /**
+   * Converts a token to its corresponding integer code for a given attribute
+   *
+   * @param attr attribute index
+   */
+  public int valueOf(int attr, String token) {
+    Preconditions.checkArgument(!isNumerical(attr), "Only for CATEGORICAL attributes");
+    Preconditions.checkArgument(values != null, "Values not found (equals null)");
+    return ArrayUtils.indexOf(values[attr], token);
+  }
+
+  public int[] getIgnored() {
+    return ignored;
+  }
+
+  /**
+   * @return number of attributes that are not IGNORED
+   */
+  private static int countAttributes(Attribute[] attrs) {
+    int nbattrs = 0;
+    for (Attribute attr : attrs) {
+      if (!attr.isIgnored()) {
+        nbattrs++;
+      }
+    }
+    return nbattrs;
+  }
+
+  private static void validateValues(Attribute[] attrs, List<String>[] values) {
+    Preconditions.checkArgument(attrs.length == values.length, "attrs.length != values.length");
+    for (int attr = 0; attr < attrs.length; attr++) {
+      Preconditions.checkArgument(!attrs[attr].isCategorical() || values[attr] != null,
+          "values not found for attribute " + attr);
+    }
+  }
+
+  /**
+   * @return number of attributes
+   */
+  public int nbAttributes() {
+    return attributes.length;
+  }
+
+  /**
+   * Is this a numerical attribute ?
+   *
+   * @param attr index of the attribute to check
+   * @return true if the attribute is numerical
+   */
+  public boolean isNumerical(int attr) {
+    return attributes[attr].isNumerical();
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+    if (!(obj instanceof Dataset)) {
+      return false;
+    }
+
+    Dataset dataset = (Dataset) obj;
+
+    if (!Arrays.equals(attributes, dataset.attributes)) {
+      return false;
+    }
+
+    for (int attr = 0; attr < nbAttributes(); attr++) {
+      if (!Arrays.equals(values[attr], dataset.values[attr])) {
+        return false;
+      }
+    }
+
+    return labelId == dataset.labelId && nbInstances == dataset.nbInstances;
+  }
+
+  @Override
+  public int hashCode() {
+    int hashCode = labelId + 31 * nbInstances;
+    for (Attribute attr : attributes) {
+      hashCode = 31 * hashCode + attr.hashCode();
+    }
+    for (String[] valueRow : values) {
+      if (valueRow == null) {
+        continue;
+      }
+      for (String value : valueRow) {
+        hashCode = 31 * hashCode + value.hashCode();
+      }
+    }
+    return hashCode;
+  }
+
+  /**
+   * Loads the dataset from a file
+   *
+   * @throws java.io.IOException
+   */
+  public static Dataset load(Configuration conf, Path path) throws IOException {
+    FileSystem fs = path.getFileSystem(conf);
+    long bytesToRead = fs.getFileStatus(path).getLen();
+    byte[] buff = new byte[Long.valueOf(bytesToRead).intValue()];
+    FSDataInputStream input = fs.open(path);
+    try {
+      input.readFully(buff);
+    } finally {
+      Closeables.close(input, true);
+    }
+    String json = new String(buff, Charset.defaultCharset());
+    return fromJSON(json);
+  }
+  
+
+  /**
+   * Serialize this instance to JSON
+   * @return some JSON
+   */
+  public String toJSON() {
+    List<Map<String, Object>> toWrite = new LinkedList<>();
+    // attributes does not include ignored columns and it does include the class label
+    int ignoredCount = 0;
+    for (int i = 0; i < attributes.length + ignored.length; i++) {
+      Map<String, Object> attribute;
+      int attributesIndex = i - ignoredCount;
+      if (ignoredCount < ignored.length && i == ignored[ignoredCount]) {
+        // fill in ignored atttribute
+        attribute = getMap(Attribute.IGNORED, null, false);
+        ignoredCount++;
+      } else if (attributesIndex == labelId) {
+        // fill in the label
+        attribute = getMap(attributes[attributesIndex], values[attributesIndex], true);
+      } else  {
+        // normal attribute
+        attribute = getMap(attributes[attributesIndex], values[attributesIndex], false);
+      }
+      toWrite.add(attribute);
+    }
+    try {
+      return OBJECT_MAPPER.writeValueAsString(toWrite);
+    } catch (Exception ex) {
+      throw new RuntimeException(ex);
+    }
+  }
+
+  /**
+   * De-serialize an instance from a string
+   * @param json From which an instance is created
+   * @return A shiny new Dataset
+   */
+  public static Dataset fromJSON(String json) {
+    List<Map<String, Object>> fromJSON;
+    try {
+      fromJSON = OBJECT_MAPPER.readValue(json, new TypeReference<List<Map<String, Object>>>() {});
+    } catch (Exception ex) {
+      throw new RuntimeException(ex);
+    }
+    List<Attribute> attributes = new LinkedList<>();
+    List<Integer> ignored = new LinkedList<>();
+    String[][] nominalValues = new String[fromJSON.size()][];
+    Dataset dataset = new Dataset();
+    for (int i = 0; i < fromJSON.size(); i++) {
+      Map<String, Object> attribute = fromJSON.get(i);
+      if (Attribute.fromString((String) attribute.get(TYPE)) == Attribute.IGNORED) {
+        ignored.add(i);
+      } else {
+        Attribute asAttribute = Attribute.fromString((String) attribute.get(TYPE));
+        attributes.add(asAttribute);
+        if ((Boolean) attribute.get(LABEL)) {
+          dataset.labelId = i - ignored.size();
+        }
+        if (attribute.get(VALUES) != null) {
+          List<String> get = (List<String>) attribute.get(VALUES);
+          String[] array = get.toArray(new String[get.size()]);
+          nominalValues[i - ignored.size()] = array;
+        }
+      }
+    }
+    dataset.attributes = attributes.toArray(new Attribute[attributes.size()]);
+    dataset.ignored = new int[ignored.size()];
+    dataset.values = nominalValues;
+    for (int i = 0; i < dataset.ignored.length; i++) {
+      dataset.ignored[i] = ignored.get(i);
+    }
+    return dataset;
+  }
+  
+  /**
+   * Generate a map to describe an attribute
+   * @param type The type
+   * @param values - values
+   * @param isLabel - is a label
+   * @return map of (AttributeTypes, Values)
+   */
+  private Map<String, Object> getMap(Attribute type, String[] values, boolean isLabel) {
+    Map<String, Object> attribute = new HashMap<>();
+    attribute.put(TYPE, type.toString().toLowerCase(Locale.getDefault()));
+    attribute.put(VALUES, values);
+    attribute.put(LABEL, isLabel);
+    return attribute;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorException.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorException.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorException.java
new file mode 100644
index 0000000..e7a10ff
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorException.java
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+/**
+ * Exception thrown when parsing a descriptor
+ */
+@Deprecated
+public class DescriptorException extends Exception {
+  public DescriptorException(String msg) {
+    super(msg);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorUtils.java
new file mode 100644
index 0000000..aadedbd
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorUtils.java
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+import com.google.common.base.Splitter;
+import org.apache.mahout.classifier.df.data.Dataset.Attribute;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * Contains various methods that deal with descriptor strings
+ */
+@Deprecated
+public final class DescriptorUtils {
+
+  private static final Splitter SPACE = Splitter.on(' ').omitEmptyStrings();
+
+  private DescriptorUtils() { }
+  
+  /**
+   * Parses a descriptor string and generates the corresponding array of Attributes
+   * 
+   * @throws DescriptorException
+   *           if a bad token is encountered
+   */
+  public static Attribute[] parseDescriptor(CharSequence descriptor) throws DescriptorException {
+    List<Attribute> attributes = new ArrayList<>();
+    for (String token : SPACE.split(descriptor)) {
+      token = token.toUpperCase(Locale.ENGLISH);
+      if ("I".equals(token)) {
+        attributes.add(Attribute.IGNORED);
+      } else if ("N".equals(token)) {
+        attributes.add(Attribute.NUMERICAL);
+      } else if ("C".equals(token)) {
+        attributes.add(Attribute.CATEGORICAL);
+      } else if ("L".equals(token)) {
+        attributes.add(Attribute.LABEL);
+      } else {
+        throw new DescriptorException("Bad Token : " + token);
+      }
+    }
+    return attributes.toArray(new Attribute[attributes.size()]);
+  }
+  
+  /**
+   * Generates a valid descriptor string from a user-friendly representation.<br>
+   * for example "3 N I N N 2 C L 5 I" generates "N N N I N N C C L I I I I I".<br>
+   * this useful when describing datasets with a large number of attributes
+   * @throws DescriptorException
+   */
+  public static String generateDescriptor(CharSequence description) throws DescriptorException {
+    return generateDescriptor(SPACE.split(description));
+  }
+  
+  /**
+   * Generates a valid descriptor string from a list of tokens
+   * @throws DescriptorException
+   */
+  public static String generateDescriptor(Iterable<String> tokens) throws DescriptorException {
+    StringBuilder descriptor = new StringBuilder();
+    
+    int multiplicator = 0;
+    
+    for (String token : tokens) {
+      try {
+        // try to parse an integer
+        int number = Integer.parseInt(token);
+        
+        if (number <= 0) {
+          throw new DescriptorException("Multiplicator (" + number + ") must be > 0");
+        }
+        if (multiplicator > 0) {
+          throw new DescriptorException("A multiplicator cannot be followed by another multiplicator");
+        }
+        
+        multiplicator = number;
+      } catch (NumberFormatException e) {
+        // token is not a number
+        if (multiplicator == 0) {
+          multiplicator = 1;
+        }
+        
+        for (int index = 0; index < multiplicator; index++) {
+          descriptor.append(token).append(' ');
+        }
+        
+        multiplicator = 0;
+      }
+    }
+    
+    return descriptor.toString().trim();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Instance.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Instance.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Instance.java
new file mode 100644
index 0000000..6a23cb8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Instance.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+import org.apache.mahout.math.Vector;
+
+/**
+ * Represents one data instance.
+ */
+@Deprecated
+public class Instance {
+  
+  /** attributes, except LABEL and IGNORED */
+  private final Vector attrs;
+  
+  public Instance(Vector attrs) {
+    this.attrs = attrs;
+  }
+  
+  /**
+   * Return the attribute at the specified position
+   * 
+   * @param index
+   *          position of the attribute to retrieve
+   * @return value of the attribute
+   */
+  public double get(int index) {
+    return attrs.getQuick(index);
+  }
+  
+  /**
+   * Set the value at the given index
+   * 
+   * @param value
+   *          a double value to set
+   */
+  public void set(int index, double value) {
+    attrs.set(index, value);
+  }
+  
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+    if (!(obj instanceof Instance)) {
+      return false;
+    }
+    
+    Instance instance = (Instance) obj;
+    
+    return /*id == instance.id &&*/ attrs.equals(instance.attrs);
+    
+  }
+  
+  @Override
+  public int hashCode() {
+    return /*id +*/ attrs.hashCode();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Condition.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Condition.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Condition.java
new file mode 100644
index 0000000..c16ca3f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Condition.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data.conditions;
+
+import org.apache.mahout.classifier.df.data.Instance;
+
+/**
+ * Condition on Instance
+ */
+@Deprecated
+public abstract class Condition {
+  
+  /**
+   * Returns true is the checked instance matches the condition
+   * 
+   * @param instance
+   *          checked instance
+   * @return true is the checked instance matches the condition
+   */
+  public abstract boolean isTrueFor(Instance instance);
+  
+  /**
+   * Condition that checks if the given attribute has a value "equal" to the given value
+   */
+  public static Condition equals(int attr, double value) {
+    return new Equals(attr, value);
+  }
+  
+  /**
+   * Condition that checks if the given attribute has a value "lesser" than the given value
+   */
+  public static Condition lesser(int attr, double value) {
+    return new Lesser(attr, value);
+  }
+  
+  /**
+   * Condition that checks if the given attribute has a value "greater or equal" than the given value
+   */
+  public static Condition greaterOrEquals(int attr, double value) {
+    return new GreaterOrEquals(attr, value);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Equals.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Equals.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Equals.java
new file mode 100644
index 0000000..c51082b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Equals.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data.conditions;
+
+import org.apache.mahout.classifier.df.data.Instance;
+
+/**
+ * True if a given attribute has a given value
+ */
+@Deprecated
+public class Equals extends Condition {
+  
+  private final int attr;
+  
+  private final double value;
+  
+  public Equals(int attr, double value) {
+    this.attr = attr;
+    this.value = value;
+  }
+  
+  @Override
+  public boolean isTrueFor(Instance instance) {
+    return instance.get(attr) == value;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/GreaterOrEquals.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/GreaterOrEquals.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/GreaterOrEquals.java
new file mode 100644
index 0000000..3e3d1a4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/GreaterOrEquals.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data.conditions;
+
+import org.apache.mahout.classifier.df.data.Instance;
+
+/**
+ * True if a given attribute has a value "greater or equal" than a given value
+ */
+@Deprecated
+public class GreaterOrEquals extends Condition {
+  
+  private final int attr;
+  
+  private final double value;
+  
+  public GreaterOrEquals(int attr, double value) {
+    this.attr = attr;
+    this.value = value;
+  }
+  
+  @Override
+  public boolean isTrueFor(Instance v) {
+    return v.get(attr) >= value;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Lesser.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Lesser.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Lesser.java
new file mode 100644
index 0000000..577cb24
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Lesser.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data.conditions;
+
+import org.apache.mahout.classifier.df.data.Instance;
+
+/**
+ * True if a given attribute has a value "lesser" than a given value
+ */
+@Deprecated
+public class Lesser extends Condition {
+  
+  private final int attr;
+  
+  private final double value;
+  
+  public Lesser(int attr, double value) {
+    this.attr = attr;
+    this.value = value;
+  }
+  
+  @Override
+  public boolean isTrueFor(Instance instance) {
+    return instance.get(attr) < value;
+  }
+  
+}


[43/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
deleted file mode 100644
index bd1149b..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.streaming.tools;
-
-import com.google.common.base.Function;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Iterables;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable;
-import org.apache.mahout.math.Centroid;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-
-public class IOUtils {
-
-  private IOUtils() {}
-
-  /**
-   * Converts CentroidWritable values in a sequence file into Centroids lazily.
-   * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
-   * @return an Iterable<Centroid> with the converted vectors.
-   */
-  public static Iterable<Centroid> getCentroidsFromCentroidWritableIterable(
-      Iterable<CentroidWritable>  dirIterable) {
-    return Iterables.transform(dirIterable, new Function<CentroidWritable, Centroid>() {
-      @Override
-      public Centroid apply(CentroidWritable input) {
-        Preconditions.checkNotNull(input);
-        return input.getCentroid().clone();
-      }
-    });
-  }
-
-  /**
-   * Converts CentroidWritable values in a sequence file into Centroids lazily.
-   * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
-   * @return an Iterable<Centroid> with the converted vectors.
-   */
-  public static Iterable<Centroid> getCentroidsFromClusterWritableIterable(Iterable<ClusterWritable>  dirIterable) {
-    return Iterables.transform(dirIterable, new Function<ClusterWritable, Centroid>() {
-      int numClusters = 0;
-      @Override
-      public Centroid apply(ClusterWritable input) {
-        Preconditions.checkNotNull(input);
-        return new Centroid(numClusters++, input.getValue().getCenter().clone(),
-            input.getValue().getTotalObservations());
-      }
-    });
-  }
-
-  /**
-   * Converts VectorWritable values in a sequence file into Vectors lazily.
-   * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
-   * @return an Iterable<Vector> with the converted vectors.
-   */
-  public static Iterable<Vector> getVectorsFromVectorWritableIterable(Iterable<VectorWritable> dirIterable) {
-    return Iterables.transform(dirIterable, new Function<VectorWritable, Vector>() {
-      @Override
-      public Vector apply(VectorWritable input) {
-        Preconditions.checkNotNull(input);
-        return input.get().clone();
-      }
-    });
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
deleted file mode 100644
index 083cd8c..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
+++ /dev/null
@@ -1,125 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.syntheticcontrol.canopy;
-
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.conversion.InputDriver;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-@Deprecated
-public final class Job extends AbstractJob {
-
-  private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
-
-  private Job() {
-  }
-
-  private static final Logger log = LoggerFactory.getLogger(Job.class);
-
-  public static void main(String[] args) throws Exception {
-    if (args.length > 0) {
-      log.info("Running with only user-supplied arguments");
-      ToolRunner.run(new Configuration(), new Job(), args);
-    } else {
-      log.info("Running with default arguments");
-      Path output = new Path("output");
-      HadoopUtil.delete(new Configuration(), output);
-      run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55);
-    }
-  }
-
-  /**
-   * Run the canopy clustering job on an input dataset using the given distance
-   * measure, t1 and t2 parameters. All output data will be written to the
-   * output directory, which will be initially deleted if it exists. The
-   * clustered points will reside in the path <output>/clustered-points. By
-   * default, the job expects the a file containing synthetic_control.data as
-   * obtained from
-   * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
-   * resides in a directory named "testdata", and writes output to a directory
-   * named "output".
-   * 
-   * @param input
-   *          the String denoting the input directory path
-   * @param output
-   *          the String denoting the output directory path
-   * @param measure
-   *          the DistanceMeasure to use
-   * @param t1
-   *          the canopy T1 threshold
-   * @param t2
-   *          the canopy T2 threshold
-   */
-  private static void run(Path input, Path output, DistanceMeasure measure,
-      double t1, double t2) throws Exception {
-    Path directoryContainingConvertedInput = new Path(output,
-        DIRECTORY_CONTAINING_CONVERTED_INPUT);
-    InputDriver.runJob(input, directoryContainingConvertedInput,
-        "org.apache.mahout.math.RandomAccessSparseVector");
-    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput,
-        output, measure, t1, t2, true, 0.0, false);
-    // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
-        "clusters-0-final"), new Path(output, "clusteredPoints"));
-    clusterDumper.printClusters(null);
-  }
-
-  @Override
-  public int run(String[] args) throws Exception {
-
-    addInputOption();
-    addOutputOption();
-    addOption(DefaultOptionCreator.distanceMeasureOption().create());
-    addOption(DefaultOptionCreator.t1Option().create());
-    addOption(DefaultOptionCreator.t2Option().create());
-    addOption(DefaultOptionCreator.overwriteOption().create());
-
-    Map<String, List<String>> argMap = parseArguments(args);
-    if (argMap == null) {
-      return -1;
-    }
-
-    Path input = getInputPath();
-    Path output = getOutputPath();
-    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
-      HadoopUtil.delete(new Configuration(), output);
-    }
-    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
-    double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
-    double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
-    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
-
-    run(input, output, measure, t1, t2);
-    return 0;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
deleted file mode 100644
index 43beb78..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
+++ /dev/null
@@ -1,144 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.syntheticcontrol.fuzzykmeans;
-
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.conversion.InputDriver;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class Job extends AbstractJob {
-  
-  private static final Logger log = LoggerFactory.getLogger(Job.class);
-  
-  private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
-  
-  private static final String M_OPTION = FuzzyKMeansDriver.M_OPTION;
-  
-  private Job() {
-  }
-  
-  public static void main(String[] args) throws Exception {
-    if (args.length > 0) {
-      log.info("Running with only user-supplied arguments");
-      ToolRunner.run(new Configuration(), new Job(), args);
-    } else {
-      log.info("Running with default arguments");
-      Path output = new Path("output");
-      Configuration conf = new Configuration();
-      HadoopUtil.delete(conf, output);
-      run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 10, 2.0f, 0.5);
-    }
-  }
-  
-  @Override
-  public int run(String[] args) throws Exception {
-    addInputOption();
-    addOutputOption();
-    addOption(DefaultOptionCreator.distanceMeasureOption().create());
-    addOption(DefaultOptionCreator.convergenceOption().create());
-    addOption(DefaultOptionCreator.maxIterationsOption().create());
-    addOption(DefaultOptionCreator.overwriteOption().create());
-    addOption(DefaultOptionCreator.t1Option().create());
-    addOption(DefaultOptionCreator.t2Option().create());
-    addOption(M_OPTION, M_OPTION, "coefficient normalization factor, must be greater than 1", true);
-    
-    Map<String,List<String>> argMap = parseArguments(args);
-    if (argMap == null) {
-      return -1;
-    }
-    
-    Path input = getInputPath();
-    Path output = getOutputPath();
-    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
-    if (measureClass == null) {
-      measureClass = SquaredEuclideanDistanceMeasure.class.getName();
-    }
-    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
-    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
-    float fuzziness = Float.parseFloat(getOption(M_OPTION));
-    
-    addOption(new DefaultOptionBuilder().withLongName(M_OPTION).withRequired(true)
-        .withArgument(new ArgumentBuilder().withName(M_OPTION).withMinimum(1).withMaximum(1).create())
-        .withDescription("coefficient normalization factor, must be greater than 1").withShortName(M_OPTION).create());
-    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
-      HadoopUtil.delete(getConf(), output);
-    }
-    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
-    double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
-    double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
-    run(getConf(), input, output, measure, t1, t2, maxIterations, fuzziness, convergenceDelta);
-    return 0;
-  }
-  
-  /**
-   * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
-   * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
-   * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file
-   * containing synthetic_control.data as obtained from
-   * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata",
-   * and writes output to a directory named "output".
-   * 
-   * @param input
-   *          the String denoting the input directory path
-   * @param output
-   *          the String denoting the output directory path
-   * @param t1
-   *          the canopy T1 threshold
-   * @param t2
-   *          the canopy T2 threshold
-   * @param maxIterations
-   *          the int maximum number of iterations
-   * @param fuzziness
-   *          the float "m" fuzziness coefficient
-   * @param convergenceDelta
-   *          the double convergence criteria for iterations
-   */
-  public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2,
-      int maxIterations, float fuzziness, double convergenceDelta) throws Exception {
-    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
-    log.info("Preparing Input");
-    InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
-    log.info("Running Canopy to get initial clusters");
-    Path canopyOutput = new Path(output, "canopies");
-    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, false);
-    log.info("Running FuzzyKMeans");
-    FuzzyKMeansDriver.run(directoryContainingConvertedInput, new Path(canopyOutput, "clusters-0-final"), output,
-        convergenceDelta, maxIterations, fuzziness, true, true, 0.0, false);
-    // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints"));
-    clusterDumper.printClusters(null);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
deleted file mode 100644
index 70c41fe..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
+++ /dev/null
@@ -1,187 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.syntheticcontrol.kmeans;
-
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.conversion.InputDriver;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class Job extends AbstractJob {
-  
-  private static final Logger log = LoggerFactory.getLogger(Job.class);
-  
-  private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
-  
-  private Job() {
-  }
-  
-  public static void main(String[] args) throws Exception {
-    if (args.length > 0) {
-      log.info("Running with only user-supplied arguments");
-      ToolRunner.run(new Configuration(), new Job(), args);
-    } else {
-      log.info("Running with default arguments");
-      Path output = new Path("output");
-      Configuration conf = new Configuration();
-      HadoopUtil.delete(conf, output);
-      run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10);
-    }
-  }
-  
-  @Override
-  public int run(String[] args) throws Exception {
-    addInputOption();
-    addOutputOption();
-    addOption(DefaultOptionCreator.distanceMeasureOption().create());
-    addOption(DefaultOptionCreator.numClustersOption().create());
-    addOption(DefaultOptionCreator.t1Option().create());
-    addOption(DefaultOptionCreator.t2Option().create());
-    addOption(DefaultOptionCreator.convergenceOption().create());
-    addOption(DefaultOptionCreator.maxIterationsOption().create());
-    addOption(DefaultOptionCreator.overwriteOption().create());
-    
-    Map<String,List<String>> argMap = parseArguments(args);
-    if (argMap == null) {
-      return -1;
-    }
-    
-    Path input = getInputPath();
-    Path output = getOutputPath();
-    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
-    if (measureClass == null) {
-      measureClass = SquaredEuclideanDistanceMeasure.class.getName();
-    }
-    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
-    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
-    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
-      HadoopUtil.delete(getConf(), output);
-    }
-    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
-    if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
-      int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
-      run(getConf(), input, output, measure, k, convergenceDelta, maxIterations);
-    } else {
-      double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
-      double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
-      run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations);
-    }
-    return 0;
-  }
-  
-  /**
-   * Run the kmeans clustering job on an input dataset using the given the number of clusters k and iteration
-   * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
-   * The clustered points will reside in the path <output>/clustered-points. By default, the job expects a file
-   * containing equal length space delimited data that resides in a directory named "testdata", and writes output to a
-   * directory named "output".
-   * 
-   * @param conf
-   *          the Configuration to use
-   * @param input
-   *          the String denoting the input directory path
-   * @param output
-   *          the String denoting the output directory path
-   * @param measure
-   *          the DistanceMeasure to use
-   * @param k
-   *          the number of clusters in Kmeans
-   * @param convergenceDelta
-   *          the double convergence criteria for iterations
-   * @param maxIterations
-   *          the int maximum number of iterations
-   */
-  public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
-      double convergenceDelta, int maxIterations) throws Exception {
-    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
-    log.info("Preparing Input");
-    InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
-    log.info("Running random seed to get initial clusters");
-    Path clusters = new Path(output, "random-seeds");
-    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
-    log.info("Running KMeans with k = {}", k);
-    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, convergenceDelta,
-        maxIterations, true, 0.0, false);
-    // run ClusterDumper
-    Path outGlob = new Path(output, "clusters-*-final");
-    Path clusteredPoints = new Path(output,"clusteredPoints");
-    log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints);
-    ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints);
-    clusterDumper.printClusters(null);
-  }
-  
-  /**
-   * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
-   * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
-   * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file
-   * containing synthetic_control.data as obtained from
-   * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata",
-   * and writes output to a directory named "output".
-   * 
-   * @param conf
-   *          the Configuration to use
-   * @param input
-   *          the String denoting the input directory path
-   * @param output
-   *          the String denoting the output directory path
-   * @param measure
-   *          the DistanceMeasure to use
-   * @param t1
-   *          the canopy T1 threshold
-   * @param t2
-   *          the canopy T2 threshold
-   * @param convergenceDelta
-   *          the double convergence criteria for iterations
-   * @param maxIterations
-   *          the int maximum number of iterations
-   */
-  public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2,
-      double convergenceDelta, int maxIterations) throws Exception {
-    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
-    log.info("Preparing Input");
-    InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
-    log.info("Running Canopy to get initial clusters");
-    Path canopyOutput = new Path(output, "canopies");
-    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0,
-        false);
-    log.info("Running KMeans");
-    KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR
-        + "-final"), output, convergenceDelta, maxIterations, true, 0.0, false);
-    // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output,
-        "clusteredPoints"));
-    clusterDumper.printClusters(null);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java
deleted file mode 100644
index 92363e5..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.fpm.pfpgrowth;
-
-import java.io.IOException;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.Parameters;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.fpm.pfpgrowth.dataset.KeyBasedStringTupleGrouper;
-
-public final class DeliciousTagsExample {
-  private DeliciousTagsExample() { }
-  
-  public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-    Option inputDirOpt = DefaultOptionCreator.inputOption().create();
-    
-    Option outputOpt = DefaultOptionCreator.outputOption().create();
-    
-    Option helpOpt = DefaultOptionCreator.helpOption();
-    Option recordSplitterOpt = obuilder.withLongName("splitterPattern").withArgument(
-      abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1).create()).withDescription(
-      "Regular Expression pattern used to split given line into fields."
-          + " Default value splits comma or tab separated fields."
-          + " Default Value: \"[ ,\\t]*\\t[ ,\\t]*\" ").withShortName("regex").create();
-    Option encodingOpt = obuilder.withLongName("encoding").withArgument(
-      abuilder.withName("encoding").withMinimum(1).withMaximum(1).create()).withDescription(
-      "(Optional) The file encoding.  Default value: UTF-8").withShortName("e").create();
-    Group group = gbuilder.withName("Options").withOption(inputDirOpt).withOption(outputOpt).withOption(
-      helpOpt).withOption(recordSplitterOpt).withOption(encodingOpt).create();
-    
-    try {
-      Parser parser = new Parser();
-      parser.setGroup(group);
-      CommandLine cmdLine = parser.parse(args);
-      
-      if (cmdLine.hasOption(helpOpt)) {
-        CommandLineUtil.printHelp(group);
-        return;
-      }
-      Parameters params = new Parameters();
-      if (cmdLine.hasOption(recordSplitterOpt)) {
-        params.set("splitPattern", (String) cmdLine.getValue(recordSplitterOpt));
-      }
-      
-      String encoding = "UTF-8";
-      if (cmdLine.hasOption(encodingOpt)) {
-        encoding = (String) cmdLine.getValue(encodingOpt);
-      }
-      params.set("encoding", encoding);
-      String inputDir = (String) cmdLine.getValue(inputDirOpt);
-      String outputDir = (String) cmdLine.getValue(outputOpt);
-      params.set("input", inputDir);
-      params.set("output", outputDir);
-      params.set("groupingFieldCount", "2");
-      params.set("gfield0", "1");
-      params.set("gfield1", "2");
-      params.set("selectedFieldCount", "1");
-      params.set("field0", "3");
-      params.set("maxTransactionLength", "100");
-      KeyBasedStringTupleGrouper.startJob(params);
-      
-    } catch (OptionException ex) {
-      CommandLineUtil.printHelp(group);
-    }
-    
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java
deleted file mode 100644
index 4c80a31..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.fpm.pfpgrowth.dataset;
-
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.common.StringTuple;
-
-public class KeyBasedStringTupleCombiner extends Reducer<Text,StringTuple,Text,StringTuple> {
-  
-  @Override
-  protected void reduce(Text key,
-                        Iterable<StringTuple> values,
-                        Context context) throws IOException, InterruptedException {
-    Set<String> outputValues = new HashSet<>();
-    for (StringTuple value : values) {
-      outputValues.addAll(value.getEntries());
-    }
-    context.write(key, new StringTuple(outputValues));
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java
deleted file mode 100644
index cd17770..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.fpm.pfpgrowth.dataset;
-
-import java.io.IOException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.Parameters;
-import org.apache.mahout.common.StringTuple;
-
-public final class KeyBasedStringTupleGrouper {
-  
-  private KeyBasedStringTupleGrouper() { }
-  
-  public static void startJob(Parameters params) throws IOException,
-                                                InterruptedException,
-                                                ClassNotFoundException {
-    Configuration conf = new Configuration();
-    
-    conf.set("job.parameters", params.toString());
-    conf.set("mapred.compress.map.output", "true");
-    conf.set("mapred.output.compression.type", "BLOCK");
-    conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
-    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
-                                  + "org.apache.hadoop.io.serializer.WritableSerialization");
-    
-    String input = params.get("input");
-    Job job = new Job(conf, "Generating dataset based from input" + input);
-    job.setJarByClass(KeyBasedStringTupleGrouper.class);
-    
-    job.setMapOutputKeyClass(Text.class);
-    job.setMapOutputValueClass(StringTuple.class);
-    
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(Text.class);
-    
-    FileInputFormat.addInputPath(job, new Path(input));
-    Path outPath = new Path(params.get("output"));
-    FileOutputFormat.setOutputPath(job, outPath);
-    
-    HadoopUtil.delete(conf, outPath);
-
-    job.setInputFormatClass(TextInputFormat.class);
-    job.setMapperClass(KeyBasedStringTupleMapper.class);
-    job.setCombinerClass(KeyBasedStringTupleCombiner.class);
-    job.setReducerClass(KeyBasedStringTupleReducer.class);
-    job.setOutputFormatClass(TextOutputFormat.class);
-    
-    boolean succeeded = job.waitForCompletion(true);
-    if (!succeeded) {
-      throw new IllegalStateException("Job failed!");
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java
deleted file mode 100644
index 362d1ce..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.fpm.pfpgrowth.dataset;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.regex.Pattern;
-
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.common.Parameters;
-import org.apache.mahout.common.StringTuple;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Splits the line using a {@link Pattern} and outputs key as given by the groupingFields
- * 
- */
-public class KeyBasedStringTupleMapper extends Mapper<LongWritable,Text,Text,StringTuple> {
-  
-  private static final Logger log = LoggerFactory.getLogger(KeyBasedStringTupleMapper.class);
-  
-  private Pattern splitter;
-  
-  private int[] selectedFields;
-  
-  private int[] groupingFields;
-  
-  @Override
-  protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
-    String[] fields = splitter.split(value.toString());
-    if (fields.length != 4) {
-      log.info("{} {}", fields.length, value.toString());
-      context.getCounter("Map", "ERROR").increment(1);
-      return;
-    }
-    Collection<String> oKey = new ArrayList<>();
-    for (int groupingField : groupingFields) {
-      oKey.add(fields[groupingField]);
-      context.setStatus(fields[groupingField]);
-    }
-    
-    List<String> oValue = new ArrayList<>();
-    for (int selectedField : selectedFields) {
-      oValue.add(fields[selectedField]);
-    }
-    
-    context.write(new Text(oKey.toString()), new StringTuple(oValue));
-    
-  }
-  
-  @Override
-  protected void setup(Context context) throws IOException, InterruptedException {
-    super.setup(context);
-    Parameters params = new Parameters(context.getConfiguration().get("job.parameters", ""));
-    splitter = Pattern.compile(params.get("splitPattern", "[ \t]*\t[ \t]*"));
-    
-    int selectedFieldCount = Integer.valueOf(params.get("selectedFieldCount", "0"));
-    selectedFields = new int[selectedFieldCount];
-    for (int i = 0; i < selectedFieldCount; i++) {
-      selectedFields[i] = Integer.valueOf(params.get("field" + i, "0"));
-    }
-    
-    int groupingFieldCount = Integer.valueOf(params.get("groupingFieldCount", "0"));
-    groupingFields = new int[groupingFieldCount];
-    for (int i = 0; i < groupingFieldCount; i++) {
-      groupingFields[i] = Integer.valueOf(params.get("gfield" + i, "0"));
-    }
-    
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java
deleted file mode 100644
index a7ef762..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.fpm.pfpgrowth.dataset;
-
-import java.io.IOException;
-import java.util.Collection;
-import java.util.HashSet;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.common.Parameters;
-import org.apache.mahout.common.StringTuple;
-
-public class KeyBasedStringTupleReducer extends Reducer<Text,StringTuple,Text,Text> {
-  
-  private int maxTransactionLength = 100;
-  
-  @Override
-  protected void reduce(Text key, Iterable<StringTuple> values, Context context)
-    throws IOException, InterruptedException {
-    Collection<String> items = new HashSet<>();
-    
-    for (StringTuple value : values) {
-      for (String field : value.getEntries()) {
-        items.add(field);
-      }
-    }
-    if (items.size() > 1) {
-      int i = 0;
-      StringBuilder sb = new StringBuilder();
-      String sep = "";
-      for (String field : items) {
-        if (i % maxTransactionLength == 0) {
-          if (i != 0) {
-            context.write(null, new Text(sb.toString()));
-          }
-          sb.replace(0, sb.length(), "");
-          sep = "";
-        }
-        
-        sb.append(sep).append(field);
-        sep = "\t";
-        
-        i++;
-        
-      }
-      if (sb.length() > 0) {
-        context.write(null, new Text(sb.toString()));
-      }
-    }
-  }
-  
-  @Override
-  protected void setup(Context context) throws IOException, InterruptedException {
-    super.setup(context);
-    Parameters params = new Parameters(context.getConfiguration().get("job.parameters", ""));
-    maxTransactionLength = Integer.valueOf(params.get("maxTransactionLength", "100"));
-  }
-}


[25/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveImplicitFeedbackMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveImplicitFeedbackMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveImplicitFeedbackMapper.java
new file mode 100644
index 0000000..fd6657f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveImplicitFeedbackMapper.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.als.ImplicitFeedbackAlternatingLeastSquaresSolver;
+
+import java.io.IOException;
+
+/** Solving mapper that can be safely executed using multiple threads */
+public class SolveImplicitFeedbackMapper
+    extends SharingMapper<IntWritable,VectorWritable,IntWritable,VectorWritable,
+    ImplicitFeedbackAlternatingLeastSquaresSolver> {
+
+  private final VectorWritable uiOrmj = new VectorWritable();
+
+  @Override
+  ImplicitFeedbackAlternatingLeastSquaresSolver createSharedInstance(Context ctx) throws IOException {
+    Configuration conf = ctx.getConfiguration();
+
+    double lambda = Double.parseDouble(conf.get(ParallelALSFactorizationJob.LAMBDA));
+    double alpha = Double.parseDouble(conf.get(ParallelALSFactorizationJob.ALPHA));
+    int numFeatures = conf.getInt(ParallelALSFactorizationJob.NUM_FEATURES, -1);
+    int numEntities = Integer.parseInt(conf.get(ParallelALSFactorizationJob.NUM_ENTITIES));
+
+    Preconditions.checkArgument(numFeatures > 0, "numFeatures must be greater then 0!");
+
+    return new ImplicitFeedbackAlternatingLeastSquaresSolver(numFeatures, lambda, alpha,
+        ALS.readMatrixByRowsFromDistributedCache(numEntities, conf), 1);
+  }
+
+  @Override
+  protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx)
+    throws IOException, InterruptedException {
+    ImplicitFeedbackAlternatingLeastSquaresSolver solver = getSharedInstance();
+    uiOrmj.set(solver.solve(ratingsWritable.get()));
+    ctx.write(userOrItemID, uiOrmj);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/AggregateAndRecommendReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/AggregateAndRecommendReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/AggregateAndRecommendReducer.java
new file mode 100644
index 0000000..b44fd5b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/AggregateAndRecommendReducer.java
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.cf.taste.hadoop.MutableRecommendedItem;
+import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.cf.taste.hadoop.TopItemsQueue;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.function.Functions;
+import org.apache.mahout.math.map.OpenIntLongHashMap;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>computes prediction values for each user</p>
+ *
+ * <pre>
+ * u = a user
+ * i = an item not yet rated by u
+ * N = all items similar to i (where similarity is usually computed by pairwisely comparing the item-vectors
+ * of the user-item matrix)
+ *
+ * Prediction(u,i) = sum(all n from N: similarity(i,n) * rating(u,n)) / sum(all n from N: abs(similarity(i,n)))
+ * </pre>
+ */
+public final class AggregateAndRecommendReducer extends
+    Reducer<VarLongWritable,PrefAndSimilarityColumnWritable,VarLongWritable,RecommendedItemsWritable> {
+
+  private static final Logger log = LoggerFactory.getLogger(AggregateAndRecommendReducer.class);
+
+  static final String ITEMID_INDEX_PATH = "itemIDIndexPath";
+  static final String NUM_RECOMMENDATIONS = "numRecommendations";
+  static final int DEFAULT_NUM_RECOMMENDATIONS = 10;
+  static final String ITEMS_FILE = "itemsFile";
+
+  private boolean booleanData;
+  private int recommendationsPerUser;
+  private IDReader idReader;
+  private FastIDSet itemsToRecommendFor;
+  private OpenIntLongHashMap indexItemIDMap;
+
+  private final RecommendedItemsWritable recommendedItems = new RecommendedItemsWritable();
+
+  private static final float BOOLEAN_PREF_VALUE = 1.0f;
+
+  @Override
+  protected void setup(Context context) throws IOException {
+    Configuration conf = context.getConfiguration();
+    recommendationsPerUser = conf.getInt(NUM_RECOMMENDATIONS, DEFAULT_NUM_RECOMMENDATIONS);
+    booleanData = conf.getBoolean(RecommenderJob.BOOLEAN_DATA, false);
+    indexItemIDMap = TasteHadoopUtils.readIDIndexMap(conf.get(ITEMID_INDEX_PATH), conf);
+
+    idReader = new IDReader(conf);
+    idReader.readIDs();
+    itemsToRecommendFor = idReader.getItemIds();
+  }
+
+  @Override
+  protected void reduce(VarLongWritable userID,
+                        Iterable<PrefAndSimilarityColumnWritable> values,
+                        Context context) throws IOException, InterruptedException {
+    if (booleanData) {
+      reduceBooleanData(userID, values, context);
+    } else {
+      reduceNonBooleanData(userID, values, context);
+    }
+  }
+
+  private void reduceBooleanData(VarLongWritable userID,
+                                 Iterable<PrefAndSimilarityColumnWritable> values,
+                                 Context context) throws IOException, InterruptedException {
+    /* having boolean data, each estimated preference can only be 1,
+     * however we can't use this to rank the recommended items,
+     * so we use the sum of similarities for that. */
+    Iterator<PrefAndSimilarityColumnWritable> columns = values.iterator();
+    Vector predictions = columns.next().getSimilarityColumn();
+    while (columns.hasNext()) {
+      predictions.assign(columns.next().getSimilarityColumn(), Functions.PLUS);
+    }
+    writeRecommendedItems(userID, predictions, context);
+  }
+
+  private void reduceNonBooleanData(VarLongWritable userID,
+                                    Iterable<PrefAndSimilarityColumnWritable> values,
+                                    Context context) throws IOException, InterruptedException {
+    /* each entry here is the sum in the numerator of the prediction formula */
+    Vector numerators = null;
+    /* each entry here is the sum in the denominator of the prediction formula */
+    Vector denominators = null;
+    /* each entry here is the number of similar items used in the prediction formula */
+    Vector numberOfSimilarItemsUsed = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
+
+    for (PrefAndSimilarityColumnWritable prefAndSimilarityColumn : values) {
+      Vector simColumn = prefAndSimilarityColumn.getSimilarityColumn();
+      float prefValue = prefAndSimilarityColumn.getPrefValue();
+      /* count the number of items used for each prediction */
+      for (Element e : simColumn.nonZeroes()) {
+        int itemIDIndex = e.index();
+        numberOfSimilarItemsUsed.setQuick(itemIDIndex, numberOfSimilarItemsUsed.getQuick(itemIDIndex) + 1);
+      }
+
+      if (denominators == null) {
+        denominators = simColumn.clone();
+      } else {
+        denominators.assign(simColumn, Functions.PLUS_ABS);
+      }
+
+      if (numerators == null) {
+        numerators = simColumn.clone();
+        if (prefValue != BOOLEAN_PREF_VALUE) {
+          numerators.assign(Functions.MULT, prefValue);
+        }
+      } else {
+        if (prefValue != BOOLEAN_PREF_VALUE) {
+          simColumn.assign(Functions.MULT, prefValue);
+        }
+        numerators.assign(simColumn, Functions.PLUS);
+      }
+
+    }
+
+    if (numerators == null) {
+      return;
+    }
+
+    Vector recommendationVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
+    for (Element element : numerators.nonZeroes()) {
+      int itemIDIndex = element.index();
+      /* preference estimations must be based on at least 2 datapoints */
+      if (numberOfSimilarItemsUsed.getQuick(itemIDIndex) > 1) {
+        /* compute normalized prediction */
+        double prediction = element.get() / denominators.getQuick(itemIDIndex);
+        recommendationVector.setQuick(itemIDIndex, prediction);
+      }
+    }
+    writeRecommendedItems(userID, recommendationVector, context);
+  }
+
+  /**
+   * find the top entries in recommendationVector, map them to the real itemIDs and write back the result
+   */
+  private void writeRecommendedItems(VarLongWritable userID, Vector recommendationVector, Context context)
+    throws IOException, InterruptedException {
+    TopItemsQueue topKItems = new TopItemsQueue(recommendationsPerUser);
+    FastIDSet itemsForUser = null;
+
+    if (idReader != null && idReader.isUserItemFilterSpecified()) {
+      itemsForUser = idReader.getItemsToRecommendForUser(userID.get());
+    }
+
+    for (Element element : recommendationVector.nonZeroes()) {
+      int index = element.index();
+      long itemID;
+      if (indexItemIDMap != null && !indexItemIDMap.isEmpty()) {
+        itemID = indexItemIDMap.get(index);
+      } else { // we don't have any mappings, so just use the original
+        itemID = index;
+      }
+
+      if (shouldIncludeItemIntoRecommendations(itemID, itemsToRecommendFor, itemsForUser)) {
+
+        float value = (float) element.get();
+        if (!Float.isNaN(value)) {
+
+          MutableRecommendedItem topItem = topKItems.top();
+          if (value > topItem.getValue()) {
+            topItem.set(itemID, value);
+            topKItems.updateTop();
+          }
+        }
+      }
+    }
+
+    List<RecommendedItem> topItems = topKItems.getTopItems();
+    if (!topItems.isEmpty()) {
+      recommendedItems.set(topItems);
+      context.write(userID, recommendedItems);
+    }
+  }
+
+  private boolean shouldIncludeItemIntoRecommendations(long itemID, FastIDSet allItemsToRecommendFor,
+                                                       FastIDSet itemsForUser) {
+    if (allItemsToRecommendFor == null && itemsForUser == null) {
+      return true;
+    } else if (itemsForUser != null) {
+      return itemsForUser.contains(itemID);
+    } else {
+      return allItemsToRecommendFor.contains(itemID);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/IDReader.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/IDReader.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/IDReader.java
new file mode 100644
index 0000000..7797fe9
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/IDReader.java
@@ -0,0 +1,244 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.iterator.FileLineIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Reads user ids and item ids from files specified in usersFile, itemsFile or userItemFile options in item-based
+ *  recommender. Composes a list of users and a list of items which can be used by
+ * {@link org.apache.mahout.cf.taste.hadoop.item.UserVectorSplitterMapper} and
+ * {@link org.apache.mahout.cf.taste.hadoop.item.AggregateAndRecommendReducer}.
+ */
+public class IDReader {
+
+  static final String USER_ITEM_FILE = "userItemFile";
+
+  private static final Logger log = LoggerFactory.getLogger(IDReader.class);
+  private static final Pattern SEPARATOR = Pattern.compile("[\t,]");
+
+  private Configuration conf;
+
+  private String usersFile;
+  private String itemsFile;
+  private String userItemFile;
+
+  private FastIDSet userIds;
+  private FastIDSet itemIds;
+
+  private FastIDSet emptySet;
+
+  /* Key - user id, value - a set of item ids to include into recommendations for this user */
+  private Map<Long, FastIDSet> userItemFilter;
+
+  /**
+   * Creates a new IDReader
+   * 
+   * @param conf Job configuration
+   */
+  public IDReader(Configuration conf) {
+    this.conf = conf;
+    emptySet = new FastIDSet();
+
+    usersFile = conf.get(UserVectorSplitterMapper.USERS_FILE);
+    itemsFile = conf.get(AggregateAndRecommendReducer.ITEMS_FILE);
+    userItemFile = conf.get(USER_ITEM_FILE);
+  }
+
+  /**
+   * Reads user ids and item ids from files specified in a job configuration
+   * 
+   * @throws IOException if an error occurs during file read operation
+   * 
+   * @throws IllegalStateException if userItemFile option is specified together with usersFile or itemsFile
+   */
+  public void readIDs() throws IOException, IllegalStateException {
+    if (isUserItemFileSpecified()) {
+      readUserItemFilterIfNeeded();
+    }
+
+    if (isUsersFileSpecified() || isUserItemFilterSpecified()) {
+      readUserIds();
+    }
+
+    if (isItemsFileSpecified() || isUserItemFilterSpecified()) {
+      readItemIds();
+    }
+  }
+
+  /**
+   * Gets a collection of items which should be recommended for a user
+   * 
+   * @param userId ID of a user we are interested in
+   * @return if a userItemFile option is specified, and that file contains at least one item ID for the user,
+   *         then this method returns a {@link FastIDSet} object populated with item IDs. Otherwise, this
+   *         method returns an empty set.
+   */
+  public FastIDSet getItemsToRecommendForUser(Long userId) {
+    if (isUserItemFilterSpecified() && userItemFilter.containsKey(userId)) {
+      return userItemFilter.get(userId);
+    } else {
+      return emptySet;
+    }
+  }
+
+  private void readUserIds() throws IOException, IllegalStateException {
+    if (isUsersFileSpecified() && !isUserItemFileSpecified()) {
+      userIds = readIDList(usersFile);
+    } else if (isUserItemFileSpecified() && !isUsersFileSpecified()) {
+      readUserItemFilterIfNeeded();
+      userIds = extractAllUserIdsFromUserItemFilter(userItemFilter);
+    } else if (!isUsersFileSpecified()) {
+      throw new IllegalStateException("Neither usersFile nor userItemFile options are specified");
+    } else {
+      throw new IllegalStateException("usersFile and userItemFile options cannot be used simultaneously");
+    }
+  }
+
+  private void readItemIds() throws IOException, IllegalStateException {
+    if (isItemsFileSpecified() && !isUserItemFileSpecified()) {
+      itemIds = readIDList(itemsFile);
+    } else if (isUserItemFileSpecified() && !isItemsFileSpecified()) {
+      readUserItemFilterIfNeeded();
+      itemIds = extractAllItemIdsFromUserItemFilter(userItemFilter);
+    } else if (!isItemsFileSpecified()) {
+      throw new IllegalStateException("Neither itemsFile nor userItemFile options are specified");
+    } else {
+      throw new IllegalStateException("itemsFile and userItemFile options cannot be specified simultaneously");
+    }
+  }
+
+  private void readUserItemFilterIfNeeded() throws IOException {
+    if (!isUserItemFilterSpecified() && isUserItemFileSpecified()) {
+      userItemFilter = readUserItemFilter(userItemFile);
+    }
+  }
+
+  private Map<Long, FastIDSet> readUserItemFilter(String pathString) throws IOException {
+    Map<Long, FastIDSet> result = new HashMap<>();
+
+    try (InputStream in = openFile(pathString)) {
+      for (String line : new FileLineIterable(in)) {
+        try {
+          String[] tokens = SEPARATOR.split(line);
+          Long userId = Long.parseLong(tokens[0]);
+          Long itemId = Long.parseLong(tokens[1]);
+
+          addUserAndItemIdToUserItemFilter(result, userId, itemId);
+        } catch (NumberFormatException nfe) {
+          log.warn("userItemFile line ignored: {}", line);
+        }
+      }
+    }
+
+    return result;
+  }
+
+  void addUserAndItemIdToUserItemFilter(Map<Long, FastIDSet> filter, Long userId, Long itemId) {
+    FastIDSet itemIds;
+
+    if (filter.containsKey(userId)) {
+      itemIds = filter.get(userId);
+    } else {
+      itemIds = new FastIDSet();
+      filter.put(userId, itemIds);
+    }
+
+    itemIds.add(itemId);
+  }
+
+  static FastIDSet extractAllUserIdsFromUserItemFilter(Map<Long, FastIDSet> filter) {
+    FastIDSet result = new FastIDSet();
+
+    for (Long userId : filter.keySet()) {
+      result.add(userId);
+    }
+
+    return result;
+  }
+
+  private FastIDSet extractAllItemIdsFromUserItemFilter(Map<Long, FastIDSet> filter) {
+    FastIDSet result = new FastIDSet();
+
+    for (FastIDSet itemIds : filter.values()) {
+      result.addAll(itemIds);
+    }
+
+    return result;
+  }
+
+  private FastIDSet readIDList(String pathString) throws IOException {
+    FastIDSet result = null;
+
+    if (pathString != null) {
+      result = new FastIDSet();
+
+      try (InputStream in = openFile(pathString)){
+        for (String line : new FileLineIterable(in)) {
+          try {
+            result.add(Long.parseLong(line));
+          } catch (NumberFormatException nfe) {
+            log.warn("line ignored: {}", line);
+          }
+        }
+      }
+    }
+
+    return result;
+  }
+
+  private InputStream openFile(String pathString) throws IOException {
+    return HadoopUtil.openStream(new Path(pathString), conf);
+  }
+
+  public boolean isUsersFileSpecified () {
+    return usersFile != null;
+  }
+  
+  public boolean isItemsFileSpecified () {
+    return itemsFile != null;
+  }
+  
+  public boolean isUserItemFileSpecified () {
+    return userItemFile != null;
+  }
+
+  public boolean isUserItemFilterSpecified() {
+    return userItemFilter != null;
+  }
+
+  public FastIDSet getUserIds() {
+    return userIds;
+  }
+
+  public FastIDSet getItemIds() {
+    return itemIds;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterAsVectorAndPrefsReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterAsVectorAndPrefsReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterAsVectorAndPrefsReducer.java
new file mode 100644
index 0000000..4415a55
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterAsVectorAndPrefsReducer.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.Vector;
+
+/**
+ * we use a neat little trick to explicitly filter items for some users: we inject a NaN summand into the preference
+ * estimation for those items, which makes {@link org.apache.mahout.cf.taste.hadoop.item.AggregateAndRecommendReducer}
+ * automatically exclude them 
+ */
+public class ItemFilterAsVectorAndPrefsReducer
+    extends Reducer<VarLongWritable,VarLongWritable,VarIntWritable,VectorAndPrefsWritable> {
+
+  private final VarIntWritable itemIDIndexWritable = new VarIntWritable();
+  private final VectorAndPrefsWritable vectorAndPrefs = new VectorAndPrefsWritable();
+
+  @Override
+  protected void reduce(VarLongWritable itemID, Iterable<VarLongWritable> values, Context ctx)
+    throws IOException, InterruptedException {
+    
+    int itemIDIndex = TasteHadoopUtils.idToIndex(itemID.get());
+    Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
+    /* artificial NaN summand to exclude this item from the recommendations for all users specified in userIDs */
+    vector.set(itemIDIndex, Double.NaN);
+
+    List<Long> userIDs = new ArrayList<>();
+    List<Float> prefValues = new ArrayList<>();
+    for (VarLongWritable userID : values) {
+      userIDs.add(userID.get());
+      prefValues.add(1.0f);
+    }
+
+    itemIDIndexWritable.set(itemIDIndex);
+    vectorAndPrefs.set(vector, userIDs, prefValues);
+    ctx.write(itemIDIndexWritable, vectorAndPrefs);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterMapper.java
new file mode 100644
index 0000000..cdc1ddf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterMapper.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarLongWritable;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+/**
+ * map out all user/item pairs to filter, keyed by the itemID
+ */
+public class ItemFilterMapper extends Mapper<LongWritable,Text,VarLongWritable,VarLongWritable> {
+
+  private static final Pattern SEPARATOR = Pattern.compile("[\t,]");
+
+  private final VarLongWritable itemIDWritable = new VarLongWritable();
+  private final VarLongWritable userIDWritable = new VarLongWritable();
+
+  @Override
+  protected void map(LongWritable key, Text line, Context ctx) throws IOException, InterruptedException {
+    String[] tokens = SEPARATOR.split(line.toString());
+    long userID = Long.parseLong(tokens[0]);
+    long itemID = Long.parseLong(tokens[1]);
+    itemIDWritable.set(itemID);
+    userIDWritable.set(userID);
+    ctx.write(itemIDWritable, userIDWritable);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexMapper.java
new file mode 100644
index 0000000..ac8597e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexMapper.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.cf.taste.hadoop.ToEntityPrefsMapper;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+
+public final class ItemIDIndexMapper extends
+    Mapper<LongWritable,Text, VarIntWritable, VarLongWritable> {
+
+  private boolean transpose;
+
+  private final VarIntWritable indexWritable = new VarIntWritable();
+  private final VarLongWritable itemIDWritable = new VarLongWritable();
+
+  @Override
+  protected void setup(Context context) {
+    Configuration jobConf = context.getConfiguration();
+    transpose = jobConf.getBoolean(ToEntityPrefsMapper.TRANSPOSE_USER_ITEM, false);
+  }
+  
+  @Override
+  protected void map(LongWritable key,
+                     Text value,
+                     Context context) throws IOException, InterruptedException {
+    String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString());
+    long itemID = Long.parseLong(tokens[transpose ? 0 : 1]);
+    int index = TasteHadoopUtils.idToIndex(itemID);
+    indexWritable.set(index);
+    itemIDWritable.set(itemID);
+    context.write(indexWritable, itemIDWritable);
+  }  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexReducer.java
new file mode 100644
index 0000000..d9ecf5e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexReducer.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+
+public final class ItemIDIndexReducer extends
+    Reducer<VarIntWritable, VarLongWritable, VarIntWritable,VarLongWritable> {
+
+  private final VarLongWritable minimumItemIDWritable = new VarLongWritable();
+
+  @Override
+  protected void reduce(VarIntWritable index,
+                        Iterable<VarLongWritable> possibleItemIDs,
+                        Context context) throws IOException, InterruptedException {
+    long minimumItemID = Long.MAX_VALUE;
+    for (VarLongWritable varLongWritable : possibleItemIDs) {
+      long itemID = varLongWritable.get();
+      if (itemID < minimumItemID) {
+        minimumItemID = itemID;
+      }
+    }
+    if (minimumItemID != Long.MAX_VALUE) {
+      minimumItemIDWritable.set(minimumItemID);
+      context.write(index, minimumItemIDWritable);
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PartialMultiplyMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PartialMultiplyMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PartialMultiplyMapper.java
new file mode 100644
index 0000000..0e818f3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PartialMultiplyMapper.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.Vector;
+
+/**
+ * maps similar items and their preference values per user
+ */
+public final class PartialMultiplyMapper extends
+    Mapper<VarIntWritable,VectorAndPrefsWritable,VarLongWritable,PrefAndSimilarityColumnWritable> {
+
+  private final VarLongWritable userIDWritable = new VarLongWritable();
+  private final PrefAndSimilarityColumnWritable prefAndSimilarityColumn = new PrefAndSimilarityColumnWritable();
+
+  @Override
+  protected void map(VarIntWritable key,
+                     VectorAndPrefsWritable vectorAndPrefsWritable,
+                     Context context) throws IOException, InterruptedException {
+
+    Vector similarityMatrixColumn = vectorAndPrefsWritable.getVector();
+    List<Long> userIDs = vectorAndPrefsWritable.getUserIDs();
+    List<Float> prefValues = vectorAndPrefsWritable.getValues();
+
+    for (int i = 0; i < userIDs.size(); i++) {
+      long userID = userIDs.get(i);
+      float prefValue = prefValues.get(i);
+      if (!Float.isNaN(prefValue)) {
+        prefAndSimilarityColumn.set(prefValue, similarityMatrixColumn);
+        userIDWritable.set(userID);
+        context.write(userIDWritable, prefAndSimilarityColumn);
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PrefAndSimilarityColumnWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PrefAndSimilarityColumnWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PrefAndSimilarityColumnWritable.java
new file mode 100644
index 0000000..704c74a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PrefAndSimilarityColumnWritable.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public final class PrefAndSimilarityColumnWritable implements Writable {
+
+  private float prefValue;
+  private Vector similarityColumn;
+
+  public PrefAndSimilarityColumnWritable() {
+  }
+
+  public PrefAndSimilarityColumnWritable(float prefValue, Vector similarityColumn) {
+    set(prefValue, similarityColumn);
+  }
+
+  public void set(float prefValue, Vector similarityColumn) {
+    this.prefValue = prefValue;
+    this.similarityColumn = similarityColumn;
+  }
+
+  public float getPrefValue() {
+    return prefValue;
+  }
+
+  public Vector getSimilarityColumn() {
+    return similarityColumn;
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    prefValue = in.readFloat();
+    VectorWritable vw = new VectorWritable();
+    vw.readFields(in);
+    similarityColumn = vw.get();
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeFloat(prefValue);
+    VectorWritable vw = new VectorWritable(similarityColumn);
+    vw.setWritesLaxPrecision(true);
+    vw.write(out);
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (obj instanceof PrefAndSimilarityColumnWritable) {
+      PrefAndSimilarityColumnWritable other = (PrefAndSimilarityColumnWritable) obj;
+      return prefValue == other.prefValue && similarityColumn.equals(other.similarityColumn);
+    }
+    return false;
+  }
+
+  @Override
+  public int hashCode() {
+    return RandomUtils.hashFloat(prefValue) + 31 * similarityColumn.hashCode();
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
new file mode 100644
index 0000000..129db1d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
@@ -0,0 +1,337 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
+import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable;
+import org.apache.mahout.cf.taste.hadoop.preparation.PreparePreferenceMatrixJob;
+import org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasures;
+
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * <p>Runs a completely distributed recommender job as a series of mapreduces.</p>
+ * <p/>
+ * <p>Preferences in the input file should look like {@code userID, itemID[, preferencevalue]}</p>
+ * <p/>
+ * <p>
+ * Preference value is optional to accommodate applications that have no notion of a preference value (that is, the user
+ * simply expresses a preference for an item, but no degree of preference).
+ * </p>
+ * <p/>
+ * <p>
+ * The preference value is assumed to be parseable as a {@code double}. The user IDs and item IDs are
+ * parsed as {@code long}s.
+ * </p>
+ * <p/>
+ * <p>Command line arguments specific to this class are:</p>
+ * <p/>
+ * <ol>
+ * <li>--input(path): Directory containing one or more text files with the preference data</li>
+ * <li>--output(path): output path where recommender output should go</li>
+ * <li>--similarityClassname (classname): Name of vector similarity class to instantiate or a predefined similarity
+ * from {@link org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasure}</li>
+ * <li>--usersFile (path): only compute recommendations for user IDs contained in this file (optional)</li>
+ * <li>--itemsFile (path): only include item IDs from this file in the recommendations (optional)</li>
+ * <li>--filterFile (path): file containing comma-separated userID,itemID pairs. Used to exclude the item from the
+ * recommendations for that user (optional)</li>
+ * <li>--numRecommendations (integer): Number of recommendations to compute per user (10)</li>
+ * <li>--booleanData (boolean): Treat input data as having no pref values (false)</li>
+ * <li>--maxPrefsPerUser (integer): Maximum number of preferences considered per user in final
+ *   recommendation phase (10)</li>
+ * <li>--maxSimilaritiesPerItem (integer): Maximum number of similarities considered per item (100)</li>
+ * <li>--minPrefsPerUser (integer): ignore users with less preferences than this in the similarity computation (1)</li>
+ * <li>--maxPrefsPerUserInItemSimilarity (integer): max number of preferences to consider per user in
+ *   the item similarity computation phase,
+ * users with more preferences will be sampled down (1000)</li>
+ * <li>--threshold (double): discard item pairs with a similarity value below this</li>
+ * </ol>
+ * <p/>
+ * <p>General command line options are documented in {@link AbstractJob}.</p>
+ * <p/>
+ * <p>Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other
+ * arguments.</p>
+ */
+public final class RecommenderJob extends AbstractJob {
+
+  public static final String BOOLEAN_DATA = "booleanData";
+  public static final String DEFAULT_PREPARE_PATH = "preparePreferenceMatrix";
+
+  private static final int DEFAULT_MAX_SIMILARITIES_PER_ITEM = 100;
+  private static final int DEFAULT_MAX_PREFS = 500;
+  private static final int DEFAULT_MIN_PREFS_PER_USER = 1;
+
+  @Override
+  public int run(String[] args) throws Exception {
+
+    addInputOption();
+    addOutputOption();
+    addOption("numRecommendations", "n", "Number of recommendations per user",
+            String.valueOf(AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS));
+    addOption("usersFile", null, "File of users to recommend for", null);
+    addOption("itemsFile", null, "File of items to recommend for", null);
+    addOption("filterFile", "f", "File containing comma-separated userID,itemID pairs. Used to exclude the item from "
+            + "the recommendations for that user (optional)", null);
+    addOption("userItemFile", "uif", "File containing comma-separated userID,itemID pairs (optional). "
+            + "Used to include only these items into recommendations. "
+            + "Cannot be used together with usersFile or itemsFile", null);
+    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
+    addOption("maxPrefsPerUser", "mxp",
+            "Maximum number of preferences considered per user in final recommendation phase",
+            String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED));
+    addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this in the similarity computation "
+            + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
+    addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per item ",
+            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM));
+    addOption("maxPrefsInItemSimilarity", "mpiis", "max number of preferences to consider per user or item in the "
+            + "item similarity computation phase, users or items with more preferences will be sampled down (default: "
+        + DEFAULT_MAX_PREFS + ')', String.valueOf(DEFAULT_MAX_PREFS));
+    addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, " 
+            + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')', true);
+    addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
+    addOption("outputPathForSimilarityMatrix", "opfsm", "write the item similarity matrix to this path (optional)",
+        false);
+    addOption("randomSeed", null, "use this seed for sampling", false);
+    addFlag("sequencefileOutput", null, "write the output into a SequenceFile instead of a text file");
+
+    Map<String, List<String>> parsedArgs = parseArguments(args);
+    if (parsedArgs == null) {
+      return -1;
+    }
+
+    Path outputPath = getOutputPath();
+    int numRecommendations = Integer.parseInt(getOption("numRecommendations"));
+    String usersFile = getOption("usersFile");
+    String itemsFile = getOption("itemsFile");
+    String filterFile = getOption("filterFile");
+    String userItemFile = getOption("userItemFile");
+    boolean booleanData = Boolean.valueOf(getOption("booleanData"));
+    int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser"));
+    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
+    int maxPrefsInItemSimilarity = Integer.parseInt(getOption("maxPrefsInItemSimilarity"));
+    int maxSimilaritiesPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
+    String similarityClassname = getOption("similarityClassname");
+    double threshold = hasOption("threshold")
+        ? Double.parseDouble(getOption("threshold")) : RowSimilarityJob.NO_THRESHOLD;
+    long randomSeed = hasOption("randomSeed")
+        ? Long.parseLong(getOption("randomSeed")) : RowSimilarityJob.NO_FIXED_RANDOM_SEED;
+
+
+    Path prepPath = getTempPath(DEFAULT_PREPARE_PATH);
+    Path similarityMatrixPath = getTempPath("similarityMatrix");
+    Path explicitFilterPath = getTempPath("explicitFilterPath");
+    Path partialMultiplyPath = getTempPath("partialMultiply");
+
+    AtomicInteger currentPhase = new AtomicInteger();
+
+    int numberOfUsers = -1;
+
+    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+      ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[]{
+        "--input", getInputPath().toString(),
+        "--output", prepPath.toString(),
+        "--minPrefsPerUser", String.valueOf(minPrefsPerUser),
+        "--booleanData", String.valueOf(booleanData),
+        "--tempDir", getTempPath().toString(),
+      });
+
+      numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());
+    }
+
+
+    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+
+      /* special behavior if phase 1 is skipped */
+      if (numberOfUsers == -1) {
+        numberOfUsers = (int) HadoopUtil.countRecords(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),
+                PathType.LIST, null, getConf());
+      }
+
+      //calculate the co-occurrence matrix
+      ToolRunner.run(getConf(), new RowSimilarityJob(), new String[]{
+        "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(),
+        "--output", similarityMatrixPath.toString(),
+        "--numberOfColumns", String.valueOf(numberOfUsers),
+        "--similarityClassname", similarityClassname,
+        "--maxObservationsPerRow", String.valueOf(maxPrefsInItemSimilarity),
+        "--maxObservationsPerColumn", String.valueOf(maxPrefsInItemSimilarity),
+        "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem),
+        "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),
+        "--threshold", String.valueOf(threshold),
+        "--randomSeed", String.valueOf(randomSeed),
+        "--tempDir", getTempPath().toString(),
+      });
+
+      // write out the similarity matrix if the user specified that behavior
+      if (hasOption("outputPathForSimilarityMatrix")) {
+        Path outputPathForSimilarityMatrix = new Path(getOption("outputPathForSimilarityMatrix"));
+
+        Job outputSimilarityMatrix = prepareJob(similarityMatrixPath, outputPathForSimilarityMatrix,
+            SequenceFileInputFormat.class, ItemSimilarityJob.MostSimilarItemPairsMapper.class,
+            EntityEntityWritable.class, DoubleWritable.class, ItemSimilarityJob.MostSimilarItemPairsReducer.class,
+            EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class);
+
+        Configuration mostSimilarItemsConf = outputSimilarityMatrix.getConfiguration();
+        mostSimilarItemsConf.set(ItemSimilarityJob.ITEM_ID_INDEX_PATH_STR,
+            new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
+        mostSimilarItemsConf.setInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, maxSimilaritiesPerItem);
+        outputSimilarityMatrix.waitForCompletion(true);
+      }
+    }
+
+    //start the multiplication of the co-occurrence matrix by the user vectors
+    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+      Job partialMultiply = Job.getInstance(getConf(), "partialMultiply");
+      Configuration partialMultiplyConf = partialMultiply.getConfiguration();
+
+      MultipleInputs.addInputPath(partialMultiply, similarityMatrixPath, SequenceFileInputFormat.class,
+                                  SimilarityMatrixRowWrapperMapper.class);
+      MultipleInputs.addInputPath(partialMultiply, new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),
+          SequenceFileInputFormat.class, UserVectorSplitterMapper.class);
+      partialMultiply.setJarByClass(ToVectorAndPrefReducer.class);
+      partialMultiply.setMapOutputKeyClass(VarIntWritable.class);
+      partialMultiply.setMapOutputValueClass(VectorOrPrefWritable.class);
+      partialMultiply.setReducerClass(ToVectorAndPrefReducer.class);
+      partialMultiply.setOutputFormatClass(SequenceFileOutputFormat.class);
+      partialMultiply.setOutputKeyClass(VarIntWritable.class);
+      partialMultiply.setOutputValueClass(VectorAndPrefsWritable.class);
+      partialMultiplyConf.setBoolean("mapred.compress.map.output", true);
+      partialMultiplyConf.set("mapred.output.dir", partialMultiplyPath.toString());
+
+      if (usersFile != null) {
+        partialMultiplyConf.set(UserVectorSplitterMapper.USERS_FILE, usersFile);
+      }
+      
+      if (userItemFile != null) {
+        partialMultiplyConf.set(IDReader.USER_ITEM_FILE, userItemFile);
+      }
+      
+      partialMultiplyConf.setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED, maxPrefsPerUser);
+
+      boolean succeeded = partialMultiply.waitForCompletion(true);
+      if (!succeeded) {
+        return -1;
+      }
+    }
+
+    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+      //filter out any users we don't care about
+      /* convert the user/item pairs to filter if a filterfile has been specified */
+      if (filterFile != null) {
+        Job itemFiltering = prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class,
+                ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class,
+                ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class,
+                SequenceFileOutputFormat.class);
+        boolean succeeded = itemFiltering.waitForCompletion(true);
+        if (!succeeded) {
+          return -1;
+        }
+      }
+
+      String aggregateAndRecommendInput = partialMultiplyPath.toString();
+      if (filterFile != null) {
+        aggregateAndRecommendInput += "," + explicitFilterPath;
+      }
+
+      Class<? extends OutputFormat> outputFormat = parsedArgs.containsKey("--sequencefileOutput")
+          ? SequenceFileOutputFormat.class : TextOutputFormat.class;
+
+      //extract out the recommendations
+      Job aggregateAndRecommend = prepareJob(
+              new Path(aggregateAndRecommendInput), outputPath, SequenceFileInputFormat.class,
+              PartialMultiplyMapper.class, VarLongWritable.class, PrefAndSimilarityColumnWritable.class,
+              AggregateAndRecommendReducer.class, VarLongWritable.class, RecommendedItemsWritable.class,
+              outputFormat);
+      Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration();
+      if (itemsFile != null) {
+        aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile);
+      }
+      
+      if (userItemFile != null) {
+        aggregateAndRecommendConf.set(IDReader.USER_ITEM_FILE, userItemFile);
+      }
+
+      if (filterFile != null) {
+        setS3SafeCombinedInputPath(aggregateAndRecommend, getTempPath(), partialMultiplyPath, explicitFilterPath);
+      }
+      setIOSort(aggregateAndRecommend);
+      aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH,
+              new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
+      aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations);
+      aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData);
+      boolean succeeded = aggregateAndRecommend.waitForCompletion(true);
+      if (!succeeded) {
+        return -1;
+      }
+    }
+
+    return 0;
+  }
+
+  private static void setIOSort(JobContext job) {
+    Configuration conf = job.getConfiguration();
+    conf.setInt("io.sort.factor", 100);
+    String javaOpts = conf.get("mapred.map.child.java.opts"); // new arg name
+    if (javaOpts == null) {
+      javaOpts = conf.get("mapred.child.java.opts"); // old arg name
+    }
+    int assumedHeapSize = 512;
+    if (javaOpts != null) {
+      Matcher m = Pattern.compile("-Xmx([0-9]+)([mMgG])").matcher(javaOpts);
+      if (m.find()) {
+        assumedHeapSize = Integer.parseInt(m.group(1));
+        String megabyteOrGigabyte = m.group(2);
+        if ("g".equalsIgnoreCase(megabyteOrGigabyte)) {
+          assumedHeapSize *= 1024;
+        }
+      }
+    }
+    // Cap this at 1024MB now; see https://issues.apache.org/jira/browse/MAPREDUCE-2308
+    conf.setInt("io.sort.mb", Math.min(assumedHeapSize / 2, 1024));
+    // For some reason the Merger doesn't report status for a long time; increase
+    // timeout when running these jobs
+    conf.setInt("mapred.task.timeout", 60 * 60 * 1000);
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new RecommenderJob(), args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/SimilarityMatrixRowWrapperMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/SimilarityMatrixRowWrapperMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/SimilarityMatrixRowWrapperMapper.java
new file mode 100644
index 0000000..8ae8215
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/SimilarityMatrixRowWrapperMapper.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * maps a row of the similarity matrix to a {@link VectorOrPrefWritable}
+ * 
+ * actually a column from that matrix has to be used but as the similarity matrix is symmetric, 
+ * we can use a row instead of having to transpose it
+ */
+public final class SimilarityMatrixRowWrapperMapper extends
+    Mapper<IntWritable,VectorWritable,VarIntWritable,VectorOrPrefWritable> {
+
+  private final VarIntWritable index = new VarIntWritable();
+  private final VectorOrPrefWritable vectorOrPref = new VectorOrPrefWritable();
+
+  @Override
+  protected void map(IntWritable key,
+                     VectorWritable value,
+                     Context context) throws IOException, InterruptedException {
+    Vector similarityMatrixRow = value.get();
+    /* remove self similarity */
+    similarityMatrixRow.set(key.get(), Double.NaN);
+
+    index.set(key.get());
+    vectorOrPref.set(similarityMatrixRow);
+
+    context.write(index, vectorOrPref);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducer.java
new file mode 100644
index 0000000..e6e47fd
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducer.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * <h1>Input</h1>
+ * 
+ * <p>
+ * Takes user IDs as {@link VarLongWritable} mapped to all associated item IDs and preference values, as
+ * {@link EntityPrefWritable}s.
+ * </p>
+ * 
+ * <h1>Output</h1>
+ * 
+ * <p>
+ * The same user ID mapped to a {@link RandomAccessSparseVector} representation of the same item IDs and
+ * preference values. Item IDs are used as vector indexes; they are hashed into ints to work as indexes with
+ * {@link TasteHadoopUtils#idToIndex(long)}. The mapping is remembered for later with a combination of
+ * {@link ItemIDIndexMapper} and {@link ItemIDIndexReducer}.
+ * </p>
+ */
+public final class ToUserVectorsReducer extends
+    Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable> {
+
+  public static final String MIN_PREFERENCES_PER_USER = ToUserVectorsReducer.class.getName() 
+      + ".minPreferencesPerUser";
+
+  private int minPreferences;
+
+  public enum Counters { USERS }
+
+  private final VectorWritable userVectorWritable = new VectorWritable();
+
+  @Override
+  protected void setup(Context ctx) throws IOException, InterruptedException {
+    super.setup(ctx);
+    minPreferences = ctx.getConfiguration().getInt(MIN_PREFERENCES_PER_USER, 1);
+  }
+
+  @Override
+  protected void reduce(VarLongWritable userID,
+                        Iterable<VarLongWritable> itemPrefs,
+                        Context context) throws IOException, InterruptedException {
+    Vector userVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
+    for (VarLongWritable itemPref : itemPrefs) {
+      int index = TasteHadoopUtils.idToIndex(itemPref.get());
+      float value = itemPref instanceof EntityPrefWritable ? ((EntityPrefWritable) itemPref).getPrefValue() : 1.0f;
+      userVector.set(index, value);
+    }
+
+    if (userVector.getNumNondefaultElements() >= minPreferences) {
+      userVectorWritable.set(userVector);
+      userVectorWritable.setWritesLaxPrecision(true);
+      context.getCounter(Counters.USERS).increment(1);
+      context.write(userID, userVectorWritable);
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToVectorAndPrefReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToVectorAndPrefReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToVectorAndPrefReducer.java
new file mode 100644
index 0000000..9167437
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToVectorAndPrefReducer.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.Vector;
+
+public final class ToVectorAndPrefReducer extends
+    Reducer<VarIntWritable,VectorOrPrefWritable,VarIntWritable,VectorAndPrefsWritable> {
+
+  private final VectorAndPrefsWritable vectorAndPrefs = new VectorAndPrefsWritable();
+
+  @Override
+  protected void reduce(VarIntWritable key,
+                        Iterable<VectorOrPrefWritable> values,
+                        Context context) throws IOException, InterruptedException {
+
+    List<Long> userIDs = new ArrayList<>();
+    List<Float> prefValues = new ArrayList<>();
+    Vector similarityMatrixColumn = null;
+    for (VectorOrPrefWritable value : values) {
+      if (value.getVector() == null) {
+        // Then this is a user-pref value
+        userIDs.add(value.getUserID());
+        prefValues.add(value.getValue());
+      } else {
+        // Then this is the column vector
+        if (similarityMatrixColumn != null) {
+          throw new IllegalStateException("Found two similarity-matrix columns for item index " + key.get());
+        }
+        similarityMatrixColumn = value.getVector();
+      }
+    }
+
+    if (similarityMatrixColumn == null) {
+      return;
+    }
+
+    vectorAndPrefs.set(similarityMatrixColumn, userIDs, prefValues);
+    context.write(key, vectorAndPrefs);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/UserVectorSplitterMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/UserVectorSplitterMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/UserVectorSplitterMapper.java
new file mode 100644
index 0000000..2290d06
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/UserVectorSplitterMapper.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class UserVectorSplitterMapper extends
+    Mapper<VarLongWritable,VectorWritable, VarIntWritable,VectorOrPrefWritable> {
+
+  private static final Logger log = LoggerFactory.getLogger(UserVectorSplitterMapper.class);
+
+  static final String USERS_FILE = "usersFile";
+  static final String MAX_PREFS_PER_USER_CONSIDERED = "maxPrefsPerUserConsidered";
+  static final int DEFAULT_MAX_PREFS_PER_USER_CONSIDERED = 10;
+
+  private int maxPrefsPerUserConsidered;
+  private FastIDSet usersToRecommendFor;
+
+  private final VarIntWritable itemIndexWritable = new VarIntWritable();
+  private final VectorOrPrefWritable vectorOrPref = new VectorOrPrefWritable();
+
+  @Override
+  protected void setup(Context context) throws IOException {
+    Configuration jobConf = context.getConfiguration();
+    maxPrefsPerUserConsidered = jobConf.getInt(MAX_PREFS_PER_USER_CONSIDERED, DEFAULT_MAX_PREFS_PER_USER_CONSIDERED);
+    
+    IDReader idReader = new IDReader (jobConf);
+    idReader.readIDs();
+    usersToRecommendFor = idReader.getUserIds();    
+  }
+
+  @Override
+  protected void map(VarLongWritable key,
+                     VectorWritable value,
+                     Context context) throws IOException, InterruptedException {
+    long userID = key.get();
+
+    log.info("UserID = {}", userID);
+
+    if (usersToRecommendFor != null && !usersToRecommendFor.contains(userID)) {
+      return;
+    }
+    Vector userVector = maybePruneUserVector(value.get());
+
+    for (Element e : userVector.nonZeroes()) {
+      itemIndexWritable.set(e.index());
+      vectorOrPref.set(userID, (float) e.get());
+      context.write(itemIndexWritable, vectorOrPref);
+    }
+  }
+
+  private Vector maybePruneUserVector(Vector userVector) {
+    if (userVector.getNumNondefaultElements() <= maxPrefsPerUserConsidered) {
+      return userVector;
+    }
+
+    float smallestLargeValue = findSmallestLargeValue(userVector);
+
+    // "Blank out" small-sized prefs to reduce the amount of partial products
+    // generated later. They're not zeroed, but NaN-ed, so they come through
+    // and can be used to exclude these items from prefs.
+    for (Element e : userVector.nonZeroes()) {
+      float absValue = Math.abs((float) e.get());
+      if (absValue < smallestLargeValue) {
+        e.set(Float.NaN);
+      }
+    }
+
+    return userVector;
+  }
+
+  private float findSmallestLargeValue(Vector userVector) {
+
+    PriorityQueue<Float> topPrefValues = new PriorityQueue<Float>(maxPrefsPerUserConsidered) {
+      @Override
+      protected boolean lessThan(Float f1, Float f2) {
+        return f1 < f2;
+      }
+    };
+
+    for (Element e : userVector.nonZeroes()) {
+      float absValue = Math.abs((float) e.get());
+      topPrefValues.insertWithOverflow(absValue);
+    }
+    return topPrefValues.top();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorAndPrefsWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorAndPrefsWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorAndPrefsWritable.java
new file mode 100644
index 0000000..11d496f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorAndPrefsWritable.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.math.Varint;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public final class VectorAndPrefsWritable implements Writable {
+
+  private Vector vector;
+  private List<Long> userIDs;
+  private List<Float> values;
+
+  public VectorAndPrefsWritable() {
+  }
+
+  public VectorAndPrefsWritable(Vector vector, List<Long> userIDs, List<Float> values) {
+    set(vector, userIDs, values);
+  }
+
+  public void set(Vector vector, List<Long> userIDs, List<Float> values) {
+    this.vector = vector;
+    this.userIDs = userIDs;
+    this.values = values;
+  }
+
+  public Vector getVector() {
+    return vector;
+  }
+
+  public List<Long> getUserIDs() {
+    return userIDs;
+  }
+
+  public List<Float> getValues() {
+    return values;
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    VectorWritable vw = new VectorWritable(vector);
+    vw.setWritesLaxPrecision(true);
+    vw.write(out);
+    Varint.writeUnsignedVarInt(userIDs.size(), out);
+    for (int i = 0; i < userIDs.size(); i++) {
+      Varint.writeSignedVarLong(userIDs.get(i), out);
+      out.writeFloat(values.get(i));
+    }
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    VectorWritable writable = new VectorWritable();
+    writable.readFields(in);
+    vector = writable.get();
+    int size = Varint.readUnsignedVarInt(in);
+    userIDs = new ArrayList<>(size);
+    values = new ArrayList<>(size);
+    for (int i = 0; i < size; i++) {
+      userIDs.add(Varint.readSignedVarLong(in));
+      values.add(in.readFloat());
+    }
+  }
+
+  @Override
+  public String toString() {
+    return vector + "\t" + userIDs + '\t' + values;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorOrPrefWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorOrPrefWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorOrPrefWritable.java
new file mode 100644
index 0000000..515d7ea
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorOrPrefWritable.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.math.Varint;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public final class VectorOrPrefWritable implements Writable {
+
+  private Vector vector;
+  private long userID;
+  private float value;
+
+  public VectorOrPrefWritable() {
+  }
+
+  public VectorOrPrefWritable(Vector vector) {
+    this.vector = vector;
+  }
+
+  public VectorOrPrefWritable(long userID, float value) {
+    this.userID = userID;
+    this.value = value;
+  }
+
+  public Vector getVector() {
+    return vector;
+  }
+
+  public long getUserID() {
+    return userID;
+  }
+
+  public float getValue() {
+    return value;
+  }
+
+  void set(Vector vector) {
+    this.vector = vector;
+    this.userID = Long.MIN_VALUE;
+    this.value = Float.NaN;
+  }
+
+  public void set(long userID, float value) {
+    this.vector = null;
+    this.userID = userID;
+    this.value = value;
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    if (vector == null) {
+      out.writeBoolean(false);
+      Varint.writeSignedVarLong(userID, out);
+      out.writeFloat(value);
+    } else {
+      out.writeBoolean(true);
+      VectorWritable vw = new VectorWritable(vector);
+      vw.setWritesLaxPrecision(true);
+      vw.write(out);
+    }
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    boolean hasVector = in.readBoolean();
+    if (hasVector) {
+      VectorWritable writable = new VectorWritable();
+      writable.readFields(in);
+      set(writable.get());
+    } else {
+      long theUserID = Varint.readSignedVarLong(in);
+      float theValue = in.readFloat();
+      set(theUserID, theValue);
+    }
+  }
+
+  @Override
+  public String toString() {
+    return vector == null ? userID + ":" + value : vector.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java
new file mode 100644
index 0000000..c64ee38
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.preparation;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable;
+import org.apache.mahout.cf.taste.hadoop.ToEntityPrefsMapper;
+import org.apache.mahout.cf.taste.hadoop.ToItemPrefsMapper;
+import org.apache.mahout.cf.taste.hadoop.item.ItemIDIndexMapper;
+import org.apache.mahout.cf.taste.hadoop.item.ItemIDIndexReducer;
+import org.apache.mahout.cf.taste.hadoop.item.RecommenderJob;
+import org.apache.mahout.cf.taste.hadoop.item.ToUserVectorsReducer;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.VectorWritable;
+
+import java.util.List;
+import java.util.Map;
+
+public class PreparePreferenceMatrixJob extends AbstractJob {
+
+  public static final String NUM_USERS = "numUsers.bin";
+  public static final String ITEMID_INDEX = "itemIDIndex";
+  public static final String USER_VECTORS = "userVectors";
+  public static final String RATING_MATRIX = "ratingMatrix";
+
+  private static final int DEFAULT_MIN_PREFS_PER_USER = 1;
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new PreparePreferenceMatrixJob(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+
+    addInputOption();
+    addOutputOption();
+    addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this "
+            + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
+    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
+    addOption("ratingShift", "rs", "shift ratings by this value", "0.0");
+
+    Map<String, List<String>> parsedArgs = parseArguments(args);
+    if (parsedArgs == null) {
+      return -1;
+    }
+
+    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
+    boolean booleanData = Boolean.valueOf(getOption("booleanData"));
+    float ratingShift = Float.parseFloat(getOption("ratingShift"));
+    //convert items to an internal index
+    Job itemIDIndex = prepareJob(getInputPath(), getOutputPath(ITEMID_INDEX), TextInputFormat.class,
+            ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class,
+            VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
+    itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
+    boolean succeeded = itemIDIndex.waitForCompletion(true);
+    if (!succeeded) {
+      return -1;
+    }
+    //convert user preferences into a vector per user
+    Job toUserVectors = prepareJob(getInputPath(),
+                                   getOutputPath(USER_VECTORS),
+                                   TextInputFormat.class,
+                                   ToItemPrefsMapper.class,
+                                   VarLongWritable.class,
+                                   booleanData ? VarLongWritable.class : EntityPrefWritable.class,
+                                   ToUserVectorsReducer.class,
+                                   VarLongWritable.class,
+                                   VectorWritable.class,
+                                   SequenceFileOutputFormat.class);
+    toUserVectors.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
+    toUserVectors.getConfiguration().setInt(ToUserVectorsReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
+    toUserVectors.getConfiguration().set(ToEntityPrefsMapper.RATING_SHIFT, String.valueOf(ratingShift));
+    succeeded = toUserVectors.waitForCompletion(true);
+    if (!succeeded) {
+      return -1;
+    }
+    //we need the number of users later
+    int numberOfUsers = (int) toUserVectors.getCounters().findCounter(ToUserVectorsReducer.Counters.USERS).getValue();
+    HadoopUtil.writeInt(numberOfUsers, getOutputPath(NUM_USERS), getConf());
+    //build the rating matrix
+    Job toItemVectors = prepareJob(getOutputPath(USER_VECTORS), getOutputPath(RATING_MATRIX),
+            ToItemVectorsMapper.class, IntWritable.class, VectorWritable.class, ToItemVectorsReducer.class,
+            IntWritable.class, VectorWritable.class);
+    toItemVectors.setCombinerClass(ToItemVectorsReducer.class);
+
+    succeeded = toItemVectors.waitForCompletion(true);
+    if (!succeeded) {
+      return -1;
+    }
+
+    return 0;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsMapper.java
new file mode 100644
index 0000000..5a4144c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsMapper.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.preparation;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+
+public class ToItemVectorsMapper
+    extends Mapper<VarLongWritable,VectorWritable,IntWritable,VectorWritable> {
+
+  private final IntWritable itemID = new IntWritable();
+  private final VectorWritable itemVectorWritable = new VectorWritable();
+
+  @Override
+  protected void map(VarLongWritable rowIndex, VectorWritable vectorWritable, Context ctx)
+    throws IOException, InterruptedException {
+    Vector userRatings = vectorWritable.get();
+
+    int column = TasteHadoopUtils.idToIndex(rowIndex.get());
+
+    itemVectorWritable.setWritesLaxPrecision(true);
+
+    Vector itemVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
+    for (Vector.Element elem : userRatings.nonZeroes()) {
+      itemID.set(elem.index());
+      itemVector.setQuick(column, elem.get());
+      itemVectorWritable.set(itemVector);
+      ctx.write(itemID, itemVectorWritable);
+      // reset vector for reuse
+      itemVector.setQuick(elem.index(), 0.0);
+    }
+  }
+
+}


[12/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/NaiveBayesModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/NaiveBayesModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/NaiveBayesModel.java
new file mode 100644
index 0000000..9f85aab
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/NaiveBayesModel.java
@@ -0,0 +1,170 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.SparseRowMatrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import com.google.common.base.Preconditions;
+
+/** NaiveBayesModel holds the weight matrix, the feature and label sums and the weight normalizer vectors.*/
+public class NaiveBayesModel {
+
+  private final Vector weightsPerLabel;
+  private final Vector perlabelThetaNormalizer;
+  private final Vector weightsPerFeature;
+  private final Matrix weightsPerLabelAndFeature;
+  private final float alphaI;
+  private final double numFeatures;
+  private final double totalWeightSum;
+  private final boolean isComplementary;  
+   
+  public final static String COMPLEMENTARY_MODEL = "COMPLEMENTARY_MODEL";
+
+  public NaiveBayesModel(Matrix weightMatrix, Vector weightsPerFeature, Vector weightsPerLabel, Vector thetaNormalizer,
+                         float alphaI, boolean isComplementary) {
+    this.weightsPerLabelAndFeature = weightMatrix;
+    this.weightsPerFeature = weightsPerFeature;
+    this.weightsPerLabel = weightsPerLabel;
+    this.perlabelThetaNormalizer = thetaNormalizer;
+    this.numFeatures = weightsPerFeature.getNumNondefaultElements();
+    this.totalWeightSum = weightsPerLabel.zSum();
+    this.alphaI = alphaI;
+    this.isComplementary=isComplementary;
+  }
+
+  public double labelWeight(int label) {
+    return weightsPerLabel.getQuick(label);
+  }
+
+  public double thetaNormalizer(int label) {
+    return perlabelThetaNormalizer.get(label); 
+  }
+
+  public double featureWeight(int feature) {
+    return weightsPerFeature.getQuick(feature);
+  }
+
+  public double weight(int label, int feature) {
+    return weightsPerLabelAndFeature.getQuick(label, feature);
+  }
+
+  public float alphaI() {
+    return alphaI;
+  }
+
+  public double numFeatures() {
+    return numFeatures;
+  }
+
+  public double totalWeightSum() {
+    return totalWeightSum;
+  }
+  
+  public int numLabels() {
+    return weightsPerLabel.size();
+  }
+
+  public Vector createScoringVector() {
+    return weightsPerLabel.like();
+  }
+  
+  public boolean isComplemtary(){
+      return isComplementary;
+  }
+  
+  public static NaiveBayesModel materialize(Path output, Configuration conf) throws IOException {
+    FileSystem fs = output.getFileSystem(conf);
+
+    Vector weightsPerLabel;
+    Vector perLabelThetaNormalizer = null;
+    Vector weightsPerFeature;
+    Matrix weightsPerLabelAndFeature;
+    float alphaI;
+    boolean isComplementary;
+
+    try (FSDataInputStream in = fs.open(new Path(output, "naiveBayesModel.bin"))) {
+      alphaI = in.readFloat();
+      isComplementary = in.readBoolean();
+      weightsPerFeature = VectorWritable.readVector(in);
+      weightsPerLabel = new DenseVector(VectorWritable.readVector(in));
+      if (isComplementary){
+        perLabelThetaNormalizer = new DenseVector(VectorWritable.readVector(in));
+      }
+      weightsPerLabelAndFeature = new SparseRowMatrix(weightsPerLabel.size(), weightsPerFeature.size());
+      for (int label = 0; label < weightsPerLabelAndFeature.numRows(); label++) {
+        weightsPerLabelAndFeature.assignRow(label, VectorWritable.readVector(in));
+      }
+    }
+
+    NaiveBayesModel model = new NaiveBayesModel(weightsPerLabelAndFeature, weightsPerFeature, weightsPerLabel,
+        perLabelThetaNormalizer, alphaI, isComplementary);
+    model.validate();
+    return model;
+  }
+
+  public void serialize(Path output, Configuration conf) throws IOException {
+    FileSystem fs = output.getFileSystem(conf);
+    try (FSDataOutputStream out = fs.create(new Path(output, "naiveBayesModel.bin"))) {
+      out.writeFloat(alphaI);
+      out.writeBoolean(isComplementary);
+      VectorWritable.writeVector(out, weightsPerFeature);
+      VectorWritable.writeVector(out, weightsPerLabel); 
+      if (isComplementary){
+        VectorWritable.writeVector(out, perlabelThetaNormalizer);
+      }
+      for (int row = 0; row < weightsPerLabelAndFeature.numRows(); row++) {
+        VectorWritable.writeVector(out, weightsPerLabelAndFeature.viewRow(row));
+      }
+    }
+  }
+  
+  public void validate() {
+    Preconditions.checkState(alphaI > 0, "alphaI has to be greater than 0!");
+    Preconditions.checkArgument(numFeatures > 0, "the vocab count has to be greater than 0!");
+    Preconditions.checkArgument(totalWeightSum > 0, "the totalWeightSum has to be greater than 0!");
+    Preconditions.checkNotNull(weightsPerLabel, "the number of labels has to be defined!");
+    Preconditions.checkArgument(weightsPerLabel.getNumNondefaultElements() > 0,
+        "the number of labels has to be greater than 0!");
+    Preconditions.checkNotNull(weightsPerFeature, "the feature sums have to be defined");
+    Preconditions.checkArgument(weightsPerFeature.getNumNondefaultElements() > 0,
+        "the feature sums have to be greater than 0!");
+    if (isComplementary){
+        Preconditions.checkArgument(perlabelThetaNormalizer != null, "the theta normalizers have to be defined");
+        Preconditions.checkArgument(perlabelThetaNormalizer.getNumNondefaultElements() > 0,
+            "the number of theta normalizers has to be greater than 0!");    
+        Preconditions.checkArgument(Math.signum(perlabelThetaNormalizer.minValue()) 
+                == Math.signum(perlabelThetaNormalizer.maxValue()), 
+           "Theta normalizers do not all have the same sign");            
+        Preconditions.checkArgument(perlabelThetaNormalizer.getNumNonZeroElements() 
+                == perlabelThetaNormalizer.size(), 
+           "Theta normalizers can not have zero value.");
+    }
+    
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/StandardNaiveBayesClassifier.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/StandardNaiveBayesClassifier.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/StandardNaiveBayesClassifier.java
new file mode 100644
index 0000000..e4ce8aa
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/StandardNaiveBayesClassifier.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes;
+
+
+/** Implementation of the Naive Bayes Classifier Algorithm */
+public class StandardNaiveBayesClassifier extends AbstractNaiveBayesClassifier { 
+ 
+  public StandardNaiveBayesClassifier(NaiveBayesModel model) {
+    super(model);
+  }
+
+  @Override
+  public double getScoreForLabelFeature(int label, int feature) {
+    NaiveBayesModel model = getModel();
+    // Standard Naive Bayes does not use weight normalization
+    return computeWeight(model.weight(label, feature), model.labelWeight(label), model.alphaI(), model.numFeatures());
+  }
+
+  public static double computeWeight(double featureLabelWeight, double labelWeight, double alphaI, double numFeatures) {
+    double numerator = featureLabelWeight + alphaI;
+    double denominator = labelWeight + alphaI * numFeatures;
+    return Math.log(numerator / denominator);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/BayesTestMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/BayesTestMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/BayesTestMapper.java
new file mode 100644
index 0000000..37a3b71
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/BayesTestMapper.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes.test;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.classifier.naivebayes.AbstractNaiveBayesClassifier;
+import org.apache.mahout.classifier.naivebayes.ComplementaryNaiveBayesClassifier;
+import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
+import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+/**
+ * Run the input through the model and see if it matches.
+ * <p/>
+ * The output value is the generated label, the Pair is the expected label and true if they match:
+ */
+public class BayesTestMapper extends Mapper<Text, VectorWritable, Text, VectorWritable> {
+
+  private static final Pattern SLASH = Pattern.compile("/");
+
+  private AbstractNaiveBayesClassifier classifier;
+
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    super.setup(context);
+    Configuration conf = context.getConfiguration();
+    Path modelPath = HadoopUtil.getSingleCachedFile(conf);
+    NaiveBayesModel model = NaiveBayesModel.materialize(modelPath, conf);
+    boolean isComplementary = Boolean.parseBoolean(conf.get(TestNaiveBayesDriver.COMPLEMENTARY));
+    
+    // ensure that if we are testing in complementary mode, the model has been
+    // trained complementary. a complementarty model will work for standard classification
+    // a standard model will not work for complementary classification
+    if (isComplementary) {
+      Preconditions.checkArgument((model.isComplemtary()),
+          "Complementary mode in model is different than test mode");
+    }
+    
+    if (isComplementary) {
+      classifier = new ComplementaryNaiveBayesClassifier(model);
+    } else {
+      classifier = new StandardNaiveBayesClassifier(model);
+    }
+  }
+
+  @Override
+  protected void map(Text key, VectorWritable value, Context context) throws IOException, InterruptedException {
+    Vector result = classifier.classifyFull(value.get());
+    //the key is the expected value
+    context.write(new Text(SLASH.split(key.toString())[1]), new VectorWritable(result));
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java
new file mode 100644
index 0000000..d9eedcf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java
@@ -0,0 +1,176 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes.test;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.classifier.ClassifierResult;
+import org.apache.mahout.classifier.ResultAnalyzer;
+import org.apache.mahout.classifier.naivebayes.AbstractNaiveBayesClassifier;
+import org.apache.mahout.classifier.naivebayes.BayesUtils;
+import org.apache.mahout.classifier.naivebayes.ComplementaryNaiveBayesClassifier;
+import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
+import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Test the (Complementary) Naive Bayes model that was built during training
+ * by running the iterating the test set and comparing it to the model
+ */
+public class TestNaiveBayesDriver extends AbstractJob {
+
+  private static final Logger log = LoggerFactory.getLogger(TestNaiveBayesDriver.class);
+
+  public static final String COMPLEMENTARY = "class"; //b for bayes, c for complementary
+  private static final Pattern SLASH = Pattern.compile("/");
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new TestNaiveBayesDriver(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+    addInputOption();
+    addOutputOption();
+    addOption(addOption(DefaultOptionCreator.overwriteOption().create()));
+    addOption("model", "m", "The path to the model built during training", true);
+    addOption(buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false)));
+    addOption(buildOption("runSequential", "seq", "run sequential?", false, false, String.valueOf(false)));
+    addOption("labelIndex", "l", "The path to the location of the label index", true);
+    Map<String, List<String>> parsedArgs = parseArguments(args);
+    if (parsedArgs == null) {
+      return -1;
+    }
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.delete(getConf(), getOutputPath());
+    }
+
+    boolean sequential = hasOption("runSequential");
+    boolean succeeded;
+    if (sequential) {
+       runSequential();
+    } else {
+      succeeded = runMapReduce();
+      if (!succeeded) {
+        return -1;
+      }
+    }
+
+    //load the labels
+    Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex")));
+
+    //loop over the results and create the confusion matrix
+    SequenceFileDirIterable<Text, VectorWritable> dirIterable =
+        new SequenceFileDirIterable<>(getOutputPath(), PathType.LIST, PathFilters.partFilter(), getConf());
+    ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT");
+    analyzeResults(labelMap, dirIterable, analyzer);
+
+    log.info("{} Results: {}", hasOption("testComplementary") ? "Complementary" : "Standard NB", analyzer);
+    return 0;
+  }
+
+  private void runSequential() throws IOException {
+    boolean complementary = hasOption("testComplementary");
+    FileSystem fs = FileSystem.get(getConf());
+    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(getOption("model")), getConf());
+    
+    // Ensure that if we are testing in complementary mode, the model has been
+    // trained complementary. a complementarty model will work for standard classification
+    // a standard model will not work for complementary classification
+    if (complementary){
+        Preconditions.checkArgument((model.isComplemtary()),
+            "Complementary mode in model is different from test mode");
+    }
+    
+    AbstractNaiveBayesClassifier classifier;
+    if (complementary) {
+      classifier = new ComplementaryNaiveBayesClassifier(model);
+    } else {
+      classifier = new StandardNaiveBayesClassifier(model);
+    }
+
+    try (SequenceFile.Writer writer =
+             SequenceFile.createWriter(fs, getConf(), new Path(getOutputPath(), "part-r-00000"),
+                 Text.class, VectorWritable.class)) {
+      SequenceFileDirIterable<Text, VectorWritable> dirIterable =
+          new SequenceFileDirIterable<>(getInputPath(), PathType.LIST, PathFilters.partFilter(), getConf());
+      // loop through the part-r-* files in getInputPath() and get classification scores for all entries
+      for (Pair<Text, VectorWritable> pair : dirIterable) {
+        writer.append(new Text(SLASH.split(pair.getFirst().toString())[1]),
+            new VectorWritable(classifier.classifyFull(pair.getSecond().get())));
+      }
+    }
+  }
+
+  private boolean runMapReduce() throws IOException,
+      InterruptedException, ClassNotFoundException {
+    Path model = new Path(getOption("model"));
+    HadoopUtil.cacheFiles(model, getConf());
+    //the output key is the expected value, the output value are the scores for all the labels
+    Job testJob = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, BayesTestMapper.class,
+        Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
+    //testJob.getConfiguration().set(LABEL_KEY, getOption("--labels"));
+
+
+    boolean complementary = hasOption("testComplementary");
+    testJob.getConfiguration().set(COMPLEMENTARY, String.valueOf(complementary));
+    return testJob.waitForCompletion(true);
+  }
+
+  private static void analyzeResults(Map<Integer, String> labelMap,
+                                     SequenceFileDirIterable<Text, VectorWritable> dirIterable,
+                                     ResultAnalyzer analyzer) {
+    for (Pair<Text, VectorWritable> pair : dirIterable) {
+      int bestIdx = Integer.MIN_VALUE;
+      double bestScore = Long.MIN_VALUE;
+      for (Vector.Element element : pair.getSecond().get().all()) {
+        if (element.get() > bestScore) {
+          bestScore = element.get();
+          bestIdx = element.index();
+        }
+      }
+      if (bestIdx != Integer.MIN_VALUE) {
+        ClassifierResult classifierResult = new ClassifierResult(labelMap.get(bestIdx), bestScore);
+        analyzer.addInstance(pair.getFirst().toString(), classifierResult);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ComplementaryThetaTrainer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ComplementaryThetaTrainer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ComplementaryThetaTrainer.java
new file mode 100644
index 0000000..2b8ee1e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ComplementaryThetaTrainer.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes.training;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.classifier.naivebayes.ComplementaryNaiveBayesClassifier;
+import org.apache.mahout.math.Vector;
+
+public class ComplementaryThetaTrainer {
+
+  private final Vector weightsPerFeature;
+  private final Vector weightsPerLabel;
+  private final Vector perLabelThetaNormalizer;
+  private final double alphaI;
+  private final double totalWeightSum;
+  private final double numFeatures;
+
+  public ComplementaryThetaTrainer(Vector weightsPerFeature, Vector weightsPerLabel, double alphaI) {
+    Preconditions.checkNotNull(weightsPerFeature);
+    Preconditions.checkNotNull(weightsPerLabel);
+    this.weightsPerFeature = weightsPerFeature;
+    this.weightsPerLabel = weightsPerLabel;
+    this.alphaI = alphaI;
+    perLabelThetaNormalizer = weightsPerLabel.like();
+    totalWeightSum = weightsPerLabel.zSum();
+    numFeatures = weightsPerFeature.getNumNondefaultElements();
+  }
+
+  public void train(int label, Vector perLabelWeight) {
+    double labelWeight = labelWeight(label);
+    // sum weights for each label including those with zero word counts
+    for(int i = 0; i < perLabelWeight.size(); i++){
+      Vector.Element perLabelWeightElement = perLabelWeight.getElement(i);
+      updatePerLabelThetaNormalizer(label,
+          ComplementaryNaiveBayesClassifier.computeWeight(featureWeight(perLabelWeightElement.index()),
+              perLabelWeightElement.get(), totalWeightSum(), labelWeight, alphaI(), numFeatures()));
+    }
+  }
+
+  protected double alphaI() {
+    return alphaI;
+  }
+
+  protected double numFeatures() {
+    return numFeatures;
+  }
+
+  protected double labelWeight(int label) {
+    return weightsPerLabel.get(label);
+  }
+
+  protected double totalWeightSum() {
+    return totalWeightSum;
+  }
+
+  protected double featureWeight(int feature) {
+    return weightsPerFeature.get(feature);
+  }
+
+  // http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.2, Weight Magnitude Errors
+  protected void updatePerLabelThetaNormalizer(int label, double weight) {
+    perLabelThetaNormalizer.set(label, perLabelThetaNormalizer.get(label) + Math.abs(weight));
+  }
+
+  public Vector retrievePerLabelThetaNormalizer() {
+    return perLabelThetaNormalizer.clone();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/IndexInstancesMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/IndexInstancesMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/IndexInstancesMapper.java
new file mode 100644
index 0000000..4df869e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/IndexInstancesMapper.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes.training;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.classifier.naivebayes.BayesUtils;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
+
+public class IndexInstancesMapper extends Mapper<Text, VectorWritable, IntWritable, VectorWritable> {
+
+  private static final Pattern SLASH = Pattern.compile("/");
+
+  enum Counter { SKIPPED_INSTANCES }
+
+  private OpenObjectIntHashMap<String> labelIndex;
+
+  @Override
+  protected void setup(Context ctx) throws IOException, InterruptedException {
+    super.setup(ctx);
+    labelIndex = BayesUtils.readIndexFromCache(ctx.getConfiguration());
+  }
+
+  @Override
+  protected void map(Text labelText, VectorWritable instance, Context ctx) throws IOException, InterruptedException {
+    String label = SLASH.split(labelText.toString())[1];
+    if (labelIndex.containsKey(label)) {
+      ctx.write(new IntWritable(labelIndex.get(label)), instance);
+    } else {
+      ctx.getCounter(Counter.SKIPPED_INSTANCES).increment(1);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ThetaMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ThetaMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ThetaMapper.java
new file mode 100644
index 0000000..ff2ea40
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ThetaMapper.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes.training;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.classifier.naivebayes.BayesUtils;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public class ThetaMapper extends Mapper<IntWritable, VectorWritable, Text, VectorWritable> {
+
+  public static final String ALPHA_I = ThetaMapper.class.getName() + ".alphaI";
+  static final String TRAIN_COMPLEMENTARY = ThetaMapper.class.getName() + ".trainComplementary";
+
+  private ComplementaryThetaTrainer trainer;
+
+  @Override
+  protected void setup(Context ctx) throws IOException, InterruptedException {
+    super.setup(ctx);
+    Configuration conf = ctx.getConfiguration();
+
+    float alphaI = conf.getFloat(ALPHA_I, 1.0f);
+    Map<String, Vector> scores = BayesUtils.readScoresFromCache(conf);    
+    
+    trainer = new ComplementaryThetaTrainer(scores.get(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE),
+                                            scores.get(TrainNaiveBayesJob.WEIGHTS_PER_LABEL), alphaI);
+  }
+
+  @Override
+  protected void map(IntWritable key, VectorWritable value, Context ctx) throws IOException, InterruptedException {
+    trainer.train(key.get(), value.get());
+  }
+
+  @Override
+  protected void cleanup(Context ctx) throws IOException, InterruptedException {
+    ctx.write(new Text(TrainNaiveBayesJob.LABEL_THETA_NORMALIZER),
+        new VectorWritable(trainer.retrievePerLabelThetaNormalizer()));
+    super.cleanup(ctx);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
new file mode 100644
index 0000000..cd18d28
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes.training;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.classifier.naivebayes.BayesUtils;
+import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.common.mapreduce.VectorSumReducer;
+import org.apache.mahout.math.VectorWritable;
+
+import com.google.common.base.Splitter;
+
+/** Trains a Naive Bayes Classifier (parameters for both Naive Bayes and Complementary Naive Bayes) */
+public final class TrainNaiveBayesJob extends AbstractJob {
+  private static final String TRAIN_COMPLEMENTARY = "trainComplementary";
+  private static final String ALPHA_I = "alphaI";
+  private static final String LABEL_INDEX = "labelIndex";
+  public static final String WEIGHTS_PER_FEATURE = "__SPF";
+  public static final String WEIGHTS_PER_LABEL = "__SPL";
+  public static final String LABEL_THETA_NORMALIZER = "_LTN";
+  public static final String SUMMED_OBSERVATIONS = "summedObservations";
+  public static final String WEIGHTS = "weights";
+  public static final String THETAS = "thetas";
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new TrainNaiveBayesJob(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+
+    addInputOption();
+    addOutputOption();
+
+    addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f));
+    addOption(buildOption(TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false)));
+    addOption(LABEL_INDEX, "li", "The path to store the label index in", false);
+    addOption(DefaultOptionCreator.overwriteOption().create());
+
+    Map<String, List<String>> parsedArgs = parseArguments(args);
+    if (parsedArgs == null) {
+      return -1;
+    }
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.delete(getConf(), getOutputPath());
+      HadoopUtil.delete(getConf(), getTempPath());
+    }
+    Path labPath;
+    String labPathStr = getOption(LABEL_INDEX);
+    if (labPathStr != null) {
+      labPath = new Path(labPathStr);
+    } else {
+      labPath = getTempPath(LABEL_INDEX);
+    }
+    long labelSize = createLabelIndex(labPath);
+    float alphaI = Float.parseFloat(getOption(ALPHA_I));
+    boolean trainComplementary = hasOption(TRAIN_COMPLEMENTARY);
+
+    HadoopUtil.setSerializations(getConf());
+    HadoopUtil.cacheFiles(labPath, getConf());
+
+    // Add up all the vectors with the same labels, while mapping the labels into our index
+    Job indexInstances = prepareJob(getInputPath(),
+                                    getTempPath(SUMMED_OBSERVATIONS),
+                                    SequenceFileInputFormat.class,
+                                    IndexInstancesMapper.class,
+                                    IntWritable.class,
+                                    VectorWritable.class,
+                                    VectorSumReducer.class,
+                                    IntWritable.class,
+                                    VectorWritable.class,
+                                    SequenceFileOutputFormat.class);
+    indexInstances.setCombinerClass(VectorSumReducer.class);
+    boolean succeeded = indexInstances.waitForCompletion(true);
+    if (!succeeded) {
+      return -1;
+    }
+    // Sum up all the weights from the previous step, per label and per feature
+    Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS),
+                                  getTempPath(WEIGHTS),
+                                  SequenceFileInputFormat.class,
+                                  WeightsMapper.class,
+                                  Text.class,
+                                  VectorWritable.class,
+                                  VectorSumReducer.class,
+                                  Text.class,
+                                  VectorWritable.class,
+                                  SequenceFileOutputFormat.class);
+    weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize));
+    weightSummer.setCombinerClass(VectorSumReducer.class);
+    succeeded = weightSummer.waitForCompletion(true);
+    if (!succeeded) {
+      return -1;
+    }
+
+    // Put the per label and per feature vectors into the cache
+    HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf());
+
+    if (trainComplementary){
+      // Calculate the per label theta normalizers, write out to LABEL_THETA_NORMALIZER vector
+      // see http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.2, Weight Magnitude Errors
+      Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS),
+                                   getTempPath(THETAS),
+                                   SequenceFileInputFormat.class,
+                                   ThetaMapper.class,
+                                   Text.class,
+                                   VectorWritable.class,
+                                   VectorSumReducer.class,
+                                   Text.class,
+                                   VectorWritable.class,
+                                   SequenceFileOutputFormat.class);
+      thetaSummer.setCombinerClass(VectorSumReducer.class);
+      thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI);
+      thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary);
+      succeeded = thetaSummer.waitForCompletion(true);
+      if (!succeeded) {
+        return -1;
+      }
+    }
+    
+    // Put the per label theta normalizers into the cache
+    HadoopUtil.cacheFiles(getTempPath(THETAS), getConf());
+    
+    // Validate our model and then write it out to the official output
+    getConf().setFloat(ThetaMapper.ALPHA_I, alphaI);
+    getConf().setBoolean(NaiveBayesModel.COMPLEMENTARY_MODEL, trainComplementary);
+    NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(getTempPath(), getConf());
+    naiveBayesModel.validate();
+    naiveBayesModel.serialize(getOutputPath(), getConf());
+
+    return 0;
+  }
+
+  private long createLabelIndex(Path labPath) throws IOException {
+    long labelSize = 0;
+    Iterable<Pair<Text,IntWritable>> iterable =
+      new SequenceFileDirIterable<>(getInputPath(),
+                                                     PathType.LIST,
+                                                     PathFilters.logsCRCFilter(),
+                                                     getConf());
+    labelSize = BayesUtils.writeLabelIndex(getConf(), labPath, iterable);
+    return labelSize;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/WeightsMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/WeightsMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/WeightsMapper.java
new file mode 100644
index 0000000..5563057
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/WeightsMapper.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes.training;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.Functions;
+
+import com.google.common.base.Preconditions;
+
+public class WeightsMapper extends Mapper<IntWritable, VectorWritable, Text, VectorWritable> {
+
+  static final String NUM_LABELS = WeightsMapper.class.getName() + ".numLabels";
+
+  private Vector weightsPerFeature;
+  private Vector weightsPerLabel;
+
+  @Override
+  protected void setup(Context ctx) throws IOException, InterruptedException {
+    super.setup(ctx);
+    int numLabels = Integer.parseInt(ctx.getConfiguration().get(NUM_LABELS));
+    Preconditions.checkArgument(numLabels > 0, "Wrong numLabels: " + numLabels + ". Must be > 0!");
+    weightsPerLabel = new DenseVector(numLabels);
+  }
+
+  @Override
+  protected void map(IntWritable index, VectorWritable value, Context ctx) throws IOException, InterruptedException {
+    Vector instance = value.get();
+    if (weightsPerFeature == null) {
+      weightsPerFeature = new RandomAccessSparseVector(instance.size(), instance.getNumNondefaultElements());
+    }
+
+    int label = index.get();
+    weightsPerFeature.assign(instance, Functions.PLUS);
+    weightsPerLabel.set(label, weightsPerLabel.get(label) + instance.zSum());
+  }
+
+  @Override
+  protected void cleanup(Context ctx) throws IOException, InterruptedException {
+    if (weightsPerFeature != null) {
+      ctx.write(new Text(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE), new VectorWritable(weightsPerFeature));
+      ctx.write(new Text(TrainNaiveBayesJob.WEIGHTS_PER_LABEL), new VectorWritable(weightsPerLabel));
+    }
+    super.cleanup(ctx);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/BaumWelchTrainer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/BaumWelchTrainer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/BaumWelchTrainer.java
new file mode 100644
index 0000000..6d4e2b0
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/BaumWelchTrainer.java
@@ -0,0 +1,161 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import java.io.DataOutputStream;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import java.util.Scanner;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+
+/**
+ * A class for EM training of HMM from console
+ */
+public final class BaumWelchTrainer {
+
+  private BaumWelchTrainer() {
+  }
+
+  public static void main(String[] args) throws IOException {
+    DefaultOptionBuilder optionBuilder = new DefaultOptionBuilder();
+    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+
+    Option inputOption = DefaultOptionCreator.inputOption().create();
+
+    Option outputOption = DefaultOptionCreator.outputOption().create();
+
+    Option stateNumberOption = optionBuilder.withLongName("nrOfHiddenStates").
+      withDescription("Number of hidden states").
+      withShortName("nh").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+      withName("number").create()).withRequired(true).create();
+
+    Option observedStateNumberOption = optionBuilder.withLongName("nrOfObservedStates").
+      withDescription("Number of observed states").
+      withShortName("no").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+      withName("number").create()).withRequired(true).create();
+
+    Option epsilonOption = optionBuilder.withLongName("epsilon").
+      withDescription("Convergence threshold").
+      withShortName("e").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+      withName("number").create()).withRequired(true).create();
+
+    Option iterationsOption = optionBuilder.withLongName("max-iterations").
+      withDescription("Maximum iterations number").
+      withShortName("m").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+      withName("number").create()).withRequired(true).create();
+
+    Group optionGroup = new GroupBuilder().withOption(inputOption).
+      withOption(outputOption).withOption(stateNumberOption).withOption(observedStateNumberOption).
+      withOption(epsilonOption).withOption(iterationsOption).
+      withName("Options").create();
+
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(optionGroup);
+      CommandLine commandLine = parser.parse(args);
+
+      String input = (String) commandLine.getValue(inputOption);
+      String output = (String) commandLine.getValue(outputOption);
+
+      int nrOfHiddenStates = Integer.parseInt((String) commandLine.getValue(stateNumberOption));
+      int nrOfObservedStates = Integer.parseInt((String) commandLine.getValue(observedStateNumberOption));
+
+      double epsilon = Double.parseDouble((String) commandLine.getValue(epsilonOption));
+      int maxIterations = Integer.parseInt((String) commandLine.getValue(iterationsOption));
+
+      //constructing random-generated HMM
+      HmmModel model = new HmmModel(nrOfHiddenStates, nrOfObservedStates, new Date().getTime());
+      List<Integer> observations = new ArrayList<>();
+
+      //reading observations
+      try (Scanner scanner = new Scanner(new FileInputStream(input), "UTF-8")) {
+        while (scanner.hasNextInt()) {
+          observations.add(scanner.nextInt());
+        }
+      }
+
+      int[] observationsArray = new int[observations.size()];
+      for (int i = 0; i < observations.size(); ++i) {
+        observationsArray[i] = observations.get(i);
+      }
+
+      //training
+      HmmModel trainedModel = HmmTrainer.trainBaumWelch(model,
+        observationsArray, epsilon, maxIterations, true);
+
+      //serializing trained model
+      try (DataOutputStream stream = new DataOutputStream(new FileOutputStream(output))){
+        LossyHmmSerializer.serialize(trainedModel, stream);
+      }
+
+      //printing tranied model
+      System.out.println("Initial probabilities: ");
+      for (int i = 0; i < trainedModel.getNrOfHiddenStates(); ++i) {
+        System.out.print(i + " ");
+      }
+      System.out.println();
+      for (int i = 0; i < trainedModel.getNrOfHiddenStates(); ++i) {
+        System.out.print(trainedModel.getInitialProbabilities().get(i) + " ");
+      }
+      System.out.println();
+
+      System.out.println("Transition matrix:");
+      System.out.print("  ");
+      for (int i = 0; i < trainedModel.getNrOfHiddenStates(); ++i) {
+        System.out.print(i + " ");
+      }
+      System.out.println();
+      for (int i = 0; i < trainedModel.getNrOfHiddenStates(); ++i) {
+        System.out.print(i + " ");
+        for (int j = 0; j < trainedModel.getNrOfHiddenStates(); ++j) {
+          System.out.print(trainedModel.getTransitionMatrix().get(i, j) + " ");
+        }
+        System.out.println();
+      }
+      System.out.println("Emission matrix: ");
+      System.out.print("  ");
+      for (int i = 0; i < trainedModel.getNrOfOutputStates(); ++i) {
+        System.out.print(i + " ");
+      }
+      System.out.println();
+      for (int i = 0; i < trainedModel.getNrOfHiddenStates(); ++i) {
+        System.out.print(i + " ");
+        for (int j = 0; j < trainedModel.getNrOfOutputStates(); ++j) {
+          System.out.print(trainedModel.getEmissionMatrix().get(i, j) + " ");
+        }
+        System.out.println();
+      }
+    } catch (OptionException e) {
+      CommandLineUtil.printHelp(optionGroup);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmAlgorithms.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmAlgorithms.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmAlgorithms.java
new file mode 100644
index 0000000..c1d328e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmAlgorithms.java
@@ -0,0 +1,306 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+
+/**
+ * Class containing implementations of the three major HMM algorithms: forward,
+ * backward and Viterbi
+ */
+public final class HmmAlgorithms {
+
+
+  /**
+   * No public constructors for utility classes.
+   */
+  private HmmAlgorithms() {
+    // nothing to do here really
+  }
+
+  /**
+   * External function to compute a matrix of alpha factors
+   *
+   * @param model        model to run forward algorithm for.
+   * @param observations observation sequence to train on.
+   * @param scaled       Should log-scaled beta factors be computed?
+   * @return matrix of alpha factors.
+   */
+  public static Matrix forwardAlgorithm(HmmModel model, int[] observations, boolean scaled) {
+    Matrix alpha = new DenseMatrix(observations.length, model.getNrOfHiddenStates());
+    forwardAlgorithm(alpha, model, observations, scaled);
+
+    return alpha;
+  }
+
+  /**
+   * Internal function to compute the alpha factors
+   *
+   * @param alpha        matrix to store alpha factors in.
+   * @param model        model to use for alpha factor computation.
+   * @param observations observation sequence seen.
+   * @param scaled       set to true if log-scaled beta factors should be computed.
+   */
+  static void forwardAlgorithm(Matrix alpha, HmmModel model, int[] observations, boolean scaled) {
+
+    // fetch references to the model parameters
+    Vector ip = model.getInitialProbabilities();
+    Matrix b = model.getEmissionMatrix();
+    Matrix a = model.getTransitionMatrix();
+
+    if (scaled) { // compute log scaled alpha values
+      // Initialization
+      for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+        alpha.setQuick(0, i, Math.log(ip.getQuick(i) * b.getQuick(i, observations[0])));
+      }
+
+      // Induction
+      for (int t = 1; t < observations.length; t++) {
+        for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+          double sum = Double.NEGATIVE_INFINITY; // log(0)
+          for (int j = 0; j < model.getNrOfHiddenStates(); j++) {
+            double tmp = alpha.getQuick(t - 1, j) + Math.log(a.getQuick(j, i));
+            if (tmp > Double.NEGATIVE_INFINITY) {
+              // make sure we handle log(0) correctly
+              sum = tmp + Math.log1p(Math.exp(sum - tmp));
+            }
+          }
+          alpha.setQuick(t, i, sum + Math.log(b.getQuick(i, observations[t])));
+        }
+      }
+    } else {
+
+      // Initialization
+      for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+        alpha.setQuick(0, i, ip.getQuick(i) * b.getQuick(i, observations[0]));
+      }
+
+      // Induction
+      for (int t = 1; t < observations.length; t++) {
+        for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+          double sum = 0.0;
+          for (int j = 0; j < model.getNrOfHiddenStates(); j++) {
+            sum += alpha.getQuick(t - 1, j) * a.getQuick(j, i);
+          }
+          alpha.setQuick(t, i, sum * b.getQuick(i, observations[t]));
+        }
+      }
+    }
+  }
+
+  /**
+   * External function to compute a matrix of beta factors
+   *
+   * @param model        model to use for estimation.
+   * @param observations observation sequence seen.
+   * @param scaled       Set to true if log-scaled beta factors should be computed.
+   * @return beta factors based on the model and observation sequence.
+   */
+  public static Matrix backwardAlgorithm(HmmModel model, int[] observations, boolean scaled) {
+    // initialize the matrix
+    Matrix beta = new DenseMatrix(observations.length, model.getNrOfHiddenStates());
+    // compute the beta factors
+    backwardAlgorithm(beta, model, observations, scaled);
+
+    return beta;
+  }
+
+  /**
+   * Internal function to compute the beta factors
+   *
+   * @param beta         Matrix to store resulting factors in.
+   * @param model        model to use for factor estimation.
+   * @param observations sequence of observations to estimate.
+   * @param scaled       set to true to compute log-scaled parameters.
+   */
+  static void backwardAlgorithm(Matrix beta, HmmModel model, int[] observations, boolean scaled) {
+    // fetch references to the model parameters
+    Matrix b = model.getEmissionMatrix();
+    Matrix a = model.getTransitionMatrix();
+
+    if (scaled) { // compute log-scaled factors
+      // initialization
+      for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+        beta.setQuick(observations.length - 1, i, 0);
+      }
+
+      // induction
+      for (int t = observations.length - 2; t >= 0; t--) {
+        for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+          double sum = Double.NEGATIVE_INFINITY; // log(0)
+          for (int j = 0; j < model.getNrOfHiddenStates(); j++) {
+            double tmp = beta.getQuick(t + 1, j) + Math.log(a.getQuick(i, j))
+                + Math.log(b.getQuick(j, observations[t + 1]));
+            if (tmp > Double.NEGATIVE_INFINITY) {
+              // handle log(0)
+              sum = tmp + Math.log1p(Math.exp(sum - tmp));
+            }
+          }
+          beta.setQuick(t, i, sum);
+        }
+      }
+    } else {
+      // initialization
+      for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+        beta.setQuick(observations.length - 1, i, 1);
+      }
+      // induction
+      for (int t = observations.length - 2; t >= 0; t--) {
+        for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+          double sum = 0;
+          for (int j = 0; j < model.getNrOfHiddenStates(); j++) {
+            sum += beta.getQuick(t + 1, j) * a.getQuick(i, j) * b.getQuick(j, observations[t + 1]);
+          }
+          beta.setQuick(t, i, sum);
+        }
+      }
+    }
+  }
+
+  /**
+   * Viterbi algorithm to compute the most likely hidden sequence for a given
+   * model and observed sequence
+   *
+   * @param model        HmmModel for which the Viterbi path should be computed
+   * @param observations Sequence of observations
+   * @param scaled       Use log-scaled computations, this requires higher computational
+   *                     effort but is numerically more stable for large observation
+   *                     sequences
+   * @return nrOfObservations 1D int array containing the most likely hidden
+   *         sequence
+   */
+  public static int[] viterbiAlgorithm(HmmModel model, int[] observations, boolean scaled) {
+
+    // probability that the most probable hidden states ends at state i at
+    // time t
+    double[][] delta = new double[observations.length][model
+        .getNrOfHiddenStates()];
+
+    // previous hidden state in the most probable state leading up to state
+    // i at time t
+    int[][] phi = new int[observations.length - 1][model.getNrOfHiddenStates()];
+
+    // initialize the return array
+    int[] sequence = new int[observations.length];
+
+    viterbiAlgorithm(sequence, delta, phi, model, observations, scaled);
+
+    return sequence;
+  }
+
+  /**
+   * Internal version of the viterbi algorithm, allowing to reuse existing
+   * arrays instead of allocating new ones
+   *
+   * @param sequence     NrOfObservations 1D int array for storing the viterbi sequence
+   * @param delta        NrOfObservations x NrHiddenStates 2D double array for storing the
+   *                     delta factors
+   * @param phi          NrOfObservations-1 x NrHiddenStates 2D int array for storing the
+   *                     phi values
+   * @param model        HmmModel for which the viterbi path should be computed
+   * @param observations Sequence of observations
+   * @param scaled       Use log-scaled computations, this requires higher computational
+   *                     effort but is numerically more stable for large observation
+   *                     sequences
+   */
+  static void viterbiAlgorithm(int[] sequence, double[][] delta, int[][] phi, HmmModel model, int[] observations,
+      boolean scaled) {
+    // fetch references to the model parameters
+    Vector ip = model.getInitialProbabilities();
+    Matrix b = model.getEmissionMatrix();
+    Matrix a = model.getTransitionMatrix();
+
+    // Initialization
+    if (scaled) {
+      for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+        delta[0][i] = Math.log(ip.getQuick(i) * b.getQuick(i, observations[0]));
+      }
+    } else {
+
+      for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+        delta[0][i] = ip.getQuick(i) * b.getQuick(i, observations[0]);
+      }
+    }
+
+    // Induction
+    // iterate over the time
+    if (scaled) {
+      for (int t = 1; t < observations.length; t++) {
+        // iterate over the hidden states
+        for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+          // find the maximum probability and most likely state
+          // leading up
+          // to this
+          int maxState = 0;
+          double maxProb = delta[t - 1][0] + Math.log(a.getQuick(0, i));
+          for (int j = 1; j < model.getNrOfHiddenStates(); j++) {
+            double prob = delta[t - 1][j] + Math.log(a.getQuick(j, i));
+            if (prob > maxProb) {
+              maxProb = prob;
+              maxState = j;
+            }
+          }
+          delta[t][i] = maxProb + Math.log(b.getQuick(i, observations[t]));
+          phi[t - 1][i] = maxState;
+        }
+      }
+    } else {
+      for (int t = 1; t < observations.length; t++) {
+        // iterate over the hidden states
+        for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+          // find the maximum probability and most likely state
+          // leading up
+          // to this
+          int maxState = 0;
+          double maxProb = delta[t - 1][0] * a.getQuick(0, i);
+          for (int j = 1; j < model.getNrOfHiddenStates(); j++) {
+            double prob = delta[t - 1][j] * a.getQuick(j, i);
+            if (prob > maxProb) {
+              maxProb = prob;
+              maxState = j;
+            }
+          }
+          delta[t][i] = maxProb * b.getQuick(i, observations[t]);
+          phi[t - 1][i] = maxState;
+        }
+      }
+    }
+
+    // find the most likely end state for initialization
+    double maxProb;
+    if (scaled) {
+      maxProb = Double.NEGATIVE_INFINITY;
+    } else {
+      maxProb = 0.0;
+    }
+    for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+      if (delta[observations.length - 1][i] > maxProb) {
+        maxProb = delta[observations.length - 1][i];
+        sequence[observations.length - 1] = i;
+      }
+    }
+
+    // now backtrack to find the most likely hidden sequence
+    for (int t = observations.length - 2; t >= 0; t--) {
+      sequence[t] = phi[t][sequence[t + 1]];
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmEvaluator.java
new file mode 100644
index 0000000..6e2def6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmEvaluator.java
@@ -0,0 +1,194 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import java.util.Random;
+
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+
+/**
+ * The HMMEvaluator class offers several methods to evaluate an HMM Model. The
+ * following use-cases are covered: 1) Generate a sequence of output states from
+ * a given model (prediction). 2) Compute the likelihood that a given model
+ * generated a given sequence of output states (model likelihood). 3) Compute
+ * the most likely hidden sequence for a given model and a given observed
+ * sequence (decoding).
+ */
+public final class HmmEvaluator {
+
+  /**
+   * No constructor for utility classes.
+   */
+  private HmmEvaluator() {}
+
+  /**
+   * Predict a sequence of steps output states for the given HMM model
+   *
+   * @param model The Hidden Markov model used to generate the output sequence
+   * @param steps Size of the generated output sequence
+   * @return integer array containing a sequence of steps output state IDs,
+   *         generated by the specified model
+   */
+  public static int[] predict(HmmModel model, int steps) {
+    return predict(model, steps, RandomUtils.getRandom());
+  }
+
+  /**
+   * Predict a sequence of steps output states for the given HMM model
+   *
+   * @param model The Hidden Markov model used to generate the output sequence
+   * @param steps Size of the generated output sequence
+   * @param seed  seed to use for the RNG
+   * @return integer array containing a sequence of steps output state IDs,
+   *         generated by the specified model
+   */
+  public static int[] predict(HmmModel model, int steps, long seed) {
+    return predict(model, steps, RandomUtils.getRandom(seed));
+  }
+  /**
+   * Predict a sequence of steps output states for the given HMM model using the
+   * given seed for probabilistic experiments
+   *
+   * @param model The Hidden Markov model used to generate the output sequence
+   * @param steps Size of the generated output sequence
+   * @param rand  RNG to use
+   * @return integer array containing a sequence of steps output state IDs,
+   *         generated by the specified model
+   */
+  private static int[] predict(HmmModel model, int steps, Random rand) {
+    // fetch the cumulative distributions
+    Vector cip = HmmUtils.getCumulativeInitialProbabilities(model);
+    Matrix ctm = HmmUtils.getCumulativeTransitionMatrix(model);
+    Matrix com = HmmUtils.getCumulativeOutputMatrix(model);
+    // allocate the result IntArrayList
+    int[] result = new int[steps];
+    // choose the initial state
+    int hiddenState = 0;
+
+    double randnr = rand.nextDouble();
+    while (cip.get(hiddenState) < randnr) {
+      hiddenState++;
+    }
+
+    // now draw steps output states according to the cumulative
+    // distributions
+    for (int step = 0; step < steps; ++step) {
+      // choose output state to given hidden state
+      randnr = rand.nextDouble();
+      int outputState = 0;
+      while (com.get(hiddenState, outputState) < randnr) {
+        outputState++;
+      }
+      result[step] = outputState;
+      // choose the next hidden state
+      randnr = rand.nextDouble();
+      int nextHiddenState = 0;
+      while (ctm.get(hiddenState, nextHiddenState) < randnr) {
+        nextHiddenState++;
+      }
+      hiddenState = nextHiddenState;
+    }
+    return result;
+  }
+
+  /**
+   * Returns the likelihood that a given output sequence was produced by the
+   * given model. Internally, this function calls the forward algorithm to
+   * compute the alpha values and then uses the overloaded function to compute
+   * the actual model likelihood.
+   *
+   * @param model          Model to base the likelihood on.
+   * @param outputSequence Sequence to compute likelihood for.
+   * @param scaled         Use log-scaled parameters for computation. This is computationally
+   *                       more expensive, but offers better numerically stability in case of
+   *                       long output sequences
+   * @return Likelihood that the given model produced the given sequence
+   */
+  public static double modelLikelihood(HmmModel model, int[] outputSequence, boolean scaled) {
+    return modelLikelihood(HmmAlgorithms.forwardAlgorithm(model, outputSequence, scaled), scaled);
+  }
+
+  /**
+   * Computes the likelihood that a given output sequence was computed by a
+   * given model using the alpha values computed by the forward algorithm.
+   * // TODO I am a bit confused here - where is the output sequence referenced in the comment above in the code?
+   * @param alpha  Matrix of alpha values
+   * @param scaled Set to true if the alpha values are log-scaled.
+   * @return model likelihood.
+   */
+  public static double modelLikelihood(Matrix alpha, boolean scaled) {
+    double likelihood = 0;
+    if (scaled) {
+      for (int i = 0; i < alpha.numCols(); ++i) {
+        likelihood += Math.exp(alpha.getQuick(alpha.numRows() - 1, i));
+      }
+    } else {
+      for (int i = 0; i < alpha.numCols(); ++i) {
+        likelihood += alpha.getQuick(alpha.numRows() - 1, i);
+      }
+    }
+    return likelihood;
+  }
+
+  /**
+   * Computes the likelihood that a given output sequence was computed by a
+   * given model.
+   *
+   * @param model model to compute sequence likelihood for.
+   * @param outputSequence sequence to base computation on.
+   * @param beta beta parameters.
+   * @param scaled     set to true if betas are log-scaled.
+   * @return likelihood of the outputSequence given the model.
+   */
+  public static double modelLikelihood(HmmModel model, int[] outputSequence, Matrix beta, boolean scaled) {
+    double likelihood = 0;
+    // fetch the emission probabilities
+    Matrix e = model.getEmissionMatrix();
+    Vector pi = model.getInitialProbabilities();
+    int firstOutput = outputSequence[0];
+    if (scaled) {
+      for (int i = 0; i < model.getNrOfHiddenStates(); ++i) {
+        likelihood += pi.getQuick(i) * Math.exp(beta.getQuick(0, i)) * e.getQuick(i, firstOutput);
+      }
+    } else {
+      for (int i = 0; i < model.getNrOfHiddenStates(); ++i) {
+        likelihood += pi.getQuick(i) * beta.getQuick(0, i) * e.getQuick(i, firstOutput);
+      }
+    }
+    return likelihood;
+  }
+
+  /**
+   * Returns the most likely sequence of hidden states for the given model and
+   * observation
+   *
+   * @param model model to use for decoding.
+   * @param observations integer Array containing a sequence of observed state IDs
+   * @param scaled       Use log-scaled computations, this requires higher computational
+   *                     effort but is numerically more stable for large observation
+   *                     sequences
+   * @return integer array containing the most likely sequence of hidden state
+   * IDs
+   */
+  public static int[] decode(HmmModel model, int[] observations, boolean scaled) {
+    return HmmAlgorithms.viterbiAlgorithm(model, observations, scaled);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmModel.java
new file mode 100644
index 0000000..bc24884
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmModel.java
@@ -0,0 +1,383 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import java.util.Map;
+import java.util.Random;
+
+import com.google.common.collect.BiMap;
+import com.google.common.collect.HashBiMap;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+
+/**
+ * Main class defining a Hidden Markov Model
+ */
+public class HmmModel implements Cloneable {
+
+  /** Bi-directional Map for storing the observed state names */
+  private BiMap<String,Integer> outputStateNames;
+
+  /** Bi-Directional Map for storing the hidden state names */
+  private BiMap<String,Integer> hiddenStateNames;
+
+  /* Number of hidden states */
+  private int nrOfHiddenStates;
+
+  /** Number of output states */
+  private int nrOfOutputStates;
+
+  /**
+   * Transition matrix containing the transition probabilities between hidden
+   * states. TransitionMatrix(i,j) is the probability that we change from hidden
+   * state i to hidden state j In general: P(h(t+1)=h_j | h(t) = h_i) =
+   * transitionMatrix(i,j) Since we have to make sure that each hidden state can
+   * be "left", the following normalization condition has to hold:
+   * sum(transitionMatrix(i,j),j=1..hiddenStates) = 1
+   */
+  private Matrix transitionMatrix;
+
+  /**
+   * Output matrix containing the probabilities that we observe a given output
+   * state given a hidden state. outputMatrix(i,j) is the probability that we
+   * observe output state j if we are in hidden state i Formally: P(o(t)=o_j |
+   * h(t)=h_i) = outputMatrix(i,j) Since we always have an observation for each
+   * hidden state, the following normalization condition has to hold:
+   * sum(outputMatrix(i,j),j=1..outputStates) = 1
+   */
+  private Matrix emissionMatrix;
+
+  /**
+   * Vector containing the initial hidden state probabilities. That is
+   * P(h(0)=h_i) = initialProbabilities(i). Since we are dealing with
+   * probabilities the following normalization condition has to hold:
+   * sum(initialProbabilities(i),i=1..hiddenStates) = 1
+   */
+  private Vector initialProbabilities;
+
+
+  /**
+   * Get a copy of this model
+   */
+  @Override
+  public HmmModel clone() {
+    HmmModel model = new HmmModel(transitionMatrix.clone(), emissionMatrix.clone(), initialProbabilities.clone());
+    if (hiddenStateNames != null) {
+      model.hiddenStateNames = HashBiMap.create(hiddenStateNames);
+    }
+    if (outputStateNames != null) {
+      model.outputStateNames = HashBiMap.create(outputStateNames);
+    }
+    return model;
+  }
+
+  /**
+   * Assign the content of another HMM model to this one
+   *
+   * @param model The HmmModel that will be assigned to this one
+   */
+  public void assign(HmmModel model) {
+    this.nrOfHiddenStates = model.nrOfHiddenStates;
+    this.nrOfOutputStates = model.nrOfOutputStates;
+    this.hiddenStateNames = model.hiddenStateNames;
+    this.outputStateNames = model.outputStateNames;
+    // for now clone the matrix/vectors
+    this.initialProbabilities = model.initialProbabilities.clone();
+    this.emissionMatrix = model.emissionMatrix.clone();
+    this.transitionMatrix = model.transitionMatrix.clone();
+  }
+
+  /**
+   * Construct a valid random Hidden-Markov parameter set with the given number
+   * of hidden and output states using a given seed.
+   *
+   * @param nrOfHiddenStates Number of hidden states
+   * @param nrOfOutputStates Number of output states
+   * @param seed             Seed for the random initialization, if set to 0 the current time
+   *                         is used
+   */
+  public HmmModel(int nrOfHiddenStates, int nrOfOutputStates, long seed) {
+    this.nrOfHiddenStates = nrOfHiddenStates;
+    this.nrOfOutputStates = nrOfOutputStates;
+    this.transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates);
+    this.emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates);
+    this.initialProbabilities = new DenseVector(nrOfHiddenStates);
+    // initialize a random, valid parameter set
+    initRandomParameters(seed);
+  }
+
+  /**
+   * Construct a valid random Hidden-Markov parameter set with the given number
+   * of hidden and output states.
+   *
+   * @param nrOfHiddenStates Number of hidden states
+   * @param nrOfOutputStates Number of output states
+   */
+  public HmmModel(int nrOfHiddenStates, int nrOfOutputStates) {
+    this(nrOfHiddenStates, nrOfOutputStates, 0);
+  }
+
+  /**
+   * Generates a Hidden Markov model using the specified parameters
+   *
+   * @param transitionMatrix     transition probabilities.
+   * @param emissionMatrix       emission probabilities.
+   * @param initialProbabilities initial start probabilities.
+   * @throws IllegalArgumentException If the given parameter set is invalid
+   */
+  public HmmModel(Matrix transitionMatrix, Matrix emissionMatrix, Vector initialProbabilities) {
+    this.nrOfHiddenStates = initialProbabilities.size();
+    this.nrOfOutputStates = emissionMatrix.numCols();
+    this.transitionMatrix = transitionMatrix;
+    this.emissionMatrix = emissionMatrix;
+    this.initialProbabilities = initialProbabilities;
+  }
+
+  /**
+   * Initialize a valid random set of HMM parameters
+   *
+   * @param seed seed to use for Random initialization. Use 0 to use Java-built-in-version.
+   */
+  private void initRandomParameters(long seed) {
+    Random rand;
+    // initialize the random number generator
+    if (seed == 0) {
+      rand = RandomUtils.getRandom();
+    } else {
+      rand = RandomUtils.getRandom(seed);
+    }
+    // initialize the initial Probabilities
+    double sum = 0; // used for normalization
+    for (int i = 0; i < nrOfHiddenStates; i++) {
+      double nextRand = rand.nextDouble();
+      initialProbabilities.set(i, nextRand);
+      sum += nextRand;
+    }
+    // "normalize" the vector to generate probabilities
+    initialProbabilities = initialProbabilities.divide(sum);
+
+    // initialize the transition matrix
+    double[] values = new double[nrOfHiddenStates];
+    for (int i = 0; i < nrOfHiddenStates; i++) {
+      sum = 0;
+      for (int j = 0; j < nrOfHiddenStates; j++) {
+        values[j] = rand.nextDouble();
+        sum += values[j];
+      }
+      // normalize the random values to obtain probabilities
+      for (int j = 0; j < nrOfHiddenStates; j++) {
+        values[j] /= sum;
+      }
+      // set this row of the transition matrix
+      transitionMatrix.set(i, values);
+    }
+
+    // initialize the output matrix
+    values = new double[nrOfOutputStates];
+    for (int i = 0; i < nrOfHiddenStates; i++) {
+      sum = 0;
+      for (int j = 0; j < nrOfOutputStates; j++) {
+        values[j] = rand.nextDouble();
+        sum += values[j];
+      }
+      // normalize the random values to obtain probabilities
+      for (int j = 0; j < nrOfOutputStates; j++) {
+        values[j] /= sum;
+      }
+      // set this row of the output matrix
+      emissionMatrix.set(i, values);
+    }
+  }
+
+  /**
+   * Getter Method for the number of hidden states
+   *
+   * @return Number of hidden states
+   */
+  public int getNrOfHiddenStates() {
+    return nrOfHiddenStates;
+  }
+
+  /**
+   * Getter Method for the number of output states
+   *
+   * @return Number of output states
+   */
+  public int getNrOfOutputStates() {
+    return nrOfOutputStates;
+  }
+
+  /**
+   * Getter function to get the hidden state transition matrix
+   *
+   * @return returns the model's transition matrix.
+   */
+  public Matrix getTransitionMatrix() {
+    return transitionMatrix;
+  }
+
+  /**
+   * Getter function to get the output state probability matrix
+   *
+   * @return returns the models emission matrix.
+   */
+  public Matrix getEmissionMatrix() {
+    return emissionMatrix;
+  }
+
+  /**
+   * Getter function to return the vector of initial hidden state probabilities
+   *
+   * @return returns the model's init probabilities.
+   */
+  public Vector getInitialProbabilities() {
+    return initialProbabilities;
+  }
+
+  /**
+   * Getter method for the hidden state Names map
+   *
+   * @return hidden state names.
+   */
+  public Map<String, Integer> getHiddenStateNames() {
+    return hiddenStateNames;
+  }
+
+  /**
+   * Register an array of hidden state Names. We assume that the state name at
+   * position i has the ID i
+   *
+   * @param stateNames names of hidden states.
+   */
+  public void registerHiddenStateNames(String[] stateNames) {
+    if (stateNames != null) {
+      hiddenStateNames = HashBiMap.create();
+      for (int i = 0; i < stateNames.length; ++i) {
+        hiddenStateNames.put(stateNames[i], i);
+      }
+    }
+  }
+
+  /**
+   * Register a map of hidden state Names/state IDs
+   *
+   * @param stateNames <String,Integer> Map that assigns each state name an integer ID
+   */
+  public void registerHiddenStateNames(Map<String, Integer> stateNames) {
+    if (stateNames != null) {
+      hiddenStateNames = HashBiMap.create(stateNames);
+    }
+  }
+
+  /**
+   * Lookup the name for the given hidden state ID
+   *
+   * @param id Integer id of the hidden state
+   * @return String containing the name for the given ID, null if this ID is not
+   *         known or no hidden state names were specified
+   */
+  public String getHiddenStateName(int id) {
+    if (hiddenStateNames == null) {
+      return null;
+    }
+    return hiddenStateNames.inverse().get(id);
+  }
+
+  /**
+   * Lookup the ID for the given hidden state name
+   *
+   * @param name Name of the hidden state
+   * @return int containing the ID for the given name, -1 if this name is not
+   *         known or no hidden state names were specified
+   */
+  public int getHiddenStateID(String name) {
+    if (hiddenStateNames == null) {
+      return -1;
+    }
+    Integer tmp = hiddenStateNames.get(name);
+    return tmp == null ? -1 : tmp;
+  }
+
+  /**
+   * Getter method for the output state Names map
+   *
+   * @return names of output states.
+   */
+  public Map<String, Integer> getOutputStateNames() {
+    return outputStateNames;
+  }
+
+  /**
+   * Register an array of hidden state Names. We assume that the state name at
+   * position i has the ID i
+   *
+   * @param stateNames state names to register.
+   */
+  public void registerOutputStateNames(String[] stateNames) {
+    if (stateNames != null) {
+      outputStateNames = HashBiMap.create();
+      for (int i = 0; i < stateNames.length; ++i) {
+        outputStateNames.put(stateNames[i], i);
+      }
+    }
+  }
+
+  /**
+   * Register a map of hidden state Names/state IDs
+   *
+   * @param stateNames <String,Integer> Map that assigns each state name an integer ID
+   */
+  public void registerOutputStateNames(Map<String, Integer> stateNames) {
+    if (stateNames != null) {
+      outputStateNames = HashBiMap.create(stateNames);
+    }
+  }
+
+  /**
+   * Lookup the name for the given output state id
+   *
+   * @param id Integer id of the output state
+   * @return String containing the name for the given id, null if this id is not
+   *         known or no output state names were specified
+   */
+  public String getOutputStateName(int id) {
+    if (outputStateNames == null) {
+      return null;
+    }
+    return outputStateNames.inverse().get(id);
+  }
+
+  /**
+   * Lookup the ID for the given output state name
+   *
+   * @param name Name of the output state
+   * @return int containing the ID for the given name, -1 if this name is not
+   *         known or no output state names were specified
+   */
+  public int getOutputStateID(String name) {
+    if (outputStateNames == null) {
+      return -1;
+    }
+    Integer tmp = outputStateNames.get(name);
+    return tmp == null ? -1 : tmp;
+  }
+
+}


[03/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/AbstractJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/AbstractJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/AbstractJob.java
new file mode 100644
index 0000000..8072466
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/AbstractJob.java
@@ -0,0 +1,648 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.google.common.base.Preconditions;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.lucene.AnalyzerUtils;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>Superclass of many Mahout Hadoop "jobs". A job drives configuration and launch of one or
+ * more maps and reduces in order to accomplish some task.</p>
+ *
+ * <p>Command line arguments available to all subclasses are:</p>
+ *
+ * <ul>
+ *  <li>--tempDir (path): Specifies a directory where the job may place temp files
+ *   (default "temp")</li>
+ *  <li>--help: Show help message</li>
+ * </ul>
+ *
+ * <p>In addition, note some key command line parameters that are parsed by Hadoop, which jobs
+ * may need to set:</p>
+ *
+ * <ul>
+ *  <li>-Dmapred.job.name=(name): Sets the Hadoop task names. It will be suffixed by
+ *    the mapper and reducer class names</li>
+ *  <li>-Dmapred.output.compress={true,false}: Compress final output (default true)</li>
+ *  <li>-Dmapred.input.dir=(path): input file, or directory containing input files (required)</li>
+ *  <li>-Dmapred.output.dir=(path): path to write output files (required)</li>
+ * </ul>
+ *
+ * <p>Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other
+ * arguments.</p>
+ */
+public abstract class AbstractJob extends Configured implements Tool {
+
+  private static final Logger log = LoggerFactory.getLogger(AbstractJob.class);
+
+  /** option used to specify the input path */
+  private Option inputOption;
+
+  /** option used to specify the output path */
+  private Option outputOption;
+
+  /** input path, populated by {@link #parseArguments(String[])} */
+  protected Path inputPath;
+  protected File inputFile; //the input represented as a file
+
+  /** output path, populated by {@link #parseArguments(String[])} */
+  protected Path outputPath;
+  protected File outputFile; //the output represented as a file
+
+  /** temp path, populated by {@link #parseArguments(String[])} */
+  protected Path tempPath;
+
+  protected Map<String, List<String>> argMap;
+
+  /** internal list of options that have been added */
+  private final List<Option> options;
+  private Group group;
+
+  protected AbstractJob() {
+    options = new LinkedList<>();
+  }
+
+  /** Returns the input path established by a call to {@link #parseArguments(String[])}.
+   *  The source of the path may be an input option added using {@link #addInputOption()}
+   *  or it may be the value of the {@code mapred.input.dir} configuration
+   *  property. 
+   */
+  protected Path getInputPath() {
+    return inputPath;
+  }
+
+  /** Returns the output path established by a call to {@link #parseArguments(String[])}.
+   *  The source of the path may be an output option added using {@link #addOutputOption()}
+   *  or it may be the value of the {@code mapred.input.dir} configuration
+   *  property. 
+   */
+  protected Path getOutputPath() {
+    return outputPath;
+  }
+
+  protected Path getOutputPath(String path) {
+    return new Path(outputPath, path);
+  }
+
+  protected File getInputFile() {
+    return inputFile;
+  }
+
+  protected File getOutputFile() {
+    return outputFile;
+  }
+
+
+  protected Path getTempPath() {
+    return tempPath;
+  }
+
+  protected Path getTempPath(String directory) {
+    return new Path(tempPath, directory);
+  }
+  
+  @Override
+  public Configuration getConf() {
+    Configuration result = super.getConf();
+    if (result == null) {
+      return new Configuration();
+    }
+    return result;
+  }
+
+  /** Add an option with no argument whose presence can be checked for using
+   *  {@code containsKey} method on the map returned by {@link #parseArguments(String[])};
+   */
+  protected void addFlag(String name, String shortName, String description) {
+    options.add(buildOption(name, shortName, description, false, false, null));
+  }
+
+  /** Add an option to the the set of options this job will parse when
+   *  {@link #parseArguments(String[])} is called. This options has an argument
+   *  with null as its default value.
+   */
+  protected void addOption(String name, String shortName, String description) {
+    options.add(buildOption(name, shortName, description, true, false, null));
+  }
+
+  /** Add an option to the the set of options this job will parse when
+   *  {@link #parseArguments(String[])} is called.
+   *
+   * @param required if true the {@link #parseArguments(String[])} will throw
+   *    fail with an error and usage message if this option is not specified
+   *    on the command line.
+   */
+  protected void addOption(String name, String shortName, String description, boolean required) {
+    options.add(buildOption(name, shortName, description, true, required, null));
+  }
+
+  /** Add an option to the the set of options this job will parse when
+   *  {@link #parseArguments(String[])} is called. If this option is not 
+   *  specified on the command line the default value will be 
+   *  used.
+   *
+   * @param defaultValue the default argument value if this argument is not
+   *   found on the command-line. null is allowed.
+   */
+  protected void addOption(String name, String shortName, String description, String defaultValue) {
+    options.add(buildOption(name, shortName, description, true, false, defaultValue));
+  }
+
+  /** Add an arbitrary option to the set of options this job will parse when
+   *  {@link #parseArguments(String[])} is called. If this option has no
+   *  argument, use {@code containsKey} on the map returned by
+   *  {@code parseArguments} to check for its presence. Otherwise, the
+   *  string value of the option will be placed in the map using a key
+   *  equal to this options long name preceded by '--'.
+   * @return the option added.
+   */
+  protected Option addOption(Option option) {
+    options.add(option);
+    return option;
+  }
+
+  protected Group getGroup() {
+    return group;
+  }
+
+  /** Add the default input directory option, '-i' which takes a directory
+   *  name as an argument. When {@link #parseArguments(String[])} is 
+   *  called, the inputPath will be set based upon the value for this option.
+   *  If this method is called, the input is required.
+   */
+  protected void addInputOption() {
+    this.inputOption = addOption(DefaultOptionCreator.inputOption().create());
+  }
+
+  /** Add the default output directory option, '-o' which takes a directory
+   *  name as an argument. When {@link #parseArguments(String[])} is 
+   *  called, the outputPath will be set based upon the value for this option.
+   *  If this method is called, the output is required. 
+   */
+  protected void addOutputOption() {
+    this.outputOption = addOption(DefaultOptionCreator.outputOption().create());
+  }
+
+  /** Build an option with the given parameters. Name and description are
+   *  required.
+   * 
+   * @param name the long name of the option prefixed with '--' on the command-line
+   * @param shortName the short name of the option, prefixed with '-' on the command-line
+   * @param description description of the option displayed in help method
+   * @param hasArg true if the option has an argument.
+   * @param required true if the option is required.
+   * @param defaultValue default argument value, can be null.
+   * @return the option.
+   */
+  protected static Option buildOption(String name,
+                                      String shortName,
+                                      String description,
+                                      boolean hasArg,
+                                      boolean required,
+                                      String defaultValue) {
+
+    return buildOption(name, shortName, description, hasArg, 1, 1, required, defaultValue);
+  }
+
+  protected static Option buildOption(String name,
+                                      String shortName,
+                                      String description,
+                                      boolean hasArg, int min, int max,
+                                      boolean required,
+                                      String defaultValue) {
+
+    DefaultOptionBuilder optBuilder = new DefaultOptionBuilder().withLongName(name).withDescription(description)
+        .withRequired(required);
+
+    if (shortName != null) {
+      optBuilder.withShortName(shortName);
+    }
+
+    if (hasArg) {
+      ArgumentBuilder argBuilder = new ArgumentBuilder().withName(name).withMinimum(min).withMaximum(max);
+
+      if (defaultValue != null) {
+        argBuilder = argBuilder.withDefault(defaultValue);
+      }
+
+      optBuilder.withArgument(argBuilder.create());
+    }
+
+    return optBuilder.create();
+  }
+
+  /**
+   * @param name The name of the option
+   * @return the {@link org.apache.commons.cli2.Option} with the name, else null
+   */
+  protected Option getCLIOption(String name) {
+    for (Option option : options) {
+      if (option.getPreferredName().equals(name)) {
+        return option;
+      }
+    }
+    return null;
+  }
+
+  /** Parse the arguments specified based on the options defined using the 
+   *  various {@code addOption} methods. If -h is specified or an
+   *  exception is encountered print help and return null. Has the 
+   *  side effect of setting inputPath and outputPath 
+   *  if {@code addInputOption} or {@code addOutputOption}
+   *  or {@code mapred.input.dir} or {@code mapred.output.dir}
+   *  are present in the Configuration.
+   *
+   * @return a {@code Map<String,String>} containing options and their argument values.
+   *  The presence of a flag can be tested using {@code containsKey}, while
+   *  argument values can be retrieved using {@code get(optionName)}. The
+   *  names used for keys are the option name parameter prefixed by '--'.
+   *
+   * @see #parseArguments(String[], boolean, boolean)  -- passes in false, false for the optional args.
+   */
+  public Map<String, List<String>> parseArguments(String[] args) throws IOException {
+    return parseArguments(args, false, false);
+  }
+
+  /**
+   *
+   * @param args  The args to parse
+   * @param inputOptional if false, then the input option, if set, need not be present.  If true and input is an option
+   *                      and there is no input, then throw an error
+   * @param outputOptional if false, then the output option, if set, need not be present.  If true and output is an
+   *                       option and there is no output, then throw an error
+   * @return the args parsed into a map.
+   */
+  public Map<String, List<String>> parseArguments(String[] args, boolean inputOptional, boolean outputOptional)
+    throws IOException {
+    Option helpOpt = addOption(DefaultOptionCreator.helpOption());
+    addOption("tempDir", null, "Intermediate output directory", "temp");
+    addOption("startPhase", null, "First phase to run", "0");
+    addOption("endPhase", null, "Last phase to run", String.valueOf(Integer.MAX_VALUE));
+
+    GroupBuilder gBuilder = new GroupBuilder().withName("Job-Specific Options:");
+
+    for (Option opt : options) {
+      gBuilder = gBuilder.withOption(opt);
+    }
+
+    group = gBuilder.create();
+
+    CommandLine cmdLine;
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(group);
+      parser.setHelpOption(helpOpt);
+      cmdLine = parser.parse(args);
+
+    } catch (OptionException e) {
+      log.error(e.getMessage());
+      CommandLineUtil.printHelpWithGenericOptions(group, e);
+      return null;
+    }
+
+    if (cmdLine.hasOption(helpOpt)) {
+      CommandLineUtil.printHelpWithGenericOptions(group);
+      return null;
+    }
+
+    try {
+      parseDirectories(cmdLine, inputOptional, outputOptional);
+    } catch (IllegalArgumentException e) {
+      log.error(e.getMessage());
+      CommandLineUtil.printHelpWithGenericOptions(group);
+      return null;
+    }
+
+    argMap = new TreeMap<>();
+    maybePut(argMap, cmdLine, this.options.toArray(new Option[this.options.size()]));
+
+    this.tempPath = new Path(getOption("tempDir"));
+
+    if (!hasOption("quiet")) {
+      log.info("Command line arguments: {}", argMap);
+    }
+    return argMap;
+  }
+  
+  /**
+   * Build the option key (--name) from the option name
+   */
+  public static String keyFor(String optionName) {
+    return "--" + optionName;
+  }
+
+  /**
+   * @return the requested option, or null if it has not been specified
+   */
+  public String getOption(String optionName) {
+    List<String> list = argMap.get(keyFor(optionName));
+    if (list != null && !list.isEmpty()) {
+      return list.get(0);
+    }
+    return null;
+  }
+
+  /**
+   * Get the option, else the default
+   * @param optionName The name of the option to look up, without the --
+   * @param defaultVal The default value.
+   * @return The requested option, else the default value if it doesn't exist
+   */
+  public String getOption(String optionName, String defaultVal) {
+    String res = getOption(optionName);
+    if (res == null) {
+      res = defaultVal;
+    }
+    return res;
+  }
+
+  public int getInt(String optionName) {
+    return Integer.parseInt(getOption(optionName));
+  }
+
+  public int getInt(String optionName, int defaultVal) {
+    return Integer.parseInt(getOption(optionName, String.valueOf(defaultVal)));
+  }
+
+  public float getFloat(String optionName) {
+    return Float.parseFloat(getOption(optionName));
+  }
+
+  public float getFloat(String optionName, float defaultVal) {
+    return Float.parseFloat(getOption(optionName, String.valueOf(defaultVal)));
+  }
+
+  /**
+   * Options can occur multiple times, so return the list
+   * @param optionName The unadorned (no "--" prefixing it) option name
+   * @return The values, else null.  If the option is present, but has no values, then the result will be an
+   * empty list (Collections.emptyList())
+   */
+  public List<String> getOptions(String optionName) {
+    return argMap.get(keyFor(optionName));
+  }
+
+  /**
+   * @return if the requested option has been specified
+   */
+  public boolean hasOption(String optionName) {
+    return argMap.containsKey(keyFor(optionName));
+  }
+
+
+  /**
+   * Get the cardinality of the input vectors
+   *
+   * @param matrix
+   * @return the cardinality of the vector
+   */
+  public int getDimensions(Path matrix) throws IOException {
+    try (SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(getConf()), matrix, getConf())){
+      Writable row = ClassUtils.instantiateAs(reader.getKeyClass().asSubclass(Writable.class), Writable.class);
+      Preconditions.checkArgument(reader.getValueClass().equals(VectorWritable.class),
+          "value type of sequencefile must be a VectorWritable");
+
+      VectorWritable vectorWritable = new VectorWritable();
+      boolean hasAtLeastOneRow = reader.next(row, vectorWritable);
+      Preconditions.checkState(hasAtLeastOneRow, "matrix must have at least one row");
+      return vectorWritable.get().size();
+    }
+  }
+
+  /**
+   * Obtain input and output directories from command-line options or hadoop
+   *  properties. If {@code addInputOption} or {@code addOutputOption}
+   *  has been called, this method will throw an {@code OptionException} if
+   *  no source (command-line or property) for that value is present. 
+   *  Otherwise, {@code inputPath} or {@code outputPath} will be
+   *  non-null only if specified as a hadoop property. Command-line options
+   *  take precedence over hadoop properties.
+   *
+   * @throws IllegalArgumentException if either inputOption is present,
+   *   and neither {@code --input} nor {@code -Dmapred.input dir} are
+   *   specified or outputOption is present and neither {@code --output}
+   *   nor {@code -Dmapred.output.dir} are specified.
+   */
+  protected void parseDirectories(CommandLine cmdLine, boolean inputOptional, boolean outputOptional) {
+
+    Configuration conf = getConf();
+
+    if (inputOption != null && cmdLine.hasOption(inputOption)) {
+      this.inputPath = new Path(cmdLine.getValue(inputOption).toString());
+      this.inputFile = new File(cmdLine.getValue(inputOption).toString());
+    }
+    if (inputPath == null && conf.get("mapred.input.dir") != null) {
+      this.inputPath = new Path(conf.get("mapred.input.dir"));
+    }
+
+    if (outputOption != null && cmdLine.hasOption(outputOption)) {
+      this.outputPath = new Path(cmdLine.getValue(outputOption).toString());
+      this.outputFile = new File(cmdLine.getValue(outputOption).toString());
+    }
+    if (outputPath == null && conf.get("mapred.output.dir") != null) {
+      this.outputPath = new Path(conf.get("mapred.output.dir"));
+    }
+
+    Preconditions.checkArgument(inputOptional || inputOption == null || inputPath != null,
+        "No input specified or -Dmapred.input.dir must be provided to specify input directory");
+    Preconditions.checkArgument(outputOptional || outputOption == null || outputPath != null,
+        "No output specified:  or -Dmapred.output.dir must be provided to specify output directory");
+  }
+
+  protected static void maybePut(Map<String, List<String>> args, CommandLine cmdLine, Option... opt) {
+    for (Option o : opt) {
+
+      // the option appeared on the command-line, or it has a value
+      // (which is likely a default value). 
+      if (cmdLine.hasOption(o) || cmdLine.getValue(o) != null
+          || (cmdLine.getValues(o) != null && !cmdLine.getValues(o).isEmpty())) {
+
+        // nulls are ok, for cases where options are simple flags.
+        List<?> vo = cmdLine.getValues(o);
+        if (vo != null && !vo.isEmpty()) {
+          List<String> vals = new ArrayList<>();
+          for (Object o1 : vo) {
+            vals.add(o1.toString());
+          }
+          args.put(o.getPreferredName(), vals);
+        } else {
+          args.put(o.getPreferredName(), null);
+        }
+      }
+    }
+  }
+
+  /**
+   *
+   * @param args The input argument map
+   * @param optName The adorned (including "--") option name
+   * @return The first value in the match, else null
+   */
+  public static String getOption(Map<String, List<String>> args, String optName) {
+    List<String> res = args.get(optName);
+    if (res != null && !res.isEmpty()) {
+      return res.get(0);
+    }
+    return null;
+  }
+
+
+  protected static boolean shouldRunNextPhase(Map<String, List<String>> args, AtomicInteger currentPhase) {
+    int phase = currentPhase.getAndIncrement();
+    String startPhase = getOption(args, "--startPhase");
+    String endPhase = getOption(args, "--endPhase");
+    boolean phaseSkipped = (startPhase != null && phase < Integer.parseInt(startPhase))
+        || (endPhase != null && phase > Integer.parseInt(endPhase));
+    if (phaseSkipped) {
+      log.info("Skipping phase {}", phase);
+    }
+    return !phaseSkipped;
+  }
+
+  protected Job prepareJob(Path inputPath,
+                           Path outputPath,
+                           Class<? extends InputFormat> inputFormat,
+                           Class<? extends Mapper> mapper,
+                           Class<? extends Writable> mapperKey,
+                           Class<? extends Writable> mapperValue,
+                           Class<? extends OutputFormat> outputFormat) throws IOException {
+    return prepareJob(inputPath, outputPath, inputFormat, mapper, mapperKey, mapperValue, outputFormat, null);
+
+  }
+  protected Job prepareJob(Path inputPath,
+                           Path outputPath,
+                           Class<? extends InputFormat> inputFormat,
+                           Class<? extends Mapper> mapper,
+                           Class<? extends Writable> mapperKey,
+                           Class<? extends Writable> mapperValue,
+                           Class<? extends OutputFormat> outputFormat,
+                           String jobname) throws IOException {
+
+    Job job = HadoopUtil.prepareJob(inputPath, outputPath,
+            inputFormat, mapper, mapperKey, mapperValue, outputFormat, getConf());
+
+    String name =
+        jobname != null ? jobname : HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class);
+
+    job.setJobName(name);
+    return job;
+
+  }
+
+  protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends Mapper> mapper,
+      Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer,
+      Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue) throws IOException {
+    return prepareJob(inputPath, outputPath, SequenceFileInputFormat.class, mapper, mapperKey, mapperValue, reducer,
+        reducerKey, reducerValue, SequenceFileOutputFormat.class);
+  }
+
+  protected Job prepareJob(Path inputPath,
+                           Path outputPath,
+                           Class<? extends InputFormat> inputFormat,
+                           Class<? extends Mapper> mapper,
+                           Class<? extends Writable> mapperKey,
+                           Class<? extends Writable> mapperValue,
+                           Class<? extends Reducer> reducer,
+                           Class<? extends Writable> reducerKey,
+                           Class<? extends Writable> reducerValue,
+                           Class<? extends OutputFormat> outputFormat) throws IOException {
+    Job job = HadoopUtil.prepareJob(inputPath, outputPath,
+            inputFormat, mapper, mapperKey, mapperValue, reducer, reducerKey, reducerValue, outputFormat, getConf());
+    job.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class));
+    return job;
+  }
+
+  /**
+   * necessary to make this job (having a combined input path) work on Amazon S3, hopefully this is
+   * obsolete when MultipleInputs is available again
+   */
+  public static void setS3SafeCombinedInputPath(Job job, Path referencePath, Path inputPathOne, Path inputPathTwo)
+    throws IOException {
+    FileSystem fs = FileSystem.get(referencePath.toUri(), job.getConfiguration());
+    FileInputFormat.setInputPaths(job, inputPathOne.makeQualified(fs), inputPathTwo.makeQualified(fs));
+  }
+
+  protected Class<? extends Analyzer> getAnalyzerClassFromOption() throws ClassNotFoundException {
+    Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
+    if (hasOption(DefaultOptionCreator.ANALYZER_NAME_OPTION)) {
+      String className = getOption(DefaultOptionCreator.ANALYZER_NAME_OPTION);
+      analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
+      // try instantiating it, b/c there isn't any point in setting it if
+      // you can't instantiate it
+      //ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
+      AnalyzerUtils.createAnalyzer(analyzerClass);
+    }
+    return analyzerClass;
+  }
+  
+  /**
+   * Overrides the base implementation to install the Oozie action configuration resource
+   * into the provided Configuration object; note that ToolRunner calls setConf on the Tool
+   * before it invokes run.
+   */
+  @Override
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+      
+    // If running in an Oozie workflow as a Java action, need to add the
+    // Configuration resource provided by Oozie to this job's config.
+    String oozieActionConfXml = System.getProperty("oozie.action.conf.xml");
+    if (oozieActionConfXml != null && conf != null) {
+      conf.addResource(new Path("file:///", oozieActionConfXml));
+      log.info("Added Oozie action Configuration resource {} to the Hadoop Configuration", oozieActionConfXml);
+    }      
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/ClassUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/ClassUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/ClassUtils.java
new file mode 100644
index 0000000..8052ef1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/ClassUtils.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.lang.reflect.InvocationTargetException;
+
+public final class ClassUtils {
+
+  private ClassUtils() {}
+
+  public static <T> T instantiateAs(String classname, Class<T> asSubclassOfClass) {
+    try {
+      return instantiateAs(Class.forName(classname).asSubclass(asSubclassOfClass), asSubclassOfClass);
+    } catch (ClassNotFoundException e) {
+      throw new IllegalStateException(e);
+    }
+  }
+
+  public static <T> T instantiateAs(String classname, Class<T> asSubclassOfClass, Class<?>[] params, Object[] args) {
+    try {
+      return instantiateAs(Class.forName(classname).asSubclass(asSubclassOfClass), asSubclassOfClass, params, args);
+    } catch (ClassNotFoundException e) {
+      throw new IllegalStateException(e);
+    }
+  }
+
+  public static <T> T instantiateAs(Class<? extends T> clazz,
+                                    Class<T> asSubclassOfClass,
+                                    Class<?>[] params,
+                                    Object[] args) {
+    try {
+      return clazz.asSubclass(asSubclassOfClass).getConstructor(params).newInstance(args);
+    } catch (InstantiationException | IllegalAccessException | NoSuchMethodException | InvocationTargetException ie) {
+      throw new IllegalStateException(ie);
+    }
+  }
+
+
+  public static <T> T instantiateAs(Class<? extends T> clazz, Class<T> asSubclassOfClass) {
+    try {
+      return clazz.asSubclass(asSubclassOfClass).getConstructor().newInstance();
+    } catch (InstantiationException | IllegalAccessException | NoSuchMethodException | InvocationTargetException ie) {
+      throw new IllegalStateException(ie);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/CommandLineUtil.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/CommandLineUtil.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/CommandLineUtil.java
new file mode 100644
index 0000000..ac4ab88
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/CommandLineUtil.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.GenericOptionsParser;
+
+public final class CommandLineUtil {
+  
+  private CommandLineUtil() { }
+  
+  public static void printHelp(Group group) {
+    HelpFormatter formatter = new HelpFormatter();
+    formatter.setGroup(group);
+    formatter.print();
+  }
+ 
+  /**
+   * Print the options supported by {@code GenericOptionsParser}.
+   * In addition to the options supported by the job, passed in as the
+   * group parameter.
+   *
+   * @param group job-specific command-line options.
+   */
+  public static void printHelpWithGenericOptions(Group group) throws IOException {
+    new GenericOptionsParser(new Configuration(), new org.apache.commons.cli.Options(), new String[0]);
+    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true);
+    HelpFormatter formatter = new HelpFormatter();
+    formatter.setGroup(group);
+    formatter.setPrintWriter(pw);
+    formatter.setFooter("Specify HDFS directories while running on hadoop; else specify local file system directories");
+    formatter.print();
+  }
+
+  public static void printHelpWithGenericOptions(Group group, OptionException oe) throws IOException {
+    new GenericOptionsParser(new Configuration(), new org.apache.commons.cli.Options(), new String[0]);
+    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true);
+    HelpFormatter formatter = new HelpFormatter();
+    formatter.setGroup(group);
+    formatter.setPrintWriter(pw);
+    formatter.setException(oe);
+    formatter.print();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/HadoopUtil.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/HadoopUtil.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/HadoopUtil.java
new file mode 100644
index 0000000..34515aa
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/HadoopUtil.java
@@ -0,0 +1,435 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class HadoopUtil {
+
+  private static final Logger log = LoggerFactory.getLogger(HadoopUtil.class);
+
+  private HadoopUtil() { }
+
+  /**
+   * Create a map-only Hadoop Job out of the passed in parameters.  Does not set the
+   * Job name.
+   *
+   * @see #getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext, Class, Class)
+   */
+  public static Job prepareJob(Path inputPath,
+                           Path outputPath,
+                           Class<? extends InputFormat> inputFormat,
+                           Class<? extends Mapper> mapper,
+                           Class<? extends Writable> mapperKey,
+                           Class<? extends Writable> mapperValue,
+                           Class<? extends OutputFormat> outputFormat, Configuration conf) throws IOException {
+
+    Job job = new Job(new Configuration(conf));
+    Configuration jobConf = job.getConfiguration();
+
+    if (mapper.equals(Mapper.class)) {
+      throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
+    }
+    job.setJarByClass(mapper);
+
+    job.setInputFormatClass(inputFormat);
+    jobConf.set("mapred.input.dir", inputPath.toString());
+
+    job.setMapperClass(mapper);
+    job.setMapOutputKeyClass(mapperKey);
+    job.setMapOutputValueClass(mapperValue);
+    job.setOutputKeyClass(mapperKey);
+    job.setOutputValueClass(mapperValue);
+    jobConf.setBoolean("mapred.compress.map.output", true);
+    job.setNumReduceTasks(0);
+
+    job.setOutputFormatClass(outputFormat);
+    jobConf.set("mapred.output.dir", outputPath.toString());
+
+    return job;
+  }
+
+  /**
+   * Create a map and reduce Hadoop job.  Does not set the name on the job.
+   * @param inputPath The input {@link org.apache.hadoop.fs.Path}
+   * @param outputPath The output {@link org.apache.hadoop.fs.Path}
+   * @param inputFormat The {@link org.apache.hadoop.mapreduce.InputFormat}
+   * @param mapper The {@link org.apache.hadoop.mapreduce.Mapper} class to use
+   * @param mapperKey The {@link org.apache.hadoop.io.Writable} key class.  If the Mapper is a no-op,
+   *                  this value may be null
+   * @param mapperValue The {@link org.apache.hadoop.io.Writable} value class.  If the Mapper is a no-op,
+   *                    this value may be null
+   * @param reducer The {@link org.apache.hadoop.mapreduce.Reducer} to use
+   * @param reducerKey The reducer key class.
+   * @param reducerValue The reducer value class.
+   * @param outputFormat The {@link org.apache.hadoop.mapreduce.OutputFormat}.
+   * @param conf The {@link org.apache.hadoop.conf.Configuration} to use.
+   * @return The {@link org.apache.hadoop.mapreduce.Job}.
+   * @throws IOException if there is a problem with the IO.
+   *
+   * @see #getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext, Class, Class)
+   * @see #prepareJob(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path, Class, Class, Class, Class, Class,
+   * org.apache.hadoop.conf.Configuration)
+   */
+  public static Job prepareJob(Path inputPath,
+                           Path outputPath,
+                           Class<? extends InputFormat> inputFormat,
+                           Class<? extends Mapper> mapper,
+                           Class<? extends Writable> mapperKey,
+                           Class<? extends Writable> mapperValue,
+                           Class<? extends Reducer> reducer,
+                           Class<? extends Writable> reducerKey,
+                           Class<? extends Writable> reducerValue,
+                           Class<? extends OutputFormat> outputFormat,
+                           Configuration conf) throws IOException {
+
+    Job job = new Job(new Configuration(conf));
+    Configuration jobConf = job.getConfiguration();
+
+    if (reducer.equals(Reducer.class)) {
+      if (mapper.equals(Mapper.class)) {
+        throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
+      }
+      job.setJarByClass(mapper);
+    } else {
+      job.setJarByClass(reducer);
+    }
+
+    job.setInputFormatClass(inputFormat);
+    jobConf.set("mapred.input.dir", inputPath.toString());
+
+    job.setMapperClass(mapper);
+    if (mapperKey != null) {
+      job.setMapOutputKeyClass(mapperKey);
+    }
+    if (mapperValue != null) {
+      job.setMapOutputValueClass(mapperValue);
+    }
+
+    jobConf.setBoolean("mapred.compress.map.output", true);
+
+    job.setReducerClass(reducer);
+    job.setOutputKeyClass(reducerKey);
+    job.setOutputValueClass(reducerValue);
+
+    job.setOutputFormatClass(outputFormat);
+    jobConf.set("mapred.output.dir", outputPath.toString());
+
+    return job;
+  }
+
+
+  public static String getCustomJobName(String className, JobContext job,
+                                  Class<? extends Mapper> mapper,
+                                  Class<? extends Reducer> reducer) {
+    StringBuilder name = new StringBuilder(100);
+    String customJobName = job.getJobName();
+    if (customJobName == null || customJobName.trim().isEmpty()) {
+      name.append(className);
+    } else {
+      name.append(customJobName);
+    }
+    name.append('-').append(mapper.getSimpleName());
+    name.append('-').append(reducer.getSimpleName());
+    return name.toString();
+  }
+
+
+  public static void delete(Configuration conf, Iterable<Path> paths) throws IOException {
+    if (conf == null) {
+      conf = new Configuration();
+    }
+    for (Path path : paths) {
+      FileSystem fs = path.getFileSystem(conf);
+      if (fs.exists(path)) {
+        log.info("Deleting {}", path);
+        fs.delete(path, true);
+      }
+    }
+  }
+
+  public static void delete(Configuration conf, Path... paths) throws IOException {
+    delete(conf, Arrays.asList(paths));
+  }
+
+  public static long countRecords(Path path, Configuration conf) throws IOException {
+    long count = 0;
+    Iterator<?> iterator = new SequenceFileValueIterator<>(path, true, conf);
+    while (iterator.hasNext()) {
+      iterator.next();
+      count++;
+    }
+    return count;
+  }
+
+  /**
+   * Count all the records in a directory using a
+   * {@link org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator}
+   *
+   * @param path The {@link org.apache.hadoop.fs.Path} to count
+   * @param pt The {@link org.apache.mahout.common.iterator.sequencefile.PathType}
+   * @param filter Apply the {@link org.apache.hadoop.fs.PathFilter}.  May be null
+   * @param conf The Hadoop {@link org.apache.hadoop.conf.Configuration}
+   * @return The number of records
+   * @throws IOException if there was an IO error
+   */
+  public static long countRecords(Path path, PathType pt, PathFilter filter, Configuration conf) throws IOException {
+    long count = 0;
+    Iterator<?> iterator = new SequenceFileDirValueIterator<>(path, pt, filter, null, true, conf);
+    while (iterator.hasNext()) {
+      iterator.next();
+      count++;
+    }
+    return count;
+  }
+
+  public static InputStream openStream(Path path, Configuration conf) throws IOException {
+    FileSystem fs = FileSystem.get(path.toUri(), conf);
+    return fs.open(path.makeQualified(path.toUri(), path));
+  }
+
+  public static FileStatus[] getFileStatus(Path path, PathType pathType, PathFilter filter,
+      Comparator<FileStatus> ordering, Configuration conf) throws IOException {
+    FileStatus[] statuses;
+    FileSystem fs = path.getFileSystem(conf);
+    if (filter == null) {
+      statuses = pathType == PathType.GLOB ? fs.globStatus(path) : listStatus(fs, path);
+    } else {
+      statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : listStatus(fs, path, filter);
+    }
+    if (ordering != null) {
+      Arrays.sort(statuses, ordering);
+    }
+    return statuses;
+  }
+
+  public static FileStatus[] listStatus(FileSystem fs, Path path) throws IOException {
+    try {
+      return fs.listStatus(path);
+    } catch (FileNotFoundException e) {
+      return new FileStatus[0];
+    }
+  }
+
+  public static FileStatus[] listStatus(FileSystem fs, Path path, PathFilter filter) throws IOException {
+    try {
+      return fs.listStatus(path, filter);
+    } catch (FileNotFoundException e) {
+      return new FileStatus[0];
+    }
+  }
+
+  public static void cacheFiles(Path fileToCache, Configuration conf) {
+    DistributedCache.setCacheFiles(new URI[]{fileToCache.toUri()}, conf);
+  }
+
+  /**
+   * Return the first cached file in the list, else null if thre are no cached files.
+   * @param conf - MapReduce Configuration
+   * @return Path of Cached file
+   * @throws IOException - IO Exception
+   */
+  public static Path getSingleCachedFile(Configuration conf) throws IOException {
+    return getCachedFiles(conf)[0];
+  }
+
+  /**
+   * Retrieves paths to cached files.
+   * @param conf - MapReduce Configuration
+   * @return Path[] of Cached Files
+   * @throws IOException - IO Exception
+   * @throws IllegalStateException if no cache files are found
+   */
+  public static Path[] getCachedFiles(Configuration conf) throws IOException {
+    LocalFileSystem localFs = FileSystem.getLocal(conf);
+    Path[] cacheFiles = DistributedCache.getLocalCacheFiles(conf);
+
+    URI[] fallbackFiles = DistributedCache.getCacheFiles(conf);
+
+    // fallback for local execution
+    if (cacheFiles == null) {
+
+      Preconditions.checkState(fallbackFiles != null, "Unable to find cached files!");
+
+      cacheFiles = new Path[fallbackFiles.length];
+      for (int n = 0; n < fallbackFiles.length; n++) {
+        cacheFiles[n] = new Path(fallbackFiles[n].getPath());
+      }
+    } else {
+
+      for (int n = 0; n < cacheFiles.length; n++) {
+        cacheFiles[n] = localFs.makeQualified(cacheFiles[n]);
+        // fallback for local execution
+        if (!localFs.exists(cacheFiles[n])) {
+          cacheFiles[n] = new Path(fallbackFiles[n].getPath());
+        }
+      }
+    }
+
+    Preconditions.checkState(cacheFiles.length > 0, "Unable to find cached files!");
+
+    return cacheFiles;
+  }
+
+  public static void setSerializations(Configuration configuration) {
+    configuration.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+        + "org.apache.hadoop.io.serializer.WritableSerialization");
+  }
+
+  public static void writeInt(int value, Path path, Configuration configuration) throws IOException {
+    FileSystem fs = FileSystem.get(path.toUri(), configuration);
+    try (FSDataOutputStream out = fs.create(path)) {
+      out.writeInt(value);
+    }
+  }
+
+  public static int readInt(Path path, Configuration configuration) throws IOException {
+    FileSystem fs = FileSystem.get(path.toUri(), configuration);
+    try (FSDataInputStream in = fs.open(path)) {
+      return in.readInt();
+    }
+  }
+
+  /**
+   * Builds a comma-separated list of input splits
+   * @param fs - File System
+   * @param fileStatus - File Status
+   * @return list of directories as a comma-separated String
+   * @throws IOException - IO Exception
+   */
+  public static String buildDirList(FileSystem fs, FileStatus fileStatus) throws IOException {
+    boolean containsFiles = false;
+    List<String> directoriesList = new ArrayList<>();
+    for (FileStatus childFileStatus : fs.listStatus(fileStatus.getPath())) {
+      if (childFileStatus.isDir()) {
+        String subDirectoryList = buildDirList(fs, childFileStatus);
+        directoriesList.add(subDirectoryList);
+      } else {
+        containsFiles = true;
+      }
+    }
+
+    if (containsFiles) {
+      directoriesList.add(fileStatus.getPath().toUri().getPath());
+    }
+    return Joiner.on(',').skipNulls().join(directoriesList.iterator());
+  }
+
+  /**
+   * Builds a comma-separated list of input splits
+   * @param fs - File System
+   * @param fileStatus - File Status
+   * @param pathFilter - path filter
+   * @return list of directories as a comma-separated String
+   * @throws IOException - IO Exception
+   */
+  public static String buildDirList(FileSystem fs, FileStatus fileStatus, PathFilter pathFilter) throws IOException {
+    boolean containsFiles = false;
+    List<String> directoriesList = new ArrayList<>();
+    for (FileStatus childFileStatus : fs.listStatus(fileStatus.getPath(), pathFilter)) {
+      if (childFileStatus.isDir()) {
+        String subDirectoryList = buildDirList(fs, childFileStatus);
+        directoriesList.add(subDirectoryList);
+      } else {
+        containsFiles = true;
+      }
+    }
+
+    if (containsFiles) {
+      directoriesList.add(fileStatus.getPath().toUri().getPath());
+    }
+    return Joiner.on(',').skipNulls().join(directoriesList.iterator());
+  }
+
+  /**
+   *
+   * @param configuration  -  configuration
+   * @param filePath - Input File Path
+   * @return relative file Path
+   * @throws IOException - IO Exception
+   */
+  public static String calcRelativeFilePath(Configuration configuration, Path filePath) throws IOException {
+    FileSystem fs = filePath.getFileSystem(configuration);
+    FileStatus fst = fs.getFileStatus(filePath);
+    String currentPath = fst.getPath().toString().replaceFirst("file:", "");
+
+    String basePath = configuration.get("baseinputpath");
+    if (!basePath.endsWith("/")) {
+      basePath += "/";
+    }
+    basePath = basePath.replaceFirst("file:", "");
+    String[] parts = currentPath.split(basePath);
+
+    if (parts.length == 2) {
+      return parts[1];
+    } else if (parts.length == 1) {
+      return parts[0];
+    }
+    return currentPath;
+  }
+
+  /**
+   * Finds a file in the DistributedCache
+   *
+   * @param partOfFilename a substring of the file name
+   * @param localFiles holds references to files stored in distributed cache
+   * @return Path to first matched file or null if nothing was found
+   **/
+  public static Path findInCacheByPartOfFilename(String partOfFilename, URI[] localFiles) {
+    for (URI distCacheFile : localFiles) {
+      log.info("trying find a file in distributed cache containing [{}] in its name", partOfFilename);
+      if (distCacheFile != null && distCacheFile.toString().contains(partOfFilename)) {
+        log.info("found file [{}] containing [{}]", distCacheFile.toString(), partOfFilename);
+        return new Path(distCacheFile.getPath());
+      }
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntPairWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntPairWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntPairWritable.java
new file mode 100644
index 0000000..dacd66f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntPairWritable.java
@@ -0,0 +1,270 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import org.apache.hadoop.io.BinaryComparable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableComparator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Arrays;
+
+/**
+ * A {@link WritableComparable} which encapsulates an ordered pair of signed integers.
+ */
+public final class IntPairWritable extends BinaryComparable
+    implements WritableComparable<BinaryComparable>, Cloneable {
+
+  static final int INT_BYTE_LENGTH = 4;
+  static final int INT_PAIR_BYTE_LENGTH = 2 * INT_BYTE_LENGTH;
+  private byte[] b = new byte[INT_PAIR_BYTE_LENGTH];
+  
+  public IntPairWritable() {
+    setFirst(0);
+    setSecond(0);
+  }
+  
+  public IntPairWritable(IntPairWritable pair) {
+    b = Arrays.copyOf(pair.getBytes(), INT_PAIR_BYTE_LENGTH);
+  }
+  
+  public IntPairWritable(int x, int y) {
+    putInt(x, b, 0);
+    putInt(y, b, INT_BYTE_LENGTH);
+  }
+  
+  public void set(int x, int y) {
+    putInt(x, b, 0);
+    putInt(y, b, INT_BYTE_LENGTH);
+  }
+  
+  public void setFirst(int x) {
+    putInt(x, b, 0);
+  }
+  
+  public int getFirst() {
+    return getInt(b, 0);
+  }
+  
+  public void setSecond(int y) {
+    putInt(y, b, INT_BYTE_LENGTH);
+  }
+  
+  public int getSecond() {
+    return getInt(b, INT_BYTE_LENGTH);
+  }
+  
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    in.readFully(b);
+  }
+  
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.write(b);
+  }
+  
+  @Override
+  public int hashCode() {
+    return Arrays.hashCode(b);
+  }
+  
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+    if (!super.equals(obj)) {
+      return false;
+    }
+    if (!(obj instanceof IntPairWritable)) {
+      return false;
+    }
+    IntPairWritable other = (IntPairWritable) obj;
+    return Arrays.equals(b, other.b);
+  }
+
+  @Override
+  public int compareTo(BinaryComparable other) {
+    return Comparator.doCompare(b, 0, ((IntPairWritable) other).b, 0);
+  }
+
+  @Override
+  public Object clone() {
+    return new IntPairWritable(this);
+  }
+  
+  @Override
+  public String toString() {
+    return "(" + getFirst() + ", " + getSecond() + ')';
+  }
+  
+  @Override
+  public byte[] getBytes() {
+    return b;
+  }
+  
+  @Override
+  public int getLength() {
+    return INT_PAIR_BYTE_LENGTH;
+  }
+  
+  private static void putInt(int value, byte[] b, int offset) {
+    for (int i = offset, j = 24; j >= 0; i++, j -= 8) {
+      b[i] = (byte) (value >> j);
+    }
+  }
+  
+  private static int getInt(byte[] b, int offset) {
+    int value = 0;
+    for (int i = offset, j = 24; j >= 0; i++, j -= 8) {
+      value |= (b[i] & 0xFF) << j;
+    }
+    return value;
+  }
+
+  static {
+    WritableComparator.define(IntPairWritable.class, new Comparator());
+  }
+
+  public static final class Comparator extends WritableComparator implements Serializable {
+    public Comparator() {
+      super(IntPairWritable.class);
+    }
+    
+    @Override
+    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+      return doCompare(b1, s1, b2, s2);
+    }
+
+    static int doCompare(byte[] b1, int s1, byte[] b2, int s2) {
+      int compare1 = compareInts(b1, s1, b2, s2);
+      if (compare1 != 0) {
+        return compare1;
+      }
+      return compareInts(b1, s1 + INT_BYTE_LENGTH, b2, s2 + INT_BYTE_LENGTH);
+    }
+
+    private static int compareInts(byte[] b1, int s1, byte[] b2, int s2) {
+      // Like WritableComparator.compareBytes(), but treats first byte as signed value
+      int end1 = s1 + INT_BYTE_LENGTH;
+      for (int i = s1, j = s2; i < end1; i++, j++) {
+        int a = b1[i];
+        int b = b2[j];
+        if (i > s1) {
+          a &= 0xff;
+          b &= 0xff;
+        }
+        if (a != b) {
+          return a - b;
+        }
+      }
+      return 0;
+    }
+  }
+  
+  /**
+   * Compare only the first part of the pair, so that reduce is called once for each value of the first part.
+   */
+  public static class FirstGroupingComparator extends WritableComparator implements Serializable {
+    
+    public FirstGroupingComparator() {
+      super(IntPairWritable.class);
+    }
+    
+    @Override
+    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+      int firstb1 = WritableComparator.readInt(b1, s1);
+      int firstb2 = WritableComparator.readInt(b2, s2);
+      if (firstb1 < firstb2) {
+        return -1;
+      } else if (firstb1 > firstb2) {
+        return 1;
+      } else {
+        return 0;
+      }
+    }
+    
+    @Override
+    public int compare(Object o1, Object o2) {
+      int firstb1 = ((IntPairWritable) o1).getFirst();
+      int firstb2 = ((IntPairWritable) o2).getFirst();
+      if (firstb1 < firstb2) {
+        return -1;
+      }
+      if (firstb1 > firstb2) {
+        return 1;
+      }
+      return 0;
+    }
+    
+  }
+  
+  /** A wrapper class that associates pairs with frequency (Occurrences) */
+  public static class Frequency implements Comparable<Frequency>, Serializable {
+    
+    private final IntPairWritable pair;
+    private final double frequency;
+
+    public Frequency(IntPairWritable bigram, double frequency) {
+      this.pair = new IntPairWritable(bigram);
+      this.frequency = frequency;
+    }
+
+    public double getFrequency() {
+      return frequency;
+    }
+
+    public IntPairWritable getPair() {
+      return pair;
+    }
+
+    @Override
+    public int hashCode() {
+      return pair.hashCode() + RandomUtils.hashDouble(frequency);
+    }
+    
+    @Override
+    public boolean equals(Object right) {
+      if (!(right instanceof Frequency)) {
+        return false;
+      }
+      Frequency that = (Frequency) right;
+      return pair.equals(that.pair) && frequency == that.frequency;
+    }
+    
+    @Override
+    public int compareTo(Frequency that) {
+      if (frequency < that.frequency) {
+        return -1;
+      }
+      if (frequency > that.frequency) {
+        return 1;
+      }
+      return 0;
+    }
+    
+    @Override
+    public String toString() {
+      return pair + "\t" + frequency;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntegerTuple.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntegerTuple.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntegerTuple.java
new file mode 100644
index 0000000..f456d4d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntegerTuple.java
@@ -0,0 +1,176 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+
+import com.google.common.collect.Lists;
+import org.apache.hadoop.io.WritableComparable;
+
+/**
+ * An Ordered List of Integers which can be used in a Hadoop Map/Reduce Job
+ * 
+ * 
+ */
+public final class IntegerTuple implements WritableComparable<IntegerTuple> {
+  
+  private List<Integer> tuple = Lists.newArrayList();
+  
+  public IntegerTuple() { }
+  
+  public IntegerTuple(Integer firstEntry) {
+    add(firstEntry);
+  }
+  
+  public IntegerTuple(Iterable<Integer> entries) {
+    for (Integer entry : entries) {
+      add(entry);
+    }
+  }
+  
+  public IntegerTuple(Integer[] entries) {
+    for (Integer entry : entries) {
+      add(entry);
+    }
+  }
+  
+  /**
+   * add an entry to the end of the list
+   * 
+   * @param entry
+   * @return true if the items get added
+   */
+  public boolean add(Integer entry) {
+    return tuple.add(entry);
+  }
+  
+  /**
+   * Fetches the string at the given location
+   * 
+   * @param index
+   * @return String value at the given location in the tuple list
+   */
+  public Integer integerAt(int index) {
+    return tuple.get(index);
+  }
+  
+  /**
+   * Replaces the string at the given index with the given newString
+   * 
+   * @param index
+   * @param newInteger
+   * @return The previous value at that location
+   */
+  public Integer replaceAt(int index, Integer newInteger) {
+    return tuple.set(index, newInteger);
+  }
+  
+  /**
+   * Fetch the list of entries from the tuple
+   * 
+   * @return a List containing the strings in the order of insertion
+   */
+  public List<Integer> getEntries() {
+    return Collections.unmodifiableList(this.tuple);
+  }
+  
+  /**
+   * Returns the length of the tuple
+   * 
+   * @return length
+   */
+  public int length() {
+    return this.tuple.size();
+  }
+  
+  @Override
+  public String toString() {
+    return tuple.toString();
+  }
+  
+  @Override
+  public int hashCode() {
+    return tuple.hashCode();
+  }
+  
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+    if (obj == null) {
+      return false;
+    }
+    if (getClass() != obj.getClass()) {
+      return false;
+    }
+    IntegerTuple other = (IntegerTuple) obj;
+    if (tuple == null) {
+      if (other.tuple != null) {
+        return false;
+      }
+    } else if (!tuple.equals(other.tuple)) {
+      return false;
+    }
+    return true;
+  }
+  
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    int len = in.readInt();
+    tuple = Lists.newArrayListWithCapacity(len);
+    for (int i = 0; i < len; i++) {
+      int data = in.readInt();
+      tuple.add(data);
+    }
+  }
+  
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(tuple.size());
+    for (Integer entry : tuple) {
+      out.writeInt(entry);
+    }
+  }
+  
+  @Override
+  public int compareTo(IntegerTuple otherTuple) {
+    int thisLength = length();
+    int otherLength = otherTuple.length();
+    int min = Math.min(thisLength, otherLength);
+    for (int i = 0; i < min; i++) {
+      int ret = this.tuple.get(i).compareTo(otherTuple.integerAt(i));
+      if (ret == 0) {
+        continue;
+      }
+      return ret;
+    }
+    if (thisLength < otherLength) {
+      return -1;
+    } else if (thisLength > otherLength) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/LongPair.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/LongPair.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/LongPair.java
new file mode 100644
index 0000000..5215e3a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/LongPair.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.Serializable;
+
+import com.google.common.primitives.Longs;
+
+/** A simple (ordered) pair of longs. */
+public final class LongPair implements Comparable<LongPair>, Serializable {
+  
+  private final long first;
+  private final long second;
+  
+  public LongPair(long first, long second) {
+    this.first = first;
+    this.second = second;
+  }
+  
+  public long getFirst() {
+    return first;
+  }
+  
+  public long getSecond() {
+    return second;
+  }
+  
+  public LongPair swap() {
+    return new LongPair(second, first);
+  }
+  
+  @Override
+  public boolean equals(Object obj) {
+    if (!(obj instanceof LongPair)) {
+      return false;
+    }
+    LongPair otherPair = (LongPair) obj;
+    return first == otherPair.getFirst() && second == otherPair.getSecond();
+  }
+  
+  @Override
+  public int hashCode() {
+    int firstHash = Longs.hashCode(first);
+    // Flip top and bottom 16 bits; this makes the hash function probably different
+    // for (a,b) versus (b,a)
+    return (firstHash >>> 16 | firstHash << 16) ^ Longs.hashCode(second);
+  }
+  
+  @Override
+  public String toString() {
+    return '(' + String.valueOf(first) + ',' + second + ')';
+  }
+  
+  @Override
+  public int compareTo(LongPair o) {
+    if (first < o.getFirst()) {
+      return -1;
+    } else if (first > o.getFirst()) {
+      return 1;
+    } else {
+      return second < o.getSecond() ? -1 : second > o.getSecond() ? 1 : 0;
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/MemoryUtil.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/MemoryUtil.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/MemoryUtil.java
new file mode 100644
index 0000000..f241b53
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/MemoryUtil.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.common;
+
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.TimeUnit;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Memory utilities.
+ */
+public final class MemoryUtil {
+
+  private static final Logger log = LoggerFactory.getLogger(MemoryUtil.class);
+
+  private MemoryUtil() {
+  }
+
+  /**
+   * Logs current heap memory statistics.
+   *
+   * @see Runtime
+   */
+  public static void logMemoryStatistics() {
+    Runtime runtime = Runtime.getRuntime();
+    long freeBytes = runtime.freeMemory();
+    long maxBytes = runtime.maxMemory();
+    long totalBytes = runtime.totalMemory();
+    long usedBytes = totalBytes - freeBytes;
+    log.info("Memory (bytes): {} used, {} heap, {} max", usedBytes, totalBytes,
+             maxBytes);
+  }
+
+  private static volatile ScheduledExecutorService scheduler;
+
+  /**
+   * Constructs and starts a memory logger thread.
+   *
+   * @param rateInMillis how often memory info should be logged.
+   */
+  public static void startMemoryLogger(long rateInMillis) {
+    stopMemoryLogger();
+    scheduler = Executors.newScheduledThreadPool(1, new ThreadFactory() {
+      private final ThreadFactory delegate = Executors.defaultThreadFactory();
+
+      @Override
+      public Thread newThread(Runnable r) {
+        Thread t = delegate.newThread(r);
+        t.setDaemon(true);
+        return t;
+      }
+    });
+    Runnable memoryLoogerRunnable = new Runnable() {
+      @Override
+      public void run() {
+        logMemoryStatistics();
+      }
+    };
+    scheduler.scheduleAtFixedRate(memoryLoogerRunnable, rateInMillis, rateInMillis,
+        TimeUnit.MILLISECONDS);
+  }
+
+  /**
+   * Constructs and starts a memory logger thread with a logging rate of 1000 milliseconds.
+   */
+  public static void startMemoryLogger() {
+    startMemoryLogger(1000);
+  }
+
+  /**
+   * Stops the memory logger, if any, started via {@link #startMemoryLogger(long)} or
+   * {@link #startMemoryLogger()}.
+   */
+  public static void stopMemoryLogger() {
+    if (scheduler != null) {
+      scheduler.shutdownNow();
+      scheduler = null;
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Pair.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Pair.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Pair.java
new file mode 100644
index 0000000..d2ad6a1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Pair.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.Serializable;
+
+/** A simple (ordered) pair of two objects. Elements may be null. */
+public final class Pair<A,B> implements Comparable<Pair<A,B>>, Serializable {
+  
+  private final A first;
+  private final B second;
+  
+  public Pair(A first, B second) {
+    this.first = first;
+    this.second = second;
+  }
+  
+  public A getFirst() {
+    return first;
+  }
+  
+  public B getSecond() {
+    return second;
+  }
+  
+  public Pair<B, A> swap() {
+    return new Pair<>(second, first);
+  }
+
+  public static <A,B> Pair<A,B> of(A a, B b) {
+    return new Pair<>(a, b);
+  }
+  
+  @Override
+  public boolean equals(Object obj) {
+    if (!(obj instanceof Pair<?, ?>)) {
+      return false;
+    }
+    Pair<?, ?> otherPair = (Pair<?, ?>) obj;
+    return isEqualOrNulls(first, otherPair.getFirst())
+        && isEqualOrNulls(second, otherPair.getSecond());
+  }
+  
+  private static boolean isEqualOrNulls(Object obj1, Object obj2) {
+    return obj1 == null ? obj2 == null : obj1.equals(obj2);
+  }
+  
+  @Override
+  public int hashCode() {
+    int firstHash = hashCodeNull(first);
+    // Flip top and bottom 16 bits; this makes the hash function probably different
+    // for (a,b) versus (b,a)
+    return (firstHash >>> 16 | firstHash << 16) ^ hashCodeNull(second);
+  }
+  
+  private static int hashCodeNull(Object obj) {
+    return obj == null ? 0 : obj.hashCode();
+  }
+  
+  @Override
+  public String toString() {
+    return '(' + String.valueOf(first) + ',' + second + ')';
+  }
+
+  /**
+   * Defines an ordering on pairs that sorts by first value's natural ordering, ascending,
+   * and then by second value's natural ordering.
+   *
+   * @throws ClassCastException if types are not actually {@link Comparable}
+   */
+  @Override
+  public int compareTo(Pair<A,B> other) {
+    Comparable<A> thisFirst = (Comparable<A>) first;
+    A thatFirst = other.getFirst();
+    int compare = thisFirst.compareTo(thatFirst);
+    if (compare != 0) {
+      return compare;
+    }
+    Comparable<B> thisSecond = (Comparable<B>) second;
+    B thatSecond = other.getSecond();
+    return thisSecond.compareTo(thatSecond);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Parameters.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Parameters.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Parameters.java
new file mode 100644
index 0000000..e74c534
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Parameters.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.IOException;
+import java.util.Map;
+
+import com.google.common.collect.Maps;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.util.GenericsUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Parameters {
+  
+  private static final Logger log = LoggerFactory.getLogger(Parameters.class);
+  
+  private Map<String,String> params = Maps.newHashMap();
+
+  public Parameters() {
+
+  }
+
+  public Parameters(String serializedString) throws IOException {
+    this(parseParams(serializedString));
+  }
+
+  protected Parameters(Map<String,String> params) {
+    this.params = params;
+  }
+
+  public String get(String key) {
+    return params.get(key);
+  }
+  
+  public String get(String key, String defaultValue) {
+    String ret = params.get(key);
+    return ret == null ? defaultValue : ret;
+  }
+  
+  public void set(String key, String value) {
+    params.put(key, value);
+  }
+
+  public int getInt(String key, int defaultValue) {
+    String ret = params.get(key);
+    return ret == null ? defaultValue : Integer.parseInt(ret);
+  }
+
+  @Override
+  public String toString() {
+    Configuration conf = new Configuration();
+    conf.set("io.serializations",
+             "org.apache.hadoop.io.serializer.JavaSerialization,"
+             + "org.apache.hadoop.io.serializer.WritableSerialization");
+    DefaultStringifier<Map<String,String>> mapStringifier = new DefaultStringifier<>(conf,
+        GenericsUtil.getClass(params));
+    try {
+      return mapStringifier.toString(params);
+    } catch (IOException e) {
+      log.info("Encountered IOException while deserializing returning empty string", e);
+      return "";
+    }
+    
+  }
+  
+  public String print() {
+    return params.toString();
+  }
+
+  public static Map<String,String> parseParams(String serializedString) throws IOException {
+    Configuration conf = new Configuration();
+    conf.set("io.serializations",
+             "org.apache.hadoop.io.serializer.JavaSerialization,"
+             + "org.apache.hadoop.io.serializer.WritableSerialization");
+    Map<String,String> params = Maps.newHashMap();
+    DefaultStringifier<Map<String,String>> mapStringifier = new DefaultStringifier<>(conf,
+        GenericsUtil.getClass(params));
+    return mapStringifier.fromString(serializedString);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringTuple.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringTuple.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringTuple.java
new file mode 100644
index 0000000..0de1a4a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringTuple.java
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+
+import com.google.common.collect.Lists;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparable;
+
+/**
+ * An Ordered List of Strings which can be used in a Hadoop Map/Reduce Job
+ */
+public final class StringTuple implements WritableComparable<StringTuple> {
+  
+  private List<String> tuple = Lists.newArrayList();
+  
+  public StringTuple() { }
+  
+  public StringTuple(String firstEntry) {
+    add(firstEntry);
+  }
+  
+  public StringTuple(Iterable<String> entries) {
+    for (String entry : entries) {
+      add(entry);
+    }
+  }
+  
+  public StringTuple(String[] entries) {
+    for (String entry : entries) {
+      add(entry);
+    }
+  }
+  
+  /**
+   * add an entry to the end of the list
+   * 
+   * @param entry
+   * @return true if the items get added
+   */
+  public boolean add(String entry) {
+    return tuple.add(entry);
+  }
+  
+  /**
+   * Fetches the string at the given location
+   * 
+   * @param index
+   * @return String value at the given location in the tuple list
+   */
+  public String stringAt(int index) {
+    return tuple.get(index);
+  }
+  
+  /**
+   * Replaces the string at the given index with the given newString
+   * 
+   * @param index
+   * @param newString
+   * @return The previous value at that location
+   */
+  public String replaceAt(int index, String newString) {
+    return tuple.set(index, newString);
+  }
+  
+  /**
+   * Fetch the list of entries from the tuple
+   * 
+   * @return a List containing the strings in the order of insertion
+   */
+  public List<String> getEntries() {
+    return Collections.unmodifiableList(this.tuple);
+  }
+  
+  /**
+   * Returns the length of the tuple
+   * 
+   * @return length
+   */
+  public int length() {
+    return this.tuple.size();
+  }
+  
+  @Override
+  public String toString() {
+    return tuple.toString();
+  }
+  
+  @Override
+  public int hashCode() {
+    return tuple.hashCode();
+  }
+  
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+    if (obj == null) {
+      return false;
+    }
+    if (getClass() != obj.getClass()) {
+      return false;
+    }
+    StringTuple other = (StringTuple) obj;
+    if (tuple == null) {
+      if (other.tuple != null) {
+        return false;
+      }
+    } else if (!tuple.equals(other.tuple)) {
+      return false;
+    }
+    return true;
+  }
+  
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    int len = in.readInt();
+    tuple = Lists.newArrayListWithCapacity(len);
+    Text value = new Text();
+    for (int i = 0; i < len; i++) {
+      value.readFields(in);
+      tuple.add(value.toString());
+    }
+  }
+  
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(tuple.size());
+    Text value = new Text();
+    for (String entry : tuple) {
+      value.set(entry);
+      value.write(out);
+    }
+  }
+  
+  @Override
+  public int compareTo(StringTuple otherTuple) {
+    int thisLength = length();
+    int otherLength = otherTuple.length();
+    int min = Math.min(thisLength, otherLength);
+    for (int i = 0; i < min; i++) {
+      int ret = this.tuple.get(i).compareTo(otherTuple.stringAt(i));
+      if (ret != 0) {
+        return ret;
+      }
+    }
+    if (thisLength < otherLength) {
+      return -1;
+    } else if (thisLength > otherLength) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringUtils.java
new file mode 100644
index 0000000..a064596
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringUtils.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.util.regex.Pattern;
+
+import com.thoughtworks.xstream.XStream;
+
+/**
+ * Offers two methods to convert an object to a string representation and restore the object given its string
+ * representation. Should use Hadoop Stringifier whenever available.
+ */
+public final class StringUtils {
+  
+  private static final XStream XSTREAM = new XStream();
+  private static final Pattern NEWLINE_PATTERN = Pattern.compile("\n");
+  private static final Pattern XMLRESERVED = Pattern.compile("\"|\\&|\\<|\\>|\'");
+
+  private StringUtils() {
+  // do nothing
+  }
+  
+  /**
+   * Converts the object to a one-line string representation
+   * 
+   * @param obj
+   *          the object to convert
+   * @return the string representation of the object
+   */
+  public static String toString(Object obj) {
+    return NEWLINE_PATTERN.matcher(XSTREAM.toXML(obj)).replaceAll("");
+  }
+  
+  /**
+   * Restores the object from its string representation.
+   * 
+   * @param str
+   *          the string representation of the object
+   * @return restored object
+   */
+  public static <T> T fromString(String str) {
+    return (T) XSTREAM.fromXML(str);
+  }
+
+  public static String escapeXML(CharSequence input) {
+    return XMLRESERVED.matcher(input).replaceAll("_");
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/TimingStatistics.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/TimingStatistics.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/TimingStatistics.java
new file mode 100644
index 0000000..5ee2066
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/TimingStatistics.java
@@ -0,0 +1,154 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.Serializable;
+import java.text.DecimalFormat;
+
+public final class TimingStatistics implements Serializable {
+  private static final DecimalFormat DF = new DecimalFormat("#.##");
+  private int nCalls;
+  private long minTime;
+  private long maxTime;
+  private long sumTime;
+  private long leadSumTime;
+  private double sumSquaredTime;
+
+
+  /** Creates a new instance of CallStats */
+  public TimingStatistics() { }
+
+  public TimingStatistics(int nCalls, long minTime, long maxTime, long sumTime, double sumSquaredTime) {
+    this.nCalls = nCalls;
+    this.minTime = minTime;
+    this.maxTime = maxTime;
+    this.sumTime = sumTime;
+    this.sumSquaredTime = sumSquaredTime;
+  }
+
+  public synchronized int getNCalls() {
+    return nCalls;
+  }
+
+  public synchronized long getMinTime() {
+    return Math.max(0, minTime);
+  }
+
+  public synchronized long getMaxTime() {
+    return maxTime;
+  }
+
+  public synchronized long getSumTime() {
+    return sumTime;
+  }
+
+  public synchronized double getSumSquaredTime() {
+    return sumSquaredTime;
+  }
+
+  public synchronized long getMeanTime() {
+    return nCalls == 0 ? 0 : sumTime / nCalls;
+  }
+
+  public synchronized long getStdDevTime() {
+    if (nCalls == 0) {
+      return 0;
+    }
+    double mean = getMeanTime();
+    double meanSquared = mean * mean;
+    double meanOfSquares = sumSquaredTime / nCalls;
+    double variance = meanOfSquares - meanSquared;
+    if (variance < 0) {
+      return 0; // might happen due to rounding error
+    }
+    return (long) Math.sqrt(variance);
+  }
+
+  @Override
+  public synchronized String toString() {
+    return '\n'
+        + "nCalls = " + nCalls + ";\n"
+        + "sum    = " + DF.format(sumTime / 1000000000.0) + "s;\n"
+        + "min    = " + DF.format(minTime / 1000000.0) + "ms;\n"
+        + "max    = " + DF.format(maxTime / 1000000.0) + "ms;\n"
+        + "mean   = " + DF.format(getMeanTime() / 1000.0) + "us;\n"
+        + "stdDev = " + DF.format(getStdDevTime() / 1000.0) + "us;";
+  }
+
+  /** Ignores counting the performance metrics until leadTimeIsFinished The caller should enough time for the JIT
+   *  to warm up. */
+  public Call newCall(long leadTimeUsec) {
+    if (leadSumTime > leadTimeUsec) {
+      return new Call();
+    } else {
+      return new LeadTimeCall();
+    }
+  }
+
+  /** Ignores counting the performance metrics. The caller should enough time for the JIT to warm up. */
+  public final class LeadTimeCall extends Call {
+
+    private LeadTimeCall() { }
+
+    @Override
+    public void end() {
+      long elapsed = System.nanoTime() - startTime;
+      synchronized (TimingStatistics.this) {
+        leadSumTime += elapsed;
+      }
+    }
+
+    @Override
+    public boolean end(long sumMaxUsec) {
+      end();
+      return false;
+    }
+  }
+
+  /**
+   * A call object that can update performance metrics.
+   */
+  public class Call {
+    protected final long startTime = System.nanoTime();
+
+    private Call() { }
+
+    public void end() {
+      long elapsed = System.nanoTime() - startTime;
+      synchronized (TimingStatistics.this) {
+        nCalls++;
+        if (elapsed < minTime || nCalls == 1) {
+          minTime = elapsed;
+        }
+        if (elapsed > maxTime) {
+          maxTime = elapsed;
+        }
+        sumTime += elapsed;
+        sumSquaredTime += elapsed * elapsed;
+      }
+    }
+
+    /**
+     * Returns true if the sumTime as reached this limit;
+     */
+    public boolean end(long sumMaxUsec) {
+      end();
+      return sumMaxUsec < sumTime;
+    }
+  }
+}


[10/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CrossFoldLearner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CrossFoldLearner.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CrossFoldLearner.java
new file mode 100644
index 0000000..f56814b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CrossFoldLearner.java
@@ -0,0 +1,334 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.classifier.OnlineLearner;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.DoubleDoubleFunction;
+import org.apache.mahout.math.function.Functions;
+import org.apache.mahout.math.stats.GlobalOnlineAuc;
+import org.apache.mahout.math.stats.OnlineAuc;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Does cross-fold validation of log-likelihood and AUC on several online logistic regression
+ * models. Each record is passed to all but one of the models for training and to the remaining
+ * model for evaluation.  In order to maintain proper segregation between the different folds across
+ * training data iterations, data should either be passed to this learner in the same order each
+ * time the training data is traversed or a tracking key such as the file offset of the training
+ * record should be passed with each training example.
+ */
+public class CrossFoldLearner extends AbstractVectorClassifier implements OnlineLearner, Writable {
+  private int record;
+  // minimum score to be used for computing log likelihood
+  private static final double MIN_SCORE = 1.0e-50;
+  private OnlineAuc auc = new GlobalOnlineAuc();
+  private double logLikelihood;
+  private final List<OnlineLogisticRegression> models = new ArrayList<>();
+
+  // lambda, learningRate, perTermOffset, perTermExponent
+  private double[] parameters = new double[4];
+  private int numFeatures;
+  private PriorFunction prior;
+  private double percentCorrect;
+
+  private int windowSize = Integer.MAX_VALUE;
+
+  public CrossFoldLearner() {
+  }
+
+  public CrossFoldLearner(int folds, int numCategories, int numFeatures, PriorFunction prior) {
+    this.numFeatures = numFeatures;
+    this.prior = prior;
+    for (int i = 0; i < folds; i++) {
+      OnlineLogisticRegression model = new OnlineLogisticRegression(numCategories, numFeatures, prior);
+      model.alpha(1).stepOffset(0).decayExponent(0);
+      models.add(model);
+    }
+  }
+
+  // -------- builder-like configuration methods
+
+  public CrossFoldLearner lambda(double v) {
+    for (OnlineLogisticRegression model : models) {
+      model.lambda(v);
+    }
+    return this;
+  }
+
+  public CrossFoldLearner learningRate(double x) {
+    for (OnlineLogisticRegression model : models) {
+      model.learningRate(x);
+    }
+    return this;
+  }
+
+  public CrossFoldLearner stepOffset(int x) {
+    for (OnlineLogisticRegression model : models) {
+      model.stepOffset(x);
+    }
+    return this;
+  }
+
+  public CrossFoldLearner decayExponent(double x) {
+    for (OnlineLogisticRegression model : models) {
+      model.decayExponent(x);
+    }
+    return this;
+  }
+
+  public CrossFoldLearner alpha(double alpha) {
+    for (OnlineLogisticRegression model : models) {
+      model.alpha(alpha);
+    }
+    return this;
+  }
+
+  // -------- training methods
+  @Override
+  public void train(int actual, Vector instance) {
+    train(record, null, actual, instance);
+  }
+
+  @Override
+  public void train(long trackingKey, int actual, Vector instance) {
+    train(trackingKey, null, actual, instance);
+  }
+
+  @Override
+  public void train(long trackingKey, String groupKey, int actual, Vector instance) {
+    record++;
+    int k = 0;
+    for (OnlineLogisticRegression model : models) {
+      if (k == mod(trackingKey, models.size())) {
+        Vector v = model.classifyFull(instance);
+        double score = Math.max(v.get(actual), MIN_SCORE);
+        logLikelihood += (Math.log(score) - logLikelihood) / Math.min(record, windowSize);
+
+        int correct = v.maxValueIndex() == actual ? 1 : 0;
+        percentCorrect += (correct - percentCorrect) / Math.min(record, windowSize);
+        if (numCategories() == 2) {
+          auc.addSample(actual, groupKey, v.get(1));
+        }
+      } else {
+        model.train(trackingKey, groupKey, actual, instance);
+      }
+      k++;
+    }
+  }
+
+  private static long mod(long x, int y) {
+    long r = x % y;
+    return r < 0 ? r + y : r;
+  }
+
+  @Override
+  public void close() {
+    for (OnlineLogisticRegression m : models) {
+      m.close();
+    }
+  }
+
+  public void resetLineCounter() {
+    record = 0;
+  }
+
+  public boolean validModel() {
+    boolean r = true;
+    for (OnlineLogisticRegression model : models) {
+      r &= model.validModel();
+    }
+    return r;
+  }
+
+  // -------- classification methods
+
+  @Override
+  public Vector classify(Vector instance) {
+    Vector r = new DenseVector(numCategories() - 1);
+    DoubleDoubleFunction scale = Functions.plusMult(1.0 / models.size());
+    for (OnlineLogisticRegression model : models) {
+      r.assign(model.classify(instance), scale);
+    }
+    return r;
+  }
+
+  @Override
+  public Vector classifyNoLink(Vector instance) {
+    Vector r = new DenseVector(numCategories() - 1);
+    DoubleDoubleFunction scale = Functions.plusMult(1.0 / models.size());
+    for (OnlineLogisticRegression model : models) {
+      r.assign(model.classifyNoLink(instance), scale);
+    }
+    return r;
+  }
+
+  @Override
+  public double classifyScalar(Vector instance) {
+    double r = 0;
+    int n = 0;
+    for (OnlineLogisticRegression model : models) {
+      n++;
+      r += model.classifyScalar(instance);
+    }
+    return r / n;
+  }
+
+  // -------- status reporting methods
+  
+  @Override
+  public int numCategories() {
+    return models.get(0).numCategories();
+  }
+
+  public double auc() {
+    return auc.auc();
+  }
+
+  public double logLikelihood() {
+    return logLikelihood;
+  }
+
+  public double percentCorrect() {
+    return percentCorrect;
+  }
+
+  // -------- evolutionary optimization
+
+  public CrossFoldLearner copy() {
+    CrossFoldLearner r = new CrossFoldLearner(models.size(), numCategories(), numFeatures, prior);
+    r.models.clear();
+    for (OnlineLogisticRegression model : models) {
+      model.close();
+      OnlineLogisticRegression newModel =
+          new OnlineLogisticRegression(model.numCategories(), model.numFeatures(), model.prior);
+      newModel.copyFrom(model);
+      r.models.add(newModel);
+    }
+    return r;
+  }
+
+  public int getRecord() {
+    return record;
+  }
+
+  public void setRecord(int record) {
+    this.record = record;
+  }
+
+  public OnlineAuc getAucEvaluator() {
+    return auc;
+  }
+
+  public void setAucEvaluator(OnlineAuc auc) {
+    this.auc = auc;
+  }
+
+  public double getLogLikelihood() {
+    return logLikelihood;
+  }
+
+  public void setLogLikelihood(double logLikelihood) {
+    this.logLikelihood = logLikelihood;
+  }
+
+  public List<OnlineLogisticRegression> getModels() {
+    return models;
+  }
+
+  public void addModel(OnlineLogisticRegression model) {
+    models.add(model);
+  }
+
+  public double[] getParameters() {
+    return parameters;
+  }
+
+  public void setParameters(double[] parameters) {
+    this.parameters = parameters;
+  }
+
+  public int getNumFeatures() {
+    return numFeatures;
+  }
+
+  public void setNumFeatures(int numFeatures) {
+    this.numFeatures = numFeatures;
+  }
+
+  public void setWindowSize(int windowSize) {
+    this.windowSize = windowSize;
+    auc.setWindowSize(windowSize);
+  }
+
+  public PriorFunction getPrior() {
+    return prior;
+  }
+
+  public void setPrior(PriorFunction prior) {
+    this.prior = prior;
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(record);
+    PolymorphicWritable.write(out, auc);
+    out.writeDouble(logLikelihood);
+    out.writeInt(models.size());
+    for (OnlineLogisticRegression model : models) {
+      model.write(out);
+    }
+
+    for (double x : parameters) {
+      out.writeDouble(x);
+    }
+    out.writeInt(numFeatures);
+    PolymorphicWritable.write(out, prior);
+    out.writeDouble(percentCorrect);
+    out.writeInt(windowSize);
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    record = in.readInt();
+    auc = PolymorphicWritable.read(in, OnlineAuc.class);
+    logLikelihood = in.readDouble();
+    int n = in.readInt();
+    for (int i = 0; i < n; i++) {
+      OnlineLogisticRegression olr = new OnlineLogisticRegression();
+      olr.readFields(in);
+      models.add(olr);
+    }
+    parameters = new double[4];
+    for (int i = 0; i < 4; i++) {
+      parameters[i] = in.readDouble();
+    }
+    numFeatures = in.readInt();
+    prior = PolymorphicWritable.read(in, PriorFunction.class);
+    percentCorrect = in.readDouble();
+    windowSize = in.readInt();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java
new file mode 100644
index 0000000..dbf3198
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java
@@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.base.Function;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Collections2;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Lists;
+
+import org.apache.commons.csv.CSVUtils;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
+import org.apache.mahout.vectorizer.encoders.ContinuousValueEncoder;
+import org.apache.mahout.vectorizer.encoders.Dictionary;
+import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
+import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
+import org.apache.mahout.vectorizer.encoders.TextValueEncoder;
+
+import java.io.IOException;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+
+/**
+ * Converts CSV data lines to vectors.
+ *
+ * Use of this class proceeds in a few steps.
+ * <ul>
+ * <li> At construction time, you tell the class about the target variable and provide
+ * a dictionary of the types of the predictor values.  At this point,
+ * the class yet cannot decode inputs because it doesn't know the fields that are in the
+ * data records, nor their order.
+ * <li> Optionally, you tell the parser object about the possible values of the target
+ * variable.  If you don't do this then you probably should set the number of distinct
+ * values so that the target variable values will be taken from a restricted range.
+ * <li> Later, when you get a list of the fields, typically from the first line of a CSV
+ * file, you tell the factory about these fields and it builds internal data structures
+ * that allow it to decode inputs.  The most important internal state is the field numbers
+ * for various fields.  After this point, you can use the factory for decoding data.
+ * <li> To encode data as a vector, you present a line of input to the factory and it
+ * mutates a vector that you provide.  The factory also retains trace information so
+ * that it can approximately reverse engineer vectors later.
+ * <li> After converting data, you can ask for an explanation of the data in terms of
+ * terms and weights.  In order to explain a vector accurately, the factory needs to
+ * have seen the particular values of categorical fields (typically during encoding vectors)
+ * and needs to have a reasonably small number of collisions in the vector encoding.
+ * </ul>
+ */
+public class CsvRecordFactory implements RecordFactory {
+  private static final String INTERCEPT_TERM = "Intercept Term";
+
+  private static final Map<String, Class<? extends FeatureVectorEncoder>> TYPE_DICTIONARY =
+          ImmutableMap.<String, Class<? extends FeatureVectorEncoder>>builder()
+                  .put("continuous", ContinuousValueEncoder.class)
+                  .put("numeric", ContinuousValueEncoder.class)
+                  .put("n", ContinuousValueEncoder.class)
+                  .put("word", StaticWordValueEncoder.class)
+                  .put("w", StaticWordValueEncoder.class)
+                  .put("text", TextValueEncoder.class)
+                  .put("t", TextValueEncoder.class)
+                  .build();
+
+  private final Map<String, Set<Integer>> traceDictionary = new TreeMap<>();
+
+  private int target;
+  private final Dictionary targetDictionary;
+  
+  //Which column is  used for identify a CSV file line 
+  private String idName;
+  private int id = -1;
+
+  private List<Integer> predictors;
+  private Map<Integer, FeatureVectorEncoder> predictorEncoders;
+  private int maxTargetValue = Integer.MAX_VALUE;
+  private final String targetName;
+  private final Map<String, String> typeMap;
+  private List<String> variableNames;
+  private boolean includeBiasTerm;
+  private static final String CANNOT_CONSTRUCT_CONVERTER =
+      "Unable to construct type converter... shouldn't be possible";
+
+  /**
+   * Parse a single line of CSV-formatted text.
+   *
+   * Separated to make changing this functionality for the entire class easier
+   * in the future.
+   * @param line - CSV formatted text
+   * @return List<String>
+   */
+  private List<String> parseCsvLine(String line) {
+    try {
+      return Arrays.asList(CSVUtils.parseLine(line));
+	   }
+	   catch (IOException e) {
+      List<String> list = new ArrayList<>();
+      list.add(line);
+      return list;
+   	}
+  }
+
+  private List<String> parseCsvLine(CharSequence line) {
+    return parseCsvLine(line.toString());
+  }
+
+  /**
+   * Construct a parser for CSV lines that encodes the parsed data in vector form.
+   * @param targetName            The name of the target variable.
+   * @param typeMap               A map describing the types of the predictor variables.
+   */
+  public CsvRecordFactory(String targetName, Map<String, String> typeMap) {
+    this.targetName = targetName;
+    this.typeMap = typeMap;
+    targetDictionary = new Dictionary();
+  }
+
+  public CsvRecordFactory(String targetName, String idName, Map<String, String> typeMap) {
+    this(targetName, typeMap);
+    this.idName = idName;
+  }
+
+  /**
+   * Defines the values and thus the encoding of values of the target variables.  Note
+   * that any values of the target variable not present in this list will be given the
+   * value of the last member of the list.
+   * @param values  The values the target variable can have.
+   */
+  @Override
+  public void defineTargetCategories(List<String> values) {
+    Preconditions.checkArgument(
+        values.size() <= maxTargetValue,
+        "Must have less than or equal to " + maxTargetValue + " categories for target variable, but found "
+            + values.size());
+    if (maxTargetValue == Integer.MAX_VALUE) {
+      maxTargetValue = values.size();
+    }
+
+    for (String value : values) {
+      targetDictionary.intern(value);
+    }
+  }
+
+  /**
+   * Defines the number of target variable categories, but allows this parser to
+   * pick encodings for them as they appear.
+   * @param max  The number of categories that will be expected.  Once this many have been
+   * seen, all others will get the encoding max-1.
+   */
+  @Override
+  public CsvRecordFactory maxTargetValue(int max) {
+    maxTargetValue = max;
+    return this;
+  }
+
+  @Override
+  public boolean usesFirstLineAsSchema() {
+    return true;
+  }
+
+  /**
+   * Processes the first line of a file (which should contain the variable names). The target and
+   * predictor column numbers are set from the names on this line.
+   *
+   * @param line       Header line for the file.
+   */
+  @Override
+  public void firstLine(String line) {
+    // read variable names, build map of name -> column
+    final Map<String, Integer> vars = new HashMap<>();
+    variableNames = parseCsvLine(line);
+    int column = 0;
+    for (String var : variableNames) {
+      vars.put(var, column++);
+    }
+
+    // record target column and establish dictionary for decoding target
+    target = vars.get(targetName);
+    
+    // record id column
+    if (idName != null) {
+      id = vars.get(idName);
+    }
+
+    // create list of predictor column numbers
+    predictors = new ArrayList<>(Collections2.transform(typeMap.keySet(), new Function<String, Integer>() {
+      @Override
+      public Integer apply(String from) {
+        Integer r = vars.get(from);
+        Preconditions.checkArgument(r != null, "Can't find variable %s, only know about %s", from, vars);
+        return r;
+      }
+    }));
+
+    if (includeBiasTerm) {
+      predictors.add(-1);
+    }
+    Collections.sort(predictors);
+
+    // and map from column number to type encoder for each column that is a predictor
+    predictorEncoders = new HashMap<>();
+    for (Integer predictor : predictors) {
+      String name;
+      Class<? extends FeatureVectorEncoder> c;
+      if (predictor == -1) {
+        name = INTERCEPT_TERM;
+        c = ConstantValueEncoder.class;
+      } else {
+        name = variableNames.get(predictor);
+        c = TYPE_DICTIONARY.get(typeMap.get(name));
+      }
+      try {
+        Preconditions.checkArgument(c != null, "Invalid type of variable %s,  wanted one of %s",
+          typeMap.get(name), TYPE_DICTIONARY.keySet());
+        Constructor<? extends FeatureVectorEncoder> constructor = c.getConstructor(String.class);
+        Preconditions.checkArgument(constructor != null, "Can't find correct constructor for %s", typeMap.get(name));
+        FeatureVectorEncoder encoder = constructor.newInstance(name);
+        predictorEncoders.put(predictor, encoder);
+        encoder.setTraceDictionary(traceDictionary);
+      } catch (InstantiationException e) {
+        throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
+      } catch (IllegalAccessException e) {
+        throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
+      } catch (InvocationTargetException e) {
+        throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
+      } catch (NoSuchMethodException e) {
+        throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
+      }
+    }
+  }
+
+
+  /**
+   * Decodes a single line of CSV data and records the target and predictor variables in a record.
+   * As a side effect, features are added into the featureVector.  Returns the value of the target
+   * variable.
+   *
+   * @param line          The raw data.
+   * @param featureVector Where to fill in the features.  Should be zeroed before calling
+   *                      processLine.
+   * @return The value of the target variable.
+   */
+  @Override
+  public int processLine(String line, Vector featureVector) {
+    List<String> values = parseCsvLine(line);
+
+    int targetValue = targetDictionary.intern(values.get(target));
+    if (targetValue >= maxTargetValue) {
+      targetValue = maxTargetValue - 1;
+    }
+
+    for (Integer predictor : predictors) {
+      String value;
+      if (predictor >= 0) {
+        value = values.get(predictor);
+      } else {
+        value = null;
+      }
+      predictorEncoders.get(predictor).addToVector(value, featureVector);
+    }
+    return targetValue;
+  }
+  
+  /***
+   * Decodes a single line of CSV data and records the target(if retrunTarget is true)
+   * and predictor variables in a record. As a side effect, features are added into the featureVector.
+   * Returns the value of the target variable. When used during classify against production data without
+   * target value, the method will be called with returnTarget = false. 
+   * @param line The raw data.
+   * @param featureVector Where to fill in the features.  Should be zeroed before calling
+   *                      processLine.
+   * @param returnTarget whether process and return target value, -1 will be returned if false.
+   * @return The value of the target variable.
+   */
+  public int processLine(CharSequence line, Vector featureVector, boolean returnTarget) {
+    List<String> values = parseCsvLine(line);
+    int targetValue = -1;
+    if (returnTarget) {
+      targetValue = targetDictionary.intern(values.get(target));
+      if (targetValue >= maxTargetValue) {
+        targetValue = maxTargetValue - 1;
+      }
+    }
+
+    for (Integer predictor : predictors) {
+      String value = predictor >= 0 ? values.get(predictor) : null;
+      predictorEncoders.get(predictor).addToVector(value, featureVector);
+    }
+    return targetValue;
+  }
+  
+  /***
+   * Extract the raw target string from a line read from a CSV file.
+   * @param line the line of content read from CSV file
+   * @return the raw target value in the corresponding column of CSV line 
+   */
+  public String getTargetString(CharSequence line) {
+    List<String> values = parseCsvLine(line);
+    return values.get(target);
+
+  }
+
+  /***
+   * Extract the corresponding raw target label according to a code 
+   * @param code the integer code encoded during training process
+   * @return the raw target label
+   */  
+  public String getTargetLabel(int code) {
+    for (String key : targetDictionary.values()) {
+      if (targetDictionary.intern(key) == code) {
+        return key;
+      }
+    }
+    return null;
+  }
+  
+  /***
+   * Extract the id column value from the CSV record
+   * @param line the line of content read from CSV file
+   * @return the id value of the CSV record
+   */
+  public String getIdString(CharSequence line) {
+    List<String> values = parseCsvLine(line);
+    return values.get(id);
+  }
+
+  /**
+   * Returns a list of the names of the predictor variables.
+   *
+   * @return A list of variable names.
+   */
+  @Override
+  public Iterable<String> getPredictors() {
+    return Lists.transform(predictors, new Function<Integer, String>() {
+      @Override
+      public String apply(Integer v) {
+        if (v >= 0) {
+          return variableNames.get(v);
+        } else {
+          return INTERCEPT_TERM;
+        }
+      }
+    });
+  }
+
+  @Override
+  public Map<String, Set<Integer>> getTraceDictionary() {
+    return traceDictionary;
+  }
+
+  @Override
+  public CsvRecordFactory includeBiasTerm(boolean useBias) {
+    includeBiasTerm = useBias;
+    return this;
+  }
+
+  @Override
+  public List<String> getTargetCategories() {
+    List<String> r = targetDictionary.values();
+    if (r.size() > maxTargetValue) {
+      r.subList(maxTargetValue, r.size()).clear();
+    }
+    return r;
+  }
+
+  public String getIdName() {
+    return idName;
+  }
+
+  public void setIdName(String idName) {
+    this.idName = idName;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/DefaultGradient.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/DefaultGradient.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/DefaultGradient.java
new file mode 100644
index 0000000..f81d8ce
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/DefaultGradient.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+
+/**
+ * Implements the basic logistic training law.
+ */
+public class DefaultGradient implements Gradient {
+  /**
+   * Provides a default gradient computation useful for logistic regression.  
+   *
+   * @param groupKey     A grouping key to allow per-something AUC loss to be used for training.
+   * @param actual       The target variable value.
+   * @param instance     The current feature vector to use for gradient computation
+   * @param classifier   The classifier that can compute scores
+   * @return  The gradient to be applied to beta
+   */
+  @Override
+  public final Vector apply(String groupKey, int actual, Vector instance, AbstractVectorClassifier classifier) {
+    // what does the current model say?
+    Vector v = classifier.classify(instance);
+
+    Vector r = v.like();
+    if (actual != 0) {
+      r.setQuick(actual - 1, 1);
+    }
+    r.assign(v, Functions.MINUS);
+    return r;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ElasticBandPrior.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ElasticBandPrior.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ElasticBandPrior.java
new file mode 100644
index 0000000..8128370
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ElasticBandPrior.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Implements a linear combination of L1 and L2 priors.  This can give an
+ * interesting mixture of sparsity and load-sharing between redundant predictors.
+ */
+public class ElasticBandPrior implements PriorFunction {
+  private double alphaByLambda;
+  private L1 l1;
+  private L2 l2;
+
+  // Exists for Writable
+  public ElasticBandPrior() {
+    this(0.0);
+  }
+
+  public ElasticBandPrior(double alphaByLambda) {
+    this.alphaByLambda = alphaByLambda;
+    l1 = new L1();
+    l2 = new L2(1);
+  }
+
+  @Override
+  public double age(double oldValue, double generations, double learningRate) {
+    oldValue *= Math.pow(1 - alphaByLambda * learningRate, generations);
+    double newValue = oldValue - Math.signum(oldValue) * learningRate * generations;
+    if (newValue * oldValue < 0.0) {
+      // don't allow the value to change sign
+      return 0.0;
+    } else {
+      return newValue;
+    }
+  }
+
+  @Override
+  public double logP(double betaIJ) {
+    return l1.logP(betaIJ) + alphaByLambda * l2.logP(betaIJ);
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeDouble(alphaByLambda);
+    l1.write(out);
+    l2.write(out);
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    alphaByLambda = in.readDouble();
+    l1 = new L1();
+    l1.readFields(in);
+    l2 = new L2();
+    l2.readFields(in);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/Gradient.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/Gradient.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/Gradient.java
new file mode 100644
index 0000000..524fc06
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/Gradient.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.math.Vector;
+
+/**
+ * Provides the ability to inject a gradient into the SGD logistic regresion.
+ * Typical uses of this are to use a ranking score such as AUC instead of a
+ * normal loss function.
+ */
+public interface Gradient {
+  Vector apply(String groupKey, int actual, Vector instance, AbstractVectorClassifier classifier);
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/GradientMachine.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/GradientMachine.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/GradientMachine.java
new file mode 100644
index 0000000..90ef7a8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/GradientMachine.java
@@ -0,0 +1,405 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.classifier.OnlineLearner;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.Functions;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Random;
+
+/**
+ * Online gradient machine learner that tries to minimize the label ranking hinge loss.
+ * Implements a gradient machine with one sigmpod hidden layer.
+ * It tries to minimize the ranking loss of some given set of labels,
+ * so this can be used for multi-class, multi-label
+ * or auto-encoding of sparse data (e.g. text).
+ */
+public class GradientMachine extends AbstractVectorClassifier implements OnlineLearner, Writable {
+
+  public static final int WRITABLE_VERSION = 1;
+
+  // the learning rate of the algorithm
+  private double learningRate = 0.1;
+
+  // the regularization term, a positive number that controls the size of the weight vector
+  private double regularization = 0.1;
+
+  // the sparsity term, a positive number that controls the sparsity of the hidden layer. (0 - 1)
+  private double sparsity = 0.1;
+
+  // the sparsity learning rate.
+  private double sparsityLearningRate = 0.1;
+
+  // the number of features
+  private int numFeatures = 10;
+  // the number of hidden nodes
+  private int numHidden = 100;
+  // the number of output nodes
+  private int numOutput = 2;
+
+  // coefficients for the input to hidden layer.
+  // There are numHidden Vectors of dimension numFeatures.
+  private Vector[] hiddenWeights;
+
+  // coefficients for the hidden to output layer.
+  // There are numOuput Vectors of dimension numHidden.
+  private Vector[] outputWeights;
+
+  // hidden unit bias
+  private Vector hiddenBias;
+
+  // output unit bias
+  private Vector outputBias;
+
+  private final Random rnd;
+
+  public GradientMachine(int numFeatures, int numHidden, int numOutput) {
+    this.numFeatures = numFeatures;
+    this.numHidden = numHidden;
+    this.numOutput = numOutput;
+    hiddenWeights = new DenseVector[numHidden];
+    for (int i = 0; i < numHidden; i++) {
+      hiddenWeights[i] = new DenseVector(numFeatures);
+      hiddenWeights[i].assign(0);
+    }
+    hiddenBias = new DenseVector(numHidden);
+    hiddenBias.assign(0);
+    outputWeights = new DenseVector[numOutput];
+    for (int i = 0; i < numOutput; i++) {
+      outputWeights[i] = new DenseVector(numHidden);
+      outputWeights[i].assign(0);
+    }
+    outputBias = new DenseVector(numOutput);
+    outputBias.assign(0);
+    rnd = RandomUtils.getRandom();
+  }
+
+  /**
+   * Initialize weights.
+   *
+   * @param gen random number generator.
+   */
+  public void initWeights(Random gen) {
+    double hiddenFanIn = 1.0 / Math.sqrt(numFeatures);
+    for (int i = 0; i < numHidden; i++) {
+      for (int j = 0; j < numFeatures; j++) {
+        double val = (2.0 * gen.nextDouble() - 1.0) * hiddenFanIn;
+        hiddenWeights[i].setQuick(j, val);
+      }
+    }
+    double outputFanIn = 1.0 / Math.sqrt(numHidden);
+    for (int i = 0; i < numOutput; i++) {
+      for (int j = 0; j < numHidden; j++) {
+        double val = (2.0 * gen.nextDouble() - 1.0) * outputFanIn;
+        outputWeights[i].setQuick(j, val);
+      }
+    }
+  }
+
+  /**
+   * Chainable configuration option.
+   *
+   * @param learningRate New value of initial learning rate.
+   * @return This, so other configurations can be chained.
+   */
+  public GradientMachine learningRate(double learningRate) {
+    this.learningRate = learningRate;
+    return this;
+  }
+
+  /**
+   * Chainable configuration option.
+   *
+   * @param regularization A positive value that controls the weight vector size.
+   * @return This, so other configurations can be chained.
+   */
+  public GradientMachine regularization(double regularization) {
+    this.regularization = regularization;
+    return this;
+  }
+
+  /**
+   * Chainable configuration option.
+   *
+   * @param sparsity A value between zero and one that controls the fraction of hidden units
+   *                 that are activated on average.
+   * @return This, so other configurations can be chained.
+   */
+  public GradientMachine sparsity(double sparsity) {
+    this.sparsity = sparsity;
+    return this;
+  }
+
+  /**
+   * Chainable configuration option.
+   *
+   * @param sparsityLearningRate New value of initial learning rate for sparsity.
+   * @return This, so other configurations can be chained.
+   */
+  public GradientMachine sparsityLearningRate(double sparsityLearningRate) {
+    this.sparsityLearningRate = sparsityLearningRate;
+    return this;
+  }
+
+  public void copyFrom(GradientMachine other) {
+    numFeatures = other.numFeatures;
+    numHidden = other.numHidden;
+    numOutput = other.numOutput;
+    learningRate = other.learningRate;
+    regularization = other.regularization;
+    sparsity = other.sparsity;
+    sparsityLearningRate = other.sparsityLearningRate;
+    hiddenWeights = new DenseVector[numHidden];
+    for (int i = 0; i < numHidden; i++) {
+      hiddenWeights[i] = other.hiddenWeights[i].clone();
+    }
+    hiddenBias = other.hiddenBias.clone();
+    outputWeights = new DenseVector[numOutput];
+    for (int i = 0; i < numOutput; i++) {
+      outputWeights[i] = other.outputWeights[i].clone();
+    }
+    outputBias = other.outputBias.clone();
+  }
+
+  @Override
+  public int numCategories() {
+    return numOutput;
+  }
+
+  public int numFeatures() {
+    return numFeatures;
+  }
+
+  public int numHidden() {
+    return numHidden;
+  }
+
+  /**
+   * Feeds forward from input to hidden unit..
+   *
+   * @return Hidden unit activations.
+   */
+  public DenseVector inputToHidden(Vector input) {
+    DenseVector activations = new DenseVector(numHidden);
+    for (int i = 0; i < numHidden; i++) {
+      activations.setQuick(i, hiddenWeights[i].dot(input));
+    }
+    activations.assign(hiddenBias, Functions.PLUS);
+    activations.assign(Functions.min(40.0)).assign(Functions.max(-40));
+    activations.assign(Functions.SIGMOID);
+    return activations;
+  }
+
+  /**
+   * Feeds forward from hidden to output
+   *
+   * @return Output unit activations.
+   */
+  public DenseVector hiddenToOutput(Vector hiddenActivation) {
+    DenseVector activations = new DenseVector(numOutput);
+    for (int i = 0; i < numOutput; i++) {
+      activations.setQuick(i, outputWeights[i].dot(hiddenActivation));
+    }
+    activations.assign(outputBias, Functions.PLUS);
+    return activations;
+  }
+
+  /**
+   * Updates using ranking loss.
+   *
+   * @param hiddenActivation the hidden unit's activation
+   * @param goodLabels       the labels you want ranked above others.
+   * @param numTrials        how many times you want to search for the highest scoring bad label.
+   * @param gen              Random number generator.
+   */
+  public void updateRanking(Vector hiddenActivation,
+                            Collection<Integer> goodLabels,
+                            int numTrials,
+                            Random gen) {
+    // All the labels are good, do nothing.
+    if (goodLabels.size() >= numOutput) {
+      return;
+    }
+    for (Integer good : goodLabels) {
+      double goodScore = outputWeights[good].dot(hiddenActivation);
+      int highestBad = -1;
+      double highestBadScore = Double.NEGATIVE_INFINITY;
+      for (int i = 0; i < numTrials; i++) {
+        int bad = gen.nextInt(numOutput);
+        while (goodLabels.contains(bad)) {
+          bad = gen.nextInt(numOutput);
+        }
+        double badScore = outputWeights[bad].dot(hiddenActivation);
+        if (badScore > highestBadScore) {
+          highestBadScore = badScore;
+          highestBad = bad;
+        }
+      }
+      int bad = highestBad;
+      double loss = 1.0 - goodScore + highestBadScore;
+      if (loss < 0.0) {
+        continue;
+      }
+      // Note from the loss above the gradient dloss/dy , y being the label is -1 for good
+      // and +1 for bad.
+      // dy / dw is just w since  y = x' * w + b.
+      // Hence by the chain rule, dloss / dw = dloss / dy * dy / dw = -w.
+      // For the regularization part, 0.5 * lambda * w' w, the gradient is lambda * w.
+      // dy / db = 1.
+      Vector gradGood = outputWeights[good].clone();
+      gradGood.assign(Functions.NEGATE);
+      Vector propHidden = gradGood.clone();
+      Vector gradBad = outputWeights[bad].clone();
+      propHidden.assign(gradBad, Functions.PLUS);
+      gradGood.assign(Functions.mult(-learningRate * (1.0 - regularization)));
+      outputWeights[good].assign(gradGood, Functions.PLUS);
+      gradBad.assign(Functions.mult(-learningRate * (1.0 + regularization)));
+      outputWeights[bad].assign(gradBad, Functions.PLUS);
+      outputBias.setQuick(good, outputBias.get(good) + learningRate);
+      outputBias.setQuick(bad, outputBias.get(bad) - learningRate);
+      // Gradient of sigmoid is s * (1 -s).
+      Vector gradSig = hiddenActivation.clone();
+      gradSig.assign(Functions.SIGMOIDGRADIENT);
+      // Multiply by the change caused by the ranking loss.
+      for (int i = 0; i < numHidden; i++) {
+        gradSig.setQuick(i, gradSig.get(i) * propHidden.get(i));
+      }
+      for (int i = 0; i < numHidden; i++) {
+        for (int j = 0; j < numFeatures; j++) {
+          double v = hiddenWeights[i].get(j);
+          v -= learningRate * (gradSig.get(i) + regularization * v);
+          hiddenWeights[i].setQuick(j, v);
+        }
+      }
+    }
+  }
+
+  @Override
+  public Vector classify(Vector instance) {
+    Vector result = classifyNoLink(instance);
+    // Find the max value's index.
+    int max = result.maxValueIndex();
+    result.assign(0);
+    result.setQuick(max, 1.0);
+    return result.viewPart(1, result.size() - 1);
+  }
+
+  @Override
+  public Vector classifyNoLink(Vector instance) {
+    DenseVector hidden = inputToHidden(instance);
+    return hiddenToOutput(hidden);
+  }
+
+  @Override
+  public double classifyScalar(Vector instance) {
+    Vector output = classifyNoLink(instance);
+    if (output.get(0) > output.get(1)) {
+      return 0;
+    }
+    return 1;
+  }
+
+  public GradientMachine copy() {
+    close();
+    GradientMachine r = new GradientMachine(numFeatures(), numHidden(), numCategories());
+    r.copyFrom(this);
+    return r;
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(WRITABLE_VERSION);
+    out.writeDouble(learningRate);
+    out.writeDouble(regularization);
+    out.writeDouble(sparsity);
+    out.writeDouble(sparsityLearningRate);
+    out.writeInt(numFeatures);
+    out.writeInt(numHidden);
+    out.writeInt(numOutput);
+    VectorWritable.writeVector(out, hiddenBias);
+    for (int i = 0; i < numHidden; i++) {
+      VectorWritable.writeVector(out, hiddenWeights[i]);
+    }
+    VectorWritable.writeVector(out, outputBias);
+    for (int i = 0; i < numOutput; i++) {
+      VectorWritable.writeVector(out, outputWeights[i]);
+    }
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    int version = in.readInt();
+    if (version == WRITABLE_VERSION) {
+      learningRate = in.readDouble();
+      regularization = in.readDouble();
+      sparsity = in.readDouble();
+      sparsityLearningRate = in.readDouble();
+      numFeatures = in.readInt();
+      numHidden = in.readInt();
+      numOutput = in.readInt();
+      hiddenWeights = new DenseVector[numHidden];
+      hiddenBias = VectorWritable.readVector(in);
+      for (int i = 0; i < numHidden; i++) {
+        hiddenWeights[i] = VectorWritable.readVector(in);
+      }
+      outputWeights = new DenseVector[numOutput];
+      outputBias = VectorWritable.readVector(in);
+      for (int i = 0; i < numOutput; i++) {
+        outputWeights[i] = VectorWritable.readVector(in);
+      }
+    } else {
+      throw new IOException("Incorrect object version, wanted " + WRITABLE_VERSION + " got " + version);
+    }
+  }
+
+  @Override
+  public void close() {
+    // This is an online classifier, nothing to do.
+  }
+
+  @Override
+  public void train(long trackingKey, String groupKey, int actual, Vector instance) {
+    Vector hiddenActivation = inputToHidden(instance);
+    hiddenToOutput(hiddenActivation);
+    Collection<Integer> goodLabels = new HashSet<>();
+    goodLabels.add(actual);
+    updateRanking(hiddenActivation, goodLabels, 2, rnd);
+  }
+
+  @Override
+  public void train(long trackingKey, int actual, Vector instance) {
+    train(trackingKey, null, actual, instance);
+  }
+
+  @Override
+  public void train(int actual, Vector instance) {
+    train(0, null, actual, instance);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L1.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L1.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L1.java
new file mode 100644
index 0000000..28a05f2
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L1.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Implements the Laplacian or bi-exponential prior.  This prior has a strong tendency to set coefficients to zero
+ * and thus is useful as an alternative to variable selection.  This version implements truncation which prevents
+ * a coefficient from changing sign.  If a correction would change the sign, the coefficient is truncated to zero.
+ *
+ * Note that it doesn't matter to have a scale for this distribution because after taking the derivative of the logP,
+ * the lambda coefficient used to combine the prior with the observations has the same effect.  If we had a scale here,
+ * then it would be the same effect as just changing lambda.
+ */
+public class L1 implements PriorFunction {
+  @Override
+  public double age(double oldValue, double generations, double learningRate) {
+    double newValue = oldValue - Math.signum(oldValue) * learningRate * generations;
+    if (newValue * oldValue < 0) {
+      // don't allow the value to change sign
+      return 0;
+    } else {
+      return newValue;
+    }
+  }
+
+  @Override
+  public double logP(double betaIJ) {
+    return -Math.abs(betaIJ);
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    // stateless class has nothing to serialize
+  }
+
+  @Override
+  public void readFields(DataInput dataInput) throws IOException {
+    // stateless class has nothing to serialize
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L2.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L2.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L2.java
new file mode 100644
index 0000000..3dfb9fc
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L2.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Implements the Gaussian prior.  This prior has a tendency to decrease large coefficients toward zero, but
+ * doesn't tend to set them to exactly zero.
+ */
+public class L2 implements PriorFunction {
+
+  private static final double HALF_LOG_2PI = Math.log(2.0 * Math.PI) / 2.0;
+
+  private double s2;
+  private double s;
+
+  public L2(double scale) {
+    s = scale;
+    s2 = scale * scale;
+  }
+
+  public L2() {
+    s = 1.0;
+    s2 = 1.0;
+  }
+
+  @Override
+  public double age(double oldValue, double generations, double learningRate) {
+    return oldValue * Math.pow(1.0 - learningRate / s2, generations);
+  }
+
+  @Override
+  public double logP(double betaIJ) {
+    return -betaIJ * betaIJ / s2 / 2.0 - Math.log(s) - HALF_LOG_2PI;
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeDouble(s2);
+    out.writeDouble(s);
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    s2 = in.readDouble();
+    s = in.readDouble();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/MixedGradient.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/MixedGradient.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/MixedGradient.java
new file mode 100644
index 0000000..a290b22
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/MixedGradient.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.Vector;
+
+import java.util.Random;
+
+/**
+ * <p>Provides a stochastic mixture of ranking updates and normal logistic updates. This uses a
+ * combination of AUC driven learning to improve ranking performance and traditional log-loss driven
+ * learning to improve log-likelihood.</p>
+ *
+ * <p>See www.eecs.tufts.edu/~dsculley/papers/combined-ranking-and-regression.pdf</p>
+ *
+ * <p>This implementation only makes sense for the binomial case.</p>
+ */
+public class MixedGradient implements Gradient {
+
+  private final double alpha;
+  private final RankingGradient rank;
+  private final Gradient basic;
+  private final Random random = RandomUtils.getRandom();
+  private boolean hasZero;
+  private boolean hasOne;
+
+  public MixedGradient(double alpha, int window) {
+    this.alpha = alpha;
+    this.rank = new RankingGradient(window);
+    this.basic = this.rank.getBaseGradient();
+  }
+
+  @Override
+  public Vector apply(String groupKey, int actual, Vector instance, AbstractVectorClassifier classifier) {
+    if (random.nextDouble() < alpha) {
+      // one option is to apply a ranking update relative to our recent history
+      if (!hasZero || !hasOne) {
+        throw new IllegalStateException();
+      }
+      return rank.apply(groupKey, actual, instance, classifier);
+    } else {
+      hasZero |= actual == 0;
+      hasOne |= actual == 1;
+      // the other option is a normal update, but we have to update our history on the way
+      rank.addToHistory(actual, instance);
+      return basic.apply(groupKey, actual, instance, classifier);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelDissector.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelDissector.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelDissector.java
new file mode 100644
index 0000000..bcd2ebc
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelDissector.java
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.collect.Ordering;
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.Vector;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+import java.util.Queue;
+import java.util.Set;
+
+/**
+ * Uses sample data to reverse engineer a feature-hashed model.
+ *
+ * The result gives approximate weights for features and interactions
+ * in the original space.
+ *
+ * The idea is that the hashed encoders have the option of having a trace dictionary.  This
+ * tells us where each feature is hashed to, or each feature/value combination in the case
+ * of word-like values.  Using this dictionary, we can put values into a synthetic feature
+ * vector in just the locations specified by a single feature or interaction.  Then we can
+ * push this through a linear part of a model to see the contribution of that input. For
+ * any generalized linear model like logistic regression, there is a linear part of the
+ * model that allows this.
+ *
+ * What the ModelDissector does is to accept a trace dictionary and a model in an update
+ * method.  It figures out the weights for the elements in the trace dictionary and stashes
+ * them.  Then in a summary method, the biggest weights are returned.  This update/flush
+ * style is used so that the trace dictionary doesn't have to grow to enormous levels,
+ * but instead can be cleared between updates.
+ */
+public class ModelDissector {
+  private final Map<String,Vector> weightMap;
+
+  public ModelDissector() {
+    weightMap = new HashMap<>();
+  }
+
+  /**
+   * Probes a model to determine the effect of a particular variable.  This is done
+   * with the ade of a trace dictionary which has recorded the locations in the feature
+   * vector that are modified by various variable values.  We can set these locations to
+   * 1 and then look at the resulting score.  This tells us the weight the model places
+   * on that variable.
+   * @param features               A feature vector to use (destructively)
+   * @param traceDictionary        A trace dictionary containing variables and what locations
+   *                               in the feature vector are affected by them
+   * @param learner                The model that we are probing to find weights on features
+   */
+
+  public void update(Vector features, Map<String, Set<Integer>> traceDictionary, AbstractVectorClassifier learner) {
+    // zero out feature vector
+    features.assign(0);
+    for (Map.Entry<String, Set<Integer>> entry : traceDictionary.entrySet()) {
+      // get a feature and locations where it is stored in the feature vector
+      String key = entry.getKey();
+      Set<Integer> value = entry.getValue();
+
+      // if we haven't looked at this feature yet
+      if (!weightMap.containsKey(key)) {
+        // put probe values in the feature vector
+        for (Integer where : value) {
+          features.set(where, 1);
+        }
+
+        // see what the model says
+        Vector v = learner.classifyNoLink(features);
+        weightMap.put(key, v);
+
+        // and zero out those locations again
+        for (Integer where : value) {
+          features.set(where, 0);
+        }
+      }
+    }
+  }
+
+  /**
+   * Returns the n most important features with their
+   * weights, most important category and the top few
+   * categories that they affect.
+   * @param n      How many results to return.
+   * @return       A list of the top variables.
+   */
+  public List<Weight> summary(int n) {
+    Queue<Weight> pq = new PriorityQueue<>();
+    for (Map.Entry<String, Vector> entry : weightMap.entrySet()) {
+      pq.add(new Weight(entry.getKey(), entry.getValue()));
+      while (pq.size() > n) {
+        pq.poll();
+      }
+    }
+    List<Weight> r = new ArrayList<>(pq);
+    Collections.sort(r, Ordering.natural().reverse());
+    return r;
+  }
+
+  private static final class Category implements Comparable<Category> {
+    private final int index;
+    private final double weight;
+
+    private Category(int index, double weight) {
+      this.index = index;
+      this.weight = weight;
+    }
+
+    @Override
+    public int compareTo(Category o) {
+      int r = Double.compare(Math.abs(weight), Math.abs(o.weight));
+      if (r == 0) {
+        if (o.index < index) {
+          return -1;
+        }
+        if (o.index > index) {
+          return 1;
+        }
+        return 0;
+      }
+      return r;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (!(o instanceof Category)) {
+        return false;
+      }
+      Category other = (Category) o;
+      return index == other.index && weight == other.weight;
+    }
+
+    @Override
+    public int hashCode() {
+      return RandomUtils.hashDouble(weight) ^ index;
+    }
+
+  }
+
+  public static class Weight implements Comparable<Weight> {
+    private final String feature;
+    private final double value;
+    private final int maxIndex;
+    private final List<Category> categories;
+
+    public Weight(String feature, Vector weights) {
+      this(feature, weights, 3);
+    }
+
+    public Weight(String feature, Vector weights, int n) {
+      this.feature = feature;
+      // pick out the weight with the largest abs value, but don't forget the sign
+      Queue<Category> biggest = new PriorityQueue<>(n + 1, Ordering.natural());
+      for (Vector.Element element : weights.all()) {
+        biggest.add(new Category(element.index(), element.get()));
+        while (biggest.size() > n) {
+          biggest.poll();
+        }
+      }
+      categories = new ArrayList<>(biggest);
+      Collections.sort(categories, Ordering.natural().reverse());
+      value = categories.get(0).weight;
+      maxIndex = categories.get(0).index;
+    }
+
+    @Override
+    public int compareTo(Weight other) {
+      int r = Double.compare(Math.abs(this.value), Math.abs(other.value));
+      if (r == 0) {
+        return feature.compareTo(other.feature);
+      }
+      return r;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (!(o instanceof Weight)) {
+        return false;
+      }
+      Weight other = (Weight) o;
+      return feature.equals(other.feature)
+          && value == other.value
+          && maxIndex == other.maxIndex
+          && categories.equals(other.categories);
+    }
+
+    @Override
+    public int hashCode() {
+      return feature.hashCode() ^ RandomUtils.hashDouble(value) ^ maxIndex ^ categories.hashCode();
+    }
+
+    public String getFeature() {
+      return feature;
+    }
+
+    public double getWeight() {
+      return value;
+    }
+
+    public double getWeight(int n) {
+      return categories.get(n).weight;
+    }
+
+    public double getCategory(int n) {
+      return categories.get(n).index;
+    }
+
+    public int getMaxImpact() {
+      return maxIndex;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java
new file mode 100644
index 0000000..f89b245
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * Provides the ability to store SGD model-related objects as binary files.
+ */
+public final class ModelSerializer {
+
+  // static class ... don't instantiate
+  private ModelSerializer() {
+  }
+
+  public static void writeBinary(String path, CrossFoldLearner model) throws IOException {
+    try (DataOutputStream out = new DataOutputStream(new FileOutputStream(path))) {
+      PolymorphicWritable.write(out, model);
+    }
+  }
+
+  public static void writeBinary(String path, OnlineLogisticRegression model) throws IOException {
+    try (DataOutputStream out = new DataOutputStream(new FileOutputStream(path))) {
+      PolymorphicWritable.write(out, model);
+    }
+  }
+
+  public static void writeBinary(String path, AdaptiveLogisticRegression model) throws IOException {
+    try (DataOutputStream out = new DataOutputStream(new FileOutputStream(path))){
+      PolymorphicWritable.write(out, model);
+    }
+  }
+
+  public static <T extends Writable> T readBinary(InputStream in, Class<T> clazz) throws IOException {
+    DataInput dataIn = new DataInputStream(in);
+    try {
+      return PolymorphicWritable.read(dataIn, clazz);
+    } finally {
+      Closeables.close(in, false);
+    }
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/OnlineLogisticRegression.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/OnlineLogisticRegression.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/OnlineLogisticRegression.java
new file mode 100644
index 0000000..7a9ca83
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/OnlineLogisticRegression.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.MatrixWritable;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Extends the basic on-line logistic regression learner with a specific set of learning
+ * rate annealing schedules.
+ */
+public class OnlineLogisticRegression extends AbstractOnlineLogisticRegression implements Writable {
+  public static final int WRITABLE_VERSION = 1;
+
+  // these next two control decayFactor^steps exponential type of annealing
+  // learning rate and decay factor
+  private double mu0 = 1;
+  private double decayFactor = 1 - 1.0e-3;
+
+  // these next two control 1/steps^forget type annealing
+  private int stepOffset = 10;
+  // -1 equals even weighting of all examples, 0 means only use exponential annealing
+  private double forgettingExponent = -0.5;
+
+  // controls how per term annealing works
+  private int perTermAnnealingOffset = 20;
+
+  public OnlineLogisticRegression() {
+    // private constructor available for serialization, but not normal use
+  }
+
+  public OnlineLogisticRegression(int numCategories, int numFeatures, PriorFunction prior) {
+    this.numCategories = numCategories;
+    this.prior = prior;
+
+    updateSteps = new DenseVector(numFeatures);
+    updateCounts = new DenseVector(numFeatures).assign(perTermAnnealingOffset);
+    beta = new DenseMatrix(numCategories - 1, numFeatures);
+  }
+
+  /**
+   * Chainable configuration option.
+   *
+   * @param alpha New value of decayFactor, the exponential decay rate for the learning rate.
+   * @return This, so other configurations can be chained.
+   */
+  public OnlineLogisticRegression alpha(double alpha) {
+    this.decayFactor = alpha;
+    return this;
+  }
+
+  @Override
+  public OnlineLogisticRegression lambda(double lambda) {
+    // we only over-ride this to provide a more restrictive return type
+    super.lambda(lambda);
+    return this;
+  }
+
+  /**
+   * Chainable configuration option.
+   *
+   * @param learningRate New value of initial learning rate.
+   * @return This, so other configurations can be chained.
+   */
+  public OnlineLogisticRegression learningRate(double learningRate) {
+    this.mu0 = learningRate;
+    return this;
+  }
+
+  public OnlineLogisticRegression stepOffset(int stepOffset) {
+    this.stepOffset = stepOffset;
+    return this;
+  }
+
+  public OnlineLogisticRegression decayExponent(double decayExponent) {
+    if (decayExponent > 0) {
+      decayExponent = -decayExponent;
+    }
+    this.forgettingExponent = decayExponent;
+    return this;
+  }
+
+
+  @Override
+  public double perTermLearningRate(int j) {
+    return Math.sqrt(perTermAnnealingOffset / updateCounts.get(j));
+  }
+
+  @Override
+  public double currentLearningRate() {
+    return mu0 * Math.pow(decayFactor, getStep()) * Math.pow(getStep() + stepOffset, forgettingExponent);
+  }
+
+  public void copyFrom(OnlineLogisticRegression other) {
+    super.copyFrom(other);
+    mu0 = other.mu0;
+    decayFactor = other.decayFactor;
+
+    stepOffset = other.stepOffset;
+    forgettingExponent = other.forgettingExponent;
+
+    perTermAnnealingOffset = other.perTermAnnealingOffset;
+  }
+
+  public OnlineLogisticRegression copy() {
+    close();
+    OnlineLogisticRegression r = new OnlineLogisticRegression(numCategories(), numFeatures(), prior);
+    r.copyFrom(this);
+    return r;
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(WRITABLE_VERSION);
+    out.writeDouble(mu0);
+    out.writeDouble(getLambda()); 
+    out.writeDouble(decayFactor);
+    out.writeInt(stepOffset);
+    out.writeInt(step);
+    out.writeDouble(forgettingExponent);
+    out.writeInt(perTermAnnealingOffset);
+    out.writeInt(numCategories);
+    MatrixWritable.writeMatrix(out, beta);
+    PolymorphicWritable.write(out, prior);
+    VectorWritable.writeVector(out, updateCounts);
+    VectorWritable.writeVector(out, updateSteps);
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    int version = in.readInt();
+    if (version == WRITABLE_VERSION) {
+      mu0 = in.readDouble();
+      lambda(in.readDouble()); 
+      decayFactor = in.readDouble();
+      stepOffset = in.readInt();
+      step = in.readInt();
+      forgettingExponent = in.readDouble();
+      perTermAnnealingOffset = in.readInt();
+      numCategories = in.readInt();
+      beta = MatrixWritable.readMatrix(in);
+      prior = PolymorphicWritable.read(in, PriorFunction.class);
+
+      updateCounts = VectorWritable.readVector(in);
+      updateSteps = VectorWritable.readVector(in);
+    } else {
+      throw new IOException("Incorrect object version, wanted " + WRITABLE_VERSION + " got " + version);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PassiveAggressive.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PassiveAggressive.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PassiveAggressive.java
new file mode 100644
index 0000000..c51361c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PassiveAggressive.java
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.classifier.OnlineLearner;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.MatrixWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Online passive aggressive learner that tries to minimize the label ranking hinge loss.
+ * Implements a multi-class linear classifier minimizing rank loss.
+ *  based on "Online passive aggressive algorithms" by Cramer et al, 2006.
+ *  Note: Its better to use classifyNoLink because the loss function is based
+ *  on ensuring that the score of the good label is larger than the next
+ *  highest label by some margin. The conversion to probability is just done
+ *  by exponentiating and dividing by the sum and is empirical at best.
+ *  Your features should be pre-normalized in some sensible range, for example,
+ *  by subtracting the mean and standard deviation, if they are very
+ *  different in magnitude from each other.
+ */
+public class PassiveAggressive extends AbstractVectorClassifier implements OnlineLearner, Writable {
+
+  private static final Logger log = LoggerFactory.getLogger(PassiveAggressive.class);
+
+  public static final int WRITABLE_VERSION = 1;
+
+  // the learning rate of the algorithm
+  private double learningRate = 0.1;
+
+  // loss statistics.
+  private int lossCount = 0;
+  private double lossSum = 0;
+
+  // coefficients for the classification.  This is a dense matrix
+  // that is (numCategories ) x numFeatures
+  private Matrix weights;
+
+  // number of categories we are classifying.
+  private int numCategories;
+
+  public PassiveAggressive(int numCategories, int numFeatures) {
+    this.numCategories = numCategories;
+    weights = new DenseMatrix(numCategories, numFeatures);
+    weights.assign(0.0);
+  }
+
+  /**
+   * Chainable configuration option.
+   *
+   * @param learningRate New value of initial learning rate.
+   * @return This, so other configurations can be chained.
+   */
+  public PassiveAggressive learningRate(double learningRate) {
+    this.learningRate = learningRate;
+    return this;
+  }
+
+  public void copyFrom(PassiveAggressive other) {
+    learningRate = other.learningRate;
+    numCategories = other.numCategories;
+    weights = other.weights;
+  }
+
+  @Override
+  public int numCategories() {
+    return numCategories;
+  }
+
+  @Override
+  public Vector classify(Vector instance) {
+    Vector result = classifyNoLink(instance);
+    // Convert to probabilities by exponentiation.
+    double max = result.maxValue();
+    result.assign(Functions.minus(max)).assign(Functions.EXP);
+    result = result.divide(result.norm(1));
+
+    return result.viewPart(1, result.size() - 1);
+  }
+
+  @Override
+  public Vector classifyNoLink(Vector instance) {
+    Vector result = new DenseVector(weights.numRows());
+    result.assign(0);
+    for (int i = 0; i < weights.numRows(); i++) {
+      result.setQuick(i, weights.viewRow(i).dot(instance));
+    }
+    return result;
+  }
+
+  @Override
+  public double classifyScalar(Vector instance) {
+    double v1 = weights.viewRow(0).dot(instance);
+    double v2 = weights.viewRow(1).dot(instance);
+    v1 = Math.exp(v1);
+    v2 = Math.exp(v2);
+    return v2 / (v1 + v2);
+  }
+
+  public int numFeatures() {
+    return weights.numCols();
+  }
+
+  public PassiveAggressive copy() {
+    close();
+    PassiveAggressive r = new PassiveAggressive(numCategories(), numFeatures());
+    r.copyFrom(this);
+    return r;
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(WRITABLE_VERSION);
+    out.writeDouble(learningRate);
+    out.writeInt(numCategories);
+    MatrixWritable.writeMatrix(out, weights);
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    int version = in.readInt();
+    if (version == WRITABLE_VERSION) {
+      learningRate = in.readDouble();
+      numCategories = in.readInt();
+      weights = MatrixWritable.readMatrix(in);
+    } else {
+      throw new IOException("Incorrect object version, wanted " + WRITABLE_VERSION + " got " + version);
+    }
+  }
+
+  @Override
+  public void close() {
+      // This is an online classifier, nothing to do.
+  }
+
+  @Override
+  public void train(long trackingKey, String groupKey, int actual, Vector instance) {
+    if (lossCount > 1000) {
+      log.info("Avg. Loss = {}", lossSum / lossCount);
+      lossCount = 0;
+      lossSum = 0;
+    }
+    Vector result = classifyNoLink(instance);
+    double myScore = result.get(actual);
+    // Find the highest score that is not actual.
+    int otherIndex = result.maxValueIndex();
+    double otherValue = result.get(otherIndex);
+    if (otherIndex == actual) {
+      result.setQuick(otherIndex, Double.NEGATIVE_INFINITY);
+      otherIndex = result.maxValueIndex();
+      otherValue = result.get(otherIndex);
+    }
+    double loss = 1.0 - myScore + otherValue;
+    lossCount += 1;
+    if (loss >= 0) {
+      lossSum += loss;
+      double tau = loss / (instance.dot(instance) + 0.5 / learningRate);
+      Vector delta = instance.clone();
+      delta.assign(Functions.mult(tau));
+      weights.viewRow(actual).assign(delta, Functions.PLUS);
+//      delta.addTo(weights.viewRow(actual));
+      delta.assign(Functions.mult(-1));
+      weights.viewRow(otherIndex).assign(delta, Functions.PLUS);
+//      delta.addTo(weights.viewRow(otherIndex));
+    }
+  }
+
+  @Override
+  public void train(long trackingKey, int actual, Vector instance) {
+    train(trackingKey, null, actual, instance);
+  }
+
+  @Override
+  public void train(int actual, Vector instance) {
+    train(0, null, actual, instance);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PolymorphicWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PolymorphicWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PolymorphicWritable.java
new file mode 100644
index 0000000..90062a6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PolymorphicWritable.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.ClassUtils;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Utilities that write a class name and then serialize using writables.
+ */
+public final class PolymorphicWritable {
+
+  private PolymorphicWritable() {
+  }
+
+  public static <T extends Writable> void write(DataOutput dataOutput, T value) throws IOException {
+    dataOutput.writeUTF(value.getClass().getName());
+    value.write(dataOutput);
+  }
+
+  public static <T extends Writable> T read(DataInput dataInput, Class<? extends T> clazz) throws IOException {
+    String className = dataInput.readUTF();
+    T r = ClassUtils.instantiateAs(className, clazz);
+    r.readFields(dataInput);
+    return r;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PriorFunction.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PriorFunction.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PriorFunction.java
new file mode 100644
index 0000000..857f061
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PriorFunction.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.hadoop.io.Writable;
+
+/**
+ * A prior is used to regularize the learning algorithm.  This allows a trade-off to
+ * be made between complexity of the model being learned and the accuracy with which
+ * the model fits the training data.  There are different definitions of complexity
+ * which can be approximated using different priors.  For large sparse systems, such
+ * as text classification, the L1 prior is often used which favors sparse models.
+ */
+public interface PriorFunction extends Writable {
+  /**
+   * Applies the regularization to a coefficient.
+   * @param oldValue        The previous value.
+   * @param generations     The number of generations.
+   * @param learningRate    The learning rate with lambda baked in.
+   * @return                The new coefficient value after regularization.
+   */
+  double age(double oldValue, double generations, double learningRate);
+
+  /**
+   * Returns the log of the probability of a particular coefficient value according to the prior.
+   * @param betaIJ          The coefficient.
+   * @return                The log probability.
+   */
+  double logP(double betaIJ);
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RankingGradient.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RankingGradient.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RankingGradient.java
new file mode 100644
index 0000000..a04fc8b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RankingGradient.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Deque;
+import java.util.List;
+
+/**
+ * Uses the difference between this instance and recent history to get a
+ * gradient that optimizes ranking performance.  Essentially this is the
+ * same as directly optimizing AUC.  It isn't expected that this would
+ * be used alone, but rather that a MixedGradient would use it and a
+ * DefaultGradient together to combine both ranking and log-likelihood
+ * goals.
+ */
+public class RankingGradient implements Gradient {
+
+  private static final Gradient BASIC = new DefaultGradient();
+
+  private int window = 10;
+
+  private final List<Deque<Vector>> history = new ArrayList<>();
+
+  public RankingGradient(int window) {
+    this.window = window;
+  }
+
+  @Override
+  public final Vector apply(String groupKey, int actual, Vector instance, AbstractVectorClassifier classifier) {
+    addToHistory(actual, instance);
+
+    // now compute average gradient versus saved vectors from the other side
+    Deque<Vector> otherSide = history.get(1 - actual);
+    int n = otherSide.size();
+
+    Vector r = null;
+    for (Vector other : otherSide) {
+      Vector g = BASIC.apply(groupKey, actual, instance.minus(other), classifier);
+
+      if (r == null) {
+        r = g;
+      } else {
+        r.assign(g, Functions.plusMult(1.0 / n));
+      }
+    }
+    return r;
+  }
+
+  public void addToHistory(int actual, Vector instance) {
+    while (history.size() <= actual) {
+      history.add(new ArrayDeque<Vector>(window));
+    }
+    // save this instance
+    Deque<Vector> ourSide = history.get(actual);
+    ourSide.add(instance);
+    while (ourSide.size() >= window) {
+      ourSide.pollFirst();
+    }
+  }
+
+  public Gradient getBaseGradient() {
+    return BASIC;
+  }
+}


[18/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPlusPlusFactorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPlusPlusFactorizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPlusPlusFactorizer.java
new file mode 100644
index 0000000..20446f8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPlusPlusFactorizer.java
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.common.RandomUtils;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+/**
+ * SVD++, an enhancement of classical matrix factorization for rating prediction.
+ * Additionally to using ratings (how did people rate?) for learning, this model also takes into account
+ * who rated what.
+ *
+ * Yehuda Koren: Factorization Meets the Neighborhood: a Multifaceted Collaborative Filtering Model, KDD 2008.
+ * http://research.yahoo.com/files/kdd08koren.pdf
+ */
+public final class SVDPlusPlusFactorizer extends RatingSGDFactorizer {
+
+  private double[][] p;
+  private double[][] y;
+  private Map<Integer, List<Integer>> itemsByUser;
+
+  public SVDPlusPlusFactorizer(DataModel dataModel, int numFeatures, int numIterations) throws TasteException {
+    this(dataModel, numFeatures, 0.01, 0.1, 0.01, numIterations, 1.0);
+    biasLearningRate = 0.7;
+    biasReg = 0.33;
+  }
+
+  public SVDPlusPlusFactorizer(DataModel dataModel, int numFeatures, double learningRate, double preventOverfitting,
+      double randomNoise, int numIterations, double learningRateDecay) throws TasteException {
+    super(dataModel, numFeatures, learningRate, preventOverfitting, randomNoise, numIterations, learningRateDecay);
+  }
+
+  @Override
+  protected void prepareTraining() throws TasteException {
+    super.prepareTraining();
+    Random random = RandomUtils.getRandom();
+
+    p = new double[dataModel.getNumUsers()][numFeatures];
+    for (int i = 0; i < p.length; i++) {
+      for (int feature = 0; feature < FEATURE_OFFSET; feature++) {
+        p[i][feature] = 0;
+      }
+      for (int feature = FEATURE_OFFSET; feature < numFeatures; feature++) {
+        p[i][feature] = random.nextGaussian() * randomNoise;
+      }
+    }
+
+    y = new double[dataModel.getNumItems()][numFeatures];
+    for (int i = 0; i < y.length; i++) {
+      for (int feature = 0; feature < FEATURE_OFFSET; feature++) {
+        y[i][feature] = 0;
+      }
+      for (int feature = FEATURE_OFFSET; feature < numFeatures; feature++) {
+        y[i][feature] = random.nextGaussian() * randomNoise;
+      }
+    }
+
+    /* get internal item IDs which we will need several times */
+    itemsByUser = new HashMap<>();
+    LongPrimitiveIterator userIDs = dataModel.getUserIDs();
+    while (userIDs.hasNext()) {
+      long userId = userIDs.nextLong();
+      int userIndex = userIndex(userId);
+      FastIDSet itemIDsFromUser = dataModel.getItemIDsFromUser(userId);
+      List<Integer> itemIndexes = new ArrayList<>(itemIDsFromUser.size());
+      itemsByUser.put(userIndex, itemIndexes);
+      for (long itemID2 : itemIDsFromUser) {
+        int i2 = itemIndex(itemID2);
+        itemIndexes.add(i2);
+      }
+    }
+  }
+
+  @Override
+  public Factorization factorize() throws TasteException {
+    prepareTraining();
+
+    super.factorize();
+
+    for (int userIndex = 0; userIndex < userVectors.length; userIndex++) {
+      for (int itemIndex : itemsByUser.get(userIndex)) {
+        for (int feature = FEATURE_OFFSET; feature < numFeatures; feature++) {
+          userVectors[userIndex][feature] += y[itemIndex][feature];
+        }
+      }
+      double denominator = Math.sqrt(itemsByUser.get(userIndex).size());
+      for (int feature = 0; feature < userVectors[userIndex].length; feature++) {
+        userVectors[userIndex][feature] =
+            (float) (userVectors[userIndex][feature] / denominator + p[userIndex][feature]);
+      }
+    }
+
+    return createFactorization(userVectors, itemVectors);
+  }
+
+
+  @Override
+  protected void updateParameters(long userID, long itemID, float rating, double currentLearningRate) {
+    int userIndex = userIndex(userID);
+    int itemIndex = itemIndex(itemID);
+
+    double[] userVector = p[userIndex];
+    double[] itemVector = itemVectors[itemIndex];
+
+    double[] pPlusY = new double[numFeatures];
+    for (int i2 : itemsByUser.get(userIndex)) {
+      for (int f = FEATURE_OFFSET; f < numFeatures; f++) {
+        pPlusY[f] += y[i2][f];
+      }
+    }
+    double denominator = Math.sqrt(itemsByUser.get(userIndex).size());
+    for (int feature = 0; feature < pPlusY.length; feature++) {
+      pPlusY[feature] = (float) (pPlusY[feature] / denominator + p[userIndex][feature]);
+    }
+
+    double prediction = predictRating(pPlusY, itemIndex);
+    double err = rating - prediction;
+    double normalized_error = err / denominator;
+
+    // adjust user bias
+    userVector[USER_BIAS_INDEX] +=
+        biasLearningRate * currentLearningRate * (err - biasReg * preventOverfitting * userVector[USER_BIAS_INDEX]);
+
+    // adjust item bias
+    itemVector[ITEM_BIAS_INDEX] +=
+        biasLearningRate * currentLearningRate * (err - biasReg * preventOverfitting * itemVector[ITEM_BIAS_INDEX]);
+
+    // adjust features
+    for (int feature = FEATURE_OFFSET; feature < numFeatures; feature++) {
+      double pF = userVector[feature];
+      double iF = itemVector[feature];
+
+      double deltaU = err * iF - preventOverfitting * pF;
+      userVector[feature] += currentLearningRate * deltaU;
+
+      double deltaI = err * pPlusY[feature] - preventOverfitting * iF;
+      itemVector[feature] += currentLearningRate * deltaI;
+
+      double commonUpdate = normalized_error * iF;
+      for (int itemIndex2 : itemsByUser.get(userIndex)) {
+        double deltaI2 = commonUpdate - preventOverfitting * y[itemIndex2][feature];
+        y[itemIndex2][feature] += learningRate * deltaI2;
+      }
+    }
+  }
+
+  private double predictRating(double[] userVector, int itemID) {
+    double sum = 0;
+    for (int feature = 0; feature < numFeatures; feature++) {
+      sum += userVector[feature] * itemVectors[itemID][feature];
+    }
+    return sum;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPreference.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPreference.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPreference.java
new file mode 100644
index 0000000..45c54da
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPreference.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.impl.model.GenericPreference;
+
+final class SVDPreference extends GenericPreference {
+
+  private double cache;
+
+  SVDPreference(long userID, long itemID, float value, double cache) {
+    super(userID, itemID, value);
+    setCache(cache);
+  }
+
+  public double getCache() {
+    return cache;
+  }
+
+  public void setCache(double value) {
+    Preconditions.checkArgument(!Double.isNaN(value), "NaN cache value");
+    this.cache = value;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDRecommender.java
new file mode 100644
index 0000000..45d4af7
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDRecommender.java
@@ -0,0 +1,185 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.Callable;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.recommender.AbstractRecommender;
+import org.apache.mahout.cf.taste.impl.recommender.AllUnknownItemsCandidateItemsStrategy;
+import org.apache.mahout.cf.taste.impl.recommender.TopItems;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A {@link org.apache.mahout.cf.taste.recommender.Recommender} that uses matrix factorization (a projection of users
+ * and items onto a feature space)
+ */
+public final class SVDRecommender extends AbstractRecommender {
+
+  private Factorization factorization;
+  private final Factorizer factorizer;
+  private final PersistenceStrategy persistenceStrategy;
+  private final RefreshHelper refreshHelper;
+
+  private static final Logger log = LoggerFactory.getLogger(SVDRecommender.class);
+
+  public SVDRecommender(DataModel dataModel, Factorizer factorizer) throws TasteException {
+    this(dataModel, factorizer, new AllUnknownItemsCandidateItemsStrategy(), getDefaultPersistenceStrategy());
+  }
+
+  public SVDRecommender(DataModel dataModel, Factorizer factorizer, CandidateItemsStrategy candidateItemsStrategy)
+    throws TasteException {
+    this(dataModel, factorizer, candidateItemsStrategy, getDefaultPersistenceStrategy());
+  }
+
+  /**
+   * Create an SVDRecommender using a persistent store to cache factorizations. A factorization is loaded from the
+   * store if present, otherwise a new factorization is computed and saved in the store.
+   *
+   * The {@link #refresh(java.util.Collection) refresh} method recomputes the factorization and overwrites the store.
+   *
+   * @param dataModel
+   * @param factorizer
+   * @param persistenceStrategy
+   * @throws TasteException
+   * @throws IOException
+   */
+  public SVDRecommender(DataModel dataModel, Factorizer factorizer, PersistenceStrategy persistenceStrategy) 
+    throws TasteException {
+    this(dataModel, factorizer, getDefaultCandidateItemsStrategy(), persistenceStrategy);
+  }
+
+  /**
+   * Create an SVDRecommender using a persistent store to cache factorizations. A factorization is loaded from the
+   * store if present, otherwise a new factorization is computed and saved in the store. 
+   *
+   * The {@link #refresh(java.util.Collection) refresh} method recomputes the factorization and overwrites the store.
+   *
+   * @param dataModel
+   * @param factorizer
+   * @param candidateItemsStrategy
+   * @param persistenceStrategy
+   *
+   * @throws TasteException
+   */
+  public SVDRecommender(DataModel dataModel, Factorizer factorizer, CandidateItemsStrategy candidateItemsStrategy,
+      PersistenceStrategy persistenceStrategy) throws TasteException {
+    super(dataModel, candidateItemsStrategy);
+    this.factorizer = Preconditions.checkNotNull(factorizer);
+    this.persistenceStrategy = Preconditions.checkNotNull(persistenceStrategy);
+    try {
+      factorization = persistenceStrategy.load();
+    } catch (IOException e) {
+      throw new TasteException("Error loading factorization", e);
+    }
+    
+    if (factorization == null) {
+      train();
+    }
+    
+    refreshHelper = new RefreshHelper(new Callable<Object>() {
+      @Override
+      public Object call() throws TasteException {
+        train();
+        return null;
+      }
+    });
+    refreshHelper.addDependency(getDataModel());
+    refreshHelper.addDependency(factorizer);
+    refreshHelper.addDependency(candidateItemsStrategy);
+  }
+
+  static PersistenceStrategy getDefaultPersistenceStrategy() {
+    return new NoPersistenceStrategy();
+  }
+
+  private void train() throws TasteException {
+    factorization = factorizer.factorize();
+    try {
+      persistenceStrategy.maybePersist(factorization);
+    } catch (IOException e) {
+      throw new TasteException("Error persisting factorization", e);
+    }
+  }
+  
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+    throws TasteException {
+    Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1");
+    log.debug("Recommending items for user ID '{}'", userID);
+
+    PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID);
+    FastIDSet possibleItemIDs = getAllOtherItems(userID, preferencesFromUser, includeKnownItems);
+
+    List<RecommendedItem> topItems = TopItems.getTopItems(howMany, possibleItemIDs.iterator(), rescorer,
+        new Estimator(userID));
+    log.debug("Recommendations are: {}", topItems);
+
+    return topItems;
+  }
+
+  /**
+   * a preference is estimated by computing the dot-product of the user and item feature vectors
+   */
+  @Override
+  public float estimatePreference(long userID, long itemID) throws TasteException {
+    double[] userFeatures = factorization.getUserFeatures(userID);
+    double[] itemFeatures = factorization.getItemFeatures(itemID);
+    double estimate = 0;
+    for (int feature = 0; feature < userFeatures.length; feature++) {
+      estimate += userFeatures[feature] * itemFeatures[feature];
+    }
+    return (float) estimate;
+  }
+
+  private final class Estimator implements TopItems.Estimator<Long> {
+
+    private final long theUserID;
+
+    private Estimator(long theUserID) {
+      this.theUserID = theUserID;
+    }
+
+    @Override
+    public double estimate(Long itemID) throws TasteException {
+      return estimatePreference(theUserID, itemID);
+    }
+  }
+
+  /**
+   * Refresh the data model and factorization.
+   */
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    refreshHelper.refresh(alreadyRefreshed);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractItemSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractItemSimilarity.java
new file mode 100644
index 0000000..e0d6f59
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractItemSimilarity.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+
+import java.util.Collection;
+
+public abstract class AbstractItemSimilarity implements ItemSimilarity {
+
+  private final DataModel dataModel;
+  private final RefreshHelper refreshHelper;
+
+  protected AbstractItemSimilarity(DataModel dataModel) {
+    Preconditions.checkArgument(dataModel != null, "dataModel is null");
+    this.dataModel = dataModel;
+    this.refreshHelper = new RefreshHelper(null);
+    refreshHelper.addDependency(this.dataModel);
+  }
+
+  protected DataModel getDataModel() {
+    return dataModel;
+  }
+
+  @Override
+  public long[] allSimilarItemIDs(long itemID) throws TasteException {
+    FastIDSet allSimilarItemIDs = new FastIDSet();
+    LongPrimitiveIterator allItemIDs = dataModel.getItemIDs();
+    while (allItemIDs.hasNext()) {
+      long possiblySimilarItemID = allItemIDs.nextLong();
+      if (!Double.isNaN(itemSimilarity(itemID, possiblySimilarItemID))) {
+        allSimilarItemIDs.add(possiblySimilarItemID);
+      }
+    }
+    return allSimilarItemIDs.toArray();
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    refreshHelper.refresh(alreadyRefreshed);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java
new file mode 100644
index 0000000..59c30d9
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java
@@ -0,0 +1,343 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.common.Weighting;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+import com.google.common.base.Preconditions;
+
+/** Abstract superclass encapsulating functionality that is common to most implementations in this package. */
+abstract class AbstractSimilarity extends AbstractItemSimilarity implements UserSimilarity {
+
+  private PreferenceInferrer inferrer;
+  private final boolean weighted;
+  private final boolean centerData;
+  private int cachedNumItems;
+  private int cachedNumUsers;
+  private final RefreshHelper refreshHelper;
+
+  /**
+   * <p>
+   * Creates a possibly weighted {@link AbstractSimilarity}.
+   * </p>
+   */
+  AbstractSimilarity(final DataModel dataModel, Weighting weighting, boolean centerData) throws TasteException {
+    super(dataModel);
+    this.weighted = weighting == Weighting.WEIGHTED;
+    this.centerData = centerData;
+    this.cachedNumItems = dataModel.getNumItems();
+    this.cachedNumUsers = dataModel.getNumUsers();
+    this.refreshHelper = new RefreshHelper(new Callable<Object>() {
+      @Override
+      public Object call() throws TasteException {
+        cachedNumItems = dataModel.getNumItems();
+        cachedNumUsers = dataModel.getNumUsers();
+        return null;
+      }
+    });
+  }
+
+  final PreferenceInferrer getPreferenceInferrer() {
+    return inferrer;
+  }
+  
+  @Override
+  public final void setPreferenceInferrer(PreferenceInferrer inferrer) {
+    Preconditions.checkArgument(inferrer != null, "inferrer is null");
+    refreshHelper.addDependency(inferrer);
+    refreshHelper.removeDependency(this.inferrer);
+    this.inferrer = inferrer;
+  }
+  
+  final boolean isWeighted() {
+    return weighted;
+  }
+  
+  /**
+   * <p>
+   * Several subclasses in this package implement this method to actually compute the similarity from figures
+   * computed over users or items. Note that the computations in this class "center" the data, such that X and
+   * Y's mean are 0.
+   * </p>
+   * 
+   * <p>
+   * Note that the sum of all X and Y values must then be 0. This value isn't passed down into the standard
+   * similarity computations as a result.
+   * </p>
+   * 
+   * @param n
+   *          total number of users or items
+   * @param sumXY
+   *          sum of product of user/item preference values, over all items/users preferred by both
+   *          users/items
+   * @param sumX2
+   *          sum of the square of user/item preference values, over the first item/user
+   * @param sumY2
+   *          sum of the square of the user/item preference values, over the second item/user
+   * @param sumXYdiff2
+   *          sum of squares of differences in X and Y values
+   * @return similarity value between -1.0 and 1.0, inclusive, or {@link Double#NaN} if no similarity can be
+   *         computed (e.g. when no items have been rated by both users
+   */
+  abstract double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2);
+  
+  @Override
+  public double userSimilarity(long userID1, long userID2) throws TasteException {
+    DataModel dataModel = getDataModel();
+    PreferenceArray xPrefs = dataModel.getPreferencesFromUser(userID1);
+    PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2);
+    int xLength = xPrefs.length();
+    int yLength = yPrefs.length();
+    
+    if (xLength == 0 || yLength == 0) {
+      return Double.NaN;
+    }
+    
+    long xIndex = xPrefs.getItemID(0);
+    long yIndex = yPrefs.getItemID(0);
+    int xPrefIndex = 0;
+    int yPrefIndex = 0;
+    
+    double sumX = 0.0;
+    double sumX2 = 0.0;
+    double sumY = 0.0;
+    double sumY2 = 0.0;
+    double sumXY = 0.0;
+    double sumXYdiff2 = 0.0;
+    int count = 0;
+    
+    boolean hasInferrer = inferrer != null;
+    
+    while (true) {
+      int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0;
+      if (hasInferrer || compare == 0) {
+        double x;
+        double y;
+        if (xIndex == yIndex) {
+          // Both users expressed a preference for the item
+          x = xPrefs.getValue(xPrefIndex);
+          y = yPrefs.getValue(yPrefIndex);
+        } else {
+          // Only one user expressed a preference, but infer the other one's preference and tally
+          // as if the other user expressed that preference
+          if (compare < 0) {
+            // X has a value; infer Y's
+            x = xPrefs.getValue(xPrefIndex);
+            y = inferrer.inferPreference(userID2, xIndex);
+          } else {
+            // compare > 0
+            // Y has a value; infer X's
+            x = inferrer.inferPreference(userID1, yIndex);
+            y = yPrefs.getValue(yPrefIndex);
+          }
+        }
+        sumXY += x * y;
+        sumX += x;
+        sumX2 += x * x;
+        sumY += y;
+        sumY2 += y * y;
+        double diff = x - y;
+        sumXYdiff2 += diff * diff;
+        count++;
+      }
+      if (compare <= 0) {
+        if (++xPrefIndex >= xLength) {
+          if (hasInferrer) {
+            // Must count other Ys; pretend next X is far away
+            if (yIndex == Long.MAX_VALUE) {
+              // ... but stop if both are done!
+              break;
+            }
+            xIndex = Long.MAX_VALUE;
+          } else {
+            break;
+          }
+        } else {
+          xIndex = xPrefs.getItemID(xPrefIndex);
+        }
+      }
+      if (compare >= 0) {
+        if (++yPrefIndex >= yLength) {
+          if (hasInferrer) {
+            // Must count other Xs; pretend next Y is far away
+            if (xIndex == Long.MAX_VALUE) {
+              // ... but stop if both are done!
+              break;
+            }
+            yIndex = Long.MAX_VALUE;
+          } else {
+            break;
+          }
+        } else {
+          yIndex = yPrefs.getItemID(yPrefIndex);
+        }
+      }
+    }
+    
+    // "Center" the data. If my math is correct, this'll do it.
+    double result;
+    if (centerData) {
+      double meanX = sumX / count;
+      double meanY = sumY / count;
+      // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
+      double centeredSumXY = sumXY - meanY * sumX;
+      // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
+      double centeredSumX2 = sumX2 - meanX * sumX;
+      // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
+      double centeredSumY2 = sumY2 - meanY * sumY;
+      result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);
+    } else {
+      result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2);
+    }
+    
+    if (!Double.isNaN(result)) {
+      result = normalizeWeightResult(result, count, cachedNumItems);
+    }
+    return result;
+  }
+  
+  @Override
+  public final double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+    DataModel dataModel = getDataModel();
+    PreferenceArray xPrefs = dataModel.getPreferencesForItem(itemID1);
+    PreferenceArray yPrefs = dataModel.getPreferencesForItem(itemID2);
+    int xLength = xPrefs.length();
+    int yLength = yPrefs.length();
+    
+    if (xLength == 0 || yLength == 0) {
+      return Double.NaN;
+    }
+    
+    long xIndex = xPrefs.getUserID(0);
+    long yIndex = yPrefs.getUserID(0);
+    int xPrefIndex = 0;
+    int yPrefIndex = 0;
+    
+    double sumX = 0.0;
+    double sumX2 = 0.0;
+    double sumY = 0.0;
+    double sumY2 = 0.0;
+    double sumXY = 0.0;
+    double sumXYdiff2 = 0.0;
+    int count = 0;
+    
+    // No, pref inferrers and transforms don't apply here. I think.
+    
+    while (true) {
+      int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0;
+      if (compare == 0) {
+        // Both users expressed a preference for the item
+        double x = xPrefs.getValue(xPrefIndex);
+        double y = yPrefs.getValue(yPrefIndex);
+        sumXY += x * y;
+        sumX += x;
+        sumX2 += x * x;
+        sumY += y;
+        sumY2 += y * y;
+        double diff = x - y;
+        sumXYdiff2 += diff * diff;
+        count++;
+      }
+      if (compare <= 0) {
+        if (++xPrefIndex == xLength) {
+          break;
+        }
+        xIndex = xPrefs.getUserID(xPrefIndex);
+      }
+      if (compare >= 0) {
+        if (++yPrefIndex == yLength) {
+          break;
+        }
+        yIndex = yPrefs.getUserID(yPrefIndex);
+      }
+    }
+
+    double result;
+    if (centerData) {
+      // See comments above on these computations
+      double n = (double) count;
+      double meanX = sumX / n;
+      double meanY = sumY / n;
+      // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
+      double centeredSumXY = sumXY - meanY * sumX;
+      // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
+      double centeredSumX2 = sumX2 - meanX * sumX;
+      // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
+      double centeredSumY2 = sumY2 - meanY * sumY;
+      result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);
+    } else {
+      result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2);
+    }
+    
+    if (!Double.isNaN(result)) {
+      result = normalizeWeightResult(result, count, cachedNumUsers);
+    }
+    return result;
+  }
+
+  @Override
+  public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+    int length = itemID2s.length;
+    double[] result = new double[length];
+    for (int i = 0; i < length; i++) {
+      result[i] = itemSimilarity(itemID1, itemID2s[i]);
+    }
+    return result;
+  }
+  
+  final double normalizeWeightResult(double result, int count, int num) {
+    double normalizedResult = result;
+    if (weighted) {
+      double scaleFactor = 1.0 - (double) count / (double) (num + 1);
+      if (normalizedResult < 0.0) {
+        normalizedResult = -1.0 + scaleFactor * (1.0 + normalizedResult);
+      } else {
+        normalizedResult = 1.0 - scaleFactor * (1.0 - normalizedResult);
+      }
+    }
+    // Make sure the result is not accidentally a little outside [-1.0, 1.0] due to rounding:
+    if (normalizedResult < -1.0) {
+      normalizedResult = -1.0;
+    } else if (normalizedResult > 1.0) {
+      normalizedResult = 1.0;
+    }
+    return normalizedResult;
+  }
+  
+  @Override
+  public final void refresh(Collection<Refreshable> alreadyRefreshed) {
+    super.refresh(alreadyRefreshed);
+    refreshHelper.refresh(alreadyRefreshed);
+  }
+  
+  @Override
+  public final String toString() {
+    return this.getClass().getSimpleName() + "[dataModel:" + getDataModel() + ",inferrer:" + inferrer + ']';
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java
new file mode 100644
index 0000000..7c655fe
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.Retriever;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+
+/**
+ * <p>
+ * Implementations of this interface compute an inferred preference for a user and an item that the user has
+ * not expressed any preference for. This might be an average of other preferences scores from that user, for
+ * example. This technique is sometimes called "default voting".
+ * </p>
+ */
+public final class AveragingPreferenceInferrer implements PreferenceInferrer {
+  
+  private static final Float ZERO = 0.0f;
+  
+  private final DataModel dataModel;
+  private final Cache<Long,Float> averagePreferenceValue;
+  
+  public AveragingPreferenceInferrer(DataModel dataModel) throws TasteException {
+    this.dataModel = dataModel;
+    Retriever<Long,Float> retriever = new PrefRetriever();
+    averagePreferenceValue = new Cache<>(retriever, dataModel.getNumUsers());
+    refresh(null);
+  }
+  
+  @Override
+  public float inferPreference(long userID, long itemID) throws TasteException {
+    return averagePreferenceValue.get(userID);
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    averagePreferenceValue.clear();
+  }
+  
+  private final class PrefRetriever implements Retriever<Long,Float> {
+    
+    @Override
+    public Float get(Long key) throws TasteException {
+      PreferenceArray prefs = dataModel.getPreferencesFromUser(key);
+      int size = prefs.length();
+      if (size == 0) {
+        return ZERO;
+      }
+      RunningAverage average = new FullRunningAverage();
+      for (int i = 0; i < size; i++) {
+        average.addDatum(prefs.getValue(i));
+      }
+      return (float) average.getAverage();
+    }
+  }
+  
+  @Override
+  public String toString() {
+    return "AveragingPreferenceInferrer";
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java
new file mode 100644
index 0000000..87aeae9
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.Retriever;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.apache.mahout.common.LongPair;
+import com.google.common.base.Preconditions;
+
+/**
+ * Caches the results from an underlying {@link ItemSimilarity} implementation.
+ */
+public final class CachingItemSimilarity implements ItemSimilarity {
+
+  private final ItemSimilarity similarity;
+  private final Cache<LongPair,Double> similarityCache;
+  private final RefreshHelper refreshHelper;
+
+  /**
+   * Creates this on top of the given {@link ItemSimilarity}.
+   * The cache is sized according to properties of the given {@link DataModel}.
+   */
+  public CachingItemSimilarity(ItemSimilarity similarity, DataModel dataModel) throws TasteException {
+    this(similarity, dataModel.getNumItems());
+  }
+
+  /**
+   * Creates this on top of the given {@link ItemSimilarity}.
+   * The cache size is capped by the given size.
+   */
+  public CachingItemSimilarity(ItemSimilarity similarity, int maxCacheSize) {
+    Preconditions.checkArgument(similarity != null, "similarity is null");
+    this.similarity = similarity;
+    this.similarityCache = new Cache<>(new SimilarityRetriever(similarity), maxCacheSize);
+    this.refreshHelper = new RefreshHelper(new Callable<Void>() {
+      @Override
+      public Void call() {
+        similarityCache.clear();
+        return null;
+      }
+    });
+    refreshHelper.addDependency(similarity);
+  }
+  
+  @Override
+  public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+    LongPair key = itemID1 < itemID2 ? new LongPair(itemID1, itemID2) : new LongPair(itemID2, itemID1);
+    return similarityCache.get(key);
+  }
+
+  @Override
+  public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+    int length = itemID2s.length;
+    double[] result = new double[length];
+    for (int i = 0; i < length; i++) {
+      result[i] = itemSimilarity(itemID1, itemID2s[i]);
+    }
+    return result;
+  }
+
+  @Override
+  public long[] allSimilarItemIDs(long itemID) throws TasteException {
+    return similarity.allSimilarItemIDs(itemID);
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    refreshHelper.refresh(alreadyRefreshed);
+  }
+
+  public void clearCacheForItem(long itemID) {
+    similarityCache.removeKeysMatching(new LongPairMatchPredicate(itemID));
+  }
+  
+  private static final class SimilarityRetriever implements Retriever<LongPair,Double> {
+    private final ItemSimilarity similarity;
+    
+    private SimilarityRetriever(ItemSimilarity similarity) {
+      this.similarity = similarity;
+    }
+    
+    @Override
+    public Double get(LongPair key) throws TasteException {
+      return similarity.itemSimilarity(key.getFirst(), key.getSecond());
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java
new file mode 100644
index 0000000..873568a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.Retriever;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+import org.apache.mahout.common.LongPair;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * Caches the results from an underlying {@link UserSimilarity} implementation.
+ */
+public final class CachingUserSimilarity implements UserSimilarity {
+  
+  private final UserSimilarity similarity;
+  private final Cache<LongPair,Double> similarityCache;
+  private final RefreshHelper refreshHelper;
+
+  /**
+   * Creates this on top of the given {@link UserSimilarity}.
+   * The cache is sized according to properties of the given {@link DataModel}.
+   */
+  public CachingUserSimilarity(UserSimilarity similarity, DataModel dataModel) throws TasteException {
+    this(similarity, dataModel.getNumUsers());
+  }
+
+  /**
+   * Creates this on top of the given {@link UserSimilarity}.
+   * The cache size is capped by the given size.
+   */
+  public CachingUserSimilarity(UserSimilarity similarity, int maxCacheSize) {
+    Preconditions.checkArgument(similarity != null, "similarity is null");
+    this.similarity = similarity;
+    this.similarityCache = new Cache<>(new SimilarityRetriever(similarity), maxCacheSize);
+    this.refreshHelper = new RefreshHelper(new Callable<Void>() {
+      @Override
+      public Void call() {
+        similarityCache.clear();
+        return null;
+      }
+    });
+    refreshHelper.addDependency(similarity);
+  }
+  
+  @Override
+  public double userSimilarity(long userID1, long userID2) throws TasteException {
+    LongPair key = userID1 < userID2 ? new LongPair(userID1, userID2) : new LongPair(userID2, userID1);
+    return similarityCache.get(key);
+  }
+  
+  @Override
+  public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+    similarityCache.clear();
+    similarity.setPreferenceInferrer(inferrer);
+  }
+
+  public void clearCacheForUser(long userID) {
+    similarityCache.removeKeysMatching(new LongPairMatchPredicate(userID));
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    refreshHelper.refresh(alreadyRefreshed);
+  }
+  
+  private static final class SimilarityRetriever implements Retriever<LongPair,Double> {
+    private final UserSimilarity similarity;
+    
+    private SimilarityRetriever(UserSimilarity similarity) {
+      this.similarity = similarity;
+    }
+    
+    @Override
+    public Double get(LongPair key) throws TasteException {
+      return similarity.userSimilarity(key.getFirst(), key.getSecond());
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java
new file mode 100644
index 0000000..88fbe58
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+/**
+ * Implementation of City Block distance (also known as Manhattan distance) - the absolute value of the difference of
+ * each direction is summed.  The resulting unbounded distance is then mapped between 0 and 1.
+ */
+public final class CityBlockSimilarity extends AbstractItemSimilarity implements UserSimilarity {
+
+  public CityBlockSimilarity(DataModel dataModel) {
+    super(dataModel);
+  }
+
+  /**
+   * @throws UnsupportedOperationException
+   */
+  @Override
+  public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    Collection<Refreshable> refreshed = RefreshHelper.buildRefreshed(alreadyRefreshed);
+    RefreshHelper.maybeRefresh(refreshed, getDataModel());
+  }
+
+  @Override
+  public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+    DataModel dataModel = getDataModel();
+    int preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+    int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2);
+    int intersection = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2);
+    return doSimilarity(preferring1, preferring2, intersection);
+  }
+
+  @Override
+  public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+    DataModel dataModel = getDataModel();
+    int preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+    double[] distance = new double[itemID2s.length];
+    for (int i = 0; i < itemID2s.length; ++i) {
+      int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2s[i]);
+      int intersection = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2s[i]);
+      distance[i] = doSimilarity(preferring1, preferring2, intersection);
+    }
+    return distance;
+  }
+
+  @Override
+  public double userSimilarity(long userID1, long userID2) throws TasteException {
+    DataModel dataModel = getDataModel();
+    FastIDSet prefs1 = dataModel.getItemIDsFromUser(userID1);
+    FastIDSet prefs2 = dataModel.getItemIDsFromUser(userID2);
+    int prefs1Size = prefs1.size();
+    int prefs2Size = prefs2.size();
+    int intersectionSize = prefs1Size < prefs2Size ? prefs2.intersectionSize(prefs1) : prefs1.intersectionSize(prefs2);
+    return doSimilarity(prefs1Size, prefs2Size, intersectionSize);
+  }
+
+  /**
+   * Calculate City Block Distance from total non-zero values and intersections and map to a similarity value.
+   *
+   * @param pref1        number of non-zero values in left vector
+   * @param pref2        number of non-zero values in right vector
+   * @param intersection number of overlapping non-zero values
+   */
+  private static double doSimilarity(int pref1, int pref2, int intersection) {
+    int distance = pref1 + pref2 - 2 * intersection;
+    return 1.0 / (1.0 + distance);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java
new file mode 100644
index 0000000..990e9ea
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.common.Weighting;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * An implementation of a "similarity" based on the Euclidean "distance" between two users X and Y. Thinking
+ * of items as dimensions and preferences as points along those dimensions, a distance is computed using all
+ * items (dimensions) where both users have expressed a preference for that item. This is simply the square
+ * root of the sum of the squares of differences in position (preference) along each dimension.</p>
+ * 
+ * <p>The similarity could be computed as 1 / (1 + distance / sqrt(n)), so the resulting values are in the range (0,1].
+ * This would weight against pairs that overlap in more dimensions, which should indicate more similarity, 
+ * since more dimensions offer more opportunities to be farther apart. Actually, it is computed as 
+ * sqrt(n) / (1 + distance), where n is the number of dimensions, in order to help correct for this.
+ * sqrt(n) is chosen since randomly-chosen points have a distance that grows as sqrt(n).</p>
+ *
+ * <p>Note that this could cause a similarity to exceed 1; such values are capped at 1.</p>
+ * 
+ * <p>Note that the distance isn't normalized in any way; it's not valid to compare similarities computed from
+ * different domains (different rating scales, for example). Within one domain, normalizing doesn't matter much as
+ * it doesn't change ordering.</p>
+ */
+public final class EuclideanDistanceSimilarity extends AbstractSimilarity {
+
+  /**
+   * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+   */
+  public EuclideanDistanceSimilarity(DataModel dataModel) throws TasteException {
+    this(dataModel, Weighting.UNWEIGHTED);
+  }
+
+  /**
+   * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+   */
+  public EuclideanDistanceSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
+    super(dataModel, weighting, false);
+    Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values");
+  }
+  
+  @Override
+  double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
+    return 1.0 / (1.0 + Math.sqrt(sumXYdiff2) / Math.sqrt(n));
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java
new file mode 100644
index 0000000..d0c9b8c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java
@@ -0,0 +1,358 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+import com.google.common.collect.AbstractIterator;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.recommender.TopItems;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.apache.mahout.common.RandomUtils;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A "generic" {@link ItemSimilarity} which takes a static list of precomputed item similarities and bases its
+ * responses on that alone. The values may have been precomputed offline by another process, stored in a file,
+ * and then read and fed into an instance of this class.
+ * </p>
+ * 
+ * <p>
+ * This is perhaps the best {@link ItemSimilarity} to use with
+ * {@link org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender}, for now, since the point
+ * of item-based recommenders is that they can take advantage of the fact that item similarity is relatively
+ * static, can be precomputed, and then used in computation to gain a significant performance advantage.
+ * </p>
+ */
+public final class GenericItemSimilarity implements ItemSimilarity {
+
+  private static final long[] NO_IDS = new long[0];
+  
+  private final FastByIDMap<FastByIDMap<Double>> similarityMaps = new FastByIDMap<>();
+  private final FastByIDMap<FastIDSet> similarItemIDsIndex = new FastByIDMap<>();
+
+  /**
+   * <p>
+   * Creates a {@link GenericItemSimilarity} from a precomputed list of {@link ItemItemSimilarity}s. Each
+   * represents the similarity between two distinct items. Since similarity is assumed to be symmetric, it is
+   * not necessary to specify similarity between item1 and item2, and item2 and item1. Both are the same. It
+   * is also not necessary to specify a similarity between any item and itself; these are assumed to be 1.0.
+   * </p>
+   *
+   * <p>
+   * Note that specifying a similarity between two items twice is not an error, but, the later value will win.
+   * </p>
+   *
+   * @param similarities
+   *          set of {@link ItemItemSimilarity}s on which to base this instance
+   */
+  public GenericItemSimilarity(Iterable<ItemItemSimilarity> similarities) {
+    initSimilarityMaps(similarities.iterator());
+  }
+
+  /**
+   * <p>
+   * Like {@link #GenericItemSimilarity(Iterable)}, but will only keep the specified number of similarities
+   * from the given {@link Iterable} of similarities. It will keep those with the highest similarity -- those
+   * that are therefore most important.
+   * </p>
+   * 
+   * <p>
+   * Thanks to tsmorton for suggesting this and providing part of the implementation.
+   * </p>
+   * 
+   * @param similarities
+   *          set of {@link ItemItemSimilarity}s on which to base this instance
+   * @param maxToKeep
+   *          maximum number of similarities to keep
+   */
+  public GenericItemSimilarity(Iterable<ItemItemSimilarity> similarities, int maxToKeep) {
+    Iterable<ItemItemSimilarity> keptSimilarities =
+        TopItems.getTopItemItemSimilarities(maxToKeep, similarities.iterator());
+    initSimilarityMaps(keptSimilarities.iterator());
+  }
+
+  /**
+   * <p>
+   * Builds a list of item-item similarities given an {@link ItemSimilarity} implementation and a
+   * {@link DataModel}, rather than a list of {@link ItemItemSimilarity}s.
+   * </p>
+   * 
+   * <p>
+   * It's valid to build a {@link GenericItemSimilarity} this way, but perhaps missing some of the point of an
+   * item-based recommender. Item-based recommenders use the assumption that item-item similarities are
+   * relatively fixed, and might be known already independent of user preferences. Hence it is useful to
+   * inject that information, using {@link #GenericItemSimilarity(Iterable)}.
+   * </p>
+   * 
+   * @param otherSimilarity
+   *          other {@link ItemSimilarity} to get similarities from
+   * @param dataModel
+   *          data model to get items from
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link DataModel} items
+   */
+  public GenericItemSimilarity(ItemSimilarity otherSimilarity, DataModel dataModel) throws TasteException {
+    long[] itemIDs = GenericUserSimilarity.longIteratorToList(dataModel.getItemIDs());
+    initSimilarityMaps(new DataModelSimilaritiesIterator(otherSimilarity, itemIDs));
+  }
+
+  /**
+   * <p>
+   * Like {@link #GenericItemSimilarity(ItemSimilarity, DataModel)} )}, but will only keep the specified
+   * number of similarities from the given {@link DataModel}. It will keep those with the highest similarity
+   * -- those that are therefore most important.
+   * </p>
+   * 
+   * <p>
+   * Thanks to tsmorton for suggesting this and providing part of the implementation.
+   * </p>
+   * 
+   * @param otherSimilarity
+   *          other {@link ItemSimilarity} to get similarities from
+   * @param dataModel
+   *          data model to get items from
+   * @param maxToKeep
+   *          maximum number of similarities to keep
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link DataModel} items
+   */
+  public GenericItemSimilarity(ItemSimilarity otherSimilarity,
+                               DataModel dataModel,
+                               int maxToKeep) throws TasteException {
+    long[] itemIDs = GenericUserSimilarity.longIteratorToList(dataModel.getItemIDs());
+    Iterator<ItemItemSimilarity> it = new DataModelSimilaritiesIterator(otherSimilarity, itemIDs);
+    Iterable<ItemItemSimilarity> keptSimilarities = TopItems.getTopItemItemSimilarities(maxToKeep, it);
+    initSimilarityMaps(keptSimilarities.iterator());
+  }
+
+  private void initSimilarityMaps(Iterator<ItemItemSimilarity> similarities) {
+    while (similarities.hasNext()) {
+      ItemItemSimilarity iic = similarities.next();
+      long similarityItemID1 = iic.getItemID1();
+      long similarityItemID2 = iic.getItemID2();
+      if (similarityItemID1 != similarityItemID2) {
+        // Order them -- first key should be the "smaller" one
+        long itemID1;
+        long itemID2;
+        if (similarityItemID1 < similarityItemID2) {
+          itemID1 = similarityItemID1;
+          itemID2 = similarityItemID2;
+        } else {
+          itemID1 = similarityItemID2;
+          itemID2 = similarityItemID1;
+        }
+        FastByIDMap<Double> map = similarityMaps.get(itemID1);
+        if (map == null) {
+          map = new FastByIDMap<>();
+          similarityMaps.put(itemID1, map);
+        }
+        map.put(itemID2, iic.getValue());
+
+        doIndex(itemID1, itemID2);
+        doIndex(itemID2, itemID1);
+      }
+      // else similarity between item and itself already assumed to be 1.0
+    }
+  }
+
+  private void doIndex(long fromItemID, long toItemID) {
+    FastIDSet similarItemIDs = similarItemIDsIndex.get(fromItemID);
+    if (similarItemIDs == null) {
+      similarItemIDs = new FastIDSet();
+      similarItemIDsIndex.put(fromItemID, similarItemIDs);
+    }
+    similarItemIDs.add(toItemID);
+  }
+
+  /**
+   * <p>
+   * Returns the similarity between two items. Note that similarity is assumed to be symmetric, that
+   * {@code itemSimilarity(item1, item2) == itemSimilarity(item2, item1)}, and that
+   * {@code itemSimilarity(item1,item1) == 1.0} for all items.
+   * </p>
+   *
+   * @param itemID1
+   *          first item
+   * @param itemID2
+   *          second item
+   * @return similarity between the two
+   */
+  @Override
+  public double itemSimilarity(long itemID1, long itemID2) {
+    if (itemID1 == itemID2) {
+      return 1.0;
+    }
+    long firstID;
+    long secondID;
+    if (itemID1 < itemID2) {
+      firstID = itemID1;
+      secondID = itemID2;
+    } else {
+      firstID = itemID2;
+      secondID = itemID1;
+    }
+    FastByIDMap<Double> nextMap = similarityMaps.get(firstID);
+    if (nextMap == null) {
+      return Double.NaN;
+    }
+    Double similarity = nextMap.get(secondID);
+    return similarity == null ? Double.NaN : similarity;
+  }
+
+  @Override
+  public double[] itemSimilarities(long itemID1, long[] itemID2s) {
+    int length = itemID2s.length;
+    double[] result = new double[length];
+    for (int i = 0; i < length; i++) {
+      result[i] = itemSimilarity(itemID1, itemID2s[i]);
+    }
+    return result;
+  }
+
+  @Override
+  public long[] allSimilarItemIDs(long itemID) {
+    FastIDSet similarItemIDs = similarItemIDsIndex.get(itemID);
+    return similarItemIDs != null ? similarItemIDs.toArray() : NO_IDS;
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+  // Do nothing
+  }
+  
+  /** Encapsulates a similarity between two items. Similarity must be in the range [-1.0,1.0]. */
+  public static final class ItemItemSimilarity implements Comparable<ItemItemSimilarity> {
+    
+    private final long itemID1;
+    private final long itemID2;
+    private final double value;
+    
+    /**
+     * @param itemID1
+     *          first item
+     * @param itemID2
+     *          second item
+     * @param value
+     *          similarity between the two
+     * @throws IllegalArgumentException
+     *           if value is NaN, less than -1.0 or greater than 1.0
+     */
+    public ItemItemSimilarity(long itemID1, long itemID2, double value) {
+      Preconditions.checkArgument(value >= -1.0 && value <= 1.0, "Illegal value: " + value + ". Must be: -1.0 <= value <= 1.0");
+      this.itemID1 = itemID1;
+      this.itemID2 = itemID2;
+      this.value = value;
+    }
+    
+    public long getItemID1() {
+      return itemID1;
+    }
+    
+    public long getItemID2() {
+      return itemID2;
+    }
+    
+    public double getValue() {
+      return value;
+    }
+    
+    @Override
+    public String toString() {
+      return "ItemItemSimilarity[" + itemID1 + ',' + itemID2 + ':' + value + ']';
+    }
+    
+    /** Defines an ordering from highest similarity to lowest. */
+    @Override
+    public int compareTo(ItemItemSimilarity other) {
+      double otherValue = other.getValue();
+      return value > otherValue ? -1 : value < otherValue ? 1 : 0;
+    }
+    
+    @Override
+    public boolean equals(Object other) {
+      if (!(other instanceof ItemItemSimilarity)) {
+        return false;
+      }
+      ItemItemSimilarity otherSimilarity = (ItemItemSimilarity) other;
+      return otherSimilarity.getItemID1() == itemID1
+          && otherSimilarity.getItemID2() == itemID2
+          && otherSimilarity.getValue() == value;
+    }
+    
+    @Override
+    public int hashCode() {
+      return (int) itemID1 ^ (int) itemID2 ^ RandomUtils.hashDouble(value);
+    }
+    
+  }
+  
+  private static final class DataModelSimilaritiesIterator extends AbstractIterator<ItemItemSimilarity> {
+    
+    private final ItemSimilarity otherSimilarity;
+    private final long[] itemIDs;
+    private int i;
+    private long itemID1;
+    private int j;
+
+    private DataModelSimilaritiesIterator(ItemSimilarity otherSimilarity, long[] itemIDs) {
+      this.otherSimilarity = otherSimilarity;
+      this.itemIDs = itemIDs;
+      i = 0;
+      itemID1 = itemIDs[0];
+      j = 1;
+    }
+
+    @Override
+    protected ItemItemSimilarity computeNext() {
+      int size = itemIDs.length;
+      ItemItemSimilarity result = null;
+      while (result == null && i < size - 1) {
+        long itemID2 = itemIDs[j];
+        double similarity;
+        try {
+          similarity = otherSimilarity.itemSimilarity(itemID1, itemID2);
+        } catch (TasteException te) {
+          // ugly:
+          throw new IllegalStateException(te);
+        }
+        if (!Double.isNaN(similarity)) {
+          result = new ItemItemSimilarity(itemID1, itemID2, similarity);
+        }
+        if (++j == size) {
+          itemID1 = itemIDs[++i];
+          j = i + 1;
+        }
+      }
+      if (result == null) {
+        return endOfData();
+      } else {
+        return result;
+      }
+    }
+    
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java
new file mode 100644
index 0000000..1c221c2
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java
@@ -0,0 +1,238 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+import com.google.common.collect.AbstractIterator;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.recommender.TopItems;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+import org.apache.mahout.common.RandomUtils;
+
+import com.google.common.base.Preconditions;
+
+public final class GenericUserSimilarity implements UserSimilarity {
+  
+  private final FastByIDMap<FastByIDMap<Double>> similarityMaps = new FastByIDMap<>();
+  
+  public GenericUserSimilarity(Iterable<UserUserSimilarity> similarities) {
+    initSimilarityMaps(similarities.iterator());
+  }
+  
+  public GenericUserSimilarity(Iterable<UserUserSimilarity> similarities, int maxToKeep) {
+    Iterable<UserUserSimilarity> keptSimilarities =
+        TopItems.getTopUserUserSimilarities(maxToKeep, similarities.iterator());
+    initSimilarityMaps(keptSimilarities.iterator());
+  }
+  
+  public GenericUserSimilarity(UserSimilarity otherSimilarity, DataModel dataModel) throws TasteException {
+    long[] userIDs = longIteratorToList(dataModel.getUserIDs());
+    initSimilarityMaps(new DataModelSimilaritiesIterator(otherSimilarity, userIDs));
+  }
+  
+  public GenericUserSimilarity(UserSimilarity otherSimilarity,
+                               DataModel dataModel,
+                               int maxToKeep) throws TasteException {
+    long[] userIDs = longIteratorToList(dataModel.getUserIDs());
+    Iterator<UserUserSimilarity> it = new DataModelSimilaritiesIterator(otherSimilarity, userIDs);
+    Iterable<UserUserSimilarity> keptSimilarities = TopItems.getTopUserUserSimilarities(maxToKeep, it);
+    initSimilarityMaps(keptSimilarities.iterator());
+  }
+
+  static long[] longIteratorToList(LongPrimitiveIterator iterator) {
+    long[] result = new long[5];
+    int size = 0;
+    while (iterator.hasNext()) {
+      if (size == result.length) {
+        long[] newResult = new long[result.length << 1];
+        System.arraycopy(result, 0, newResult, 0, result.length);
+        result = newResult;
+      }
+      result[size++] = iterator.next();
+    }
+    if (size != result.length) {
+      long[] newResult = new long[size];
+      System.arraycopy(result, 0, newResult, 0, size);
+      result = newResult;
+    }
+    return result;
+  }
+  
+  private void initSimilarityMaps(Iterator<UserUserSimilarity> similarities) {
+    while (similarities.hasNext()) {
+      UserUserSimilarity uuc = similarities.next();
+      long similarityUser1 = uuc.getUserID1();
+      long similarityUser2 = uuc.getUserID2();
+      if (similarityUser1 != similarityUser2) {
+        // Order them -- first key should be the "smaller" one
+        long user1;
+        long user2;
+        if (similarityUser1 < similarityUser2) {
+          user1 = similarityUser1;
+          user2 = similarityUser2;
+        } else {
+          user1 = similarityUser2;
+          user2 = similarityUser1;
+        }
+        FastByIDMap<Double> map = similarityMaps.get(user1);
+        if (map == null) {
+          map = new FastByIDMap<>();
+          similarityMaps.put(user1, map);
+        }
+        map.put(user2, uuc.getValue());
+      }
+      // else similarity between user and itself already assumed to be 1.0
+    }
+  }
+  
+  @Override
+  public double userSimilarity(long userID1, long userID2) {
+    if (userID1 == userID2) {
+      return 1.0;
+    }
+    long first;
+    long second;
+    if (userID1 < userID2) {
+      first = userID1;
+      second = userID2;
+    } else {
+      first = userID2;
+      second = userID1;
+    }
+    FastByIDMap<Double> nextMap = similarityMaps.get(first);
+    if (nextMap == null) {
+      return Double.NaN;
+    }
+    Double similarity = nextMap.get(second);
+    return similarity == null ? Double.NaN : similarity;
+  }
+  
+  @Override
+  public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+  // Do nothing
+  }
+  
+  public static final class UserUserSimilarity implements Comparable<UserUserSimilarity> {
+
+    private final long userID1;
+    private final long userID2;
+    private final double value;
+    
+    public UserUserSimilarity(long userID1, long userID2, double value) {
+      Preconditions.checkArgument(value >= -1.0 && value <= 1.0, "Illegal value: " + value + ". Must be: -1.0 <= value <= 1.0");
+      this.userID1 = userID1;
+      this.userID2 = userID2;
+      this.value = value;
+    }
+    
+    public long getUserID1() {
+      return userID1;
+    }
+    
+    public long getUserID2() {
+      return userID2;
+    }
+    
+    public double getValue() {
+      return value;
+    }
+    
+    @Override
+    public String toString() {
+      return "UserUserSimilarity[" + userID1 + ',' + userID2 + ':' + value + ']';
+    }
+    
+    /** Defines an ordering from highest similarity to lowest. */
+    @Override
+    public int compareTo(UserUserSimilarity other) {
+      double otherValue = other.getValue();
+      return value > otherValue ? -1 : value < otherValue ? 1 : 0;
+    }
+    
+    @Override
+    public boolean equals(Object other) {
+      if (!(other instanceof UserUserSimilarity)) {
+        return false;
+      }
+      UserUserSimilarity otherSimilarity = (UserUserSimilarity) other;
+      return otherSimilarity.getUserID1() == userID1
+          && otherSimilarity.getUserID2() == userID2
+          && otherSimilarity.getValue() == value;
+    }
+    
+    @Override
+    public int hashCode() {
+      return (int) userID1 ^ (int) userID2 ^ RandomUtils.hashDouble(value);
+    }
+    
+  }
+  
+  private static final class DataModelSimilaritiesIterator extends AbstractIterator<UserUserSimilarity> {
+
+    private final UserSimilarity otherSimilarity;
+    private final long[] itemIDs;
+    private int i;
+    private long itemID1;
+    private int j;
+
+    private DataModelSimilaritiesIterator(UserSimilarity otherSimilarity, long[] itemIDs) {
+      this.otherSimilarity = otherSimilarity;
+      this.itemIDs = itemIDs;
+      i = 0;
+      itemID1 = itemIDs[0];
+      j = 1;
+    }
+
+    @Override
+    protected UserUserSimilarity computeNext() {
+      int size = itemIDs.length;
+      while (i < size - 1) {
+        long itemID2 = itemIDs[j];
+        double similarity;
+        try {
+          similarity = otherSimilarity.userSimilarity(itemID1, itemID2);
+        } catch (TasteException te) {
+          // ugly:
+          throw new IllegalStateException(te);
+        }
+        if (!Double.isNaN(similarity)) {
+          return new UserUserSimilarity(itemID1, itemID2, similarity);
+        }
+        if (++j == size) {
+          itemID1 = itemIDs[++i];
+          j = i + 1;
+        }
+      }
+      return endOfData();
+    }
+    
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
new file mode 100644
index 0000000..3084c8f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+import org.apache.mahout.math.stats.LogLikelihood;
+
+/**
+ * See <a href="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.5962">
+ * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.5962</a> and
+ * <a href="http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html">
+ * http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html</a>.
+ */
+public final class LogLikelihoodSimilarity extends AbstractItemSimilarity implements UserSimilarity {
+
+  public LogLikelihoodSimilarity(DataModel dataModel) {
+    super(dataModel);
+  }
+  
+  /**
+   * @throws UnsupportedOperationException
+   */
+  @Override
+  public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public double userSimilarity(long userID1, long userID2) throws TasteException {
+
+    DataModel dataModel = getDataModel();
+    FastIDSet prefs1 = dataModel.getItemIDsFromUser(userID1);
+    FastIDSet prefs2 = dataModel.getItemIDsFromUser(userID2);
+    
+    long prefs1Size = prefs1.size();
+    long prefs2Size = prefs2.size();
+    long intersectionSize =
+        prefs1Size < prefs2Size ? prefs2.intersectionSize(prefs1) : prefs1.intersectionSize(prefs2);
+    if (intersectionSize == 0) {
+      return Double.NaN;
+    }
+    long numItems = dataModel.getNumItems();
+    double logLikelihood =
+        LogLikelihood.logLikelihoodRatio(intersectionSize,
+                                         prefs2Size - intersectionSize,
+                                         prefs1Size - intersectionSize,
+                                         numItems - prefs1Size - prefs2Size + intersectionSize);
+    return 1.0 - 1.0 / (1.0 + logLikelihood);
+  }
+  
+  @Override
+  public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+    DataModel dataModel = getDataModel();
+    long preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+    long numUsers = dataModel.getNumUsers();
+    return doItemSimilarity(itemID1, itemID2, preferring1, numUsers);
+  }
+
+  @Override
+  public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+    DataModel dataModel = getDataModel();
+    long preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+    long numUsers = dataModel.getNumUsers();
+    int length = itemID2s.length;
+    double[] result = new double[length];
+    for (int i = 0; i < length; i++) {
+      result[i] = doItemSimilarity(itemID1, itemID2s[i], preferring1, numUsers);
+    }
+    return result;
+  }
+
+  private double doItemSimilarity(long itemID1, long itemID2, long preferring1, long numUsers) throws TasteException {
+    DataModel dataModel = getDataModel();
+    long preferring1and2 = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2);
+    if (preferring1and2 == 0) {
+      return Double.NaN;
+    }
+    long preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2);
+    double logLikelihood =
+        LogLikelihood.logLikelihoodRatio(preferring1and2,
+                                         preferring2 - preferring1and2,
+                                         preferring1 - preferring1and2,
+                                         numUsers - preferring1 - preferring2 + preferring1and2);
+    return 1.0 - 1.0 / (1.0 + logLikelihood);
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed);
+    RefreshHelper.maybeRefresh(alreadyRefreshed, getDataModel());
+  }
+  
+  @Override
+  public String toString() {
+    return "LogLikelihoodSimilarity[dataModel:" + getDataModel() + ']';
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java
new file mode 100644
index 0000000..48dc4e0
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.common.LongPair;
+
+/**
+ * A {@link Cache.MatchPredicate} which will match an ID against either element of a
+ * {@link LongPair}.
+ */
+final class LongPairMatchPredicate implements Cache.MatchPredicate<LongPair> {
+
+  private final long id;
+
+  LongPairMatchPredicate(long id) {
+    this.id = id;
+  }
+
+  @Override
+  public boolean matches(LongPair pair) {
+    return pair.getFirst() == id || pair.getSecond() == id;
+  }
+
+}


[46/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java
deleted file mode 100644
index a99d54c..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java
+++ /dev/null
@@ -1,265 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
-import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.common.RunningAverage;
-import org.apache.mahout.cf.taste.impl.recommender.svd.Factorization;
-import org.apache.mahout.cf.taste.impl.recommender.svd.Factorizer;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.common.RandomUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.Collection;
-import java.util.Random;
-
-/**
- * {@link Factorizer} based on Simon Funk's famous article <a href="http://sifter.org/~simon/journal/20061211.html">
- * "Netflix Update: Try this at home"</a>.
- *
- * Attempts to be as memory efficient as possible, only iterating once through the
- * {@link FactorizablePreferences} or {@link DataModel} while copying everything to primitive arrays.
- * Learning works in place on these datastructures after that.
- */
-public class ParallelArraysSGDFactorizer implements Factorizer {
-
-  public static final double DEFAULT_LEARNING_RATE = 0.005;
-  public static final double DEFAULT_PREVENT_OVERFITTING = 0.02;
-  public static final double DEFAULT_RANDOM_NOISE = 0.005;
-
-  private final int numFeatures;
-  private final int numIterations;
-  private final float minPreference;
-  private final float maxPreference;
-
-  private final Random random;
-  private final double learningRate;
-  private final double preventOverfitting;
-
-  private final FastByIDMap<Integer> userIDMapping;
-  private final FastByIDMap<Integer> itemIDMapping;
-
-  private final double[][] userFeatures;
-  private final double[][] itemFeatures;
-
-  private final int[] userIndexes;
-  private final int[] itemIndexes;
-  private final float[] values;
-
-  private final double defaultValue;
-  private final double interval;
-  private final double[] cachedEstimates;
-
-
-  private static final Logger log = LoggerFactory.getLogger(ParallelArraysSGDFactorizer.class);
-
-  public ParallelArraysSGDFactorizer(DataModel dataModel, int numFeatures, int numIterations) {
-    this(new DataModelFactorizablePreferences(dataModel), numFeatures, numIterations, DEFAULT_LEARNING_RATE,
-        DEFAULT_PREVENT_OVERFITTING, DEFAULT_RANDOM_NOISE);
-  }
-
-  public ParallelArraysSGDFactorizer(DataModel dataModel, int numFeatures, int numIterations, double learningRate,
-                                     double preventOverfitting, double randomNoise) {
-    this(new DataModelFactorizablePreferences(dataModel), numFeatures, numIterations, learningRate, preventOverfitting,
-        randomNoise);
-  }
-
-  public ParallelArraysSGDFactorizer(FactorizablePreferences factorizablePrefs, int numFeatures, int numIterations) {
-    this(factorizablePrefs, numFeatures, numIterations, DEFAULT_LEARNING_RATE, DEFAULT_PREVENT_OVERFITTING,
-        DEFAULT_RANDOM_NOISE);
-  }
-
-  public ParallelArraysSGDFactorizer(FactorizablePreferences factorizablePreferences, int numFeatures,
-      int numIterations, double learningRate, double preventOverfitting, double randomNoise) {
-
-    this.numFeatures = numFeatures;
-    this.numIterations = numIterations;
-    minPreference = factorizablePreferences.getMinPreference();
-    maxPreference = factorizablePreferences.getMaxPreference();
-
-    this.random = RandomUtils.getRandom();
-    this.learningRate = learningRate;
-    this.preventOverfitting = preventOverfitting;
-
-    int numUsers = factorizablePreferences.numUsers();
-    int numItems = factorizablePreferences.numItems();
-    int numPrefs = factorizablePreferences.numPreferences();
-
-    log.info("Mapping {} users...", numUsers);
-    userIDMapping = new FastByIDMap<>(numUsers);
-    int index = 0;
-    LongPrimitiveIterator userIterator = factorizablePreferences.getUserIDs();
-    while (userIterator.hasNext()) {
-      userIDMapping.put(userIterator.nextLong(), index++);
-    }
-
-    log.info("Mapping {} items", numItems);
-    itemIDMapping = new FastByIDMap<>(numItems);
-    index = 0;
-    LongPrimitiveIterator itemIterator = factorizablePreferences.getItemIDs();
-    while (itemIterator.hasNext()) {
-      itemIDMapping.put(itemIterator.nextLong(), index++);
-    }
-
-    this.userIndexes = new int[numPrefs];
-    this.itemIndexes = new int[numPrefs];
-    this.values = new float[numPrefs];
-    this.cachedEstimates = new double[numPrefs];
-
-    index = 0;
-    log.info("Loading {} preferences into memory", numPrefs);
-    RunningAverage average = new FullRunningAverage();
-    for (Preference preference : factorizablePreferences.getPreferences()) {
-      userIndexes[index] = userIDMapping.get(preference.getUserID());
-      itemIndexes[index] = itemIDMapping.get(preference.getItemID());
-      values[index] = preference.getValue();
-      cachedEstimates[index] = 0;
-
-      average.addDatum(preference.getValue());
-
-      index++;
-      if (index % 1000000 == 0) {
-        log.info("Processed {} preferences", index);
-      }
-    }
-    log.info("Processed {} preferences, done.", index);
-
-    double averagePreference = average.getAverage();
-    log.info("Average preference value is {}", averagePreference);
-
-    double prefInterval = factorizablePreferences.getMaxPreference() - factorizablePreferences.getMinPreference();
-    defaultValue = Math.sqrt((averagePreference - prefInterval * 0.1) / numFeatures);
-    interval = prefInterval * 0.1 / numFeatures;
-
-    userFeatures = new double[numUsers][numFeatures];
-    itemFeatures = new double[numItems][numFeatures];
-
-    log.info("Initializing feature vectors...");
-    for (int feature = 0; feature < numFeatures; feature++) {
-      for (int userIndex = 0; userIndex < numUsers; userIndex++) {
-        userFeatures[userIndex][feature] = defaultValue + (random.nextDouble() - 0.5) * interval * randomNoise;
-      }
-      for (int itemIndex = 0; itemIndex < numItems; itemIndex++) {
-        itemFeatures[itemIndex][feature] = defaultValue + (random.nextDouble() - 0.5) * interval * randomNoise;
-      }
-    }
-  }
-
-  @Override
-  public Factorization factorize() throws TasteException {
-    for (int feature = 0; feature < numFeatures; feature++) {
-      log.info("Shuffling preferences...");
-      shufflePreferences();
-      log.info("Starting training of feature {} ...", feature);
-      for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {
-        if (currentIteration == numIterations - 1) {
-          double rmse = trainingIterationWithRmse(feature);
-          log.info("Finished training feature {} with RMSE {}", feature, rmse);
-        } else {
-          trainingIteration(feature);
-        }
-      }
-      if (feature < numFeatures - 1) {
-        log.info("Updating cache...");
-        for (int index = 0; index < userIndexes.length; index++) {
-          cachedEstimates[index] = estimate(userIndexes[index], itemIndexes[index], feature, cachedEstimates[index],
-              false);
-        }
-      }
-    }
-    log.info("Factorization done");
-    return new Factorization(userIDMapping, itemIDMapping, userFeatures, itemFeatures);
-  }
-
-  private void trainingIteration(int feature) {
-    for (int index = 0; index < userIndexes.length; index++) {
-      train(userIndexes[index], itemIndexes[index], feature, values[index], cachedEstimates[index]);
-    }
-  }
-
-  private double trainingIterationWithRmse(int feature) {
-    double rmse = 0.0;
-    for (int index = 0; index < userIndexes.length; index++) {
-      double error = train(userIndexes[index], itemIndexes[index], feature, values[index], cachedEstimates[index]);
-      rmse += error * error;
-    }
-    return Math.sqrt(rmse / userIndexes.length);
-  }
-
-  private double estimate(int userIndex, int itemIndex, int feature, double cachedEstimate, boolean trailing) {
-    double sum = cachedEstimate;
-    sum += userFeatures[userIndex][feature] * itemFeatures[itemIndex][feature];
-    if (trailing) {
-      sum += (numFeatures - feature - 1) * (defaultValue + interval) * (defaultValue + interval);
-      if (sum > maxPreference) {
-        sum = maxPreference;
-      } else if (sum < minPreference) {
-        sum = minPreference;
-      }
-    }
-    return sum;
-  }
-
-  public double train(int userIndex, int itemIndex, int feature, double original, double cachedEstimate) {
-    double error = original - estimate(userIndex, itemIndex, feature, cachedEstimate, true);
-    double[] userVector = userFeatures[userIndex];
-    double[] itemVector = itemFeatures[itemIndex];
-
-    userVector[feature] += learningRate * (error * itemVector[feature] - preventOverfitting * userVector[feature]);
-    itemVector[feature] += learningRate * (error * userVector[feature] - preventOverfitting * itemVector[feature]);
-
-    return error;
-  }
-
-  protected void shufflePreferences() {
-    /* Durstenfeld shuffle */
-    for (int currentPos = userIndexes.length - 1; currentPos > 0; currentPos--) {
-      int swapPos = random.nextInt(currentPos + 1);
-      swapPreferences(currentPos, swapPos);
-    }
-  }
-
-  private void swapPreferences(int posA, int posB) {
-    int tmpUserIndex = userIndexes[posA];
-    int tmpItemIndex = itemIndexes[posA];
-    float tmpValue = values[posA];
-    double tmpEstimate = cachedEstimates[posA];
-
-    userIndexes[posA] = userIndexes[posB];
-    itemIndexes[posA] = itemIndexes[posB];
-    values[posA] = values[posB];
-    cachedEstimates[posA] = cachedEstimates[posB];
-
-    userIndexes[posB] = tmpUserIndex;
-    itemIndexes[posB] = tmpItemIndex;
-    values[posB] = tmpValue;
-    cachedEstimates[posB] = tmpEstimate;
-  }
-
-  @Override
-  public void refresh(Collection<Refreshable> alreadyRefreshed) {
-    // do nothing
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java
deleted file mode 100644
index 5cce02d..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java
+++ /dev/null
@@ -1,141 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
-
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.OutputStream;
-
-import com.google.common.io.Closeables;
-import org.apache.mahout.cf.taste.common.NoSuchItemException;
-import org.apache.mahout.cf.taste.common.NoSuchUserException;
-import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.example.kddcup.track1.EstimateConverter;
-import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
-import org.apache.mahout.cf.taste.impl.common.RunningAverage;
-import org.apache.mahout.cf.taste.impl.recommender.svd.Factorization;
-import org.apache.mahout.cf.taste.impl.recommender.svd.Factorizer;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * run an SVD factorization of the KDD track1 data.
- *
- * needs at least 6-7GB of memory, tested with -Xms6700M -Xmx6700M
- *
- */
-public final class Track1SVDRunner {
-
-  private static final Logger log = LoggerFactory.getLogger(Track1SVDRunner.class);
-
-  private Track1SVDRunner() {
-  }
-
-  public static void main(String[] args) throws Exception {
-
-    if (args.length != 2) {
-      System.err.println("Necessary arguments: <kddDataFileDirectory> <resultFile>");
-      return;
-    }
-
-    File dataFileDirectory = new File(args[0]);
-    if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
-      throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
-    }
-
-    File resultFile = new File(args[1]);
-
-    /* the knobs to turn */
-    int numFeatures = 20;
-    int numIterations = 5;
-    double learningRate = 0.0001;
-    double preventOverfitting = 0.002;
-    double randomNoise = 0.0001;
-
-
-    KDDCupFactorizablePreferences factorizablePreferences =
-        new KDDCupFactorizablePreferences(KDDCupDataModel.getTrainingFile(dataFileDirectory));
-
-    Factorizer sgdFactorizer = new ParallelArraysSGDFactorizer(factorizablePreferences, numFeatures, numIterations,
-        learningRate, preventOverfitting, randomNoise);
-
-    Factorization factorization = sgdFactorizer.factorize();
-
-    log.info("Estimating validation preferences...");
-    int prefsProcessed = 0;
-    RunningAverage average = new FullRunningAverage();
-    for (Pair<PreferenceArray,long[]> validationPair
-        : new DataFileIterable(KDDCupDataModel.getValidationFile(dataFileDirectory))) {
-      for (Preference validationPref : validationPair.getFirst()) {
-        double estimate = estimatePreference(factorization, validationPref.getUserID(), validationPref.getItemID(),
-            factorizablePreferences.getMinPreference(), factorizablePreferences.getMaxPreference());
-        double error = validationPref.getValue() - estimate;
-        average.addDatum(error * error);
-        prefsProcessed++;
-        if (prefsProcessed % 100000 == 0) {
-          log.info("Computed {} estimations", prefsProcessed);
-        }
-      }
-    }
-    log.info("Computed {} estimations, done.", prefsProcessed);
-
-    double rmse = Math.sqrt(average.getAverage());
-    log.info("RMSE {}", rmse);
-
-    log.info("Estimating test preferences...");
-    OutputStream out = null;
-    try {
-      out = new BufferedOutputStream(new FileOutputStream(resultFile));
-
-      for (Pair<PreferenceArray,long[]> testPair
-          : new DataFileIterable(KDDCupDataModel.getTestFile(dataFileDirectory))) {
-        for (Preference testPref : testPair.getFirst()) {
-          double estimate = estimatePreference(factorization, testPref.getUserID(), testPref.getItemID(),
-              factorizablePreferences.getMinPreference(), factorizablePreferences.getMaxPreference());
-          byte result = EstimateConverter.convert(estimate, testPref.getUserID(), testPref.getItemID());
-          out.write(result);
-        }
-      }
-    } finally {
-      Closeables.close(out, false);
-    }
-    log.info("wrote estimates to {}, done.", resultFile.getAbsolutePath());
-  }
-
-  static double estimatePreference(Factorization factorization, long userID, long itemID, float minPreference,
-      float maxPreference) throws NoSuchUserException, NoSuchItemException {
-    double[] userFeatures = factorization.getUserFeatures(userID);
-    double[] itemFeatures = factorization.getItemFeatures(itemID);
-    double estimate = 0;
-    for (int feature = 0; feature < userFeatures.length; feature++) {
-      estimate += userFeatures[feature] * itemFeatures[feature];
-    }
-    if (estimate < minPreference) {
-      estimate = minPreference;
-    } else if (estimate > maxPreference) {
-      estimate = maxPreference;
-    }
-    return estimate;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java
deleted file mode 100644
index ce025a9..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Collection;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.similarity.AbstractItemSimilarity;
-import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
-
-final class HybridSimilarity extends AbstractItemSimilarity {
-
-  private final ItemSimilarity cfSimilarity;
-  private final ItemSimilarity contentSimilarity;
-
-  HybridSimilarity(DataModel dataModel, File dataFileDirectory) throws IOException {
-    super(dataModel);
-    cfSimilarity = new LogLikelihoodSimilarity(dataModel);
-    contentSimilarity = new TrackItemSimilarity(dataFileDirectory);
-  }
-
-  @Override
-  public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
-    return contentSimilarity.itemSimilarity(itemID1, itemID2) * cfSimilarity.itemSimilarity(itemID1, itemID2);
-  }
-
-  @Override
-  public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
-    double[] result = contentSimilarity.itemSimilarities(itemID1, itemID2s);
-    double[] multipliers = cfSimilarity.itemSimilarities(itemID1, itemID2s);
-    for (int i = 0; i < result.length; i++) {
-      result[i] *= multipliers[i];
-    }
-    return result;
-  }
-
-  @Override
-  public void refresh(Collection<Refreshable> alreadyRefreshed) {
-    cfSimilarity.refresh(alreadyRefreshed);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java
deleted file mode 100644
index 50fd35e..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-import org.apache.mahout.cf.taste.common.NoSuchItemException;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.List;
-import java.util.TreeMap;
-import java.util.concurrent.Callable;
-import java.util.concurrent.atomic.AtomicInteger;
-
-final class Track2Callable implements Callable<UserResult> {
-
-  private static final Logger log = LoggerFactory.getLogger(Track2Callable.class);
-  private static final AtomicInteger COUNT = new AtomicInteger();
-
-  private final Recommender recommender;
-  private final PreferenceArray userTest;
-
-  Track2Callable(Recommender recommender, PreferenceArray userTest) {
-    this.recommender = recommender;
-    this.userTest = userTest;
-  }
-
-  @Override
-  public UserResult call() throws TasteException {
-
-    int testSize = userTest.length();
-    if (testSize != 6) {
-      throw new IllegalArgumentException("Expecting 6 items for user but got " + userTest);
-    }
-    long userID = userTest.get(0).getUserID();
-    TreeMap<Double,Long> estimateToItemID = new TreeMap<>(Collections.reverseOrder());
-
-    for (int i = 0; i < testSize; i++) {
-      long itemID = userTest.getItemID(i);
-      double estimate;
-      try {
-        estimate = recommender.estimatePreference(userID, itemID);
-      } catch (NoSuchItemException nsie) {
-        // OK in the sample data provided before the contest, should never happen otherwise
-        log.warn("Unknown item {}; OK unless this is the real contest data", itemID);
-        continue;
-      }
-
-      if (!Double.isNaN(estimate)) {
-        estimateToItemID.put(estimate, itemID);
-      }
-    }
-
-    Collection<Long> itemIDs = estimateToItemID.values();
-    List<Long> topThree = new ArrayList<>(itemIDs);
-    if (topThree.size() > 3) {
-      topThree = topThree.subList(0, 3);
-    } else if (topThree.size() < 3) {
-      log.warn("Unable to recommend three items for {}", userID);
-      // Some NaNs - just guess at the rest then
-      Collection<Long> newItemIDs = new HashSet<>(3);
-      newItemIDs.addAll(itemIDs);
-      int i = 0;
-      while (i < testSize && newItemIDs.size() < 3) {
-        newItemIDs.add(userTest.getItemID(i));
-        i++;
-      }
-      topThree = new ArrayList<>(newItemIDs);
-    }
-    if (topThree.size() != 3) {
-      throw new IllegalStateException();
-    }
-
-    boolean[] result = new boolean[testSize];
-    for (int i = 0; i < testSize; i++) {
-      result[i] = topThree.contains(userTest.getItemID(i));
-    }
-
-    if (COUNT.incrementAndGet() % 1000 == 0) {
-      log.info("Completed {} users", COUNT.get());
-    }
-
-    return new UserResult(userID, result);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java
deleted file mode 100644
index 185a00d..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.recommender.GenericBooleanPrefItemBasedRecommender;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.IDRescorer;
-import org.apache.mahout.cf.taste.recommender.RecommendedItem;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
-
-public final class Track2Recommender implements Recommender {
-
-  private final Recommender recommender;
-
-  public Track2Recommender(DataModel dataModel, File dataFileDirectory) throws TasteException {
-    // Change this to whatever you like!
-    ItemSimilarity similarity;
-    try {
-      similarity = new HybridSimilarity(dataModel, dataFileDirectory);
-    } catch (IOException ioe) {
-      throw new TasteException(ioe);
-    }
-    recommender = new GenericBooleanPrefItemBasedRecommender(dataModel, similarity);
-  }
-  
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
-    return recommender.recommend(userID, howMany);
-  }
-
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
-    return recommend(userID, howMany, null, includeKnownItems);
-  }
-
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
-    return recommender.recommend(userID, howMany, rescorer, false);
-  }
-  
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
-    throws TasteException {
-    return recommender.recommend(userID, howMany, rescorer, includeKnownItems);
-  }
-  
-  @Override
-  public float estimatePreference(long userID, long itemID) throws TasteException {
-    return recommender.estimatePreference(userID, itemID);
-  }
-  
-  @Override
-  public void setPreference(long userID, long itemID, float value) throws TasteException {
-    recommender.setPreference(userID, itemID, value);
-  }
-  
-  @Override
-  public void removePreference(long userID, long itemID) throws TasteException {
-    recommender.removePreference(userID, itemID);
-  }
-  
-  @Override
-  public DataModel getDataModel() {
-    return recommender.getDataModel();
-  }
-  
-  @Override
-  public void refresh(Collection<Refreshable> alreadyRefreshed) {
-    recommender.refresh(alreadyRefreshed);
-  }
-  
-  @Override
-  public String toString() {
-    return "Track1Recommender[recommender:" + recommender + ']';
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java
deleted file mode 100644
index 09ade5d..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-
-final class Track2RecommenderBuilder implements RecommenderBuilder {
-  
-  @Override
-  public Recommender buildRecommender(DataModel dataModel) throws TasteException {
-    return new Track2Recommender(dataModel, ((KDDCupDataModel) dataModel).getDataFileDirectory());
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java
deleted file mode 100644
index 3cbb61c..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-
-/**
- * <p>Runs "track 2" of the KDD Cup competition using whatever recommender is inside {@link Track2Recommender}
- * and attempts to output the result in the correct contest format.</p>
- *
- * <p>Run as: {@code Track2Runner [track 2 data file directory] [output file]}</p>
- */
-public final class Track2Runner {
-
-  private static final Logger log = LoggerFactory.getLogger(Track2Runner.class);
-
-  private Track2Runner() {
-  }
-
-  public static void main(String[] args) throws Exception {
-
-    File dataFileDirectory = new File(args[0]);
-    if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
-      throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
-    }
-
-    long start = System.currentTimeMillis();
-
-    KDDCupDataModel model = new KDDCupDataModel(KDDCupDataModel.getTrainingFile(dataFileDirectory));
-    Track2Recommender recommender = new Track2Recommender(model, dataFileDirectory);
-
-    long end = System.currentTimeMillis();
-    log.info("Loaded model in {}s", (end - start) / 1000);
-    start = end;
-
-    Collection<Track2Callable> callables = new ArrayList<>();
-    for (Pair<PreferenceArray,long[]> tests : new DataFileIterable(KDDCupDataModel.getTestFile(dataFileDirectory))) {
-      PreferenceArray userTest = tests.getFirst();
-      callables.add(new Track2Callable(recommender, userTest));
-    }
-
-    int cores = Runtime.getRuntime().availableProcessors();
-    log.info("Running on {} cores", cores);
-    ExecutorService executor = Executors.newFixedThreadPool(cores);
-    List<Future<UserResult>> futures = executor.invokeAll(callables);
-    executor.shutdown();
-
-    end = System.currentTimeMillis();
-    log.info("Ran recommendations in {}s", (end - start) / 1000);
-    start = end;
-
-    try (OutputStream out = new BufferedOutputStream(new FileOutputStream(new File(args[1])))){
-      long lastUserID = Long.MIN_VALUE;
-      for (Future<UserResult> future : futures) {
-        UserResult result = future.get();
-        long userID = result.getUserID();
-        if (userID <= lastUserID) {
-          throw new IllegalStateException();
-        }
-        lastUserID = userID;
-        out.write(result.getResultBytes());
-      }
-    }
-
-    end = System.currentTimeMillis();
-    log.info("Wrote output in {}s", (end - start) / 1000);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java
deleted file mode 100644
index abd15f8..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-import java.util.regex.Pattern;
-
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-
-final class TrackData {
-
-  private static final Pattern PIPE = Pattern.compile("\\|");
-  private static final String NO_VALUE = "None";
-  static final long NO_VALUE_ID = Long.MIN_VALUE;
-  private static final FastIDSet NO_GENRES = new FastIDSet();
-
-  private final long trackID;
-  private final long albumID;
-  private final long artistID;
-  private final FastIDSet genreIDs;
-
-  TrackData(CharSequence line) {
-    String[] tokens = PIPE.split(line);
-    trackID = Long.parseLong(tokens[0]);
-    albumID = parse(tokens[1]);
-    artistID = parse(tokens[2]);
-    if (tokens.length > 3) {
-      genreIDs = new FastIDSet(tokens.length - 3);
-      for (int i = 3; i < tokens.length; i++) {
-        genreIDs.add(Long.parseLong(tokens[i]));
-      }
-    } else {
-      genreIDs = NO_GENRES;
-    }
-  }
-
-  private static long parse(String value) {
-    return NO_VALUE.equals(value) ? NO_VALUE_ID : Long.parseLong(value);
-  }
-
-  public long getTrackID() {
-    return trackID;
-  }
-
-  public long getAlbumID() {
-    return albumID;
-  }
-
-  public long getArtistID() {
-    return artistID;
-  }
-
-  public FastIDSet getGenreIDs() {
-    return genreIDs;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java
deleted file mode 100644
index 3012a84..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Collection;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
-import org.apache.mahout.common.iterator.FileLineIterable;
-
-final class TrackItemSimilarity implements ItemSimilarity {
-
-  private final FastByIDMap<TrackData> trackData;
-
-  TrackItemSimilarity(File dataFileDirectory) throws IOException {
-    trackData = new FastByIDMap<>();
-    for (String line : new FileLineIterable(KDDCupDataModel.getTrackFile(dataFileDirectory))) {
-      TrackData trackDatum = new TrackData(line);
-      trackData.put(trackDatum.getTrackID(), trackDatum);
-    }
-  }
-
-  @Override
-  public double itemSimilarity(long itemID1, long itemID2) {
-    if (itemID1 == itemID2) {
-      return 1.0;
-    }
-    TrackData data1 = trackData.get(itemID1);
-    TrackData data2 = trackData.get(itemID2);
-    if (data1 == null || data2 == null) {
-      return 0.0;
-    }
-
-    // Arbitrarily decide that same album means "very similar"
-    if (data1.getAlbumID() != TrackData.NO_VALUE_ID && data1.getAlbumID() == data2.getAlbumID()) {
-      return 0.9;
-    }
-    // ... and same artist means "fairly similar"
-    if (data1.getArtistID() != TrackData.NO_VALUE_ID && data1.getArtistID() == data2.getArtistID()) {
-      return 0.7;
-    }
-
-    // Tanimoto coefficient similarity based on genre, but maximum value of 0.25
-    FastIDSet genres1 = data1.getGenreIDs();
-    FastIDSet genres2 = data2.getGenreIDs();
-    if (genres1 == null || genres2 == null) {
-      return 0.0;
-    }
-    int intersectionSize = genres1.intersectionSize(genres2);
-    if (intersectionSize == 0) {
-      return 0.0;
-    }
-    int unionSize = genres1.size() + genres2.size() - intersectionSize;
-    return intersectionSize / (4.0 * unionSize);
-  }
-
-  @Override
-  public double[] itemSimilarities(long itemID1, long[] itemID2s) {
-    int length = itemID2s.length;
-    double[] result = new double[length];
-    for (int i = 0; i < length; i++) {
-      result[i] = itemSimilarity(itemID1, itemID2s[i]);
-    }
-    return result;
-  }
-
-  @Override
-  public long[] allSimilarItemIDs(long itemID) {
-    FastIDSet allSimilarItemIDs = new FastIDSet();
-    LongPrimitiveIterator allItemIDs = trackData.keySetIterator();
-    while (allItemIDs.hasNext()) {
-      long possiblySimilarItemID = allItemIDs.nextLong();
-      if (!Double.isNaN(itemSimilarity(itemID, possiblySimilarItemID))) {
-        allSimilarItemIDs.add(possiblySimilarItemID);
-      }
-    }
-    return allSimilarItemIDs.toArray();
-  }
-
-  @Override
-  public void refresh(Collection<Refreshable> alreadyRefreshed) {
-    // do nothing
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java
deleted file mode 100644
index e554d10..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-final class UserResult {
-
-  private final long userID;
-  private final byte[] resultBytes;
-
-  UserResult(long userID, boolean[] result) {
-
-    this.userID = userID;
-
-    int trueCount = 0;
-    for (boolean b : result) {
-      if (b) {
-        trueCount++;
-      }
-    }
-    if (trueCount != 3) {
-      throw new IllegalStateException();
-    }
-
-    resultBytes = new byte[result.length];
-    for (int i = 0; i < result.length; i++) {
-      resultBytes[i] = (byte) (result[i] ? '1' : '0');
-    }
-  }
-
-  public long getUserID() {
-    return userID;
-  }
-
-  public byte[] getResultBytes() {
-    return resultBytes;
-  }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java
deleted file mode 100644
index 22f122e..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.hadoop.example.als.netflix;
-
-import com.google.common.base.Preconditions;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.cf.taste.impl.model.GenericPreference;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.common.iterator.FileLineIterable;
-import org.apache.mahout.common.iterator.FileLineIterator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Pattern;
-
-/** converts the raw files provided by netflix to an appropriate input format */
-public final class NetflixDatasetConverter {
-
-  private static final Logger log = LoggerFactory.getLogger(NetflixDatasetConverter.class);
-
-  private static final Pattern SEPARATOR = Pattern.compile(",");
-  private static final String MOVIE_DENOTER = ":";
-  private static final String TAB = "\t";
-  private static final String NEWLINE = "\n";
-
-  private NetflixDatasetConverter() {
-  }
-
-  public static void main(String[] args) throws IOException {
-
-    if (args.length != 4) {
-      System.err.println("Usage: NetflixDatasetConverter /path/to/training_set/ /path/to/qualifying.txt "
-          + "/path/to/judging.txt /path/to/destination");
-      return;
-    }
-
-    String trainingDataDir = args[0];
-    String qualifyingTxt = args[1];
-    String judgingTxt = args[2];
-    Path outputPath = new Path(args[3]);
-
-    Configuration conf = new Configuration();
-    FileSystem fs = FileSystem.get(outputPath.toUri(), conf);
-
-    Preconditions.checkArgument(trainingDataDir != null, "Training Data location needs to be specified");
-    log.info("Creating training set at {}/trainingSet/ratings.tsv ...", outputPath);
-    try (BufferedWriter writer =
-             new BufferedWriter(
-                 new OutputStreamWriter(
-                     fs.create(new Path(outputPath, "trainingSet/ratings.tsv")), Charsets.UTF_8))){
-
-      int ratingsProcessed = 0;
-      for (File movieRatings : new File(trainingDataDir).listFiles()) {
-        try (FileLineIterator lines = new FileLineIterator(movieRatings)) {
-          boolean firstLineRead = false;
-          String movieID = null;
-          while (lines.hasNext()) {
-            String line = lines.next();
-            if (firstLineRead) {
-              String[] tokens = SEPARATOR.split(line);
-              String userID = tokens[0];
-              String rating = tokens[1];
-              writer.write(userID + TAB + movieID + TAB + rating + NEWLINE);
-              ratingsProcessed++;
-              if (ratingsProcessed % 1000000 == 0) {
-                log.info("{} ratings processed...", ratingsProcessed);
-              }
-            } else {
-              movieID = line.replaceAll(MOVIE_DENOTER, "");
-              firstLineRead = true;
-            }
-          }
-        }
-
-      }
-      log.info("{} ratings processed. done.", ratingsProcessed);
-    }
-
-    log.info("Reading probes...");
-    List<Preference> probes = new ArrayList<>(2817131);
-    long currentMovieID = -1;
-    for (String line : new FileLineIterable(new File(qualifyingTxt))) {
-      if (line.contains(MOVIE_DENOTER)) {
-        currentMovieID = Long.parseLong(line.replaceAll(MOVIE_DENOTER, ""));
-      } else {
-        long userID = Long.parseLong(SEPARATOR.split(line)[0]);
-        probes.add(new GenericPreference(userID, currentMovieID, 0));
-      }
-    }
-    log.info("{} probes read...", probes.size());
-
-    log.info("Reading ratings, creating probe set at {}/probeSet/ratings.tsv ...", outputPath);
-    try (BufferedWriter writer =
-             new BufferedWriter(new OutputStreamWriter(
-                 fs.create(new Path(outputPath, "probeSet/ratings.tsv")), Charsets.UTF_8))){
-      int ratingsProcessed = 0;
-      for (String line : new FileLineIterable(new File(judgingTxt))) {
-        if (line.contains(MOVIE_DENOTER)) {
-          currentMovieID = Long.parseLong(line.replaceAll(MOVIE_DENOTER, ""));
-        } else {
-          float rating = Float.parseFloat(SEPARATOR.split(line)[0]);
-          Preference pref = probes.get(ratingsProcessed);
-          Preconditions.checkState(pref.getItemID() == currentMovieID);
-          ratingsProcessed++;
-          writer.write(pref.getUserID() + TAB + pref.getItemID() + TAB + rating + NEWLINE);
-          if (ratingsProcessed % 1000000 == 0) {
-            log.info("{} ratings processed...", ratingsProcessed);
-          }
-        }
-      }
-      log.info("{} ratings processed. done.", ratingsProcessed);
-    }
-  }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java
deleted file mode 100644
index 8021d00..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.similarity.precompute.example;
-
-import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender;
-import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity;
-import org.apache.mahout.cf.taste.impl.similarity.precompute.FileSimilarItemsWriter;
-import org.apache.mahout.cf.taste.impl.similarity.precompute.MultithreadedBatchItemSimilarities;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.ItemBasedRecommender;
-import org.apache.mahout.cf.taste.similarity.precompute.BatchItemSimilarities;
-
-import java.io.File;
-
-/**
- * Example that precomputes all item similarities of the Movielens1M dataset
- *
- * Usage: download movielens1M from http://www.grouplens.org/node/73 , unzip it and invoke this code with the path
- * to the ratings.dat file as argument
- *
- */
-public final class BatchItemSimilaritiesGroupLens {
-
-  private BatchItemSimilaritiesGroupLens() {}
-
-  public static void main(String[] args) throws Exception {
-
-    if (args.length != 1) {
-      System.err.println("Need path to ratings.dat of the movielens1M dataset as argument!");
-      System.exit(-1);
-    }
-
-    File resultFile = new File(System.getProperty("java.io.tmpdir"), "similarities.csv");
-    if (resultFile.exists()) {
-      resultFile.delete();
-    }
-
-    DataModel dataModel = new GroupLensDataModel(new File(args[0]));
-    ItemBasedRecommender recommender = new GenericItemBasedRecommender(dataModel,
-        new LogLikelihoodSimilarity(dataModel));
-    BatchItemSimilarities batch = new MultithreadedBatchItemSimilarities(recommender, 5);
-
-    int numSimilarities = batch.computeItemSimilarities(Runtime.getRuntime().availableProcessors(), 1,
-        new FileSimilarItemsWriter(resultFile));
-
-    System.out.println("Computed " + numSimilarities + " similarities for " + dataModel.getNumItems() + " items "
-        + "and saved them to " + resultFile.getAbsolutePath());
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java
deleted file mode 100644
index 7ee9b17..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.similarity.precompute.example;
-
-import com.google.common.io.Files;
-import com.google.common.io.InputSupplier;
-import com.google.common.io.Resources;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.net.URL;
-import java.util.regex.Pattern;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
-import org.apache.mahout.common.iterator.FileLineIterable;
-
-public final class GroupLensDataModel extends FileDataModel {
-  
-  private static final String COLON_DELIMTER = "::";
-  private static final Pattern COLON_DELIMITER_PATTERN = Pattern.compile(COLON_DELIMTER);
-  
-  public GroupLensDataModel() throws IOException {
-    this(readResourceToTempFile("/org/apache/mahout/cf/taste/example/grouplens/ratings.dat"));
-  }
-  
-  /**
-   * @param ratingsFile GroupLens ratings.dat file in its native format
-   * @throws IOException if an error occurs while reading or writing files
-   */
-  public GroupLensDataModel(File ratingsFile) throws IOException {
-    super(convertGLFile(ratingsFile));
-  }
-  
-  private static File convertGLFile(File originalFile) throws IOException {
-    // Now translate the file; remove commas, then convert "::" delimiter to comma
-    File resultFile = new File(new File(System.getProperty("java.io.tmpdir")), "ratings.txt");
-    if (resultFile.exists()) {
-      resultFile.delete();
-    }
-    try (Writer writer = new OutputStreamWriter(new FileOutputStream(resultFile), Charsets.UTF_8)){
-      for (String line : new FileLineIterable(originalFile, false)) {
-        int lastDelimiterStart = line.lastIndexOf(COLON_DELIMTER);
-        if (lastDelimiterStart < 0) {
-          throw new IOException("Unexpected input format on line: " + line);
-        }
-        String subLine = line.substring(0, lastDelimiterStart);
-        String convertedLine = COLON_DELIMITER_PATTERN.matcher(subLine).replaceAll(",");
-        writer.write(convertedLine);
-        writer.write('\n');
-      }
-    } catch (IOException ioe) {
-      resultFile.delete();
-      throw ioe;
-    }
-    return resultFile;
-  }
-
-  public static File readResourceToTempFile(String resourceName) throws IOException {
-    InputSupplier<? extends InputStream> inSupplier;
-    try {
-      URL resourceURL = Resources.getResource(GroupLensDataModel.class, resourceName);
-      inSupplier = Resources.newInputStreamSupplier(resourceURL);
-    } catch (IllegalArgumentException iae) {
-      File resourceFile = new File("src/main/java" + resourceName);
-      inSupplier = Files.newInputStreamSupplier(resourceFile);
-    }
-    File tempFile = File.createTempFile("taste", null);
-    tempFile.deleteOnExit();
-    Files.copy(inSupplier, tempFile);
-    return tempFile;
-  }
-
-  @Override
-  public String toString() {
-    return "GroupLensDataModel";
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
deleted file mode 100644
index 5cec51c..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
+++ /dev/null
@@ -1,128 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier;
-
-import com.google.common.collect.ConcurrentHashMultiset;
-import com.google.common.collect.Multiset;
-import com.google.common.io.Closeables;
-import com.google.common.io.Files;
-import org.apache.commons.io.Charsets;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
-import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
-import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.text.SimpleDateFormat;
-import java.util.Collection;
-import java.util.Date;
-import java.util.Locale;
-import java.util.Random;
-
-public final class NewsgroupHelper {
-  
-  private static final SimpleDateFormat[] DATE_FORMATS = {
-    new SimpleDateFormat("", Locale.ENGLISH),
-    new SimpleDateFormat("MMM-yyyy", Locale.ENGLISH),
-    new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.ENGLISH)
-  };
-
-  public static final int FEATURES = 10000;
-  // 1997-01-15 00:01:00 GMT
-  private static final long DATE_REFERENCE = 853286460;
-  private static final long MONTH = 30 * 24 * 3600;
-  private static final long WEEK = 7 * 24 * 3600;
-  
-  private final Random rand = RandomUtils.getRandom();  
-  private final Analyzer analyzer = new StandardAnalyzer();
-  private final FeatureVectorEncoder encoder = new StaticWordValueEncoder("body");
-  private final FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
-  
-  public FeatureVectorEncoder getEncoder() {
-    return encoder;
-  }
-  
-  public FeatureVectorEncoder getBias() {
-    return bias;
-  }
-  
-  public Random getRandom() {
-    return rand;
-  }
-
-  public Vector encodeFeatureVector(File file, int actual, int leakType, Multiset<String> overallCounts)
-    throws IOException {
-    long date = (long) (1000 * (DATE_REFERENCE + actual * MONTH + 1 * WEEK * rand.nextDouble()));
-    Multiset<String> words = ConcurrentHashMultiset.create();
-
-    try (BufferedReader reader = Files.newReader(file, Charsets.UTF_8)) {
-      String line = reader.readLine();
-      Reader dateString = new StringReader(DATE_FORMATS[leakType % 3].format(new Date(date)));
-      countWords(analyzer, words, dateString, overallCounts);
-      while (line != null && !line.isEmpty()) {
-        boolean countHeader = (
-                line.startsWith("From:") || line.startsWith("Subject:")
-                        || line.startsWith("Keywords:") || line.startsWith("Summary:")) && leakType < 6;
-        do {
-          Reader in = new StringReader(line);
-          if (countHeader) {
-            countWords(analyzer, words, in, overallCounts);
-          }
-          line = reader.readLine();
-        } while (line != null && line.startsWith(" "));
-      }
-      if (leakType < 3) {
-        countWords(analyzer, words, reader, overallCounts);
-      }
-    }
-
-    Vector v = new RandomAccessSparseVector(FEATURES);
-    bias.addToVector("", 1, v);
-    for (String word : words.elementSet()) {
-      encoder.addToVector(word, Math.log1p(words.count(word)), v);
-    }
-
-    return v;
-  }
-
-  public static void countWords(Analyzer analyzer,
-                                 Collection<String> words,
-                                 Reader in,
-                                 Multiset<String> overallCounts) throws IOException {
-    TokenStream ts = analyzer.tokenStream("text", in);
-    ts.addAttribute(CharTermAttribute.class);
-    ts.reset();
-    while (ts.incrementToken()) {
-      String s = ts.getAttribute(CharTermAttribute.class).toString();
-      words.add(s);
-    }
-    overallCounts.addAll(words);
-    ts.end();
-    Closeables.close(ts, true);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java
deleted file mode 100644
index 16e9d80..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.email;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.math.VectorWritable;
-
-import java.io.IOException;
-import java.util.Locale;
-import java.util.regex.Pattern;
-
-/**
- * Convert the labels created by the {@link org.apache.mahout.utils.email.MailProcessor} to one consumable
- * by the classifiers
- */
-public class PrepEmailMapper extends Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable> {
-
-  private static final Pattern DASH_DOT = Pattern.compile("-|\\.");
-  private static final Pattern SLASH = Pattern.compile("\\/");
-
-  private boolean useListName = false; //if true, use the project name and the list name in label creation
-  @Override
-  protected void setup(Context context) throws IOException, InterruptedException {
-    useListName = Boolean.parseBoolean(context.getConfiguration().get(PrepEmailVectorsDriver.USE_LIST_NAME));
-  }
-
-  @Override
-  protected void map(WritableComparable<?> key, VectorWritable value, Context context)
-    throws IOException, InterruptedException {
-    String input = key.toString();
-    ///Example: /cocoon.apache.org/dev/200307.gz/001401c3414f$8394e160$1e01a8c0@WRPO
-    String[] splits = SLASH.split(input);
-    //we need the first two splits;
-    if (splits.length >= 3) {
-      StringBuilder bldr = new StringBuilder();
-      bldr.append(escape(splits[1]));
-      if (useListName) {
-        bldr.append('_').append(escape(splits[2]));
-      }
-      context.write(new Text(bldr.toString()), value);
-    }
-
-  }
-  
-  private static String escape(CharSequence value) {
-    return DASH_DOT.matcher(value).replaceAll("_").toLowerCase(Locale.ENGLISH);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java
deleted file mode 100644
index da6e613..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.email;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.math.VectorWritable;
-
-import java.io.IOException;
-import java.util.Iterator;
-
-public class PrepEmailReducer extends Reducer<Text, VectorWritable, Text, VectorWritable> {
-
-  private long maxItemsPerLabel = 10000;
-
-  @Override
-  protected void setup(Context context) throws IOException, InterruptedException {
-    maxItemsPerLabel = Long.parseLong(context.getConfiguration().get(PrepEmailVectorsDriver.ITEMS_PER_CLASS));
-  }
-
-  @Override
-  protected void reduce(Text key, Iterable<VectorWritable> values, Context context)
-    throws IOException, InterruptedException {
-    //TODO: support randomization?  Likely not needed due to the SplitInput utility which does random selection
-    long i = 0;
-    Iterator<VectorWritable> iterator = values.iterator();
-    while (i < maxItemsPerLabel && iterator.hasNext()) {
-      context.write(key, iterator.next());
-      i++;
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java
deleted file mode 100644
index 8fba739..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.email;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.math.VectorWritable;
-
-import java.util.List;
-import java.util.Map;
-
-/**
- * Convert the labels generated by {@link org.apache.mahout.text.SequenceFilesFromMailArchives} and
- * {@link org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles} to ones consumable by the classifiers. We do this
- * here b/c if it is done in the creation of sparse vectors, the Reducer collapses all the vectors.
- */
-public class PrepEmailVectorsDriver extends AbstractJob {
-
-  public static final String ITEMS_PER_CLASS = "itemsPerClass";
-  public static final String USE_LIST_NAME = "USE_LIST_NAME";
-
-  public static void main(String[] args) throws Exception {
-    ToolRunner.run(new Configuration(), new PrepEmailVectorsDriver(), args);
-  }
-
-  @Override
-  public int run(String[] args) throws Exception {
-    addInputOption();
-    addOutputOption();
-    addOption(DefaultOptionCreator.overwriteOption().create());
-    addOption("maxItemsPerLabel", "mipl", "The maximum number of items per label.  Can be useful for making the "
-        + "training sets the same size", String.valueOf(100000));
-    addOption(buildOption("useListName", "ul", "Use the name of the list as part of the label.  If not set, then "
-        + "just use the project name", false, false, "false"));
-    Map<String,List<String>> parsedArgs = parseArguments(args);
-    if (parsedArgs == null) {
-      return -1;
-    }
-
-    Path input = getInputPath();
-    Path output = getOutputPath();
-    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
-      HadoopUtil.delete(getConf(), output);
-    }
-    Job convertJob = prepareJob(input, output, SequenceFileInputFormat.class, PrepEmailMapper.class, Text.class,
-        VectorWritable.class, PrepEmailReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
-    convertJob.getConfiguration().set(ITEMS_PER_CLASS, getOption("maxItemsPerLabel"));
-    convertJob.getConfiguration().set(USE_LIST_NAME, String.valueOf(hasOption("useListName")));
-
-    boolean succeeded = convertJob.waitForCompletion(true);
-    return succeeded ? 0 : -1;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java
deleted file mode 100644
index 9c0ef56..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java
+++ /dev/null
@@ -1,277 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sequencelearning.hmm;
-
-import com.google.common.io.Resources;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.math.Matrix;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.net.URL;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-/**
- * This class implements a sample program that uses a pre-tagged training data
- * set to train an HMM model as a POS tagger. The training data is automatically
- * downloaded from the following URL:
- * http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/train.txt It then
- * trains an HMM Model using supervised learning and tests the model on the
- * following test data set:
- * http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/test.txt Further
- * details regarding the data files can be found at
- * http://flexcrfs.sourceforge.net/#Case_Study
- */
-public final class PosTagger {
-
-  private static final Logger log = LoggerFactory.getLogger(PosTagger.class);
-
-  private static final Pattern SPACE = Pattern.compile(" ");
-  private static final Pattern SPACES = Pattern.compile("[ ]+");
-
-  /**
-   * No public constructors for utility classes.
-   */
-  private PosTagger() {
-    // nothing to do here really.
-  }
-
-  /**
-   * Model trained in the example.
-   */
-  private static HmmModel taggingModel;
-
-  /**
-   * Map for storing the IDs for the POS tags (hidden states)
-   */
-  private static Map<String, Integer> tagIDs;
-
-  /**
-   * Counter for the next assigned POS tag ID The value of 0 is reserved for
-   * "unknown POS tag"
-   */
-  private static int nextTagId;
-
-  /**
-   * Map for storing the IDs for observed words (observed states)
-   */
-  private static Map<String, Integer> wordIDs;
-
-  /**
-   * Counter for the next assigned word ID The value of 0 is reserved for
-   * "unknown word"
-   */
-  private static int nextWordId = 1; // 0 is reserved for "unknown word"
-
-  /**
-   * Used for storing a list of POS tags of read sentences.
-   */
-  private static List<int[]> hiddenSequences;
-
-  /**
-   * Used for storing a list of word tags of read sentences.
-   */
-  private static List<int[]> observedSequences;
-
-  /**
-   * number of read lines
-   */
-  private static int readLines;
-
-  /**
-   * Given an URL, this function fetches the data file, parses it, assigns POS
-   * Tag/word IDs and fills the hiddenSequences/observedSequences lists with
-   * data from those files. The data is expected to be in the following format
-   * (one word per line): word pos-tag np-tag sentences are closed with the .
-   * pos tag
-   *
-   * @param url       Where the data file is stored
-   * @param assignIDs Should IDs for unknown words/tags be assigned? (Needed for
-   *                  training data, not needed for test data)
-   * @throws IOException in case data file cannot be read.
-   */
-  private static void readFromURL(String url, boolean assignIDs) throws IOException {
-    // initialize the data structure
-    hiddenSequences = new LinkedList<>();
-    observedSequences = new LinkedList<>();
-    readLines = 0;
-
-    // now read line by line of the input file
-    List<Integer> observedSequence = new LinkedList<>();
-    List<Integer> hiddenSequence = new LinkedList<>();
-
-    for (String line :Resources.readLines(new URL(url), Charsets.UTF_8)) {
-      if (line.isEmpty()) {
-        // new sentence starts
-        int[] observedSequenceArray = new int[observedSequence.size()];
-        int[] hiddenSequenceArray = new int[hiddenSequence.size()];
-        for (int i = 0; i < observedSequence.size(); ++i) {
-          observedSequenceArray[i] = observedSequence.get(i);
-          hiddenSequenceArray[i] = hiddenSequence.get(i);
-        }
-        // now register those arrays
-        hiddenSequences.add(hiddenSequenceArray);
-        observedSequences.add(observedSequenceArray);
-        // and reset the linked lists
-        observedSequence.clear();
-        hiddenSequence.clear();
-        continue;
-      }
-      readLines++;
-      // we expect the format [word] [POS tag] [NP tag]
-      String[] tags = SPACE.split(line);
-      // when analyzing the training set, assign IDs
-      if (assignIDs) {
-        if (!wordIDs.containsKey(tags[0])) {
-          wordIDs.put(tags[0], nextWordId++);
-        }
-        if (!tagIDs.containsKey(tags[1])) {
-          tagIDs.put(tags[1], nextTagId++);
-        }
-      }
-      // determine the IDs
-      Integer wordID = wordIDs.get(tags[0]);
-      Integer tagID = tagIDs.get(tags[1]);
-      // now construct the current sequence
-      if (wordID == null) {
-        observedSequence.add(0);
-      } else {
-        observedSequence.add(wordID);
-      }
-
-      if (tagID == null) {
-        hiddenSequence.add(0);
-      } else {
-        hiddenSequence.add(tagID);
-      }
-    }
-
-    // if there is still something in the pipe, register it
-    if (!observedSequence.isEmpty()) {
-      int[] observedSequenceArray = new int[observedSequence.size()];
-      int[] hiddenSequenceArray = new int[hiddenSequence.size()];
-      for (int i = 0; i < observedSequence.size(); ++i) {
-        observedSequenceArray[i] = observedSequence.get(i);
-        hiddenSequenceArray[i] = hiddenSequence.get(i);
-      }
-      // now register those arrays
-      hiddenSequences.add(hiddenSequenceArray);
-      observedSequences.add(observedSequenceArray);
-    }
-  }
-
-  private static void trainModel(String trainingURL) throws IOException {
-    tagIDs = new HashMap<>(44); // we expect 44 distinct tags
-    wordIDs = new HashMap<>(19122); // we expect 19122
-    // distinct words
-    log.info("Reading and parsing training data file from URL: {}", trainingURL);
-    long start = System.currentTimeMillis();
-    readFromURL(trainingURL, true);
-    long end = System.currentTimeMillis();
-    double duration = (end - start) / 1000.0;
-    log.info("Parsing done in {} seconds!", duration);
-    log.info("Read {} lines containing {} sentences with a total of {} distinct words and {} distinct POS tags.",
-             readLines, hiddenSequences.size(), nextWordId - 1, nextTagId - 1);
-    start = System.currentTimeMillis();
-    taggingModel = HmmTrainer.trainSupervisedSequence(nextTagId, nextWordId,
-        hiddenSequences, observedSequences, 0.05);
-    // we have to adjust the model a bit,
-    // since we assume a higher probability that a given unknown word is NNP
-    // than anything else
-    Matrix emissions = taggingModel.getEmissionMatrix();
-    for (int i = 0; i < taggingModel.getNrOfHiddenStates(); ++i) {
-      emissions.setQuick(i, 0, 0.1 / taggingModel.getNrOfHiddenStates());
-    }
-    int nnptag = tagIDs.get("NNP");
-    emissions.setQuick(nnptag, 0, 1 / (double) taggingModel.getNrOfHiddenStates());
-    // re-normalize the emission probabilities
-    HmmUtils.normalizeModel(taggingModel);
-    // now register the names
-    taggingModel.registerHiddenStateNames(tagIDs);
-    taggingModel.registerOutputStateNames(wordIDs);
-    end = System.currentTimeMillis();
-    duration = (end - start) / 1000.0;
-    log.info("Trained HMM models in {} seconds!", duration);
-  }
-
-  private static void testModel(String testingURL) throws IOException {
-    log.info("Reading and parsing test data file from URL: {}", testingURL);
-    long start = System.currentTimeMillis();
-    readFromURL(testingURL, false);
-    long end = System.currentTimeMillis();
-    double duration = (end - start) / 1000.0;
-    log.info("Parsing done in {} seconds!", duration);
-    log.info("Read {} lines containing {} sentences.", readLines, hiddenSequences.size());
-
-    start = System.currentTimeMillis();
-    int errorCount = 0;
-    int totalCount = 0;
-    for (int i = 0; i < observedSequences.size(); ++i) {
-      // fetch the viterbi path as the POS tag for this observed sequence
-      int[] posEstimate = HmmEvaluator.decode(taggingModel, observedSequences.get(i), false);
-      // compare with the expected
-      int[] posExpected = hiddenSequences.get(i);
-      for (int j = 0; j < posExpected.length; ++j) {
-        totalCount++;
-        if (posEstimate[j] != posExpected[j]) {
-          errorCount++;
-        }
-      }
-    }
-    end = System.currentTimeMillis();
-    duration = (end - start) / 1000.0;
-    log.info("POS tagged test file in {} seconds!", duration);
-    double errorRate = (double) errorCount / totalCount;
-    log.info("Tagged the test file with an error rate of: {}", errorRate);
-  }
-
-  private static List<String> tagSentence(String sentence) {
-    // first, we need to isolate all punctuation characters, so that they
-    // can be recognized
-    sentence = sentence.replaceAll("[,.!?:;\"]", " $0 ");
-    sentence = sentence.replaceAll("''", " '' ");
-    // now we tokenize the sentence
-    String[] tokens = SPACES.split(sentence);
-    // now generate the observed sequence
-    int[] observedSequence = HmmUtils.encodeStateSequence(taggingModel, Arrays.asList(tokens), true, 0);
-    // POS tag this observedSequence
-    int[] hiddenSequence = HmmEvaluator.decode(taggingModel, observedSequence, false);
-    // and now decode the tag names
-    return HmmUtils.decodeStateSequence(taggingModel, hiddenSequence, false, null);
-  }
-
-  public static void main(String[] args) throws IOException {
-    // generate the model from URL
-    trainModel("http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/train.txt");
-    testModel("http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/test.txt");
-    // tag an exemplary sentence
-    String test = "McDonalds is a huge company with many employees .";
-    String[] testWords = SPACE.split(test);
-    List<String> posTags = tagSentence(test);
-    for (int i = 0; i < posTags.size(); ++i) {
-      log.info("{}[{}]", testWords[i], posTags.get(i));
-    }
-  }
-
-}


[06/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0Mapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0Mapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0Mapper.java
new file mode 100644
index 0000000..96f36d4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0Mapper.java
@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.MatrixSlice;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+
+/**
+ * Run ensemble learning via loading the {@link ModelTrainer} with two {@link TopicModel} instances:
+ * one from the previous iteration, the other empty.  Inference is done on the first, and the
+ * learning updates are stored in the second, and only emitted at cleanup().
+ * <p/>
+ * In terms of obvious performance improvements still available, the memory footprint in this
+ * Mapper could be dropped by half if we accumulated model updates onto the model we're using
+ * for inference, which might also speed up convergence, as we'd be able to take advantage of
+ * learning <em>during</em> iteration, not just after each one is done.  Most likely we don't
+ * really need to accumulate double values in the model either, floats would most likely be
+ * sufficient.  Between these two, we could squeeze another factor of 4 in memory efficiency.
+ * <p/>
+ * In terms of CPU, we're re-learning the p(topic|doc) distribution on every iteration, starting
+ * from scratch.  This is usually only 10 fixed-point iterations per doc, but that's 10x more than
+ * only 1.  To avoid having to do this, we would need to do a map-side join of the unchanging
+ * corpus with the continually-improving p(topic|doc) matrix, and then emit multiple outputs
+ * from the mappers to make sure we can do the reduce model averaging as well.  Tricky, but
+ * possibly worth it.
+ * <p/>
+ * {@link ModelTrainer} already takes advantage (in maybe the not-nice way) of multi-core
+ * availability by doing multithreaded learning, see that class for details.
+ */
+public class CachingCVB0Mapper
+    extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> {
+
+  private static final Logger log = LoggerFactory.getLogger(CachingCVB0Mapper.class);
+
+  private ModelTrainer modelTrainer;
+  private TopicModel readModel;
+  private TopicModel writeModel;
+  private int maxIters;
+  private int numTopics;
+
+  protected ModelTrainer getModelTrainer() {
+    return modelTrainer;
+  }
+
+  protected int getMaxIters() {
+    return maxIters;
+  }
+
+  protected int getNumTopics() {
+    return numTopics;
+  }
+
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    log.info("Retrieving configuration");
+    Configuration conf = context.getConfiguration();
+    float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN);
+    float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN);
+    long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L);
+    numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1);
+    int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1);
+    int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1);
+    int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4);
+    maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10);
+    float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f);
+
+    log.info("Initializing read model");
+    Path[] modelPaths = CVB0Driver.getModelPaths(conf);
+    if (modelPaths != null && modelPaths.length > 0) {
+      readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths);
+    } else {
+      log.info("No model files found");
+      readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null,
+          numTrainThreads, modelWeight);
+    }
+
+    log.info("Initializing write model");
+    writeModel = modelWeight == 1
+        ? new TopicModel(numTopics, numTerms, eta, alpha, null, numUpdateThreads)
+        : readModel;
+
+    log.info("Initializing model trainer");
+    modelTrainer = new ModelTrainer(readModel, writeModel, numTrainThreads, numTopics, numTerms);
+    modelTrainer.start();
+  }
+
+  @Override
+  public void map(IntWritable docId, VectorWritable document, Context context)
+      throws IOException, InterruptedException {
+    /* where to get docTopics? */
+    Vector topicVector = new DenseVector(numTopics).assign(1.0 / numTopics);
+    modelTrainer.train(document.get(), topicVector, true, maxIters);
+  }
+
+  @Override
+  protected void cleanup(Context context) throws IOException, InterruptedException {
+    log.info("Stopping model trainer");
+    modelTrainer.stop();
+
+    log.info("Writing model");
+    TopicModel readFrom = modelTrainer.getReadModel();
+    for (MatrixSlice topic : readFrom) {
+      context.write(new IntWritable(topic.index()), new VectorWritable(topic.vector()));
+    }
+    readModel.stop();
+    writeModel.stop();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0PerplexityMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0PerplexityMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0PerplexityMapper.java
new file mode 100644
index 0000000..da77baf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0PerplexityMapper.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.common.MemoryUtil;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Random;
+
+public class CachingCVB0PerplexityMapper extends
+    Mapper<IntWritable, VectorWritable, DoubleWritable, DoubleWritable> {
+  /**
+   * Hadoop counters for {@link CachingCVB0PerplexityMapper}, to aid in debugging.
+   */
+  public enum Counters {
+    SAMPLED_DOCUMENTS
+  }
+
+  private static final Logger log = LoggerFactory.getLogger(CachingCVB0PerplexityMapper.class);
+
+  private ModelTrainer modelTrainer;
+  private TopicModel readModel;
+  private int maxIters;
+  private int numTopics;
+  private float testFraction;
+  private Random random;
+  private Vector topicVector;
+  private final DoubleWritable outKey = new DoubleWritable();
+  private final DoubleWritable outValue = new DoubleWritable();
+
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    MemoryUtil.startMemoryLogger(5000);
+
+    log.info("Retrieving configuration");
+    Configuration conf = context.getConfiguration();
+    float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN);
+    float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN);
+    long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L);
+    random = RandomUtils.getRandom(seed);
+    numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1);
+    int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1);
+    int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1);
+    int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4);
+    maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10);
+    float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f);
+    testFraction = conf.getFloat(CVB0Driver.TEST_SET_FRACTION, 0.1f);
+
+    log.info("Initializing read model");
+    Path[] modelPaths = CVB0Driver.getModelPaths(conf);
+    if (modelPaths != null && modelPaths.length > 0) {
+      readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths);
+    } else {
+      log.info("No model files found");
+      readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null,
+          numTrainThreads, modelWeight);
+    }
+
+    log.info("Initializing model trainer");
+    modelTrainer = new ModelTrainer(readModel, null, numTrainThreads, numTopics, numTerms);
+
+    log.info("Initializing topic vector");
+    topicVector = new DenseVector(new double[numTopics]);
+  }
+
+  @Override
+  protected void cleanup(Context context) throws IOException, InterruptedException {
+    readModel.stop();
+    MemoryUtil.stopMemoryLogger();
+  }
+
+  @Override
+  public void map(IntWritable docId, VectorWritable document, Context context)
+    throws IOException, InterruptedException {
+    if (testFraction < 1.0f && random.nextFloat() >= testFraction) {
+      return;
+    }
+    context.getCounter(Counters.SAMPLED_DOCUMENTS).increment(1);
+    outKey.set(document.get().norm(1));
+    outValue.set(modelTrainer.calculatePerplexity(document.get(), topicVector.assign(1.0 / numTopics), maxIters));
+    context.write(outKey, outValue);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/InMemoryCollapsedVariationalBayes0.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/InMemoryCollapsedVariationalBayes0.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/InMemoryCollapsedVariationalBayes0.java
new file mode 100644
index 0000000..d7d09c5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/InMemoryCollapsedVariationalBayes0.java
@@ -0,0 +1,492 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.DistributedRowMatrixWriter;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.SparseRowMatrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Runs the same algorithm as {@link CVB0Driver}, but sequentially, in memory.  Memory requirements
+ * are currently: the entire corpus is read into RAM, two copies of the model (each of size
+ * numTerms * numTopics), and another matrix of size numDocs * numTopics is held in memory
+ * (to store p(topic|doc) for all docs).
+ *
+ * But if all this fits in memory, this can be significantly faster than an iterative MR job.
+ */
+public class InMemoryCollapsedVariationalBayes0 extends AbstractJob {
+
+  private static final Logger log = LoggerFactory.getLogger(InMemoryCollapsedVariationalBayes0.class);
+
+  private int numTopics;
+  private int numTerms;
+  private int numDocuments;
+  private double alpha;
+  private double eta;
+  //private int minDfCt;
+  //private double maxDfPct;
+  private boolean verbose = false;
+  private String[] terms;  // of length numTerms;
+  private Matrix corpusWeights; // length numDocs;
+  private double totalCorpusWeight;
+  private double initialModelCorpusFraction;
+  private Matrix docTopicCounts;
+  private int numTrainingThreads;
+  private int numUpdatingThreads;
+  private ModelTrainer modelTrainer;
+
+  private InMemoryCollapsedVariationalBayes0() {
+    // only for main usage
+  }
+
+  public void setVerbose(boolean verbose) {
+    this.verbose = verbose;
+  }
+    
+  public InMemoryCollapsedVariationalBayes0(Matrix corpus,
+                                            String[] terms,
+                                            int numTopics,
+                                            double alpha,
+                                            double eta,
+                                            int numTrainingThreads,
+                                            int numUpdatingThreads,
+                                            double modelCorpusFraction) {
+    //this.seed = seed;
+    this.numTopics = numTopics;
+    this.alpha = alpha;
+    this.eta = eta;
+    //this.minDfCt = 0;
+    //this.maxDfPct = 1.0f;
+    corpusWeights = corpus;
+    numDocuments = corpus.numRows();
+    this.terms = terms;
+    this.initialModelCorpusFraction = modelCorpusFraction;
+    numTerms = terms != null ? terms.length : corpus.numCols();
+    Map<String, Integer> termIdMap = new HashMap<>();
+    if (terms != null) {
+      for (int t = 0; t < terms.length; t++) {
+        termIdMap.put(terms[t], t);
+      }
+    }
+    this.numTrainingThreads = numTrainingThreads;
+    this.numUpdatingThreads = numUpdatingThreads;
+    postInitCorpus();
+    initializeModel();
+  }
+
+  private void postInitCorpus() {
+    totalCorpusWeight = 0;
+    int numNonZero = 0;
+    for (int i = 0; i < numDocuments; i++) {
+      Vector v = corpusWeights.viewRow(i);
+      double norm;
+      if (v != null && (norm = v.norm(1)) != 0) {
+        numNonZero += v.getNumNondefaultElements();
+        totalCorpusWeight += norm;
+      }
+    }
+    String s = "Initializing corpus with %d docs, %d terms, %d nonzero entries, total termWeight %f";
+    log.info(String.format(s, numDocuments, numTerms, numNonZero, totalCorpusWeight));
+  }
+
+  private void initializeModel() {
+    TopicModel topicModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(), terms,
+        numUpdatingThreads, initialModelCorpusFraction == 0 ? 1 : initialModelCorpusFraction * totalCorpusWeight);
+    topicModel.setConf(getConf());
+
+    TopicModel updatedModel = initialModelCorpusFraction == 0
+        ? new TopicModel(numTopics, numTerms, eta, alpha, null, terms, numUpdatingThreads, 1)
+        : topicModel;
+    updatedModel.setConf(getConf());
+    docTopicCounts = new DenseMatrix(numDocuments, numTopics);
+    docTopicCounts.assign(1.0 / numTopics);
+    modelTrainer = new ModelTrainer(topicModel, updatedModel, numTrainingThreads, numTopics, numTerms);
+  }
+
+  /*
+  private void inferDocuments(double convergence, int maxIter, boolean recalculate) {
+    for (int docId = 0; docId < corpusWeights.numRows() ; docId++) {
+      Vector inferredDocument = topicModel.infer(corpusWeights.viewRow(docId),
+          docTopicCounts.viewRow(docId));
+      // do what now?
+    }
+  }
+   */
+
+  public void trainDocuments() {
+    trainDocuments(0);
+  }
+
+  public void trainDocuments(double testFraction) {
+    long start = System.nanoTime();
+    modelTrainer.start();
+    for (int docId = 0; docId < corpusWeights.numRows(); docId++) {
+      if (testFraction == 0 || docId % (1 / testFraction) != 0) {
+        Vector docTopics = new DenseVector(numTopics).assign(1.0 / numTopics); // docTopicCounts.getRow(docId)
+        modelTrainer.trainSync(corpusWeights.viewRow(docId), docTopics , true, 10);
+      }
+    }
+    modelTrainer.stop();
+    logTime("train documents", System.nanoTime() - start);
+  }
+
+  /*
+  private double error(int docId) {
+    Vector docTermCounts = corpusWeights.viewRow(docId);
+    if (docTermCounts == null) {
+      return 0;
+    } else {
+      Vector expectedDocTermCounts =
+          topicModel.infer(corpusWeights.viewRow(docId), docTopicCounts.viewRow(docId));
+      double expectedNorm = expectedDocTermCounts.norm(1);
+      return expectedDocTermCounts.times(docTermCounts.norm(1)/expectedNorm)
+          .minus(docTermCounts).norm(1);
+    }
+  }
+
+  private double error() {
+    long time = System.nanoTime();
+    double error = 0;
+    for (int docId = 0; docId < numDocuments; docId++) {
+      error += error(docId);
+    }
+    logTime("error calculation", System.nanoTime() - time);
+    return error / totalCorpusWeight;
+  }
+   */
+
+  public double iterateUntilConvergence(double minFractionalErrorChange,
+      int maxIterations, int minIter) {
+    return iterateUntilConvergence(minFractionalErrorChange, maxIterations, minIter, 0);
+  }
+
+  public double iterateUntilConvergence(double minFractionalErrorChange,
+      int maxIterations, int minIter, double testFraction) {
+    int iter = 0;
+    double oldPerplexity = 0;
+    while (iter < minIter) {
+      trainDocuments(testFraction);
+      if (verbose) {
+        log.info("model after: {}: {}", iter, modelTrainer.getReadModel());
+      }
+      log.info("iteration {} complete", iter);
+      oldPerplexity = modelTrainer.calculatePerplexity(corpusWeights, docTopicCounts,
+          testFraction);
+      log.info("{} = perplexity", oldPerplexity);
+      iter++;
+    }
+    double newPerplexity = 0;
+    double fractionalChange = Double.MAX_VALUE;
+    while (iter < maxIterations && fractionalChange > minFractionalErrorChange) {
+      trainDocuments();
+      if (verbose) {
+        log.info("model after: {}: {}", iter, modelTrainer.getReadModel());
+      }
+      newPerplexity = modelTrainer.calculatePerplexity(corpusWeights, docTopicCounts,
+          testFraction);
+      log.info("{} = perplexity", newPerplexity);
+      iter++;
+      fractionalChange = Math.abs(newPerplexity - oldPerplexity) / oldPerplexity;
+      log.info("{} = fractionalChange", fractionalChange);
+      oldPerplexity = newPerplexity;
+    }
+    if (iter < maxIterations) {
+      log.info(String.format("Converged! fractional error change: %f, error %f",
+          fractionalChange, newPerplexity));
+    } else {
+      log.info(String.format("Reached max iteration count (%d), fractional error change: %f, error: %f",
+          maxIterations, fractionalChange, newPerplexity));
+    }
+    return newPerplexity;
+  }
+
+  public void writeModel(Path outputPath) throws IOException {
+    modelTrainer.persist(outputPath);
+  }
+
+  private static void logTime(String label, long nanos) {
+    log.info("{} time: {}ms", label, nanos / 1.0e6);
+  }
+
+  public static int main2(String[] args, Configuration conf) throws Exception {
+    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+    ArgumentBuilder abuilder = new ArgumentBuilder();
+    GroupBuilder gbuilder = new GroupBuilder();
+
+    Option helpOpt = DefaultOptionCreator.helpOption();
+
+    Option inputDirOpt = obuilder.withLongName("input").withRequired(true).withArgument(
+      abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The Directory on HDFS containing the collapsed, properly formatted files having "
+          + "one doc per line").withShortName("i").create();
+
+    Option dictOpt = obuilder.withLongName("dictionary").withRequired(false).withArgument(
+      abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The path to the term-dictionary format is ... ").withShortName("d").create();
+
+    Option dfsOpt = obuilder.withLongName("dfs").withRequired(false).withArgument(
+      abuilder.withName("dfs").withMinimum(1).withMaximum(1).create()).withDescription(
+      "HDFS namenode URI").withShortName("dfs").create();
+
+    Option numTopicsOpt = obuilder.withLongName("numTopics").withRequired(true).withArgument(abuilder
+        .withName("numTopics").withMinimum(1).withMaximum(1)
+        .create()).withDescription("Number of topics to learn").withShortName("top").create();
+
+    Option outputTopicFileOpt = obuilder.withLongName("topicOutputFile").withRequired(true).withArgument(
+        abuilder.withName("topicOutputFile").withMinimum(1).withMaximum(1).create())
+        .withDescription("File to write out p(term | topic)").withShortName("to").create();
+
+    Option outputDocFileOpt = obuilder.withLongName("docOutputFile").withRequired(true).withArgument(
+        abuilder.withName("docOutputFile").withMinimum(1).withMaximum(1).create())
+        .withDescription("File to write out p(topic | docid)").withShortName("do").create();
+
+    Option alphaOpt = obuilder.withLongName("alpha").withRequired(false).withArgument(abuilder
+        .withName("alpha").withMinimum(1).withMaximum(1).withDefault("0.1").create())
+        .withDescription("Smoothing parameter for p(topic | document) prior").withShortName("a").create();
+
+    Option etaOpt = obuilder.withLongName("eta").withRequired(false).withArgument(abuilder
+        .withName("eta").withMinimum(1).withMaximum(1).withDefault("0.1").create())
+        .withDescription("Smoothing parameter for p(term | topic)").withShortName("e").create();
+
+    Option maxIterOpt = obuilder.withLongName("maxIterations").withRequired(false).withArgument(abuilder
+        .withName("maxIterations").withMinimum(1).withMaximum(1).withDefault("10").create())
+        .withDescription("Maximum number of training passes").withShortName("m").create();
+
+    Option modelCorpusFractionOption = obuilder.withLongName("modelCorpusFraction")
+        .withRequired(false).withArgument(abuilder.withName("modelCorpusFraction").withMinimum(1)
+        .withMaximum(1).withDefault("0.0").create()).withShortName("mcf")
+        .withDescription("For online updates, initial value of |model|/|corpus|").create();
+
+    Option burnInOpt = obuilder.withLongName("burnInIterations").withRequired(false).withArgument(abuilder
+        .withName("burnInIterations").withMinimum(1).withMaximum(1).withDefault("5").create())
+        .withDescription("Minimum number of iterations").withShortName("b").create();
+
+    Option convergenceOpt = obuilder.withLongName("convergence").withRequired(false).withArgument(abuilder
+        .withName("convergence").withMinimum(1).withMaximum(1).withDefault("0.0").create())
+        .withDescription("Fractional rate of perplexity to consider convergence").withShortName("c").create();
+
+    Option reInferDocTopicsOpt = obuilder.withLongName("reInferDocTopics").withRequired(false)
+        .withArgument(abuilder.withName("reInferDocTopics").withMinimum(1).withMaximum(1)
+        .withDefault("no").create())
+        .withDescription("re-infer p(topic | doc) : [no | randstart | continue]")
+        .withShortName("rdt").create();
+
+    Option numTrainThreadsOpt = obuilder.withLongName("numTrainThreads").withRequired(false)
+        .withArgument(abuilder.withName("numTrainThreads").withMinimum(1).withMaximum(1)
+        .withDefault("1").create())
+        .withDescription("number of threads to train with")
+        .withShortName("ntt").create();
+
+    Option numUpdateThreadsOpt = obuilder.withLongName("numUpdateThreads").withRequired(false)
+        .withArgument(abuilder.withName("numUpdateThreads").withMinimum(1).withMaximum(1)
+        .withDefault("1").create())
+        .withDescription("number of threads to update the model with")
+        .withShortName("nut").create();
+
+    Option verboseOpt = obuilder.withLongName("verbose").withRequired(false)
+        .withArgument(abuilder.withName("verbose").withMinimum(1).withMaximum(1)
+        .withDefault("false").create())
+        .withDescription("print verbose information, like top-terms in each topic, during iteration")
+        .withShortName("v").create();
+
+    Group group = gbuilder.withName("Options").withOption(inputDirOpt).withOption(numTopicsOpt)
+        .withOption(alphaOpt).withOption(etaOpt)
+        .withOption(maxIterOpt).withOption(burnInOpt).withOption(convergenceOpt)
+        .withOption(dictOpt).withOption(reInferDocTopicsOpt)
+        .withOption(outputDocFileOpt).withOption(outputTopicFileOpt).withOption(dfsOpt)
+        .withOption(numTrainThreadsOpt).withOption(numUpdateThreadsOpt)
+        .withOption(modelCorpusFractionOption).withOption(verboseOpt).create();
+
+    try {
+      Parser parser = new Parser();
+
+      parser.setGroup(group);
+      parser.setHelpOption(helpOpt);
+      CommandLine cmdLine = parser.parse(args);
+      if (cmdLine.hasOption(helpOpt)) {
+        CommandLineUtil.printHelp(group);
+        return -1;
+      }
+
+      String inputDirString = (String) cmdLine.getValue(inputDirOpt);
+      String dictDirString = cmdLine.hasOption(dictOpt) ? (String)cmdLine.getValue(dictOpt) : null;
+      int numTopics = Integer.parseInt((String) cmdLine.getValue(numTopicsOpt));
+      double alpha = Double.parseDouble((String)cmdLine.getValue(alphaOpt));
+      double eta = Double.parseDouble((String)cmdLine.getValue(etaOpt));
+      int maxIterations = Integer.parseInt((String)cmdLine.getValue(maxIterOpt));
+      int burnInIterations = Integer.parseInt((String)cmdLine.getValue(burnInOpt));
+      double minFractionalErrorChange = Double.parseDouble((String) cmdLine.getValue(convergenceOpt));
+      int numTrainThreads = Integer.parseInt((String)cmdLine.getValue(numTrainThreadsOpt));
+      int numUpdateThreads = Integer.parseInt((String)cmdLine.getValue(numUpdateThreadsOpt));
+      String topicOutFile = (String)cmdLine.getValue(outputTopicFileOpt);
+      String docOutFile = (String)cmdLine.getValue(outputDocFileOpt);
+      //String reInferDocTopics = (String)cmdLine.getValue(reInferDocTopicsOpt);
+      boolean verbose = Boolean.parseBoolean((String) cmdLine.getValue(verboseOpt));
+      double modelCorpusFraction = Double.parseDouble((String)cmdLine.getValue(modelCorpusFractionOption));
+
+      long start = System.nanoTime();
+
+      if (conf.get("fs.default.name") == null) {
+        String dfsNameNode = (String)cmdLine.getValue(dfsOpt);
+        conf.set("fs.default.name", dfsNameNode);
+      }
+      String[] terms = loadDictionary(dictDirString, conf);
+      logTime("dictionary loading", System.nanoTime() - start);
+      start = System.nanoTime();
+      Matrix corpus = loadVectors(inputDirString, conf);
+      logTime("vector seqfile corpus loading", System.nanoTime() - start);
+      start = System.nanoTime();
+      InMemoryCollapsedVariationalBayes0 cvb0 =
+          new InMemoryCollapsedVariationalBayes0(corpus, terms, numTopics, alpha, eta,
+                                                 numTrainThreads, numUpdateThreads, modelCorpusFraction);
+      logTime("cvb0 init", System.nanoTime() - start);
+
+      start = System.nanoTime();
+      cvb0.setVerbose(verbose);
+      cvb0.iterateUntilConvergence(minFractionalErrorChange, maxIterations, burnInIterations);
+      logTime("total training time", System.nanoTime() - start);
+
+      /*
+      if ("randstart".equalsIgnoreCase(reInferDocTopics)) {
+        cvb0.inferDocuments(0.0, 100, true);
+      } else if ("continue".equalsIgnoreCase(reInferDocTopics)) {
+        cvb0.inferDocuments(0.0, 100, false);
+      }
+       */
+
+      start = System.nanoTime();
+      cvb0.writeModel(new Path(topicOutFile));
+      DistributedRowMatrixWriter.write(new Path(docOutFile), conf, cvb0.docTopicCounts);
+      logTime("printTopics", System.nanoTime() - start);
+    } catch (OptionException e) {
+      log.error("Error while parsing options", e);
+      CommandLineUtil.printHelp(group);
+    }
+    return 0;
+  }
+
+  private static String[] loadDictionary(String dictionaryPath, Configuration conf) {
+    if (dictionaryPath == null) {
+      return null;
+    }
+    Path dictionaryFile = new Path(dictionaryPath);
+    List<Pair<Integer, String>> termList = new ArrayList<>();
+    int maxTermId = 0;
+     // key is word value is id
+    for (Pair<Writable, IntWritable> record
+            : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) {
+      termList.add(new Pair<>(record.getSecond().get(),
+          record.getFirst().toString()));
+      maxTermId = Math.max(maxTermId, record.getSecond().get());
+    }
+    String[] terms = new String[maxTermId + 1];
+    for (Pair<Integer, String> pair : termList) {
+      terms[pair.getFirst()] = pair.getSecond();
+    }
+    return terms;
+  }
+
+  @Override
+  public Configuration getConf() {
+    return super.getConf();
+  }
+
+  private static Matrix loadVectors(String vectorPathString, Configuration conf)
+    throws IOException {
+    Path vectorPath = new Path(vectorPathString);
+    FileSystem fs = vectorPath.getFileSystem(conf);
+    List<Path> subPaths = new ArrayList<>();
+    if (fs.isFile(vectorPath)) {
+      subPaths.add(vectorPath);
+    } else {
+      for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) {
+        subPaths.add(fileStatus.getPath());
+      }
+    }
+    List<Pair<Integer, Vector>> rowList = new ArrayList<>();
+    int numRows = Integer.MIN_VALUE;
+    int numCols = -1;
+    boolean sequentialAccess = false;
+    for (Path subPath : subPaths) {
+      for (Pair<IntWritable, VectorWritable> record
+          : new SequenceFileIterable<IntWritable, VectorWritable>(subPath, true, conf)) {
+        int id = record.getFirst().get();
+        Vector vector = record.getSecond().get();
+        if (vector instanceof NamedVector) {
+          vector = ((NamedVector)vector).getDelegate();
+        }
+        if (numCols < 0) {
+          numCols = vector.size();
+          sequentialAccess = vector.isSequentialAccess();
+        }
+        rowList.add(Pair.of(id, vector));
+        numRows = Math.max(numRows, id);
+      }
+    }
+    numRows++;
+    Vector[] rowVectors = new Vector[numRows];
+    for (Pair<Integer, Vector> pair : rowList) {
+      rowVectors[pair.getFirst()] = pair.getSecond();
+    }
+    return new SparseRowMatrix(numRows, numCols, rowVectors, true, !sequentialAccess);
+
+  }
+
+  @Override
+  public int run(String[] strings) throws Exception {
+    return main2(strings, getConf());
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new InMemoryCollapsedVariationalBayes0(), args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/ModelTrainer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/ModelTrainer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/ModelTrainer.java
new file mode 100644
index 0000000..c3f2bc0
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/ModelTrainer.java
@@ -0,0 +1,301 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.MatrixSlice;
+import org.apache.mahout.math.SparseRowMatrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Multithreaded LDA model trainer class, which primarily operates by running a "map/reduce"
+ * operation, all in memory locally (ie not a hadoop job!) : the "map" operation is to take
+ * the "read-only" {@link TopicModel} and use it to iteratively learn the p(topic|term, doc)
+ * distribution for documents (this can be done in parallel across many documents, as the
+ * "read-only" model is, well, read-only.  Then the outputs of this are "reduced" onto the
+ * "write" model, and these updates are not parallelizable in the same way: individual
+ * documents can't be added to the same entries in different threads at the same time, but
+ * updates across many topics to the same term from the same document can be done in parallel,
+ * so they are.
+ *
+ * Because computation is done asynchronously, when iteration is done, it's important to call
+ * the stop() method, which blocks until work is complete.
+ *
+ * Setting the read model and the write model to be the same object may not quite work yet,
+ * on account of parallelism badness.
+ */
+public class ModelTrainer {
+
+  private static final Logger log = LoggerFactory.getLogger(ModelTrainer.class);
+
+  private final int numTopics;
+  private final int numTerms;
+  private TopicModel readModel;
+  private TopicModel writeModel;
+  private ThreadPoolExecutor threadPool;
+  private BlockingQueue<Runnable> workQueue;
+  private final int numTrainThreads;
+  private final boolean isReadWrite;
+
+  public ModelTrainer(TopicModel initialReadModel, TopicModel initialWriteModel,
+      int numTrainThreads, int numTopics, int numTerms) {
+    this.readModel = initialReadModel;
+    this.writeModel = initialWriteModel;
+    this.numTrainThreads = numTrainThreads;
+    this.numTopics = numTopics;
+    this.numTerms = numTerms;
+    isReadWrite = initialReadModel == initialWriteModel;
+  }
+
+  /**
+   * WARNING: this constructor may not lead to good behavior.  What should be verified is that
+   * the model updating process does not conflict with model reading.  It might work, but then
+   * again, it might not!
+   * @param model to be used for both reading (inference) and accumulating (learning)
+   * @param numTrainThreads
+   * @param numTopics
+   * @param numTerms
+   */
+  public ModelTrainer(TopicModel model, int numTrainThreads, int numTopics, int numTerms) {
+    this(model, model, numTrainThreads, numTopics, numTerms);
+  }
+
+  public TopicModel getReadModel() {
+    return readModel;
+  }
+
+  public void start() {
+    log.info("Starting training threadpool with {} threads", numTrainThreads);
+    workQueue = new ArrayBlockingQueue<>(numTrainThreads * 10);
+    threadPool = new ThreadPoolExecutor(numTrainThreads, numTrainThreads, 0, TimeUnit.SECONDS,
+        workQueue);
+    threadPool.allowCoreThreadTimeOut(false);
+    threadPool.prestartAllCoreThreads();
+    writeModel.reset();
+  }
+
+  public void train(VectorIterable matrix, VectorIterable docTopicCounts) {
+    train(matrix, docTopicCounts, 1);
+  }
+
+  public double calculatePerplexity(VectorIterable matrix, VectorIterable docTopicCounts) {
+    return calculatePerplexity(matrix, docTopicCounts, 0);
+  }
+
+  public double calculatePerplexity(VectorIterable matrix, VectorIterable docTopicCounts,
+      double testFraction) {
+    Iterator<MatrixSlice> docIterator = matrix.iterator();
+    Iterator<MatrixSlice> docTopicIterator = docTopicCounts.iterator();
+    double perplexity = 0;
+    double matrixNorm = 0;
+    while (docIterator.hasNext() && docTopicIterator.hasNext()) {
+      MatrixSlice docSlice = docIterator.next();
+      MatrixSlice topicSlice = docTopicIterator.next();
+      int docId = docSlice.index();
+      Vector document = docSlice.vector();
+      Vector topicDist = topicSlice.vector();
+      if (testFraction == 0 || docId % (1 / testFraction) == 0) {
+        trainSync(document, topicDist, false, 10);
+        perplexity += readModel.perplexity(document, topicDist);
+        matrixNorm += document.norm(1);
+      }
+    }
+    return perplexity / matrixNorm;
+  }
+
+  public void train(VectorIterable matrix, VectorIterable docTopicCounts, int numDocTopicIters) {
+    start();
+    Iterator<MatrixSlice> docIterator = matrix.iterator();
+    Iterator<MatrixSlice> docTopicIterator = docTopicCounts.iterator();
+    long startTime = System.nanoTime();
+    int i = 0;
+    double[] times = new double[100];
+    Map<Vector, Vector> batch = new HashMap<>();
+    int numTokensInBatch = 0;
+    long batchStart = System.nanoTime();
+    while (docIterator.hasNext() && docTopicIterator.hasNext()) {
+      i++;
+      Vector document = docIterator.next().vector();
+      Vector topicDist = docTopicIterator.next().vector();
+      if (isReadWrite) {
+        if (batch.size() < numTrainThreads) {
+          batch.put(document, topicDist);
+          if (log.isDebugEnabled()) {
+            numTokensInBatch += document.getNumNondefaultElements();
+          }
+        } else {
+          batchTrain(batch, true, numDocTopicIters);
+          long time = System.nanoTime();
+          log.debug("trained {} docs with {} tokens, start time {}, end time {}",
+                    numTrainThreads, numTokensInBatch, batchStart, time);
+          batchStart = time;
+          numTokensInBatch = 0;
+        }
+      } else {
+        long start = System.nanoTime();
+        train(document, topicDist, true, numDocTopicIters);
+        if (log.isDebugEnabled()) {
+          times[i % times.length] =
+              (System.nanoTime() - start) / (1.0e6 * document.getNumNondefaultElements());
+          if (i % 100 == 0) {
+            long time = System.nanoTime() - startTime;
+            log.debug("trained {} documents in {}ms", i, time / 1.0e6);
+            if (i % 500 == 0) {
+              Arrays.sort(times);
+              log.debug("training took median {}ms per token-instance", times[times.length / 2]);
+            }
+          }
+        }
+      }
+    }
+    stop();
+  }
+
+  public void batchTrain(Map<Vector, Vector> batch, boolean update, int numDocTopicsIters) {
+    while (true) {
+      try {
+        List<TrainerRunnable> runnables = new ArrayList<>();
+        for (Map.Entry<Vector, Vector> entry : batch.entrySet()) {
+          runnables.add(new TrainerRunnable(readModel, null, entry.getKey(),
+              entry.getValue(), new SparseRowMatrix(numTopics, numTerms, true),
+              numDocTopicsIters));
+        }
+        threadPool.invokeAll(runnables);
+        if (update) {
+          for (TrainerRunnable runnable : runnables) {
+            writeModel.update(runnable.docTopicModel);
+          }
+        }
+        break;
+      } catch (InterruptedException e) {
+        log.warn("Interrupted during batch training, retrying!", e);
+      }
+    }
+  }
+
+  public void train(Vector document, Vector docTopicCounts, boolean update, int numDocTopicIters) {
+    while (true) {
+      try {
+        workQueue.put(new TrainerRunnable(readModel, update
+            ? writeModel
+            : null, document, docTopicCounts, new SparseRowMatrix(numTopics, numTerms, true), numDocTopicIters));
+        return;
+      } catch (InterruptedException e) {
+        log.warn("Interrupted waiting to submit document to work queue: {}", document, e);
+      }
+    }
+  }
+
+  public void trainSync(Vector document, Vector docTopicCounts, boolean update,
+      int numDocTopicIters) {
+    new TrainerRunnable(readModel, update
+        ? writeModel
+        : null, document, docTopicCounts, new SparseRowMatrix(numTopics, numTerms, true), numDocTopicIters).run();
+  }
+
+  public double calculatePerplexity(Vector document, Vector docTopicCounts, int numDocTopicIters) {
+    TrainerRunnable runner =  new TrainerRunnable(readModel, null, document, docTopicCounts,
+        new SparseRowMatrix(numTopics, numTerms, true), numDocTopicIters);
+    return runner.call();
+  }
+
+  public void stop() {
+    long startTime = System.nanoTime();
+    log.info("Initiating stopping of training threadpool");
+    try {
+      threadPool.shutdown();
+      if (!threadPool.awaitTermination(60, TimeUnit.SECONDS)) {
+        log.warn("Threadpool timed out on await termination - jobs still running!");
+      }
+      long newTime = System.nanoTime();
+      log.info("threadpool took: {}ms", (newTime - startTime) / 1.0e6);
+      startTime = newTime;
+      readModel.stop();
+      newTime = System.nanoTime();
+      log.info("readModel.stop() took {}ms", (newTime - startTime) / 1.0e6);
+      startTime = newTime;
+      writeModel.stop();
+      newTime = System.nanoTime();
+      log.info("writeModel.stop() took {}ms", (newTime - startTime) / 1.0e6);
+      TopicModel tmpModel = writeModel;
+      writeModel = readModel;
+      readModel = tmpModel;
+    } catch (InterruptedException e) {
+      log.error("Interrupted shutting down!", e);
+    }
+  }
+
+  public void persist(Path outputPath) throws IOException {
+    readModel.persist(outputPath, true);
+  }
+
+  private static final class TrainerRunnable implements Runnable, Callable<Double> {
+    private final TopicModel readModel;
+    private final TopicModel writeModel;
+    private final Vector document;
+    private final Vector docTopics;
+    private final Matrix docTopicModel;
+    private final int numDocTopicIters;
+
+    private TrainerRunnable(TopicModel readModel, TopicModel writeModel, Vector document,
+        Vector docTopics, Matrix docTopicModel, int numDocTopicIters) {
+      this.readModel = readModel;
+      this.writeModel = writeModel;
+      this.document = document;
+      this.docTopics = docTopics;
+      this.docTopicModel = docTopicModel;
+      this.numDocTopicIters = numDocTopicIters;
+    }
+
+    @Override
+    public void run() {
+      for (int i = 0; i < numDocTopicIters; i++) {
+        // synchronous read-only call:
+        readModel.trainDocTopicModel(document, docTopics, docTopicModel);
+      }
+      if (writeModel != null) {
+        // parallel call which is read-only on the docTopicModel, and write-only on the writeModel
+        // this method does not return until all rows of the docTopicModel have been submitted
+        // to write work queues
+        writeModel.update(docTopicModel);
+      }
+    }
+
+    @Override
+    public Double call() {
+      run();
+      return readModel.perplexity(document, docTopics);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/TopicModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/TopicModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/TopicModel.java
new file mode 100644
index 0000000..9ba77c1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/TopicModel.java
@@ -0,0 +1,513 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.DistributedRowMatrixWriter;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.MatrixSlice;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.Functions;
+import org.apache.mahout.math.stats.Sampler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Thin wrapper around a {@link Matrix} of counts of occurrences of (topic, term) pairs.  Dividing
+ * {code topicTermCount.viewRow(topic).get(term)} by the sum over the values for all terms in that
+ * row yields p(term | topic).  Instead dividing it by all topic columns for that term yields
+ * p(topic | term).
+ *
+ * Multithreading is enabled for the {@code update(Matrix)} method: this method is async, and
+ * merely submits the matrix to a work queue.  When all work has been submitted,
+ * {@code awaitTermination()} should be called, which will block until updates have been
+ * accumulated.
+ */
+public class TopicModel implements Configurable, Iterable<MatrixSlice> {
+  
+  private static final Logger log = LoggerFactory.getLogger(TopicModel.class);
+  
+  private final String[] dictionary;
+  private final Matrix topicTermCounts;
+  private final Vector topicSums;
+  private final int numTopics;
+  private final int numTerms;
+  private final double eta;
+  private final double alpha;
+
+  private Configuration conf;
+
+  private final Sampler sampler;
+  private final int numThreads;
+  private ThreadPoolExecutor threadPool;
+  private Updater[] updaters;
+
+  public int getNumTerms() {
+    return numTerms;
+  }
+
+  public int getNumTopics() {
+    return numTopics;
+  }
+
+  public TopicModel(int numTopics, int numTerms, double eta, double alpha, String[] dictionary,
+      double modelWeight) {
+    this(numTopics, numTerms, eta, alpha, null, dictionary, 1, modelWeight);
+  }
+
+  public TopicModel(Configuration conf, double eta, double alpha,
+      String[] dictionary, int numThreads, double modelWeight, Path... modelpath) throws IOException {
+    this(loadModel(conf, modelpath), eta, alpha, dictionary, numThreads, modelWeight);
+  }
+
+  public TopicModel(int numTopics, int numTerms, double eta, double alpha, String[] dictionary,
+      int numThreads, double modelWeight) {
+    this(new DenseMatrix(numTopics, numTerms), new DenseVector(numTopics), eta, alpha, dictionary,
+        numThreads, modelWeight);
+  }
+
+  public TopicModel(int numTopics, int numTerms, double eta, double alpha, Random random,
+      String[] dictionary, int numThreads, double modelWeight) {
+    this(randomMatrix(numTopics, numTerms, random), eta, alpha, dictionary, numThreads, modelWeight);
+  }
+
+  private TopicModel(Pair<Matrix, Vector> model, double eta, double alpha, String[] dict,
+      int numThreads, double modelWeight) {
+    this(model.getFirst(), model.getSecond(), eta, alpha, dict, numThreads, modelWeight);
+  }
+
+  public TopicModel(Matrix topicTermCounts, Vector topicSums, double eta, double alpha,
+    String[] dictionary, double modelWeight) {
+    this(topicTermCounts, topicSums, eta, alpha, dictionary, 1, modelWeight);
+  }
+
+  public TopicModel(Matrix topicTermCounts, double eta, double alpha, String[] dictionary,
+      int numThreads, double modelWeight) {
+    this(topicTermCounts, viewRowSums(topicTermCounts),
+        eta, alpha, dictionary, numThreads, modelWeight);
+  }
+
+  public TopicModel(Matrix topicTermCounts, Vector topicSums, double eta, double alpha,
+    String[] dictionary, int numThreads, double modelWeight) {
+    this.dictionary = dictionary;
+    this.topicTermCounts = topicTermCounts;
+    this.topicSums = topicSums;
+    this.numTopics = topicSums.size();
+    this.numTerms = topicTermCounts.numCols();
+    this.eta = eta;
+    this.alpha = alpha;
+    this.sampler = new Sampler(RandomUtils.getRandom());
+    this.numThreads = numThreads;
+    if (modelWeight != 1) {
+      topicSums.assign(Functions.mult(modelWeight));
+      for (int x = 0; x < numTopics; x++) {
+        topicTermCounts.viewRow(x).assign(Functions.mult(modelWeight));
+      }
+    }
+    initializeThreadPool();
+  }
+
+  private static Vector viewRowSums(Matrix m) {
+    Vector v = new DenseVector(m.numRows());
+    for (MatrixSlice slice : m) {
+      v.set(slice.index(), slice.vector().norm(1));
+    }
+    return v;
+  }
+
+  private synchronized void initializeThreadPool() {
+    if (threadPool != null) {
+      threadPool.shutdown();
+      try {
+        threadPool.awaitTermination(100, TimeUnit.SECONDS);
+      } catch (InterruptedException e) {
+        log.error("Could not terminate all threads for TopicModel in time.", e);
+      }
+    }
+    threadPool = new ThreadPoolExecutor(numThreads, numThreads, 0, TimeUnit.SECONDS,
+                                                           new ArrayBlockingQueue<Runnable>(numThreads * 10));
+    threadPool.allowCoreThreadTimeOut(false);
+    updaters = new Updater[numThreads];
+    for (int i = 0; i < numThreads; i++) {
+      updaters[i] = new Updater();
+      threadPool.submit(updaters[i]);
+    }
+  }
+
+  Matrix topicTermCounts() {
+    return topicTermCounts;
+  }
+
+  @Override
+  public Iterator<MatrixSlice> iterator() {
+    return topicTermCounts.iterateAll();
+  }
+
+  public Vector topicSums() {
+    return topicSums;
+  }
+
+  private static Pair<Matrix,Vector> randomMatrix(int numTopics, int numTerms, Random random) {
+    Matrix topicTermCounts = new DenseMatrix(numTopics, numTerms);
+    Vector topicSums = new DenseVector(numTopics);
+    if (random != null) {
+      for (int x = 0; x < numTopics; x++) {
+        for (int term = 0; term < numTerms; term++) {
+          topicTermCounts.viewRow(x).set(term, random.nextDouble());
+        }
+      }
+    }
+    for (int x = 0; x < numTopics; x++) {
+      topicSums.set(x, random == null ? 1.0 : topicTermCounts.viewRow(x).norm(1));
+    }
+    return Pair.of(topicTermCounts, topicSums);
+  }
+
+  public static Pair<Matrix, Vector> loadModel(Configuration conf, Path... modelPaths)
+    throws IOException {
+    int numTopics = -1;
+    int numTerms = -1;
+    List<Pair<Integer, Vector>> rows = new ArrayList<>();
+    for (Path modelPath : modelPaths) {
+      for (Pair<IntWritable, VectorWritable> row
+          : new SequenceFileIterable<IntWritable, VectorWritable>(modelPath, true, conf)) {
+        rows.add(Pair.of(row.getFirst().get(), row.getSecond().get()));
+        numTopics = Math.max(numTopics, row.getFirst().get());
+        if (numTerms < 0) {
+          numTerms = row.getSecond().get().size();
+        }
+      }
+    }
+    if (rows.isEmpty()) {
+      throw new IOException(Arrays.toString(modelPaths) + " have no vectors in it");
+    }
+    numTopics++;
+    Matrix model = new DenseMatrix(numTopics, numTerms);
+    Vector topicSums = new DenseVector(numTopics);
+    for (Pair<Integer, Vector> pair : rows) {
+      model.viewRow(pair.getFirst()).assign(pair.getSecond());
+      topicSums.set(pair.getFirst(), pair.getSecond().norm(1));
+    }
+    return Pair.of(model, topicSums);
+  }
+
+  // NOTE: this is purely for debug purposes.  It is not performant to "toString()" a real model
+  @Override
+  public String toString() {
+    StringBuilder buf = new StringBuilder();
+    for (int x = 0; x < numTopics; x++) {
+      String v = dictionary != null
+          ? vectorToSortedString(topicTermCounts.viewRow(x).normalize(1), dictionary)
+          : topicTermCounts.viewRow(x).asFormatString();
+      buf.append(v).append('\n');
+    }
+    return buf.toString();
+  }
+
+  public int sampleTerm(Vector topicDistribution) {
+    return sampler.sample(topicTermCounts.viewRow(sampler.sample(topicDistribution)));
+  }
+
+  public int sampleTerm(int topic) {
+    return sampler.sample(topicTermCounts.viewRow(topic));
+  }
+
+  public synchronized void reset() {
+    for (int x = 0; x < numTopics; x++) {
+      topicTermCounts.assignRow(x, new SequentialAccessSparseVector(numTerms));
+    }
+    topicSums.assign(1.0);
+    if (threadPool.isTerminated()) {
+      initializeThreadPool();
+    }
+  }
+
+  public synchronized void stop() {
+    for (Updater updater : updaters) {
+      updater.shutdown();
+    }
+    threadPool.shutdown();
+    try {
+      if (!threadPool.awaitTermination(60, TimeUnit.SECONDS)) {
+        log.warn("Threadpool timed out on await termination - jobs still running!");
+      }
+    } catch (InterruptedException e) {
+      log.error("Interrupted shutting down!", e);
+    }
+  }
+
+  public void renormalize() {
+    for (int x = 0; x < numTopics; x++) {
+      topicTermCounts.assignRow(x, topicTermCounts.viewRow(x).normalize(1));
+      topicSums.assign(1.0);
+    }
+  }
+
+  public void trainDocTopicModel(Vector original, Vector topics, Matrix docTopicModel) {
+    // first calculate p(topic|term,document) for all terms in original, and all topics,
+    // using p(term|topic) and p(topic|doc)
+    pTopicGivenTerm(original, topics, docTopicModel);
+    normalizeByTopic(docTopicModel);
+    // now multiply, term-by-term, by the document, to get the weighted distribution of
+    // term-topic pairs from this document.
+    for (Element e : original.nonZeroes()) {
+      for (int x = 0; x < numTopics; x++) {
+        Vector docTopicModelRow = docTopicModel.viewRow(x);
+        docTopicModelRow.setQuick(e.index(), docTopicModelRow.getQuick(e.index()) * e.get());
+      }
+    }
+    // now recalculate \(p(topic|doc)\) by summing contributions from all of pTopicGivenTerm
+    topics.assign(0.0);
+    for (int x = 0; x < numTopics; x++) {
+      topics.set(x, docTopicModel.viewRow(x).norm(1));
+    }
+    // now renormalize so that \(sum_x(p(x|doc))\) = 1
+    topics.assign(Functions.mult(1 / topics.norm(1)));
+  }
+
+  public Vector infer(Vector original, Vector docTopics) {
+    Vector pTerm = original.like();
+    for (Element e : original.nonZeroes()) {
+      int term = e.index();
+      // p(a) = sum_x (p(a|x) * p(x|i))
+      double pA = 0;
+      for (int x = 0; x < numTopics; x++) {
+        pA += (topicTermCounts.viewRow(x).get(term) / topicSums.get(x)) * docTopics.get(x);
+      }
+      pTerm.set(term, pA);
+    }
+    return pTerm;
+  }
+
+  public void update(Matrix docTopicCounts) {
+    for (int x = 0; x < numTopics; x++) {
+      updaters[x % updaters.length].update(x, docTopicCounts.viewRow(x));
+    }
+  }
+
+  public void updateTopic(int topic, Vector docTopicCounts) {
+    topicTermCounts.viewRow(topic).assign(docTopicCounts, Functions.PLUS);
+    topicSums.set(topic, topicSums.get(topic) + docTopicCounts.norm(1));
+  }
+
+  public void update(int termId, Vector topicCounts) {
+    for (int x = 0; x < numTopics; x++) {
+      Vector v = topicTermCounts.viewRow(x);
+      v.set(termId, v.get(termId) + topicCounts.get(x));
+    }
+    topicSums.assign(topicCounts, Functions.PLUS);
+  }
+
+  public void persist(Path outputDir, boolean overwrite) throws IOException {
+    FileSystem fs = outputDir.getFileSystem(conf);
+    if (overwrite) {
+      fs.delete(outputDir, true); // CHECK second arg
+    }
+    DistributedRowMatrixWriter.write(outputDir, conf, topicTermCounts);
+  }
+
+  /**
+   * Computes {@code \(p(topic x | term a, document i)\)} distributions given input document {@code i}.
+   * {@code \(pTGT[x][a]\)} is the (un-normalized) {@code \(p(x|a,i)\)}, or if docTopics is {@code null},
+   * {@code \(p(a|x)\)} (also un-normalized).
+   *
+   * @param document doc-term vector encoding {@code \(w(term a|document i)\)}.
+   * @param docTopics {@code docTopics[x]} is the overall weight of topic {@code x} in given
+   *          document. If {@code null}, a topic weight of {@code 1.0} is used for all topics.
+   * @param termTopicDist storage for output {@code \(p(x|a,i)\)} distributions.
+   */
+  private void pTopicGivenTerm(Vector document, Vector docTopics, Matrix termTopicDist) {
+    // for each topic x
+    for (int x = 0; x < numTopics; x++) {
+      // get p(topic x | document i), or 1.0 if docTopics is null
+      double topicWeight = docTopics == null ? 1.0 : docTopics.get(x);
+      // get w(term a | topic x)
+      Vector topicTermRow = topicTermCounts.viewRow(x);
+      // get \sum_a w(term a | topic x)
+      double topicSum = topicSums.get(x);
+      // get p(topic x | term a) distribution to update
+      Vector termTopicRow = termTopicDist.viewRow(x);
+
+      // for each term a in document i with non-zero weight
+      for (Element e : document.nonZeroes()) {
+        int termIndex = e.index();
+
+        // calc un-normalized p(topic x | term a, document i)
+        double termTopicLikelihood = (topicTermRow.get(termIndex) + eta) * (topicWeight + alpha)
+            / (topicSum + eta * numTerms);
+        termTopicRow.set(termIndex, termTopicLikelihood);
+      }
+    }
+  }
+
+  /**
+   * \(sum_x sum_a (c_ai * log(p(x|i) * p(a|x)))\)
+   */
+  public double perplexity(Vector document, Vector docTopics) {
+    double perplexity = 0;
+    double norm = docTopics.norm(1) + (docTopics.size() * alpha);
+    for (Element e : document.nonZeroes()) {
+      int term = e.index();
+      double prob = 0;
+      for (int x = 0; x < numTopics; x++) {
+        double d = (docTopics.get(x) + alpha) / norm;
+        double p = d * (topicTermCounts.viewRow(x).get(term) + eta)
+                   / (topicSums.get(x) + eta * numTerms);
+        prob += p;
+      }
+      perplexity += e.get() * Math.log(prob);
+    }
+    return -perplexity;
+  }
+
+  private void normalizeByTopic(Matrix perTopicSparseDistributions) {
+    // then make sure that each of these is properly normalized by topic: sum_x(p(x|t,d)) = 1
+    for (Element e : perTopicSparseDistributions.viewRow(0).nonZeroes()) {
+      int a = e.index();
+      double sum = 0;
+      for (int x = 0; x < numTopics; x++) {
+        sum += perTopicSparseDistributions.viewRow(x).get(a);
+      }
+      for (int x = 0; x < numTopics; x++) {
+        perTopicSparseDistributions.viewRow(x).set(a,
+            perTopicSparseDistributions.viewRow(x).get(a) / sum);
+      }
+    }
+  }
+
+  public static String vectorToSortedString(Vector vector, String[] dictionary) {
+    List<Pair<String,Double>> vectorValues = new ArrayList<>(vector.getNumNondefaultElements());
+    for (Element e : vector.nonZeroes()) {
+      vectorValues.add(Pair.of(dictionary != null ? dictionary[e.index()] : String.valueOf(e.index()),
+                               e.get()));
+    }
+    Collections.sort(vectorValues, new Comparator<Pair<String, Double>>() {
+      @Override public int compare(Pair<String, Double> x, Pair<String, Double> y) {
+        return y.getSecond().compareTo(x.getSecond());
+      }
+    });
+    Iterator<Pair<String,Double>> listIt = vectorValues.iterator();
+    StringBuilder bldr = new StringBuilder(2048);
+    bldr.append('{');
+    int i = 0;
+    while (listIt.hasNext() && i < 25) {
+      i++;
+      Pair<String,Double> p = listIt.next();
+      bldr.append(p.getFirst());
+      bldr.append(':');
+      bldr.append(p.getSecond());
+      bldr.append(',');
+    }
+    if (bldr.length() > 1) {
+      bldr.setCharAt(bldr.length() - 1, '}');
+    }
+    return bldr.toString();
+  }
+
+  @Override
+  public void setConf(Configuration configuration) {
+    this.conf = configuration;
+  }
+
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+
+  private final class Updater implements Runnable {
+    private final ArrayBlockingQueue<Pair<Integer, Vector>> queue =
+        new ArrayBlockingQueue<>(100);
+    private boolean shutdown = false;
+    private boolean shutdownComplete = false;
+
+    public void shutdown() {
+      try {
+        synchronized (this) {
+          while (!shutdownComplete) {
+            shutdown = true;
+            wait(10000L); // Arbitrarily, wait 10 seconds rather than forever for this
+          }
+        }
+      } catch (InterruptedException e) {
+        log.warn("Interrupted waiting to shutdown() : ", e);
+      }
+    }
+
+    public boolean update(int topic, Vector v) {
+      if (shutdown) { // maybe don't do this?
+        throw new IllegalStateException("In SHUTDOWN state: cannot submit tasks");
+      }
+      while (true) { // keep trying if interrupted
+        try {
+          // start async operation by submitting to the queue
+          queue.put(Pair.of(topic, v));
+          // return once you got access to the queue
+          return true;
+        } catch (InterruptedException e) {
+          log.warn("Interrupted trying to queue update:", e);
+        }
+      }
+    }
+
+    @Override
+    public void run() {
+      while (!shutdown) {
+        try {
+          Pair<Integer, Vector> pair = queue.poll(1, TimeUnit.SECONDS);
+          if (pair != null) {
+            updateTopic(pair.getFirst(), pair.getSecond());
+          }
+        } catch (InterruptedException e) {
+          log.warn("Interrupted waiting to poll for update", e);
+        }
+      }
+      // in shutdown mode, finish remaining tasks!
+      for (Pair<Integer, Vector> pair : queue) {
+        updateTopic(pair.getFirst(), pair.getSecond());
+      }
+      synchronized (this) {
+        shutdownComplete = true;
+        notifyAll();
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/package-info.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/package-info.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/package-info.java
new file mode 100644
index 0000000..9926b91
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/package-info.java
@@ -0,0 +1,13 @@
+/**
+ * <p></p>This package provides several clustering algorithm implementations. Clustering usually groups a set of
+ * objects into groups of similar items. The definition of similarity usually is up to you - for text documents,
+ * cosine-distance/-similarity is recommended. Mahout also features other types of distance measure like
+ * Euclidean distance.</p>
+ *
+ * <p></p>Input of each clustering algorithm is a set of vectors representing your items. For texts in general these are
+ * <a href="http://en.wikipedia.org/wiki/TFIDF">TFIDF</a> or
+ * <a href="http://en.wikipedia.org/wiki/Bag_of_words">Bag of words</a> representations of the documents.</p>
+ *
+ * <p>Output of each clustering algorithm is either a hard or soft assignment of items to clusters.</p>
+ */
+package org.apache.mahout.clustering;

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputJob.java
new file mode 100644
index 0000000..aa12b9e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputJob.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.hadoop.DistributedRowMatrix;
+
+public final class AffinityMatrixInputJob {
+
+  private AffinityMatrixInputJob() {
+  }
+
+  /**
+   * Initializes and executes the job of reading the documents containing
+   * the data of the affinity matrix in (x_i, x_j, value) format.
+   */
+  public static void runJob(Path input, Path output, int rows, int cols)
+    throws IOException, InterruptedException, ClassNotFoundException {
+    Configuration conf = new Configuration();
+    HadoopUtil.delete(conf, output);
+
+    conf.setInt(Keys.AFFINITY_DIMENSIONS, rows);
+    Job job = new Job(conf, "AffinityMatrixInputJob: " + input + " -> M/R -> " + output);
+
+    job.setMapOutputKeyClass(IntWritable.class);
+    job.setMapOutputValueClass(DistributedRowMatrix.MatrixEntryWritable.class);
+    job.setOutputKeyClass(IntWritable.class);
+    job.setOutputValueClass(VectorWritable.class);
+    job.setOutputFormatClass(SequenceFileOutputFormat.class);
+    job.setMapperClass(AffinityMatrixInputMapper.class);   
+    job.setReducerClass(AffinityMatrixInputReducer.class);
+
+    FileInputFormat.addInputPath(job, input);
+    FileOutputFormat.setOutputPath(job, output);
+
+    job.setJarByClass(AffinityMatrixInputJob.class);
+
+    boolean succeeded = job.waitForCompletion(true);
+    if (!succeeded) {
+      throw new IllegalStateException("Job failed!");
+    }
+  }
+
+  /**
+   * A transparent wrapper for the above method which handles the tedious tasks
+   * of setting and retrieving system Paths. Hands back a fully-populated
+   * and initialized DistributedRowMatrix.
+   */
+  public static DistributedRowMatrix runJob(Path input, Path output, int dimensions)
+    throws IOException, InterruptedException, ClassNotFoundException {
+    Path seqFiles = new Path(output, "seqfiles-" + (System.nanoTime() & 0xFF));
+    runJob(input, seqFiles, dimensions, dimensions);
+    DistributedRowMatrix a = new DistributedRowMatrix(seqFiles,
+        new Path(seqFiles, "seqtmp-" + (System.nanoTime() & 0xFF)), 
+        dimensions, dimensions);
+    a.setConf(new Configuration());
+    return a;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputMapper.java
new file mode 100644
index 0000000..30d2404
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputMapper.java
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.hadoop.DistributedRowMatrix;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>Handles reading the files representing the affinity matrix. Since the affinity
+ * matrix is representative of a graph, each line in all the files should
+ * take the form:</p>
+ *
+ * {@code i,j,value}
+ *
+ * <p>where {@code i} and {@code j} are the {@code i}th and
+ * {@code j} data points in the entire set, and {@code value}
+ * represents some measurement of their relative absolute magnitudes. This
+ * is, simply, a method for representing a graph textually.
+ */
+public class AffinityMatrixInputMapper
+    extends Mapper<LongWritable, Text, IntWritable, DistributedRowMatrix.MatrixEntryWritable> {
+
+  private static final Logger log = LoggerFactory.getLogger(AffinityMatrixInputMapper.class);
+
+  private static final Pattern COMMA_PATTERN = Pattern.compile(",");
+
+  @Override
+  protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+
+    String[] elements = COMMA_PATTERN.split(value.toString());
+    log.debug("(DEBUG - MAP) Key[{}], Value[{}]", key.get(), value);
+
+    // enforce well-formed textual representation of the graph
+    if (elements.length != 3) {
+      throw new IOException("Expected input of length 3, received "
+                            + elements.length + ". Please make sure you adhere to "
+                            + "the structure of (i,j,value) for representing a graph in text. "
+                            + "Input line was: '" + value + "'.");
+    }
+    if (elements[0].isEmpty() || elements[1].isEmpty() || elements[2].isEmpty()) {
+      throw new IOException("Found an element of 0 length. Please be sure you adhere to the structure of "
+          + "(i,j,value) for  representing a graph in text.");
+    }
+
+    // parse the line of text into a DistributedRowMatrix entry,
+    // making the row (elements[0]) the key to the Reducer, and
+    // setting the column (elements[1]) in the entry itself
+    DistributedRowMatrix.MatrixEntryWritable toAdd = new DistributedRowMatrix.MatrixEntryWritable();
+    IntWritable row = new IntWritable(Integer.valueOf(elements[0]));
+    toAdd.setRow(-1); // already set as the Reducer's key
+    toAdd.setCol(Integer.valueOf(elements[1]));
+    toAdd.setVal(Double.valueOf(elements[2]));
+    context.write(row, toAdd);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputReducer.java
new file mode 100644
index 0000000..d892969
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputReducer.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.hadoop.DistributedRowMatrix;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Tasked with taking each DistributedRowMatrix entry and collecting them
+ * into vectors corresponding to rows. The input and output keys are the same,
+ * corresponding to the row in the ensuing matrix. The matrix entries are
+ * entered into a vector according to the column to which they belong, and
+ * the vector is then given the key corresponding to its row.
+ */
+public class AffinityMatrixInputReducer
+    extends Reducer<IntWritable, DistributedRowMatrix.MatrixEntryWritable, IntWritable, VectorWritable> {
+
+  private static final Logger log = LoggerFactory.getLogger(AffinityMatrixInputReducer.class);
+
+  @Override
+  protected void reduce(IntWritable row, Iterable<DistributedRowMatrix.MatrixEntryWritable> values, Context context)
+    throws IOException, InterruptedException {
+    int size = context.getConfiguration().getInt(Keys.AFFINITY_DIMENSIONS, Integer.MAX_VALUE);
+    RandomAccessSparseVector out = new RandomAccessSparseVector(size, 100);
+
+    for (DistributedRowMatrix.MatrixEntryWritable element : values) {
+      out.setQuick(element.getCol(), element.getVal());
+      if (log.isDebugEnabled()) {
+        log.debug("(DEBUG - REDUCE) Row[{}], Column[{}], Value[{}]",
+                  row.get(), element.getCol(), element.getVal());
+      }
+    }
+    SequentialAccessSparseVector output = new SequentialAccessSparseVector(out);
+    context.write(row, new VectorWritable(output));
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/IntDoublePairWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/IntDoublePairWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/IntDoublePairWritable.java
new file mode 100644
index 0000000..593cc58
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/IntDoublePairWritable.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+
+/**
+ * This class is a Writable implementation of the mahout.common.Pair
+ * generic class. Since the generic types would also themselves have to
+ * implement Writable, it made more sense to create a more specialized
+ * version of the class altogether.
+ * 
+ * In essence, this can be treated as a single Vector Element.
+ */
+public class IntDoublePairWritable implements Writable {
+  
+  private int key;
+  private double value;
+  
+  public IntDoublePairWritable() {
+  }
+  
+  public IntDoublePairWritable(int k, double v) {
+    this.key = k;
+    this.value = v;
+  }
+  
+  public void setKey(int k) {
+    this.key = k;
+  }
+  
+  public void setValue(double v) {
+    this.value = v;
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    this.key = in.readInt();
+    this.value = in.readDouble();
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(key);
+    out.writeDouble(value);
+  }
+
+  public int getKey() {
+    return key;
+  }
+
+  public double getValue() {
+    return value;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/Keys.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/Keys.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/Keys.java
new file mode 100644
index 0000000..268a365
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/Keys.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+public class Keys {
+
+  /**
+   * Sets the SequenceFile index for the diagonal matrix.
+   */
+  public static final int DIAGONAL_CACHE_INDEX = 1;
+
+  public static final String AFFINITY_DIMENSIONS = "org.apache.mahout.clustering.spectral.common.affinitydimensions";
+
+  private Keys() {}
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/MatrixDiagonalizeJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/MatrixDiagonalizeJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/MatrixDiagonalizeJob.java
new file mode 100644
index 0000000..f245f99
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/MatrixDiagonalizeJob.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * Given a matrix, this job returns a vector whose i_th element is the 
+ * sum of all the elements in the i_th row of the original matrix.
+ */
+public final class MatrixDiagonalizeJob {
+
+  private MatrixDiagonalizeJob() {
+  }
+
+  public static Vector runJob(Path affInput, int dimensions)
+    throws IOException, ClassNotFoundException, InterruptedException {
+    
+    // set up all the job tasks
+    Configuration conf = new Configuration();
+    Path diagOutput = new Path(affInput.getParent(), "diagonal");
+    HadoopUtil.delete(conf, diagOutput);
+    conf.setInt(Keys.AFFINITY_DIMENSIONS, dimensions);
+    Job job = new Job(conf, "MatrixDiagonalizeJob");
+    
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    job.setMapOutputKeyClass(NullWritable.class);
+    job.setMapOutputValueClass(IntDoublePairWritable.class);
+    job.setOutputKeyClass(NullWritable.class);
+    job.setOutputValueClass(VectorWritable.class);
+    job.setOutputFormatClass(SequenceFileOutputFormat.class);
+    job.setMapperClass(MatrixDiagonalizeMapper.class);
+    job.setReducerClass(MatrixDiagonalizeReducer.class);
+    
+    FileInputFormat.addInputPath(job, affInput);
+    FileOutputFormat.setOutputPath(job, diagOutput);
+    
+    job.setJarByClass(MatrixDiagonalizeJob.class);
+
+    boolean succeeded = job.waitForCompletion(true);
+    if (!succeeded) {
+      throw new IllegalStateException("Job failed!");
+    }
+
+    // read the results back from the path
+    return VectorCache.load(conf, new Path(diagOutput, "part-r-00000"));
+  }
+  
+  public static class MatrixDiagonalizeMapper
+    extends Mapper<IntWritable, VectorWritable, NullWritable, IntDoublePairWritable> {
+    
+    @Override
+    protected void map(IntWritable key, VectorWritable row, Context context) 
+      throws IOException, InterruptedException {
+      // store the sum
+      IntDoublePairWritable store = new IntDoublePairWritable(key.get(), row.get().zSum());
+      context.write(NullWritable.get(), store);
+    }
+  }
+  
+  public static class MatrixDiagonalizeReducer
+    extends Reducer<NullWritable, IntDoublePairWritable, NullWritable, VectorWritable> {
+    
+    @Override
+    protected void reduce(NullWritable key, Iterable<IntDoublePairWritable> values,
+      Context context) throws IOException, InterruptedException {
+      // create the return vector
+      Vector retval = new DenseVector(context.getConfiguration().getInt(Keys.AFFINITY_DIMENSIONS, Integer.MAX_VALUE));
+      // put everything in its correct spot
+      for (IntDoublePairWritable e : values) {
+        retval.setQuick(e.getKey(), e.getValue());
+      }
+      // write it out
+      context.write(key, new VectorWritable(retval));
+    }
+  }
+}


[08/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java
new file mode 100644
index 0000000..265d3da
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.canopy;
+
+import java.io.IOException;
+import java.util.Collection;
+
+import com.google.common.collect.Lists;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VectorWritable;
+
+@Deprecated
+class CanopyMapper extends
+    Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable> {
+
+  private final Collection<Canopy> canopies = Lists.newArrayList();
+
+  private CanopyClusterer canopyClusterer;
+
+  private int clusterFilter;
+
+  @Override
+  protected void map(WritableComparable<?> key, VectorWritable point,
+      Context context) throws IOException, InterruptedException {
+    canopyClusterer.addPointToCanopies(point.get(), canopies);
+  }
+
+  @Override
+  protected void setup(Context context) throws IOException,
+      InterruptedException {
+    super.setup(context);
+    canopyClusterer = CanopyConfigKeys.configureCanopyClusterer(context.getConfiguration());
+    clusterFilter = Integer.parseInt(context.getConfiguration().get(
+        CanopyConfigKeys.CF_KEY));
+  }
+
+  @Override
+  protected void cleanup(Context context) throws IOException,
+      InterruptedException {
+    for (Canopy canopy : canopies) {
+      canopy.computeParameters();
+      if (canopy.getNumObservations() > clusterFilter) {
+        context.write(new Text("centroid"), new VectorWritable(canopy
+            .getCenter()));
+      }
+    }
+    super.cleanup(context);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java
new file mode 100644
index 0000000..cdd7d5e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.canopy;
+
+import java.io.IOException;
+import java.util.Collection;
+
+import com.google.common.collect.Lists;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+@Deprecated
+public class CanopyReducer extends Reducer<Text, VectorWritable, Text, ClusterWritable> {
+
+  private final Collection<Canopy> canopies = Lists.newArrayList();
+
+  private CanopyClusterer canopyClusterer;
+
+  private int clusterFilter;
+
+  CanopyClusterer getCanopyClusterer() {
+    return canopyClusterer;
+  }
+
+  @Override
+  protected void reduce(Text arg0, Iterable<VectorWritable> values,
+      Context context) throws IOException, InterruptedException {
+    for (VectorWritable value : values) {
+      Vector point = value.get();
+      canopyClusterer.addPointToCanopies(point, canopies);
+    }
+    for (Canopy canopy : canopies) {
+      canopy.computeParameters();
+      if (canopy.getNumObservations() > clusterFilter) {
+        ClusterWritable clusterWritable = new ClusterWritable();
+        clusterWritable.setValue(canopy);
+        context.write(new Text(canopy.getIdentifier()), clusterWritable);
+      }
+    }
+  }
+
+  @Override
+  protected void setup(Context context) throws IOException,
+      InterruptedException {
+    super.setup(context);
+    canopyClusterer = CanopyConfigKeys.configureCanopyClusterer(context.getConfiguration());
+    canopyClusterer.useT3T4();
+    clusterFilter = Integer.parseInt(context.getConfiguration().get(
+        CanopyConfigKeys.CF_KEY));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationConfigKeys.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationConfigKeys.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationConfigKeys.java
new file mode 100644
index 0000000..6b88388
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationConfigKeys.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.classify;
+
+/**
+ * Constants used in Cluster Classification.
+ */
+public final class ClusterClassificationConfigKeys {
+  
+  public static final String CLUSTERS_IN = "clusters_in";
+  
+  public static final String OUTLIER_REMOVAL_THRESHOLD = "pdf_threshold";
+  
+  public static final String EMIT_MOST_LIKELY = "emit_most_likely";
+
+  private ClusterClassificationConfigKeys() {
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java
new file mode 100644
index 0000000..ead95cf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java
@@ -0,0 +1,313 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.classify;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.iterator.ClusteringPolicy;
+import org.apache.mahout.clustering.iterator.DistanceMeasureCluster;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * Classifies the vectors into different clusters found by the clustering
+ * algorithm.
+ */
+public final class ClusterClassificationDriver extends AbstractJob {
+  
+  /**
+   * CLI to run Cluster Classification Driver.
+   */
+  @Override
+  public int run(String[] args) throws Exception {
+    
+    addInputOption();
+    addOutputOption();
+    addOption(DefaultOptionCreator.methodOption().create());
+    addOption(DefaultOptionCreator.clustersInOption()
+        .withDescription("The input centroids, as Vectors.  Must be a SequenceFile of Writable, Cluster/Canopy.")
+        .create());
+    
+    if (parseArguments(args) == null) {
+      return -1;
+    }
+    
+    Path input = getInputPath();
+    Path output = getOutputPath();
+    
+    if (getConf() == null) {
+      setConf(new Configuration());
+    }
+    Path clustersIn = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION));
+    boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(
+        DefaultOptionCreator.SEQUENTIAL_METHOD);
+    
+    double clusterClassificationThreshold = 0.0;
+    if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) {
+      clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD));
+    }
+    
+    run(getConf(), input, clustersIn, output, clusterClassificationThreshold, true, runSequential);
+    
+    return 0;
+  }
+  
+  /**
+   * Constructor to be used by the ToolRunner.
+   */
+  private ClusterClassificationDriver() {
+  }
+  
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new ClusterClassificationDriver(), args);
+  }
+  
+  /**
+   * Uses {@link ClusterClassifier} to classify input vectors into their
+   * respective clusters.
+   * 
+   * @param input
+   *          the input vectors
+   * @param clusteringOutputPath
+   *          the output path of clustering ( it reads clusters-*-final file
+   *          from here )
+   * @param output
+   *          the location to store the classified vectors
+   * @param clusterClassificationThreshold
+   *          the threshold value of probability distribution function from 0.0
+   *          to 1.0. Any vector with pdf less that this threshold will not be
+   *          classified for the cluster.
+   * @param runSequential
+   *          Run the process sequentially or in a mapreduce way.
+   * @throws IOException
+   * @throws InterruptedException
+   * @throws ClassNotFoundException
+   */
+  public static void run(Configuration conf, Path input, Path clusteringOutputPath, Path output, Double clusterClassificationThreshold,
+      boolean emitMostLikely, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException {
+    if (runSequential) {
+      classifyClusterSeq(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely);
+    } else {
+      classifyClusterMR(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely);
+    }
+    
+  }
+  
+  private static void classifyClusterSeq(Configuration conf, Path input, Path clusters, Path output,
+      Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException {
+    List<Cluster> clusterModels = populateClusterModels(clusters, conf);
+    ClusteringPolicy policy = ClusterClassifier.readPolicy(finalClustersPath(conf, clusters));
+    ClusterClassifier clusterClassifier = new ClusterClassifier(clusterModels, policy);
+    selectCluster(input, clusterModels, clusterClassifier, output, clusterClassificationThreshold, emitMostLikely);
+    
+  }
+  
+  /**
+   * Populates a list with clusters present in clusters-*-final directory.
+   * 
+   * @param clusterOutputPath
+   *          The output path of the clustering.
+   * @param conf
+   *          The Hadoop Configuration
+   * @return The list of clusters found by the clustering.
+   * @throws IOException
+   */
+  private static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException {
+    List<Cluster> clusterModels = new ArrayList<>();
+    Path finalClustersPath = finalClustersPath(conf, clusterOutputPath);
+    Iterator<?> it = new SequenceFileDirValueIterator<>(finalClustersPath, PathType.LIST,
+        PathFilters.partFilter(), null, false, conf);
+    while (it.hasNext()) {
+      ClusterWritable next = (ClusterWritable) it.next();
+      Cluster cluster = next.getValue();
+      cluster.configure(conf);
+      clusterModels.add(cluster);
+    }
+    return clusterModels;
+  }
+  
+  private static Path finalClustersPath(Configuration conf, Path clusterOutputPath) throws IOException {
+    FileSystem fileSystem = clusterOutputPath.getFileSystem(conf);
+    FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter());
+    return clusterFiles[0].getPath();
+  }
+  
+  /**
+   * Classifies the vector into its respective cluster.
+   * 
+   * @param input
+   *          the path containing the input vector.
+   * @param clusterModels
+   *          the clusters
+   * @param clusterClassifier
+   *          used to classify the vectors into different clusters
+   * @param output
+   *          the path to store classified data
+   * @param clusterClassificationThreshold
+   *          the threshold value of probability distribution function from 0.0
+   *          to 1.0. Any vector with pdf less that this threshold will not be
+   *          classified for the cluster
+   * @param emitMostLikely
+   *          emit the vectors with the max pdf values per cluster
+   * @throws IOException
+   */
+  private static void selectCluster(Path input, List<Cluster> clusterModels, ClusterClassifier clusterClassifier,
+      Path output, Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException {
+    Configuration conf = new Configuration();
+    SequenceFile.Writer writer = new SequenceFile.Writer(input.getFileSystem(conf), conf, new Path(output,
+        "part-m-" + 0), IntWritable.class, WeightedPropertyVectorWritable.class);
+    for (Pair<Writable, VectorWritable> vw : new SequenceFileDirIterable<Writable, VectorWritable>(input, PathType.LIST,
+        PathFilters.logsCRCFilter(), conf)) {
+      // Converting to NamedVectors to preserve the vectorId else its not obvious as to which point
+      // belongs to which cluster - fix for MAHOUT-1410
+      Class<? extends Writable> keyClass = vw.getFirst().getClass();
+      Vector vector = vw.getSecond().get();
+      if (!keyClass.equals(NamedVector.class)) {
+        if (keyClass.equals(Text.class)) {
+          vector = new NamedVector(vector, vw.getFirst().toString());
+        } else if (keyClass.equals(IntWritable.class)) {
+          vector = new NamedVector(vector, Integer.toString(((IntWritable) vw.getFirst()).get()));
+        }
+      }
+      Vector pdfPerCluster = clusterClassifier.classify(vector);
+      if (shouldClassify(pdfPerCluster, clusterClassificationThreshold)) {
+        classifyAndWrite(clusterModels, clusterClassificationThreshold, emitMostLikely, writer, new VectorWritable(vector), pdfPerCluster);
+      }
+    }
+    writer.close();
+  }
+  
+  private static void classifyAndWrite(List<Cluster> clusterModels, Double clusterClassificationThreshold,
+      boolean emitMostLikely, SequenceFile.Writer writer, VectorWritable vw, Vector pdfPerCluster) throws IOException {
+    Map<Text, Text> props = new HashMap<>();
+    if (emitMostLikely) {
+      int maxValueIndex = pdfPerCluster.maxValueIndex();
+      WeightedPropertyVectorWritable weightedPropertyVectorWritable =
+          new WeightedPropertyVectorWritable(pdfPerCluster.maxValue(), vw.get(), props);
+      write(clusterModels, writer, weightedPropertyVectorWritable, maxValueIndex);
+    } else {
+      writeAllAboveThreshold(clusterModels, clusterClassificationThreshold, writer, vw, pdfPerCluster);
+    }
+  }
+  
+  private static void writeAllAboveThreshold(List<Cluster> clusterModels, Double clusterClassificationThreshold,
+      SequenceFile.Writer writer, VectorWritable vw, Vector pdfPerCluster) throws IOException {
+    Map<Text, Text> props = new HashMap<>();
+    for (Element pdf : pdfPerCluster.nonZeroes()) {
+      if (pdf.get() >= clusterClassificationThreshold) {
+        WeightedPropertyVectorWritable wvw = new WeightedPropertyVectorWritable(pdf.get(), vw.get(), props);
+        int clusterIndex = pdf.index();
+        write(clusterModels, writer, wvw, clusterIndex);
+      }
+    }
+  }
+
+  private static void write(List<Cluster> clusterModels, SequenceFile.Writer writer,
+      WeightedPropertyVectorWritable weightedPropertyVectorWritable,
+      int maxValueIndex) throws IOException {
+    Cluster cluster = clusterModels.get(maxValueIndex);
+
+    DistanceMeasureCluster distanceMeasureCluster = (DistanceMeasureCluster) cluster;
+    DistanceMeasure distanceMeasure = distanceMeasureCluster.getMeasure();
+    double distance = distanceMeasure.distance(cluster.getCenter(), weightedPropertyVectorWritable.getVector());
+
+    weightedPropertyVectorWritable.getProperties().put(new Text("distance"), new Text(Double.toString(distance)));
+    writer.append(new IntWritable(cluster.getId()), weightedPropertyVectorWritable);
+  }
+  
+  /**
+   * Decides whether the vector should be classified or not based on the max pdf
+   * value of the clusters and threshold value.
+   * 
+   * @return whether the vector should be classified or not.
+   */
+  private static boolean shouldClassify(Vector pdfPerCluster, Double clusterClassificationThreshold) {
+    return pdfPerCluster.maxValue() >= clusterClassificationThreshold;
+  }
+  
+  private static void classifyClusterMR(Configuration conf, Path input, Path clustersIn, Path output,
+      Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException, InterruptedException,
+      ClassNotFoundException {
+    
+    conf.setFloat(ClusterClassificationConfigKeys.OUTLIER_REMOVAL_THRESHOLD,
+                  clusterClassificationThreshold.floatValue());
+    conf.setBoolean(ClusterClassificationConfigKeys.EMIT_MOST_LIKELY, emitMostLikely);
+    conf.set(ClusterClassificationConfigKeys.CLUSTERS_IN, clustersIn.toUri().toString());
+    
+    Job job = new Job(conf, "Cluster Classification Driver running over input: " + input);
+    job.setJarByClass(ClusterClassificationDriver.class);
+    
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    job.setOutputFormatClass(SequenceFileOutputFormat.class);
+    
+    job.setMapperClass(ClusterClassificationMapper.class);
+    job.setNumReduceTasks(0);
+    
+    job.setOutputKeyClass(IntWritable.class);
+    job.setOutputValueClass(WeightedPropertyVectorWritable.class);
+    
+    FileInputFormat.addInputPath(job, input);
+    FileOutputFormat.setOutputPath(job, output);
+    if (!job.waitForCompletion(true)) {
+      throw new InterruptedException("Cluster Classification Driver Job failed processing " + input);
+    }
+  }
+  
+  public static void run(Configuration conf, Path input, Path clusteringOutputPath, Path output,
+      double clusterClassificationThreshold, boolean emitMostLikely, boolean runSequential) throws IOException,
+      InterruptedException, ClassNotFoundException {
+    if (runSequential) {
+      classifyClusterSeq(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely);
+    } else {
+      classifyClusterMR(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely);
+    }
+    
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java
new file mode 100644
index 0000000..fffa7f9
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java
@@ -0,0 +1,161 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.classify;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.iterator.ClusteringPolicy;
+import org.apache.mahout.clustering.iterator.DistanceMeasureCluster;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * Mapper for classifying vectors into clusters.
+ */
+public class ClusterClassificationMapper extends
+    Mapper<WritableComparable<?>,VectorWritable,IntWritable,WeightedVectorWritable> {
+  
+  private double threshold;
+  private List<Cluster> clusterModels;
+  private ClusterClassifier clusterClassifier;
+  private IntWritable clusterId;
+  private boolean emitMostLikely;
+  
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    super.setup(context);
+    
+    Configuration conf = context.getConfiguration();
+    String clustersIn = conf.get(ClusterClassificationConfigKeys.CLUSTERS_IN);
+    threshold = conf.getFloat(ClusterClassificationConfigKeys.OUTLIER_REMOVAL_THRESHOLD, 0.0f);
+    emitMostLikely = conf.getBoolean(ClusterClassificationConfigKeys.EMIT_MOST_LIKELY, false);
+    
+    clusterModels = new ArrayList<>();
+    
+    if (clustersIn != null && !clustersIn.isEmpty()) {
+      Path clustersInPath = new Path(clustersIn);
+      clusterModels = populateClusterModels(clustersInPath, conf);
+      ClusteringPolicy policy = ClusterClassifier
+          .readPolicy(finalClustersPath(clustersInPath));
+      clusterClassifier = new ClusterClassifier(clusterModels, policy);
+    }
+    clusterId = new IntWritable();
+  }
+  
+  /**
+   * Mapper which classifies the vectors to respective clusters.
+   */
+  @Override
+  protected void map(WritableComparable<?> key, VectorWritable vw, Context context)
+    throws IOException, InterruptedException {
+    if (!clusterModels.isEmpty()) {
+      // Converting to NamedVectors to preserve the vectorId else its not obvious as to which point
+      // belongs to which cluster - fix for MAHOUT-1410
+      Class<? extends Vector> vectorClass = vw.get().getClass();
+      Vector vector = vw.get();
+      if (!vectorClass.equals(NamedVector.class)) {
+        if (key.getClass().equals(Text.class)) {
+          vector = new NamedVector(vector, key.toString());
+        } else if (key.getClass().equals(IntWritable.class)) {
+          vector = new NamedVector(vector, Integer.toString(((IntWritable) key).get()));
+        }
+      }
+      Vector pdfPerCluster = clusterClassifier.classify(vector);
+      if (shouldClassify(pdfPerCluster)) {
+        if (emitMostLikely) {
+          int maxValueIndex = pdfPerCluster.maxValueIndex();
+          write(new VectorWritable(vector), context, maxValueIndex, 1.0);
+        } else {
+          writeAllAboveThreshold(new VectorWritable(vector), context, pdfPerCluster);
+        }
+      }
+    }
+  }
+  
+  private void writeAllAboveThreshold(VectorWritable vw, Context context,
+      Vector pdfPerCluster) throws IOException, InterruptedException {
+    for (Element pdf : pdfPerCluster.nonZeroes()) {
+      if (pdf.get() >= threshold) {
+        int clusterIndex = pdf.index();
+        write(vw, context, clusterIndex, pdf.get());
+      }
+    }
+  }
+  
+  private void write(VectorWritable vw, Context context, int clusterIndex, double weight)
+    throws IOException, InterruptedException {
+    Cluster cluster = clusterModels.get(clusterIndex);
+    clusterId.set(cluster.getId());
+
+    DistanceMeasureCluster distanceMeasureCluster = (DistanceMeasureCluster) cluster;
+    DistanceMeasure distanceMeasure = distanceMeasureCluster.getMeasure();
+    double distance = distanceMeasure.distance(cluster.getCenter(), vw.get());
+
+    Map<Text, Text> props = new HashMap<>();
+    props.put(new Text("distance"), new Text(Double.toString(distance)));
+    context.write(clusterId, new WeightedPropertyVectorWritable(weight, vw.get(), props));
+  }
+  
+  public static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException {
+    List<Cluster> clusters = new ArrayList<>();
+    FileSystem fileSystem = clusterOutputPath.getFileSystem(conf);
+    FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter());
+    Iterator<?> it = new SequenceFileDirValueIterator<>(
+        clusterFiles[0].getPath(), PathType.LIST, PathFilters.partFilter(),
+        null, false, conf);
+    while (it.hasNext()) {
+      ClusterWritable next = (ClusterWritable) it.next();
+      Cluster cluster = next.getValue();
+      cluster.configure(conf);
+      clusters.add(cluster);
+    }
+    return clusters;
+  }
+  
+  private boolean shouldClassify(Vector pdfPerCluster) {
+    return pdfPerCluster.maxValue() >= threshold;
+  }
+  
+  private static Path finalClustersPath(Path clusterOutputPath) throws IOException {
+    FileSystem fileSystem = clusterOutputPath.getFileSystem(new Configuration());
+    FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter());
+    return clusterFiles[0].getPath();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassifier.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassifier.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassifier.java
new file mode 100644
index 0000000..dcd4062
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassifier.java
@@ -0,0 +1,231 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.classify;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.classifier.OnlineLearner;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.iterator.ClusteringPolicy;
+import org.apache.mahout.clustering.iterator.ClusteringPolicyWritable;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * This classifier works with any ClusteringPolicy and its associated Clusters.
+ * It is initialized with a policy and a list of compatible clusters and
+ * thereafter it can classify any new Vector into one or more of the clusters
+ * based upon the pdf() function which each cluster supports.
+ * <p/>
+ * In addition, it is an OnlineLearner and can be trained. Training amounts to
+ * asking the actual model to observe the vector and closing the classifier
+ * causes all the models to computeParameters.
+ * <p/>
+ * Because a ClusterClassifier implements Writable, it can be written-to and
+ * read-from a sequence file as a single entity. For sequential and MapReduce
+ * clustering in conjunction with a ClusterIterator; however, it utilizes an
+ * exploded file format. In this format, the iterator writes the policy to a
+ * single POLICY_FILE_NAME file in the clustersOut directory and the models are
+ * written to one or more part-n files so that multiple reducers may employed to
+ * produce them.
+ */
+public class ClusterClassifier extends AbstractVectorClassifier implements OnlineLearner, Writable {
+
+  private static final String POLICY_FILE_NAME = "_policy";
+
+  private List<Cluster> models;
+
+  private String modelClass;
+
+  private ClusteringPolicy policy;
+
+  /**
+   * The public constructor accepts a list of clusters to become the models
+   *
+   * @param models a List<Cluster>
+   * @param policy a ClusteringPolicy
+   */
+  public ClusterClassifier(List<Cluster> models, ClusteringPolicy policy) {
+    this.models = models;
+    modelClass = models.get(0).getClass().getName();
+    this.policy = policy;
+  }
+
+  // needed for serialization/De-serialization
+  public ClusterClassifier() {
+  }
+
+  // only used by MR ClusterIterator
+  protected ClusterClassifier(ClusteringPolicy policy) {
+    this.policy = policy;
+  }
+
+  @Override
+  public Vector classify(Vector instance) {
+    return policy.classify(instance, this);
+  }
+
+  @Override
+  public double classifyScalar(Vector instance) {
+    if (models.size() == 2) {
+      double pdf0 = models.get(0).pdf(new VectorWritable(instance));
+      double pdf1 = models.get(1).pdf(new VectorWritable(instance));
+      return pdf0 / (pdf0 + pdf1);
+    }
+    throw new IllegalStateException();
+  }
+
+  @Override
+  public int numCategories() {
+    return models.size();
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(models.size());
+    out.writeUTF(modelClass);
+    new ClusteringPolicyWritable(policy).write(out);
+    for (Cluster cluster : models) {
+      cluster.write(out);
+    }
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    int size = in.readInt();
+    modelClass = in.readUTF();
+    models = new ArrayList<>();
+    ClusteringPolicyWritable clusteringPolicyWritable = new ClusteringPolicyWritable();
+    clusteringPolicyWritable.readFields(in);
+    policy = clusteringPolicyWritable.getValue();
+    for (int i = 0; i < size; i++) {
+      Cluster element = ClassUtils.instantiateAs(modelClass, Cluster.class);
+      element.readFields(in);
+      models.add(element);
+    }
+  }
+
+  @Override
+  public void train(int actual, Vector instance) {
+    models.get(actual).observe(new VectorWritable(instance));
+  }
+
+  /**
+   * Train the models given an additional weight. Unique to ClusterClassifier
+   *
+   * @param actual the int index of a model
+   * @param data   a data Vector
+   * @param weight a double weighting factor
+   */
+  public void train(int actual, Vector data, double weight) {
+    models.get(actual).observe(new VectorWritable(data), weight);
+  }
+
+  @Override
+  public void train(long trackingKey, String groupKey, int actual, Vector instance) {
+    models.get(actual).observe(new VectorWritable(instance));
+  }
+
+  @Override
+  public void train(long trackingKey, int actual, Vector instance) {
+    models.get(actual).observe(new VectorWritable(instance));
+  }
+
+  @Override
+  public void close() {
+    policy.close(this);
+  }
+
+  public List<Cluster> getModels() {
+    return models;
+  }
+
+  public ClusteringPolicy getPolicy() {
+    return policy;
+  }
+
+  public void writeToSeqFiles(Path path) throws IOException {
+    writePolicy(policy, path);
+    Configuration config = new Configuration();
+    FileSystem fs = FileSystem.get(path.toUri(), config);
+    ClusterWritable cw = new ClusterWritable();
+    for (int i = 0; i < models.size(); i++) {
+      try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, config,
+          new Path(path, "part-" + String.format(Locale.ENGLISH, "%05d", i)), IntWritable.class,
+          ClusterWritable.class)) {
+        Cluster cluster = models.get(i);
+        cw.setValue(cluster);
+        Writable key = new IntWritable(i);
+        writer.append(key, cw);
+      }
+    }
+  }
+
+  public void readFromSeqFiles(Configuration conf, Path path) throws IOException {
+    Configuration config = new Configuration();
+    List<Cluster> clusters = new ArrayList<>();
+    for (ClusterWritable cw : new SequenceFileDirValueIterable<ClusterWritable>(path, PathType.LIST,
+        PathFilters.logsCRCFilter(), config)) {
+      Cluster cluster = cw.getValue();
+      cluster.configure(conf);
+      clusters.add(cluster);
+    }
+    this.models = clusters;
+    modelClass = models.get(0).getClass().getName();
+    this.policy = readPolicy(path);
+  }
+
+  public static ClusteringPolicy readPolicy(Path path) throws IOException {
+    Path policyPath = new Path(path, POLICY_FILE_NAME);
+    Configuration config = new Configuration();
+    FileSystem fs = FileSystem.get(policyPath.toUri(), config);
+    SequenceFile.Reader reader = new SequenceFile.Reader(fs, policyPath, config);
+    Text key = new Text();
+    ClusteringPolicyWritable cpw = new ClusteringPolicyWritable();
+    reader.next(key, cpw);
+    Closeables.close(reader, true);
+    return cpw.getValue();
+  }
+
+  public static void writePolicy(ClusteringPolicy policy, Path path) throws IOException {
+    Path policyPath = new Path(path, POLICY_FILE_NAME);
+    Configuration config = new Configuration();
+    FileSystem fs = FileSystem.get(policyPath.toUri(), config);
+    SequenceFile.Writer writer = new SequenceFile.Writer(fs, config, policyPath, Text.class,
+        ClusteringPolicyWritable.class);
+    writer.append(new Text(), new ClusteringPolicyWritable(policy));
+    Closeables.close(writer, false);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedPropertyVectorWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedPropertyVectorWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedPropertyVectorWritable.java
new file mode 100644
index 0000000..567659b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedPropertyVectorWritable.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.classify;
+
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.math.Vector;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+public class WeightedPropertyVectorWritable extends WeightedVectorWritable {
+
+  private Map<Text, Text> properties;
+
+  public WeightedPropertyVectorWritable() {
+  }
+
+  public WeightedPropertyVectorWritable(Map<Text, Text> properties) {
+    this.properties = properties;
+  }
+
+  public WeightedPropertyVectorWritable(double weight, Vector vector, Map<Text, Text> properties) {
+    super(weight, vector);
+    this.properties = properties;
+  }
+
+  public Map<Text, Text> getProperties() {
+    return properties;
+  }
+
+  public void setProperties(Map<Text, Text> properties) {
+    this.properties = properties;
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    super.readFields(in);
+    int size = in.readInt();
+    if (size > 0) {
+      properties = new HashMap<>();
+      for (int i = 0; i < size; i++) {
+        Text key = new Text(in.readUTF());
+        Text val = new Text(in.readUTF());
+        properties.put(key, val);
+      }
+    }
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    super.write(out);
+    out.writeInt(properties != null ? properties.size() : 0);
+    if (properties != null) {
+      for (Map.Entry<Text, Text> entry : properties.entrySet()) {
+        out.writeUTF(entry.getKey().toString());
+        out.writeUTF(entry.getValue().toString());
+      }
+    }
+  }
+
+  @Override
+  public String toString() {
+    Vector vector = getVector();
+    StringBuilder bldr = new StringBuilder("wt: ").append(getWeight()).append(' ');
+    if (properties != null && !properties.isEmpty()) {
+      for (Map.Entry<Text, Text> entry : properties.entrySet()) {
+        bldr.append(entry.getKey().toString()).append(": ").append(entry.getValue().toString()).append(' ');
+      }
+    }
+    bldr.append(" vec: ").append(vector == null ? "null" : AbstractCluster.formatVector(vector, null));
+    return bldr.toString();
+  }
+
+
+}
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedVectorWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedVectorWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedVectorWritable.java
new file mode 100644
index 0000000..510dd39
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedVectorWritable.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.classify;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public class WeightedVectorWritable implements Writable {
+
+  private final VectorWritable vectorWritable = new VectorWritable();
+  private double weight;
+
+  public WeightedVectorWritable() {
+  }
+
+  public WeightedVectorWritable(double weight, Vector vector) {
+    this.vectorWritable.set(vector);
+    this.weight = weight;
+  }
+
+  public Vector getVector() {
+    return vectorWritable.get();
+  }
+
+  public void setVector(Vector vector) {
+    vectorWritable.set(vector);
+  }
+
+  public double getWeight() {
+    return weight;
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    vectorWritable.readFields(in);
+    weight = in.readDouble();
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    vectorWritable.write(out);
+    out.writeDouble(weight);
+  }
+
+  @Override
+  public String toString() {
+    Vector vector = vectorWritable.get();
+    return weight + ": " + (vector == null ? "null" : AbstractCluster.formatVector(vector, null));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java
new file mode 100644
index 0000000..ff02a4c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.fuzzykmeans;
+
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+
+public class FuzzyKMeansClusterer {
+
+  private static final double MINIMAL_VALUE = 0.0000000001;
+  
+  private double m = 2.0; // default value
+  
+  public Vector computePi(Collection<SoftCluster> clusters, List<Double> clusterDistanceList) {
+    Vector pi = new DenseVector(clusters.size());
+    for (int i = 0; i < clusters.size(); i++) {
+      double probWeight = computeProbWeight(clusterDistanceList.get(i), clusterDistanceList);
+      pi.set(i, probWeight);
+    }
+    return pi;
+  }
+  
+  /** Computes the probability of a point belonging to a cluster */
+  public double computeProbWeight(double clusterDistance, Iterable<Double> clusterDistanceList) {
+    if (clusterDistance == 0) {
+      clusterDistance = MINIMAL_VALUE;
+    }
+    double denom = 0.0;
+    for (double eachCDist : clusterDistanceList) {
+      if (eachCDist == 0.0) {
+        eachCDist = MINIMAL_VALUE;
+      }
+      denom += Math.pow(clusterDistance / eachCDist, 2.0 / (m - 1));
+    }
+    return 1.0 / denom;
+  }
+
+  public void setM(double m) {
+    this.m = m;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
new file mode 100644
index 0000000..98eb944
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
@@ -0,0 +1,324 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.fuzzykmeans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassificationDriver;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.clustering.iterator.ClusterIterator;
+import org.apache.mahout.clustering.iterator.ClusteringPolicy;
+import org.apache.mahout.clustering.iterator.FuzzyKMeansClusteringPolicy;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
+import org.apache.mahout.clustering.topdown.PathDirectory;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class FuzzyKMeansDriver extends AbstractJob {
+
+  public static final String M_OPTION = "m";
+
+  private static final Logger log = LoggerFactory.getLogger(FuzzyKMeansDriver.class);
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new FuzzyKMeansDriver(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+
+    addInputOption();
+    addOutputOption();
+    addOption(DefaultOptionCreator.distanceMeasureOption().create());
+    addOption(DefaultOptionCreator.clustersInOption()
+        .withDescription("The input centroids, as Vectors.  Must be a SequenceFile of Writable, Cluster/Canopy.  "
+            + "If k is also specified, then a random set of vectors will be selected"
+            + " and written out to this path first")
+        .create());
+    addOption(DefaultOptionCreator.numClustersOption()
+        .withDescription("The k in k-Means.  If specified, then a random selection of k Vectors will be chosen"
+            + " as the Centroid and written to the clusters input path.").create());
+    addOption(DefaultOptionCreator.convergenceOption().create());
+    addOption(DefaultOptionCreator.maxIterationsOption().create());
+    addOption(DefaultOptionCreator.overwriteOption().create());
+    addOption(M_OPTION, M_OPTION, "coefficient normalization factor, must be greater than 1", true);
+    addOption(DefaultOptionCreator.clusteringOption().create());
+    addOption(DefaultOptionCreator.emitMostLikelyOption().create());
+    addOption(DefaultOptionCreator.thresholdOption().create());
+    addOption(DefaultOptionCreator.methodOption().create());
+    addOption(DefaultOptionCreator.useSetRandomSeedOption().create());
+
+    if (parseArguments(args) == null) {
+      return -1;
+    }
+
+    Path input = getInputPath();
+    Path clusters = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION));
+    Path output = getOutputPath();
+    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+    if (measureClass == null) {
+      measureClass = SquaredEuclideanDistanceMeasure.class.getName();
+    }
+    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+    float fuzziness = Float.parseFloat(getOption(M_OPTION));
+
+    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.delete(getConf(), output);
+    }
+    boolean emitMostLikely = Boolean.parseBoolean(getOption(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION));
+    double threshold = Double.parseDouble(getOption(DefaultOptionCreator.THRESHOLD_OPTION));
+    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
+
+    if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
+      int numClusters = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
+
+      Long seed = null;
+      if (hasOption(DefaultOptionCreator.RANDOM_SEED)) {
+        seed = Long.parseLong(getOption(DefaultOptionCreator.RANDOM_SEED));
+      }
+
+      clusters = RandomSeedGenerator.buildRandom(getConf(), input, clusters, numClusters, measure, seed);
+    }
+
+    boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
+    boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(
+        DefaultOptionCreator.SEQUENTIAL_METHOD);
+
+    run(getConf(),
+        input,
+        clusters,
+        output,
+        convergenceDelta,
+        maxIterations,
+        fuzziness,
+        runClustering,
+        emitMostLikely,
+        threshold,
+        runSequential);
+    return 0;
+  }
+
+  /**
+   * Iterate over the input vectors to produce clusters and, if requested, use the
+   * results of the final iteration to cluster the input vectors.
+   *
+   * @param input
+   *          the directory pathname for input points
+   * @param clustersIn
+   *          the directory pathname for initial & computed clusters
+   * @param output
+ *          the directory pathname for output points
+   * @param convergenceDelta
+*          the convergence delta value
+   * @param maxIterations
+*          the maximum number of iterations
+   * @param m
+*          the fuzzification factor, see
+*          http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
+   * @param runClustering
+*          true if points are to be clustered after iterations complete
+   * @param emitMostLikely
+*          a boolean if true emit only most likely cluster for each point
+   * @param threshold
+*          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
+   * @param runSequential if true run in sequential execution mode
+   */
+  public static void run(Path input,
+                         Path clustersIn,
+                         Path output,
+                         double convergenceDelta,
+                         int maxIterations,
+                         float m,
+                         boolean runClustering,
+                         boolean emitMostLikely,
+                         double threshold,
+                         boolean runSequential) throws IOException, ClassNotFoundException, InterruptedException {
+    Configuration conf = new Configuration();
+    Path clustersOut = buildClusters(conf,
+                                     input,
+                                     clustersIn,
+                                     output,
+                                     convergenceDelta,
+                                     maxIterations,
+                                     m,
+                                     runSequential);
+    if (runClustering) {
+      log.info("Clustering ");
+      clusterData(conf, input,
+                  clustersOut,
+                  output,
+                  convergenceDelta,
+                  m,
+                  emitMostLikely,
+                  threshold,
+                  runSequential);
+    }
+  }
+
+  /**
+   * Iterate over the input vectors to produce clusters and, if requested, use the
+   * results of the final iteration to cluster the input vectors.
+   * @param input
+   *          the directory pathname for input points
+   * @param clustersIn
+   *          the directory pathname for initial & computed clusters
+   * @param output
+ *          the directory pathname for output points
+   * @param convergenceDelta
+*          the convergence delta value
+   * @param maxIterations
+*          the maximum number of iterations
+   * @param m
+*          the fuzzification factor, see
+*          http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
+   * @param runClustering
+*          true if points are to be clustered after iterations complete
+   * @param emitMostLikely
+*          a boolean if true emit only most likely cluster for each point
+   * @param threshold
+*          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
+   * @param runSequential if true run in sequential execution mode
+   */
+  public static void run(Configuration conf,
+                         Path input,
+                         Path clustersIn,
+                         Path output,
+                         double convergenceDelta,
+                         int maxIterations,
+                         float m,
+                         boolean runClustering,
+                         boolean emitMostLikely,
+                         double threshold,
+                         boolean runSequential)
+    throws IOException, ClassNotFoundException, InterruptedException {
+    Path clustersOut =
+        buildClusters(conf, input, clustersIn, output, convergenceDelta, maxIterations, m, runSequential);
+    if (runClustering) {
+      log.info("Clustering");
+      clusterData(conf, 
+                  input,
+                  clustersOut,
+                  output,
+                  convergenceDelta,
+                  m,
+                  emitMostLikely,
+                  threshold,
+                  runSequential);
+    }
+  }
+
+  /**
+   * Iterate over the input vectors to produce cluster directories for each iteration
+   *
+   * @param input
+   *          the directory pathname for input points
+   * @param clustersIn
+   *          the file pathname for initial cluster centers
+   * @param output
+   *          the directory pathname for output points
+   * @param convergenceDelta
+   *          the convergence delta value
+   * @param maxIterations
+   *          the maximum number of iterations
+   * @param m
+   *          the fuzzification factor, see
+   *          http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
+   * @param runSequential if true run in sequential execution mode
+   *
+   * @return the Path of the final clusters directory
+   */
+  public static Path buildClusters(Configuration conf,
+                                   Path input,
+                                   Path clustersIn,
+                                   Path output,
+                                   double convergenceDelta,
+                                   int maxIterations,
+                                   float m,
+                                   boolean runSequential)
+    throws IOException, InterruptedException, ClassNotFoundException {
+    
+    List<Cluster> clusters = new ArrayList<>();
+    FuzzyKMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters);
+    
+    if (conf == null) {
+      conf = new Configuration();
+    }
+    
+    if (clusters.isEmpty()) {
+      throw new IllegalStateException("No input clusters found in " + clustersIn + ". Check your -c argument.");
+    }
+    
+    Path priorClustersPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);   
+    ClusteringPolicy policy = new FuzzyKMeansClusteringPolicy(m, convergenceDelta);
+    ClusterClassifier prior = new ClusterClassifier(clusters, policy);
+    prior.writeToSeqFiles(priorClustersPath);
+    
+    if (runSequential) {
+      ClusterIterator.iterateSeq(conf, input, priorClustersPath, output, maxIterations);
+    } else {
+      ClusterIterator.iterateMR(conf, input, priorClustersPath, output, maxIterations);
+    }
+    return output;
+  }
+
+  /**
+   * Run the job using supplied arguments
+   *
+   * @param input
+   *          the directory pathname for input points
+   * @param clustersIn
+   *          the directory pathname for input clusters
+   * @param output
+ *          the directory pathname for output points
+   * @param convergenceDelta
+*          the convergence delta value
+   * @param emitMostLikely
+*          a boolean if true emit only most likely cluster for each point
+   * @param threshold
+*          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
+   * @param runSequential if true run in sequential execution mode
+   */
+  public static void clusterData(Configuration conf,
+                                 Path input,
+                                 Path clustersIn,
+                                 Path output,
+                                 double convergenceDelta,
+                                 float m,
+                                 boolean emitMostLikely,
+                                 double threshold,
+                                 boolean runSequential)
+    throws IOException, ClassNotFoundException, InterruptedException {
+    
+    ClusterClassifier.writePolicy(new FuzzyKMeansClusteringPolicy(m, convergenceDelta), clustersIn);
+    ClusterClassificationDriver.run(conf, input, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY),
+        threshold, emitMostLikely, runSequential);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java
new file mode 100644
index 0000000..25621bb
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.fuzzykmeans;
+
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.canopy.Canopy;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.kmeans.Kluster;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+
+final class FuzzyKMeansUtil {
+  
+  private FuzzyKMeansUtil() {}
+  
+  /**
+   * Create a list of SoftClusters from whatever type is passed in as the prior
+   * 
+   * @param conf
+   *          the Configuration
+   * @param clusterPath
+   *          the path to the prior Clusters
+   * @param clusters
+   *          a List<Cluster> to put values into
+   */
+  public static void configureWithClusterInfo(Configuration conf, Path clusterPath, List<Cluster> clusters) {
+    for (Writable value : new SequenceFileDirValueIterable<>(clusterPath, PathType.LIST,
+        PathFilters.partFilter(), conf)) {
+      Class<? extends Writable> valueClass = value.getClass();
+      
+      if (valueClass.equals(ClusterWritable.class)) {
+        ClusterWritable clusterWritable = (ClusterWritable) value;
+        value = clusterWritable.getValue();
+        valueClass = value.getClass();
+      }
+      
+      if (valueClass.equals(Kluster.class)) {
+        // get the cluster info
+        Kluster cluster = (Kluster) value;
+        clusters.add(new SoftCluster(cluster.getCenter(), cluster.getId(), cluster.getMeasure()));
+      } else if (valueClass.equals(SoftCluster.class)) {
+        // get the cluster info
+        clusters.add((SoftCluster) value);
+      } else if (valueClass.equals(Canopy.class)) {
+        // get the cluster info
+        Canopy canopy = (Canopy) value;
+        clusters.add(new SoftCluster(canopy.getCenter(), canopy.getId(), canopy.getMeasure()));
+      } else {
+        throw new IllegalStateException("Bad value class: " + valueClass);
+      }
+    }
+    
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
new file mode 100644
index 0000000..52fd764
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.fuzzykmeans;
+
+import org.apache.mahout.clustering.kmeans.Kluster;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public class SoftCluster extends Kluster {
+  
+  // For Writable
+  public SoftCluster() {}
+  
+  /**
+   * Construct a new SoftCluster with the given point as its center
+   * 
+   * @param center
+   *          the center point
+   * @param measure
+   *          the DistanceMeasure
+   */
+  public SoftCluster(Vector center, int clusterId, DistanceMeasure measure) {
+    super(center, clusterId, measure);
+  }
+  
+  @Override
+  public String asFormatString() {
+    return this.getIdentifier() + ": "
+        + this.computeCentroid().asFormatString();
+  }
+  
+  @Override
+  public String getIdentifier() {
+    return (isConverged() ? "SV-" : "SC-") + getId();
+  }
+  
+  @Override
+  public double pdf(VectorWritable vw) {
+    // SoftCluster pdf cannot be calculated out of context. See
+    // FuzzyKMeansClusterer
+    throw new UnsupportedOperationException(
+        "SoftCluster pdf cannot be calculated out of context. See FuzzyKMeansClusterer");
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/AbstractClusteringPolicy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/AbstractClusteringPolicy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/AbstractClusteringPolicy.java
new file mode 100644
index 0000000..07cc7e3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/AbstractClusteringPolicy.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.TimesFunction;
+
+public abstract class AbstractClusteringPolicy implements ClusteringPolicy {
+  
+  @Override
+  public abstract void write(DataOutput out) throws IOException;
+  
+  @Override
+  public abstract void readFields(DataInput in) throws IOException;
+  
+  @Override
+  public Vector select(Vector probabilities) {
+    int maxValueIndex = probabilities.maxValueIndex();
+    Vector weights = new SequentialAccessSparseVector(probabilities.size());
+    weights.set(maxValueIndex, 1.0);
+    return weights;
+  }
+  
+  @Override
+  public void update(ClusterClassifier posterior) {
+    // nothing to do in general here
+  }
+  
+  @Override
+  public Vector classify(Vector data, ClusterClassifier prior) {
+    List<Cluster> models = prior.getModels();
+    int i = 0;
+    Vector pdfs = new DenseVector(models.size());
+    for (Cluster model : models) {
+      pdfs.set(i++, model.pdf(new VectorWritable(data)));
+    }
+    return pdfs.assign(new TimesFunction(), 1.0 / pdfs.zSum());
+  }
+  
+  @Override
+  public void close(ClusterClassifier posterior) {
+    for (Cluster cluster : posterior.getModels()) {
+      cluster.computeParameters();
+    }
+    
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIMapper.java
new file mode 100644
index 0000000..fb2db49
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIMapper.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.iterator;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
+
+public class CIMapper extends Mapper<WritableComparable<?>,VectorWritable,IntWritable,ClusterWritable> {
+  
+  private ClusterClassifier classifier;
+  private ClusteringPolicy policy;
+
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    Configuration conf = context.getConfiguration();
+    String priorClustersPath = conf.get(ClusterIterator.PRIOR_PATH_KEY);
+    classifier = new ClusterClassifier();
+    classifier.readFromSeqFiles(conf, new Path(priorClustersPath));
+    policy = classifier.getPolicy();
+    policy.update(classifier);
+    super.setup(context);
+  }
+
+  @Override
+  protected void map(WritableComparable<?> key, VectorWritable value, Context context) throws IOException,
+      InterruptedException {
+    Vector probabilities = classifier.classify(value.get());
+    Vector selections = policy.select(probabilities);
+    for (Element el : selections.nonZeroes()) {
+      classifier.train(el.index(), value.get(), el.get());
+    }
+  }
+
+  @Override
+  protected void cleanup(Context context) throws IOException, InterruptedException {
+    List<Cluster> clusters = classifier.getModels();
+    ClusterWritable cw = new ClusterWritable();
+    for (int index = 0; index < clusters.size(); index++) {
+      cw.setValue(clusters.get(index));
+      context.write(new IntWritable(index), cw);
+    }
+    super.cleanup(context);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIReducer.java
new file mode 100644
index 0000000..ca63b0f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIReducer.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.iterator;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+
+public class CIReducer extends Reducer<IntWritable,ClusterWritable,IntWritable,ClusterWritable> {
+  
+  private ClusterClassifier classifier;
+  private ClusteringPolicy policy;
+  
+  @Override
+  protected void reduce(IntWritable key, Iterable<ClusterWritable> values, Context context) throws IOException,
+      InterruptedException {
+    Iterator<ClusterWritable> iter = values.iterator();
+    Cluster first = iter.next().getValue(); // there must always be at least one
+    while (iter.hasNext()) {
+      Cluster cluster = iter.next().getValue();
+      first.observe(cluster);
+    }
+    List<Cluster> models = new ArrayList<>();
+    models.add(first);
+    classifier = new ClusterClassifier(models, policy);
+    classifier.close();
+    context.write(key, new ClusterWritable(first));
+  }
+
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    Configuration conf = context.getConfiguration();
+    String priorClustersPath = conf.get(ClusterIterator.PRIOR_PATH_KEY);
+    classifier = new ClusterClassifier();
+    classifier.readFromSeqFiles(conf, new Path(priorClustersPath));
+    policy = classifier.getPolicy();
+    policy.update(classifier);
+    super.setup(context);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CanopyClusteringPolicy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CanopyClusteringPolicy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CanopyClusteringPolicy.java
new file mode 100644
index 0000000..c9a0940
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CanopyClusteringPolicy.java
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+
+@Deprecated
+public class CanopyClusteringPolicy extends AbstractClusteringPolicy {
+
+  private double t1;
+  private double t2;
+
+  @Override
+  public Vector select(Vector probabilities) {
+    int maxValueIndex = probabilities.maxValueIndex();
+    Vector weights = new SequentialAccessSparseVector(probabilities.size());
+    weights.set(maxValueIndex, 1.0);
+    return weights;
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeDouble(t1);
+    out.writeDouble(t2);
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    this.t1 = in.readDouble();
+    this.t2 = in.readDouble();
+  }
+  
+}


[38/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/resources/bank-full.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/resources/bank-full.csv b/community/mahout-mr/mr-examples/bin/resources/bank-full.csv
new file mode 100644
index 0000000..d7a2ede
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/resources/bank-full.csv
@@ -0,0 +1,45212 @@
+"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"
+58;"management";"married";"tertiary";"no";2143;"yes";"no";"unknown";5;"may";261;1;-1;0;"unknown";"no"
+44;"technician";"single";"secondary";"no";29;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
+33;"entrepreneur";"married";"secondary";"no";2;"yes";"yes";"unknown";5;"may";76;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"unknown";"no";1506;"yes";"no";"unknown";5;"may";92;1;-1;0;"unknown";"no"
+33;"unknown";"single";"unknown";"no";1;"no";"no";"unknown";5;"may";198;1;-1;0;"unknown";"no"
+35;"management";"married";"tertiary";"no";231;"yes";"no";"unknown";5;"may";139;1;-1;0;"unknown";"no"
+28;"management";"single";"tertiary";"no";447;"yes";"yes";"unknown";5;"may";217;1;-1;0;"unknown";"no"
+42;"entrepreneur";"divorced";"tertiary";"yes";2;"yes";"no";"unknown";5;"may";380;1;-1;0;"unknown";"no"
+58;"retired";"married";"primary";"no";121;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
+43;"technician";"single";"secondary";"no";593;"yes";"no";"unknown";5;"may";55;1;-1;0;"unknown";"no"
+41;"admin.";"divorced";"secondary";"no";270;"yes";"no";"unknown";5;"may";222;1;-1;0;"unknown";"no"
+29;"admin.";"single";"secondary";"no";390;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";6;"yes";"no";"unknown";5;"may";517;1;-1;0;"unknown";"no"
+58;"technician";"married";"unknown";"no";71;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
+57;"services";"married";"secondary";"no";162;"yes";"no";"unknown";5;"may";174;1;-1;0;"unknown";"no"
+51;"retired";"married";"primary";"no";229;"yes";"no";"unknown";5;"may";353;1;-1;0;"unknown";"no"
+45;"admin.";"single";"unknown";"no";13;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";52;"yes";"no";"unknown";5;"may";38;1;-1;0;"unknown";"no"
+60;"retired";"married";"primary";"no";60;"yes";"no";"unknown";5;"may";219;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";54;1;-1;0;"unknown";"no"
+28;"blue-collar";"married";"secondary";"no";723;"yes";"yes";"unknown";5;"may";262;1;-1;0;"unknown";"no"
+56;"management";"married";"tertiary";"no";779;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
+32;"blue-collar";"single";"primary";"no";23;"yes";"yes";"unknown";5;"may";160;1;-1;0;"unknown";"no"
+25;"services";"married";"secondary";"no";50;"yes";"no";"unknown";5;"may";342;1;-1;0;"unknown";"no"
+40;"retired";"married";"primary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+44;"admin.";"married";"secondary";"no";-372;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
+39;"management";"single";"tertiary";"no";255;"yes";"no";"unknown";5;"may";296;1;-1;0;"unknown";"no"
+52;"entrepreneur";"married";"secondary";"no";113;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
+46;"management";"single";"secondary";"no";-246;"yes";"no";"unknown";5;"may";255;2;-1;0;"unknown";"no"
+36;"technician";"single";"secondary";"no";265;"yes";"yes";"unknown";5;"may";348;1;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";839;"no";"yes";"unknown";5;"may";225;1;-1;0;"unknown";"no"
+49;"management";"married";"tertiary";"no";378;"yes";"no";"unknown";5;"may";230;1;-1;0;"unknown";"no"
+60;"admin.";"married";"secondary";"no";39;"yes";"yes";"unknown";5;"may";208;1;-1;0;"unknown";"no"
+59;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";226;1;-1;0;"unknown";"no"
+51;"management";"married";"tertiary";"no";10635;"yes";"no";"unknown";5;"may";336;1;-1;0;"unknown";"no"
+57;"technician";"divorced";"secondary";"no";63;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
+25;"blue-collar";"married";"secondary";"no";-7;"yes";"no";"unknown";5;"may";365;1;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";-3;"no";"no";"unknown";5;"may";1666;1;-1;0;"unknown";"no"
+36;"admin.";"divorced";"secondary";"no";506;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
+44;"services";"divorced";"secondary";"no";2586;"yes";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
+50;"management";"married";"secondary";"no";49;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
+60;"blue-collar";"married";"unknown";"no";104;"yes";"no";"unknown";5;"may";22;1;-1;0;"unknown";"no"
+54;"retired";"married";"secondary";"no";529;"yes";"no";"unknown";5;"may";1492;1;-1;0;"unknown";"no"
+58;"retired";"married";"unknown";"no";96;"yes";"no";"unknown";5;"may";616;1;-1;0;"unknown";"no"
+36;"admin.";"single";"primary";"no";-171;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
+58;"self-employed";"married";"tertiary";"no";-364;"yes";"no";"unknown";5;"may";355;1;-1;0;"unknown";"no"
+44;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
+55;"technician";"divorced";"secondary";"no";0;"no";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
+29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";363;1;-1;0;"unknown";"no"
+54;"blue-collar";"married";"secondary";"no";1291;"yes";"no";"unknown";5;"may";266;1;-1;0;"unknown";"no"
+48;"management";"divorced";"tertiary";"no";-244;"yes";"no";"unknown";5;"may";253;1;-1;0;"unknown";"no"
+32;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";179;1;-1;0;"unknown";"no"
+42;"admin.";"single";"secondary";"no";-76;"yes";"no";"unknown";5;"may";787;1;-1;0;"unknown";"no"
+24;"technician";"single";"secondary";"no";-103;"yes";"yes";"unknown";5;"may";145;1;-1;0;"unknown";"no"
+38;"entrepreneur";"single";"tertiary";"no";243;"no";"yes";"unknown";5;"may";174;1;-1;0;"unknown";"no"
+38;"management";"single";"tertiary";"no";424;"yes";"no";"unknown";5;"may";104;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"unknown";"no";306;"yes";"no";"unknown";5;"may";13;1;-1;0;"unknown";"no"
+40;"blue-collar";"single";"unknown";"no";24;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
+46;"services";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";1778;1;-1;0;"unknown";"no"
+32;"admin.";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
+53;"technician";"divorced";"secondary";"no";989;"yes";"no";"unknown";5;"may";812;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";249;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";790;"yes";"no";"unknown";5;"may";391;1;-1;0;"unknown";"no"
+49;"blue-collar";"married";"unknown";"no";154;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
+51;"management";"married";"tertiary";"no";6530;"yes";"no";"unknown";5;"may";91;1;-1;0;"unknown";"no"
+60;"retired";"married";"tertiary";"no";100;"no";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
+59;"management";"divorced";"tertiary";"no";59;"yes";"no";"unknown";5;"may";273;1;-1;0;"unknown";"no"
+55;"technician";"married";"secondary";"no";1205;"yes";"no";"unknown";5;"may";158;2;-1;0;"unknown";"no"
+35;"blue-collar";"single";"secondary";"no";12223;"yes";"yes";"unknown";5;"may";177;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"secondary";"no";5935;"yes";"yes";"unknown";5;"may";258;1;-1;0;"unknown";"no"
+31;"services";"married";"secondary";"no";25;"yes";"yes";"unknown";5;"may";172;1;-1;0;"unknown";"no"
+54;"management";"married";"secondary";"no";282;"yes";"yes";"unknown";5;"may";154;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
+43;"technician";"married";"secondary";"no";1937;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";384;"yes";"no";"unknown";5;"may";176;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";582;"no";"yes";"unknown";5;"may";211;1;-1;0;"unknown";"no"
+55;"services";"divorced";"secondary";"no";91;"no";"no";"unknown";5;"may";349;1;-1;0;"unknown";"no"
+49;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";5;"may";272;1;-1;0;"unknown";"no"
+55;"services";"divorced";"secondary";"yes";1;"yes";"no";"unknown";5;"may";208;1;-1;0;"unknown";"no"
+45;"admin.";"single";"secondary";"no";206;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
+47;"services";"divorced";"secondary";"no";164;"no";"no";"unknown";5;"may";212;1;-1;0;"unknown";"no"
+42;"technician";"single";"secondary";"no";690;"yes";"no";"unknown";5;"may";20;1;-1;0;"unknown";"no"
+59;"admin.";"married";"secondary";"no";2343;"yes";"no";"unknown";5;"may";1042;1;-1;0;"unknown";"yes"
+46;"self-employed";"married";"tertiary";"no";137;"yes";"yes";"unknown";5;"may";246;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";173;"yes";"no";"unknown";5;"may";529;2;-1;0;"unknown";"no"
+56;"admin.";"married";"secondary";"no";45;"no";"no";"unknown";5;"may";1467;1;-1;0;"unknown";"yes"
+41;"technician";"married";"secondary";"no";1270;"yes";"no";"unknown";5;"may";1389;1;-1;0;"unknown";"yes"
+46;"management";"divorced";"secondary";"no";16;"yes";"yes";"unknown";5;"may";188;2;-1;0;"unknown";"no"
+57;"retired";"married";"secondary";"no";486;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
+42;"management";"single";"secondary";"no";50;"no";"no";"unknown";5;"may";48;1;-1;0;"unknown";"no"
+30;"technician";"married";"secondary";"no";152;"yes";"yes";"unknown";5;"may";213;2;-1;0;"unknown";"no"
+60;"admin.";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";583;1;-1;0;"unknown";"no"
+60;"blue-collar";"married";"unknown";"no";54;"yes";"no";"unknown";5;"may";221;1;-1;0;"unknown";"no"
+57;"entrepreneur";"divorced";"secondary";"no";-37;"no";"no";"unknown";5;"may";173;1;-1;0;"unknown";"no"
+36;"management";"married";"tertiary";"no";101;"yes";"yes";"unknown";5;"may";426;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";383;"no";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
+60;"retired";"married";"tertiary";"no";81;"yes";"no";"unknown";5;"may";101;1;-1;0;"unknown";"no"
+39;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";203;1;-1;0;"unknown";"no"
+46;"management";"married";"tertiary";"no";229;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";-674;"yes";"no";"unknown";5;"may";257;1;-1;0;"unknown";"no"
+53;"blue-collar";"married";"primary";"no";90;"no";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
+52;"blue-collar";"married";"primary";"no";128;"yes";"no";"unknown";5;"may";229;1;-1;0;"unknown";"no"
+59;"blue-collar";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";55;3;-1;0;"unknown";"no"
+27;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";400;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";54;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
+47;"technician";"married";"tertiary";"no";151;"yes";"no";"unknown";5;"may";190;1;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";61;"no";"yes";"unknown";5;"may";21;1;-1;0;"unknown";"no"
+59;"retired";"single";"secondary";"no";30;"yes";"no";"unknown";5;"may";514;1;-1;0;"unknown";"no"
+45;"management";"married";"tertiary";"no";523;"yes";"no";"unknown";5;"may";849;2;-1;0;"unknown";"no"
+29;"services";"divorced";"secondary";"no";31;"yes";"no";"unknown";5;"may";194;1;-1;0;"unknown";"no"
+46;"technician";"divorced";"secondary";"no";79;"no";"no";"unknown";5;"may";144;1;-1;0;"unknown";"no"
+56;"self-employed";"married";"primary";"no";-34;"yes";"yes";"unknown";5;"may";212;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"primary";"no";448;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
+59;"retired";"divorced";"primary";"no";81;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";144;"yes";"no";"unknown";5;"may";247;2;-1;0;"unknown";"no"
+41;"admin.";"married";"secondary";"no";351;"yes";"no";"unknown";5;"may";518;1;-1;0;"unknown";"no"
+33;"management";"single";"tertiary";"no";-67;"yes";"no";"unknown";5;"may";364;1;-1;0;"unknown";"no"
+59;"management";"divorced";"tertiary";"no";262;"no";"no";"unknown";5;"may";178;1;-1;0;"unknown";"no"
+57;"technician";"married";"primary";"no";0;"no";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+56;"technician";"divorced";"unknown";"no";56;"yes";"no";"unknown";5;"may";439;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
+34;"admin.";"married";"unknown";"no";3;"yes";"no";"unknown";5;"may";120;3;-1;0;"unknown";"no"
+43;"services";"married";"secondary";"no";41;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
+52;"technician";"married";"tertiary";"no";7;"no";"yes";"unknown";5;"may";175;1;-1;0;"unknown";"no"
+33;"technician";"single";"secondary";"no";105;"yes";"no";"unknown";5;"may";262;2;-1;0;"unknown";"no"
+29;"admin.";"single";"secondary";"no";818;"yes";"yes";"unknown";5;"may";61;1;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";-16;"yes";"yes";"unknown";5;"may";78;1;-1;0;"unknown";"no"
+31;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";143;1;-1;0;"unknown";"no"
+55;"services";"married";"secondary";"no";2476;"yes";"no";"unknown";5;"may";579;1;-1;0;"unknown";"yes"
+55;"management";"married";"unknown";"no";1185;"no";"no";"unknown";5;"may";677;1;-1;0;"unknown";"no"
+32;"admin.";"single";"secondary";"no";217;"yes";"no";"unknown";5;"may";345;1;-1;0;"unknown";"no"
+38;"technician";"single";"secondary";"no";1685;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
+55;"admin.";"single";"secondary";"no";802;"yes";"yes";"unknown";5;"may";100;2;-1;0;"unknown";"no"
+28;"unemployed";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
+23;"blue-collar";"married";"secondary";"no";94;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
+32;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";136;1;-1;0;"unknown";"no"
+43;"services";"single";"unknown";"no";0;"no";"no";"unknown";5;"may";73;1;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";517;"yes";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
+46;"blue-collar";"married";"secondary";"no";265;"yes";"no";"unknown";5;"may";541;1;-1;0;"unknown";"no"
+53;"housemaid";"divorced";"primary";"no";947;"yes";"no";"unknown";5;"may";163;1;-1;0;"unknown";"no"
+34;"self-employed";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";301;1;-1;0;"unknown";"no"
+57;"unemployed";"married";"tertiary";"no";42;"no";"no";"unknown";5;"may";46;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";37;"yes";"no";"unknown";5;"may";204;1;-1;0;"unknown";"no"
+59;"blue-collar";"married";"secondary";"no";57;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";22;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
+56;"blue-collar";"divorced";"primary";"no";8;"yes";"no";"unknown";5;"may";157;2;-1;0;"unknown";"no"
+48;"unemployed";"married";"secondary";"no";293;"yes";"no";"unknown";5;"may";243;1;-1;0;"unknown";"no"
+43;"services";"married";"primary";"no";3;"yes";"no";"unknown";5;"may";186;2;-1;0;"unknown";"no"
+54;"blue-collar";"married";"primary";"no";348;"yes";"no";"unknown";5;"may";579;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"unknown";"no";-19;"yes";"no";"unknown";5;"may";163;2;-1;0;"unknown";"no"
+26;"student";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";610;2;-1;0;"unknown";"no"
+40;"management";"married";"tertiary";"no";-4;"yes";"no";"unknown";5;"may";2033;1;-1;0;"unknown";"no"
+39;"management";"married";"secondary";"no";18;"yes";"no";"unknown";5;"may";85;1;-1;0;"unknown";"no"
+50;"technician";"married";"primary";"no";139;"no";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
+41;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"unknown";"no";1883;"yes";"no";"unknown";5;"may";57;1;-1;0;"unknown";"no"
+60;"retired";"divorced";"secondary";"no";216;"yes";"no";"unknown";5;"may";238;1;-1;0;"unknown";"no"
+52;"blue-collar";"married";"secondary";"no";782;"yes";"no";"unknown";5;"may";93;3;-1;0;"unknown";"no"
+48;"blue-collar";"married";"secondary";"no";904;"yes";"no";"unknown";5;"may";128;2;-1;0;"unknown";"no"
+48;"services";"married";"unknown";"no";1705;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
+39;"technician";"single";"tertiary";"no";47;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+47;"services";"single";"secondary";"no";176;"yes";"no";"unknown";5;"may";303;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";1225;"yes";"no";"unknown";5;"may";558;5;-1;0;"unknown";"no"
+45;"technician";"married";"secondary";"no";86;"yes";"no";"unknown";5;"may";270;1;-1;0;"unknown";"no"
+26;"admin.";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";228;1;-1;0;"unknown";"no"
+52;"management";"married";"tertiary";"no";271;"yes";"no";"unknown";5;"may";99;1;-1;0;"unknown";"no"
+54;"technician";"married";"secondary";"no";1378;"yes";"no";"unknown";5;"may";240;1;-1;0;"unknown";"no"
+54;"admin.";"married";"tertiary";"no";184;"no";"no";"unknown";5;"may";673;2;-1;0;"unknown";"yes"
+50;"blue-collar";"married";"primary";"no";0;"no";"no";"unknown";5;"may";233;3;-1;0;"unknown";"no"
+35;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";1056;1;-1;0;"unknown";"no"
+44;"services";"married";"secondary";"no";1357;"yes";"yes";"unknown";5;"may";250;1;-1;0;"unknown";"no"
+53;"entrepreneur";"married";"unknown";"no";19;"yes";"no";"unknown";5;"may";252;1;-1;0;"unknown";"no"
+35;"retired";"single";"primary";"no";434;"no";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
+60;"admin.";"divorced";"secondary";"no";92;"yes";"no";"unknown";5;"may";130;1;-1;0;"unknown";"no"
+53;"admin.";"divorced";"secondary";"no";1151;"yes";"no";"unknown";5;"may";412;1;-1;0;"unknown";"no"
+48;"unemployed";"married";"secondary";"no";41;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
+34;"technician";"married";"secondary";"no";51;"yes";"no";"unknown";5;"may";19;2;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"no";214;"yes";"no";"unknown";5;"may";458;2;-1;0;"unknown";"no"
+51;"management";"married";"secondary";"no";1161;"yes";"no";"unknown";5;"may";717;1;-1;0;"unknown";"no"
+31;"services";"married";"tertiary";"no";37;"yes";"no";"unknown";5;"may";313;1;-1;0;"unknown";"no"
+35;"technician";"divorced";"secondary";"no";787;"yes";"no";"unknown";5;"may";683;2;-1;0;"unknown";"no"
+35;"services";"married";"secondary";"no";59;"yes";"no";"unknown";5;"may";1077;1;-1;0;"unknown";"no"
+38;"technician";"married";"secondary";"no";253;"yes";"no";"unknown";5;"may";416;1;-1;0;"unknown";"no"
+36;"admin.";"married";"tertiary";"no";211;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
+58;"retired";"married";"primary";"no";235;"yes";"no";"unknown";5;"may";167;1;-1;0;"unknown";"no"
+40;"services";"divorced";"unknown";"no";4384;"yes";"no";"unknown";5;"may";315;1;-1;0;"unknown";"no"
+54;"management";"married";"secondary";"no";4080;"no";"no";"unknown";5;"may";140;1;-1;0;"unknown";"no"
+34;"blue-collar";"single";"secondary";"no";53;"yes";"yes";"unknown";5;"may";346;1;-1;0;"unknown";"no"
+31;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";562;1;-1;0;"unknown";"no"
+51;"retired";"married";"secondary";"no";2127;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
+33;"management";"married";"tertiary";"no";377;"yes";"no";"unknown";5;"may";217;1;-1;0;"unknown";"no"
+55;"management";"married";"tertiary";"no";73;"yes";"no";"unknown";5;"may";142;2;-1;0;"unknown";"no"
+42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";5;"may";67;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";243;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
+33;"blue-collar";"single";"secondary";"no";307;"yes";"no";"unknown";5;"may";309;2;-1;0;"unknown";"no"
+38;"services";"married";"secondary";"no";155;"yes";"no";"unknown";5;"may";248;1;-1;0;"unknown";"no"
+50;"technician";"divorced";"tertiary";"no";173;"no";"yes";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+43;"management";"married";"tertiary";"no";400;"yes";"no";"unknown";5;"may";256;1;-1;0;"unknown";"no"
+61;"blue-collar";"divorced";"primary";"no";1428;"yes";"no";"unknown";5;"may";82;2;-1;0;"unknown";"no"
+47;"admin.";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
+48;"self-employed";"married";"tertiary";"no";7;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";575;"yes";"no";"unknown";5;"may";477;1;-1;0;"unknown";"no"
+35;"student";"single";"unknown";"no";298;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
+35;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";471;1;-1;0;"unknown";"no"
+50;"services";"married";"secondary";"no";5699;"yes";"no";"unknown";5;"may";381;2;-1;0;"unknown";"no"
+41;"management";"married";"tertiary";"no";176;"yes";"yes";"unknown";5;"may";42;1;-1;0;"unknown";"no"
+41;"management";"married";"tertiary";"no";517;"yes";"no";"unknown";5;"may";251;1;-1;0;"unknown";"no"
+39;"services";"single";"unknown";"no";257;"yes";"no";"unknown";5;"may";408;1;-1;0;"unknown";"no"
+42;"retired";"married";"secondary";"no";56;"yes";"no";"unknown";5;"may";215;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";-390;"yes";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
+53;"retired";"married";"secondary";"no";330;"yes";"no";"unknown";5;"may";216;2;-1;0;"unknown";"no"
+59;"housemaid";"divorced";"primary";"no";195;"no";"no";"unknown";5;"may";366;2;-1;0;"unknown";"no"
+36;"services";"married";"secondary";"no";301;"yes";"no";"unknown";5;"may";210;1;-1;0;"unknown";"no"
+54;"blue-collar";"married";"primary";"no";-41;"yes";"no";"unknown";5;"may";288;1;-1;0;"unknown";"no"
+40;"technician";"married";"tertiary";"no";483;"yes";"no";"unknown";5;"may";168;1;-1;0;"unknown";"no"
+47;"unknown";"married";"unknown";"no";28;"no";"no";"unknown";5;"may";338;2;-1;0;"unknown";"no"
+53;"unemployed";"married";"unknown";"no";13;"no";"no";"unknown";5;"may";410;3;-1;0;"unknown";"no"
+46;"housemaid";"married";"primary";"no";965;"no";"no";"unknown";5;"may";177;1;-1;0;"unknown";"no"
+39;"management";"married";"tertiary";"no";378;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
+40;"unemployed";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
+28;"blue-collar";"married";"primary";"no";324;"yes";"no";"unknown";5;"may";175;1;-1;0;"unknown";"no"
+35;"entrepreneur";"divorced";"secondary";"no";-69;"yes";"no";"unknown";5;"may";300;1;-1;0;"unknown";"no"
+55;"retired";"married";"secondary";"no";0;"no";"yes";"unknown";5;"may";136;1;-1;0;"unknown";"no"
+43;"technician";"divorced";"unknown";"no";205;"yes";"no";"unknown";5;"may";1419;1;-1;0;"unknown";"no"
+48;"blue-collar";"married";"primary";"no";278;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
+58;"management";"married";"unknown";"no";1065;"yes";"no";"unknown";5;"may";213;3;-1;0;"unknown";"no"
+33;"management";"single";"tertiary";"no";34;"yes";"no";"unknown";5;"may";27;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"unknown";"no";1033;"no";"no";"unknown";5;"may";238;2;-1;0;"unknown";"no"
+53;"services";"divorced";"secondary";"no";1467;"yes";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"primary";"no";-12;"yes";"no";"unknown";5;"may";18;1;-1;0;"unknown";"no"
+31;"services";"married";"secondary";"no";388;"yes";"no";"unknown";5;"may";730;2;-1;0;"unknown";"no"
+57;"entrepreneur";"married";"secondary";"no";294;"yes";"no";"unknown";5;"may";746;2;-1;0;"unknown";"no"
+53;"blue-collar";"married";"unknown";"no";1827;"no";"no";"unknown";5;"may";121;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"primary";"no";627;"yes";"no";"unknown";5;"may";247;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";5;"may";40;1;-1;0;"unknown";"no"
+53;"admin.";"divorced";"secondary";"no";315;"yes";"no";"unknown";5;"may";181;2;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
+44;"admin.";"divorced";"secondary";"no";66;"yes";"no";"unknown";5;"may";206;1;-1;0;"unknown";"no"
+49;"blue-collar";"divorced";"primary";"no";-9;"yes";"yes";"unknown";5;"may";389;1;-1;0;"unknown";"no"
+46;"technician";"married";"secondary";"no";349;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
+43;"entrepreneur";"married";"unknown";"no";100;"yes";"no";"unknown";5;"may";702;1;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
+43;"technician";"married";"secondary";"no";434;"yes";"no";"unknown";5;"may";117;1;-1;0;"unknown";"no"
+49;"management";"married";"tertiary";"no";3237;"yes";"no";"unknown";5;"may";232;3;-1;0;"unknown";"no"
+42;"management";"married";"unknown";"no";275;"no";"no";"unknown";5;"may";408;2;-1;0;"unknown";"no"
+22;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
+40;"management";"married";"tertiary";"no";207;"yes";"no";"unknown";5;"may";39;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";483;"yes";"no";"unknown";5;"may";282;1;-1;0;"unknown";"no"
+51;"services";"married";"secondary";"no";2248;"yes";"no";"unknown";5;"may";714;2;-1;0;"unknown";"no"
+49;"admin.";"married";"secondary";"no";428;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
+53;"blue-collar";"married";"secondary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+34;"services";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";142;1;-1;0;"unknown";"no"
+33;"technician";"divorced";"secondary";"no";140;"yes";"no";"unknown";5;"may";227;1;-1;0;"unknown";"no"
+50;"management";"single";"tertiary";"no";297;"yes";"no";"unknown";5;"may";119;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";279;"yes";"no";"unknown";5;"may";361;1;-1;0;"unknown";"no"
+59;"entrepreneur";"divorced";"secondary";"no";901;"yes";"no";"unknown";5;"may";73;3;-1;0;"unknown";"no"
+30;"technician";"single";"secondary";"no";2573;"yes";"no";"unknown";5;"may";67;2;-1;0;"unknown";"no"
+36;"services";"married";"secondary";"no";143;"yes";"yes";"unknown";5;"may";350;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";475;"yes";"no";"unknown";5;"may";332;2;-1;0;"unknown";"no"
+53;"blue-collar";"married";"secondary";"no";70;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
+34;"management";"single";"tertiary";"no";318;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";275;"yes";"no";"unknown";5;"may";132;1;-1;0;"unknown";"no"
+42;"management";"divorced";"tertiary";"no";742;"yes";"no";"unknown";5;"may";58;3;-1;0;"unknown";"no"
+41;"entrepreneur";"married";"primary";"no";236;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
+30;"student";"single";"tertiary";"no";25;"yes";"no";"unknown";5;"may";89;2;-1;0;"unknown";"no"
+37;"management";"single";"tertiary";"no";600;"yes";"no";"unknown";5;"may";152;1;-1;0;"unknown";"no"
+39;"admin.";"divorced";"secondary";"no";-349;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
+41;"blue-collar";"married";"primary";"no";183;"yes";"yes";"unknown";5;"may";110;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";463;1;-1;0;"unknown";"no"
+42;"management";"single";"tertiary";"no";0;"yes";"yes";"unknown";5;"may";562;2;-1;0;"unknown";"yes"
+40;"blue-collar";"divorced";"primary";"no";0;"yes";"no";"unknown";5;"may";962;1;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";1078;"yes";"no";"unknown";5;"may";10;4;-1;0;"unknown";"no"
+56;"entrepreneur";"divorced";"secondary";"no";155;"no";"no";"unknown";5;"may";118;3;-1;0;"unknown";"no"
+37;"admin.";"married";"secondary";"no";190;"yes";"no";"unknown";5;"may";92;2;-1;0;"unknown";"no"
+59;"retired";"married";"secondary";"no";319;"yes";"no";"unknown";5;"may";143;3;-1;0;"unknown";"no"
+39;"services";"divorced";"secondary";"no";-185;"yes";"no";"unknown";5;"may";189;3;-1;0;"unknown";"no"
+49;"services";"married";"secondary";"no";47;"no";"no";"unknown";5;"may";234;2;-1;0;"unknown";"no"
+38;"services";"single";"secondary";"no";570;"yes";"no";"unknown";5;"may";75;2;-1;0;"unknown";"no"
+36;"self-employed";"married";"tertiary";"no";19;"no";"no";"unknown";5;"may";189;2;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";61;"yes";"no";"unknown";5;"may";621;3;-1;0;"unknown";"no"
+41;"admin.";"married";"secondary";"no";-62;"yes";"yes";"unknown";5;"may";55;2;-1;0;"unknown";"no"
+54;"technician";"married";"tertiary";"no";258;"no";"no";"unknown";5;"may";310;4;-1;0;"unknown";"no"
+58;"blue-collar";"married";"primary";"no";76;"yes";"no";"unknown";5;"may";156;2;-1;0;"unknown";"no"
+30;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";5;2;-1;0;"unknown";"no"
+33;"admin.";"single";"secondary";"no";352;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
+47;"admin.";"married";"secondary";"no";368;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
+50;"technician";"single";"tertiary";"no";339;"yes";"no";"unknown";5;"may";2;3;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";1331;"yes";"no";"unknown";5;"may";286;2;-1;0;"unknown";"no"
+40;"self-employed";"married";"secondary";"no";672;"yes";"no";"unknown";5;"may";164;2;-1;0;"unknown";"no"
+37;"management";"married";"tertiary";"no";58;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
+54;"technician";"single";"unknown";"no";447;"yes";"no";"unknown";5;"may";742;2;-1;0;"unknown";"no"
+24;"student";"single";"secondary";"no";423;"yes";"no";"unknown";5;"may";226;3;-1;0;"unknown";"no"
+54;"management";"married";"tertiary";"no";0;"no";"no";"unknown";5;"may";120;2;-1;0;"unknown";"no"
+34;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";5;"may";362;4;-1;0;"unknown";"no"
+56;"technician";"divorced";"primary";"no";13;"yes";"no";"unknown";5;"may";357;2;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";200;2;-1;0;"unknown";"no"
+24;"student";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";204;2;-1;0;"unknown";"no"
+42;"blue-collar";"divorced";"primary";"no";28;"yes";"no";"unknown";5;"may";126;3;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";792;"yes";"no";"unknown";5;"may";65;2;-1;0;"unknown";"no"
+42;"blue-collar";"married";"unknown";"no";408;"yes";"no";"unknown";5;"may";107;2;-1;0;"unknown";"no"
+51;"admin.";"married";"secondary";"no";531;"yes";"no";"unknown";5;"may";267;2;-1;0;"unknown";"no"
+57;"retired";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";248;2;-1;0;"unknown";"no"
+36;"services";"single";"secondary";"no";62;"yes";"no";"unknown";5;"may";215;2;-1;0;"unknown";"no"
+53;"services";"married";"unknown";"no";257;"yes";"no";"unknown";5;"may";209;2;-1;0;"unknown";"no"
+50;"technician";"married";"secondary";"no";1234;"yes";"no";"unknown";5;"may";205;2;-1;0;"unknown";"no"
+54;"management";"married";"tertiary";"no";313;"yes";"no";"unknown";5;"may";83;2;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";5;"may";106;3;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";129;"yes";"yes";"unknown";5;"may";189;2;-1;0;"unknown";"no"
+43;"management";"married";"unknown";"no";0;"yes";"no";"unknown";5;"may";105;2;-1;0;"unknown";"no"
+56;"admin.";"married";"secondary";"no";353;"yes";"no";"unknown";5;"may";106;2;-1;0;"unknown";"no"
+54;"technician";"married";"unknown";"no";851;"yes";"no";"unknown";5;"may";108;2;-1;0;"unknown";"no"
+55;"services";"divorced";"primary";"no";96;"yes";"yes";"unknown";5;"may";311;2;-1;0;"unknown";"no"
+37;"services";"divorced";"secondary";"no";398;"yes";"yes";"unknown";5;"may";214;2;-1;0;"unknown";"no"
+33;"admin.";"single";"tertiary";"no";193;"no";"no";"unknown";5;"may";132;2;-1;0;"unknown";"no"
+46;"admin.";"married";"secondary";"no";-358;"yes";"no";"unknown";5;"may";358;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";539;"yes";"yes";"unknown";5;"may";453;2;-1;0;"unknown";"no"
+51;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";364;2;-1;0;"unknown";"no"
+40;"retired";"single";"primary";"no";0;"no";"no";"unknown";5;"may";136;2;-1;0;"unknown";"no"
+42;"blue-collar";"married";"secondary";"no";490;"yes";"no";"unknown";5;"may";386;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";173;2;-1;0;"unknown";"no"
+49;"blue-collar";"married";"unknown";"no";403;"yes";"no";"unknown";5;"may";241;2;-1;0;"unknown";"no"
+48;"management";"married";"secondary";"no";161;"yes";"no";"unknown";5;"may";224;3;-1;0;"unknown";"no"
+32;"technician";"divorced";"tertiary";"no";2558;"no";"no";"unknown";5;"may";148;2;-1;0;"unknown";"no"
+31;"admin.";"single";"secondary";"no";98;"yes";"no";"unknown";5;"may";196;2;-1;0;"unknown";"no"
+55;"management";"single";"tertiary";"no";115;"no";"no";"unknown";5;"may";111;4;-1;0;"unknown";"no"
+40;"blue-collar";"single";"secondary";"no";436;"yes";"no";"unknown";5;"may";231;3;-1;0;"unknown";"no"
+47;"technician";"married";"tertiary";"no";831;"yes";"no";"unknown";5;"may";316;3;-1;0;"unknown";"no"
+57;"technician";"married";"unknown";"no";206;"yes";"no";"unknown";5;"may";216;3;-1;0;"unknown";"no"
+41;"blue-collar";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";240;2;-1;0;"unknown";"no"
+48;"blue-collar";"married";"secondary";"no";1;"no";"no";"unknown";5;"may";669;3;-1;0;"unknown";"no"
+42;"blue-collar";"married";"unknown";"no";57;"yes";"no";"unknown";5;"may";425;2;-1;0;"unknown";"no"
+30;"blue-collar";"single";"secondary";"no";-457;"yes";"no";"unknown";5;"may";143;2;-1;0;"unknown";"no"
+58;"management";"single";"tertiary";"no";1387;"yes";"no";"unknown";5;"may";174;5;-1;0;"unknown";"no"
+45;"management";"divorced";"tertiary";"no";24598;"yes";"no";"unknown";5;"may";313;3;-1;0;"unknown";"no"
+49;"blue-collar";"married";"secondary";"no";30;"yes";"no";"unknown";5;"may";135;4;-1;0;"unknown";"no"
+42;"admin.";"single";"secondary";"no";1022;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";56;"yes";"yes";"unknown";5;"may";152;2;-1;0;"unknown";"no"
+51;"admin.";"single";"secondary";"yes";-2;"no";"no";"unknown";5;"may";402;3;-1;0;"unknown";"no"
+32;"services";"single";"secondary";"no";121;"yes";"no";"unknown";5;"may";213;2;-1;0;"unknown";"no"
+41;"blue-collar";"single";"secondary";"no";842;"yes";"no";"unknown";5;"may";144;3;-1;0;"unknown";"no"
+43;"management";"divorced";"secondary";"no";693;"yes";"no";"unknown";5;"may";124;3;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"secondary";"no";-333;"yes";"no";"unknown";5;"may";183;2;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";1533;"yes";"no";"unknown";5;"may";325;2;-1;0;"unknown";"no"
+34;"management";"married";"tertiary";"no";46;"yes";"no";"unknown";5;"may";39;4;-1;0;"unknown";"no"
+53;"services";"married";"unknown";"no";18;"no";"no";"unknown";5;"may";503;2;-1;0;"unknown";"no"
+45;"technician";"married";"secondary";"no";44;"yes";"no";"unknown";5;"may";95;4;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";-100;"yes";"no";"unknown";5;"may";680;2;-1;0;"unknown";"no"
+44;"services";"married";"tertiary";"no";510;"yes";"no";"unknown";5;"may";421;4;-1;0;"unknown";"no"
+55;"management";"married";"tertiary";"no";685;"yes";"no";"unknown";5;"may";174;3;-1;0;"unknown";"no"
+46;"management";"single";"tertiary";"no";187;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
+45;"blue-collar";"married";"secondary";"no";66;"yes";"no";"unknown";5;"may";808;2;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";560;"yes";"no";"unknown";5;"may";198;3;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";195;2;-1;0;"unknown";"no"
+59;"unknown";"divorced";"unknown";"no";27;"no";"no";"unknown";5;"may";347;3;-1;0;"unknown";"no"
+31;"admin.";"single";"secondary";"no";12;"yes";"no";"unknown";5;"may";208;2;-1;0;"unknown";"no"
+44;"blue-collar";"single";"secondary";"no";34;"yes";"no";"unknown";5;"may";404;4;-1;0;"unknown";"no"
+33;"entrepreneur";"single";"tertiary";"no";1068;"yes";"no";"unknown";5;"may";396;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";216;4;-1;0;"unknown";"no"
+46;"admin.";"single";"tertiary";"no";377;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
+48;"management";"married";"tertiary";"no";263;"yes";"no";"unknown";5;"may";350;2;-1;0;"unknown";"no"
+42;"services";"married";"secondary";"no";1263;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
+27;"services";"married";"secondary";"no";8;"yes";"no";"unknown";6;"may";88;3;-1;0;"unknown";"no"
+48;"admin.";"married";"secondary";"no";126;"yes";"yes";"unknown";6;"may";379;2;-1;0;"unknown";"no"
+59;"admin.";"married";"secondary";"no";230;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
+46;"technician";"married";"tertiary";"no";841;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
+38;"admin.";"divorced";"secondary";"no";308;"yes";"no";"unknown";6;"may";102;1;-1;0;"unknown";"no"
+43;"management";"divorced";"tertiary";"no";1;"yes";"no";"unknown";6;"may";306;1;-1;0;"unknown";"no"
+38;"admin.";"divorced";"tertiary";"no";86;"yes";"no";"unknown";6;"may";218;1;-1;0;"unknown";"no"
+23;"student";"single";"secondary";"no";157;"yes";"no";"unknown";6;"may";54;1;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";22;"yes";"no";"unknown";6;"may";344;1;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";46;"yes";"yes";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";1293;"no";"no";"unknown";6;"may";652;1;-1;0;"unknown";"no"
+25;"admin.";"single";"secondary";"no";122;"yes";"no";"unknown";6;"may";286;1;-1;0;"unknown";"no"
+48;"blue-collar";"married";"unknown";"no";131;"yes";"no";"unknown";6;"may";189;1;-1;0;"unknown";"no"
+49;"blue-collar";"single";"secondary";"no";143;"yes";"no";"unknown";6;"may";83;1;-1;0;"unknown";"no"
+38;"admin.";"single";"secondary";"no";393;"no";"no";"unknown";6;"may";184;2;-1;0;"unknown";"no"
+43;"blue-collar";"married";"primary";"no";98;"yes";"no";"unknown";6;"may";235;1;-1;0;"unknown";"no"
+33;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";290;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";224;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";757;"yes";"no";"unknown";6;"may";133;1;-1;0;"unknown";"no"
+49;"services";"married";"secondary";"no";245;"yes";"yes";"unknown";6;"may";318;1;-1;0;"unknown";"no"
+40;"management";"married";"secondary";"no";8486;"no";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
+43;"admin.";"married";"unknown";"no";350;"no";"no";"unknown";6;"may";437;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";20;"yes";"no";"unknown";6;"may";402;1;-1;0;"unknown";"no"
+58;"services";"married";"secondary";"no";1667;"yes";"yes";"unknown";6;"may";85;1;-1;0;"unknown";"no"
+57;"technician";"married";"unknown";"no";345;"yes";"no";"unknown";6;"may";125;1;-1;0;"unknown";"no"
+32;"unemployed";"married";"secondary";"no";10;"yes";"no";"unknown";6;"may";501;4;-1;0;"unknown";"no"
+56;"management";"married";"tertiary";"no";830;"yes";"yes";"unknown";6;"may";1201;1;-1;0;"unknown";"yes"
+58;"blue-collar";"divorced";"unknown";"no";29;"yes";"no";"unknown";6;"may";253;1;-1;0;"unknown";"no"
+60;"retired";"divorced";"secondary";"no";545;"yes";"no";"unknown";6;"may";1030;1;-1;0;"unknown";"yes"
+37;"technician";"married";"tertiary";"no";8730;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
+46;"technician";"divorced";"tertiary";"no";477;"yes";"no";"unknown";6;"may";114;1;-1;0;"unknown";"no"
+27;"admin.";"married";"secondary";"no";4;"yes";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";769;2;-1;0;"unknown";"no"
+32;"technician";"single";"secondary";"no";0;"yes";"yes";"unknown";6;"may";135;3;-1;0;"unknown";"no"
+40;"admin.";"single";"secondary";"no";263;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";1;"no";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";283;"no";"yes";"unknown";6;"may";199;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"primary";"no";206;"yes";"no";"unknown";6;"may";152;1;-1;0;"unknown";"no"
+42;"housemaid";"married";"primary";"no";17;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
+48;"technician";"married";"secondary";"no";141;"yes";"yes";"unknown";6;"may";424;1;-1;0;"unknown";"no"
+29;"self-employed";"single";"tertiary";"no";16;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
+50;"services";"married";"secondary";"no";206;"yes";"no";"unknown";6;"may";154;1;-1;0;"unknown";"no"
+52;"technician";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";203;2;-1;0;"unknown";"no"
+50;"management";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";326;1;-1;0;"unknown";"no"
+58;"retired";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";393;1;-1;0;"unknown";"no"
+46;"blue-collar";"divorced";"primary";"no";1927;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
+38;"technician";"married";"secondary";"no";284;"yes";"no";"unknown";6;"may";483;1;-1;0;"unknown";"no"
+46;"blue-collar";"married";"secondary";"no";1660;"yes";"no";"unknown";6;"may";259;1;-1;0;"unknown";"no"
+32;"services";"single";"secondary";"no";406;"yes";"no";"unknown";6;"may";227;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";230;"yes";"no";"unknown";6;"may";673;1;-1;0;"unknown";"no"
+39;"admin.";"single";"secondary";"no";-25;"yes";"no";"unknown";6;"may";576;1;-1;0;"unknown";"no"
+48;"admin.";"married";"secondary";"no";182;"yes";"no";"unknown";6;"may";180;2;-1;0;"unknown";"no"
+36;"entrepreneur";"married";"tertiary";"no";1169;"yes";"no";"unknown";6;"may";168;2;-1;0;"unknown";"no"
+34;"admin.";"divorced";"secondary";"no";67;"yes";"no";"unknown";6;"may";90;1;-1;0;"unknown";"no"
+40;"technician";"married";"secondary";"no";77;"no";"no";"unknown";6;"may";505;1;-1;0;"unknown";"no"
+43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";245;1;-1;0;"unknown";"no"
+52;"blue-collar";"divorced";"primary";"no";55;"yes";"yes";"unknown";6;"may";186;1;-1;0;"unknown";"no"
+33;"technician";"married";"secondary";"yes";72;"yes";"no";"unknown";6;"may";623;1;-1;0;"unknown";"no"
+49;"management";"single";"tertiary";"no";163;"yes";"no";"unknown";6;"may";496;3;-1;0;"unknown";"no"
+32;"management";"single";"tertiary";"no";151;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
+39;"admin.";"single";"secondary";"no";113;"yes";"no";"unknown";6;"may";342;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
+38;"technician";"single";"tertiary";"no";9;"yes";"no";"unknown";6;"may";185;3;-1;0;"unknown";"no"
+43;"management";"married";"secondary";"no";375;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
+39;"services";"married";"secondary";"no";1142;"yes";"no";"unknown";6;"may";276;1;-1;0;"unknown";"no"
+54;"blue-collar";"married";"primary";"no";2102;"yes";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
+38;"technician";"single";"tertiary";"no";4325;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";217;"yes";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+55;"admin.";"married";"secondary";"no";131;"yes";"no";"unknown";6;"may";744;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";1680;"yes";"no";"unknown";6;"may";765;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";119;1;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";320;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
+55;"admin.";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"secondary";"no";39;"no";"no";"unknown";6;"may";241;1;-1;0;"unknown";"no"
+35;"management";"single";"tertiary";"no";560;"yes";"no";"unknown";6;"may";181;1;-1;0;"unknown";"no"
+58;"technician";"divorced";"secondary";"no";469;"no";"no";"unknown";6;"may";196;1;-1;0;"unknown";"no"
+35;"admin.";"married";"secondary";"no";530;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
+49;"services";"married";"primary";"no";61;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
+34;"technician";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";139;"yes";"no";"unknown";6;"may";309;2;-1;0;"unknown";"no"
+24;"self-employed";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
+34;"technician";"married";"secondary";"no";367;"yes";"no";"unknown";6;"may";140;1;-1;0;"unknown";"no"
+51;"admin.";"divorced";"secondary";"no";228;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
+39;"technician";"single";"unknown";"no";45248;"yes";"no";"unknown";6;"may";1623;1;-1;0;"unknown";"yes"
+50;"self-employed";"married";"unknown";"no";-84;"yes";"no";"unknown";6;"may";101;1;-1;0;"unknown";"no"
+32;"services";"single";"secondary";"no";310;"yes";"no";"unknown";6;"may";144;1;-1;0;"unknown";"no"
+42;"blue-collar";"married";"unknown";"no";132;"yes";"no";"unknown";6;"may";238;1;-1;0;"unknown";"no"
+50;"technician";"married";"secondary";"no";797;"yes";"no";"unknown";6;"may";354;1;-1;0;"unknown";"no"
+40;"services";"married";"secondary";"no";71;"no";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
+46;"management";"divorced";"unknown";"no";2;"yes";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
+37;"management";"married";"tertiary";"no";231;"yes";"yes";"unknown";6;"may";451;2;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";270;"yes";"yes";"unknown";6;"may";159;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";274;"yes";"yes";"unknown";6;"may";409;1;-1;0;"unknown";"no"
+40;"admin.";"single";"secondary";"no";-109;"yes";"yes";"unknown";6;"may";170;1;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";608;1;-1;0;"unknown";"yes"
+33;"blue-collar";"single";"secondary";"yes";-60;"no";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
+35;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
+58;"blue-collar";"divorced";"secondary";"no";-11;"no";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";-509;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
+39;"unemployed";"married";"primary";"no";408;"yes";"no";"unknown";6;"may";53;1;-1;0;"unknown";"no"
+36;"services";"single";"primary";"no";58;"yes";"no";"unknown";6;"may";134;1;-1;0;"unknown";"no"
+57;"retired";"single";"secondary";"no";1640;"no";"yes";"unknown";6;"may";204;4;-1;0;"unknown";"no"
+36;"admin.";"single";"secondary";"no";20;"yes";"no";"unknown";6;"may";186;1;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";71;"yes";"no";"unknown";6;"may";678;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";52;"yes";"no";"unknown";6;"may";182;1;-1;0;"unknown";"no"
+44;"self-employed";"married";"tertiary";"no";292;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
+44;"services";"divorced";"secondary";"no";424;"yes";"no";"unknown";6;"may";27;1;-1;0;"unknown";"no"
+39;"housemaid";"single";"primary";"no";109;"yes";"no";"unknown";6;"may";699;3;-1;0;"unknown";"no"
+46;"blue-collar";"married";"unknown";"no";1044;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";983;"yes";"no";"unknown";6;"may";97;1;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";869;"no";"no";"unknown";6;"may";1677;1;-1;0;"unknown";"yes"
+40;"blue-collar";"married";"primary";"no";668;"yes";"no";"unknown";6;"may";283;2;-1;0;"unknown";"no"
+50;"management";"married";"tertiary";"no";964;"yes";"no";"unknown";6;"may";323;1;-1;0;"unknown";"no"
+31;"management";"single";"secondary";"no";301;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";140;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
+39;"management";"single";"secondary";"no";1877;"yes";"no";"unknown";6;"may";185;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";1127;"yes";"no";"unknown";6;"may";47;1;-1;0;"unknown";"no"
+41;"technician";"married";"secondary";"no";871;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
+41;"technician";"married";"secondary";"no";767;"yes";"yes";"unknown";6;"may";204;1;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
+30;"services";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";30;2;-1;0;"unknown";"no"
+54;"management";"divorced";"primary";"no";0;"no";"no";"unknown";6;"may";472;1;-1;0;"unknown";"no"
+43;"blue-collar";"divorced";"secondary";"no";110;"yes";"yes";"unknown";6;"may";448;1;-1;0;"unknown";"no"
+59;"management";"divorced";"tertiary";"no";-76;"yes";"yes";"unknown";6;"may";264;1;-1;0;"unknown";"no"
+47;"technician";"married";"unknown";"no";178;"yes";"no";"unknown";6;"may";169;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";288;1;-1;0;"unknown";"no"
+32;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";176;2;-1;0;"unknown";"no"
+29;"blue-collar";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";215;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";337;1;-1;0;"unknown";"no"
+55;"unemployed";"married";"tertiary";"no";5345;"no";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
+30;"blue-collar";"divorced";"secondary";"no";-209;"yes";"no";"unknown";6;"may";188;2;-1;0;"unknown";"no"
+39;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
+39;"blue-collar";"divorced";"secondary";"no";42;"yes";"no";"unknown";6;"may";226;2;-1;0;"unknown";"no"
+50;"blue-collar";"divorced";"secondary";"no";41;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";-99;"yes";"no";"unknown";6;"may";111;2;-1;0;"unknown";"no"
+37;"technician";"single";"secondary";"no";17;"yes";"no";"unknown";6;"may";164;1;-1;0;"unknown";"no"
+46;"admin.";"married";"primary";"no";276;"yes";"yes";"unknown";6;"may";157;2;-1;0;"unknown";"no"
+32;"technician";"single";"unknown";"no";-170;"no";"no";"unknown";6;"may";46;1;-1;0;"unknown";"no"
+37;"management";"single";"tertiary";"no";230;"yes";"yes";"unknown";6;"may";374;1;-1;0;"unknown";"no"
+29;"blue-collar";"married";"secondary";"no";9;"yes";"no";"unknown";6;"may";349;1;-1;0;"unknown";"no"
+41;"blue-collar";"married";"secondary";"no";946;"yes";"no";"unknown";6;"may";325;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";1297;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
+57;"retired";"divorced";"secondary";"no";-331;"yes";"no";"unknown";6;"may";531;1;-1;0;"unknown";"no"
+48;"blue-collar";"single";"secondary";"no";44;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
+60;"retired";"married";"secondary";"yes";15;"no";"no";"unknown";6;"may";80;1;-1;0;"unknown";"no"
+26;"admin.";"single";"secondary";"no";712;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
+58;"retired";"married";"secondary";"no";5435;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";507;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
+55;"unemployed";"divorced";"secondary";"no";387;"yes";"no";"unknown";6;"may";918;1;-1;0;"unknown";"yes"
+41;"blue-collar";"married";"primary";"no";0;"yes";"yes";"unknown";6;"may";238;1;-1;0;"unknown";"no"
+50;"management";"divorced";"secondary";"no";1716;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
+49;"entrepreneur";"married";"secondary";"no";167;"yes";"yes";"unknown";6;"may";198;3;-1;0;"unknown";"no"
+44;"admin.";"married";"unknown";"no";40;"no";"yes";"unknown";6;"may";160;2;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";148;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
+31;"technician";"married";"secondary";"no";17;"yes";"yes";"unknown";6;"may";120;1;-1;0;"unknown";"no"
+34;"blue-collar";"single";"tertiary";"no";1011;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
+46;"management";"single";"unknown";"no";1527;"yes";"no";"unknown";6;"may";269;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";744;"no";"no";"unknown";6;"may";157;1;-1;0;"unknown";"no"
+52;"admin.";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";128;1;-1;0;"unknown";"no"
+29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
+53;"retired";"married";"primary";"no";136;"yes";"no";"unknown";6;"may";267;2;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";1335;"yes";"no";"unknown";6;"may";371;2;-1;0;"unknown";"no"
+38;"management";"married";"secondary";"no";517;"yes";"no";"unknown";6;"may";288;2;-1;0;"unknown";"no"
+46;"management";"married";"tertiary";"no";459;"yes";"no";"unknown";6;"may";221;1;-1;0;"unknown";"no"
+48;"management";"divorced";"unknown";"no";549;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
+30;"admin.";"divorced";"secondary";"no";83;"yes";"yes";"unknown";6;"may";310;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";213;"no";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
+31;"housemaid";"married";"primary";"no";203;"yes";"no";"unknown";6;"may";604;3;-1;0;"unknown";"no"
+42;"services";"single";"secondary";"no";518;"yes";"no";"unknown";6;"may";198;1;-1;0;"unknown";"no"
+40;"management";"single";"tertiary";"no";3877;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
+52;"admin.";"married";"secondary";"no";1236;"yes";"no";"unknown";6;"may";247;1;-1;0;"unknown";"no"
+45;"blue-collar";"divorced";"secondary";"no";756;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
+48;"blue-collar";"married";"secondary";"no";157;"yes";"no";"unknown";6;"may";73;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";263;2;-1;0;"unknown";"no"
+34;"blue-collar";"married";"unknown";"no";245;"yes";"no";"unknown";6;"may";13;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"primary";"no";-144;"yes";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
+46;"blue-collar";"married";"secondary";"no";71;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
+49;"services";"divorced";"secondary";"no";505;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
+50;"technician";"married";"primary";"no";249;"yes";"no";"unknown";6;"may";129;1;-1;0;"unknown";"no"
+34;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
+40;"unemployed";"single";"secondary";"no";11;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
+36;"admin.";"married";"secondary";"no";639;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
+59;"blue-collar";"divorced";"unknown";"no";124;"yes";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";250;1;-1;0;"unknown";"no"
+36;"self-employed";"married";"tertiary";"no";107;"yes";"no";"unknown";6;"may";146;1;-1;0;"unknown";"no"
+56;"services";"married";"secondary";"no";473;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
+42;"services";"divorced";"secondary";"no";372;"yes";"yes";"unknown";6;"may";121;2;-1;0;"unknown";"no"
+30;"admin.";"married";"secondary";"no";46;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
+30;"student";"single";"tertiary";"no";34;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
+47;"self-employed";"married";"unknown";"no";935;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
+33;"blue-collar";"married";"secondary";"no";-10;"yes";"no";"unknown";6;"may";123;1;-1;0;"unknown";"no"
+36;"admin.";"married";"secondary";"no";-106;"yes";"no";"unknown";6;"may";130;2;-1;0;"unknown";"no"
+39;"services";"divorced";"primary";"no";471;"yes";"no";"unknown";6;"may";161;2;-1;0;"unknown";"no"
+56;"admin.";"divorced";"secondary";"no";778;"yes";"no";"unknown";6;"may";149;2;-1;0;"unknown";"no"
+39;"blue-collar";"divorced";"unknown";"no";170;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
+42;"technician";"married";"secondary";"no";315;"yes";"no";"unknown";6;"may";259;2;-1;0;"unknown";"no"
+52;"blue-collar";"married";"secondary";"no";3165;"no";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
+36;"admin.";"divorced";"secondary";"no";131;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
+35;"entrepreneur";"married";"secondary";"yes";204;"yes";"no";"unknown";6;"may";424;2;-1;0;"unknown";"no"
+47;"technician";"married";"secondary";"no";83;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
+59;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";6;"may";97;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";5431;"yes";"yes";"unknown";6;"may";383;1;-1;0;"unknown";"no"
+38;"management";"married";"unknown";"no";1759;"yes";"no";"unknown";6;"may";440;1;-1;0;"unknown";"no"
+46;"unemployed";"married";"secondary";"no";-125;"yes";"no";"unknown";6;"may";23;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+28;"services";"single";"secondary";"no";5090;"yes";"no";"unknown";6;"may";1297;3;-1;0;"unknown";"yes"
+38;"technician";"married";"unknown";"no";573;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
+56;"blue-collar";"married";"secondary";"no";1602;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
+41;"blue-collar";"single";"primary";"yes";-137;"yes";"yes";"unknown";6;"may";189;1;-1;0;"unknown";"no"
+52;"technician";"married";"unknown";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"no";193;"no";"no";"unknown";6;"may";179;1;-1;0;"unknown";"no"
+61;"retired";"married";"secondary";"no";195;"yes";"yes";"unknown";6;"may";179;1;-1;0;"unknown";"no"
+53;"entrepreneur";"married";"secondary";"no";288;"no";"no";"unknown";6;"may";69;1;-1;0;"unknown";"no"
+47;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";6;"may";105;2;-1;0;"unknown";"no"
+53;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";6;"may";266;3;-1;0;"unknown";"no"
+46;"services";"married";"secondary";"no";216;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
+39;"blue-collar";"divorced";"primary";"no";190;"yes";"yes";"unknown";6;"may";96;2;-1;0;"unknown";"no"
+56;"technician";"divorced";"secondary";"no";99;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
+55;"services";"divorced";"primary";"no";2298;"yes";"no";"unknown";6;"may";162;2;-1;0;"unknown";"no"
+44;"management";"married";"tertiary";"no";17;"yes";"no";"unknown";6;"may";352;2;-1;0;"unknown";"no"
+37;"technician";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";76;4;-1;0;"unknown";"no"
+35;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";154;2;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";840;"yes";"no";"unknown";6;"may";310;2;-1;0;"unknown";"no"
+37;"services";"married";"secondary";"no";358;"yes";"no";"unknown";6;"may";390;3;-1;0;"unknown";"no"
+30;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";369;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";-325;"yes";"yes";"unknown";6;"may";112;2;-1;0;"unknown";"no"
+36;"technician";"single";"secondary";"no";-15;"yes";"no";"unknown";6;"may";341;3;-1;0;"unknown";"no"
+38;"technician";"married";"secondary";"no";581;"yes";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
+41;"admin.";"divorced";"primary";"no";4070;"yes";"no";"unknown";6;"may";140;2;-1;0;"unknown";"no"
+48;"retired";"married";"secondary";"no";74;"no";"yes";"unknown";6;"may";315;1;-1;0;"unknown";"no"
+55;"services";"divorced";"secondary";"no";141;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
+28;"services";"divorced";"secondary";"no";89;"no";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"yes";0;"yes";"no";"unknown";6;"may";138;3;-1;0;"unknown";"no"
+30;"blue-collar";"married";"secondary";"no";450;"no";"no";"unknown";6;"may";526;2;-1;0;"unknown";"no"
+48;"technician";"married";"tertiary";"no";310;"no";"no";"unknown";6;"may";135;1;-1;0;"unknown";"no"
+31;"self-employed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";36;5;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";384;"yes";"no";"unknown";6;"may";1906;3;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";395;"yes";"no";"unknown";6;"may";219;2;-1;0;"unknown";"no"
+37;"services";"single";"unknown";"no";-118;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
+56;"blue-collar";"married";"primary";"no";5;"yes";"yes";"unknown";6;"may";407;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"secondary";"no";50;"yes";"yes";"unknown";6;"may";121;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";285;"yes";"yes";"unknown";6;"may";209;1;-1;0;"unknown";"no"
+49;"technician";"married";"unknown";"no";15;"no";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";653;"yes";"yes";"unknown";6;"may";208;1;-1;0;"unknown";"no"
+43;"self-employed";"married";"secondary";"no";918;"yes";"no";"unknown";6;"may";193;1;-1;0;"unknown";"no"
+32;"services";"married";"secondary";"no";243;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
+29;"technician";"single";"tertiary";"no";405;"yes";"no";"unknown";6;"may";65;1;-1;0;"unknown";"no"
+48;"management";"divorced";"tertiary";"no";1328;"yes";"no";"unknown";6;"may";339;1;-1;0;"unknown";"no"
+55;"services";"married";"primary";"no";255;"yes";"no";"unknown";6;"may";285;1;-1;0;"unknown";"no"
+53;"blue-collar";"married";"secondary";"no";3397;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
+47;"technician";"married";"unknown";"no";2106;"yes";"no";"unknown";6;"may";168;1;-1;0;"unknown";"no"
+39;"management";"married";"tertiary";"no";2877;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
+31;"blue-collar";"single";"tertiary";"no";60;"yes";"yes";"unknown";6;"may";389;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";2226;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";2880;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
+40;"technician";"single";"unknown";"no";-5;"yes";"no";"unknown";6;"may";78;2;-1;0;"unknown";"no"
+48;"technician";"married";"secondary";"no";147;"no";"no";"unknown";6;"may";142;3;-1;0;"unknown";"no"
+33;"technician";"divorced";"secondary";"no";7;"yes";"yes";"unknown";6;"may";87;1;-1;0;"unknown";"no"
+40;"technician";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
+59;"retired";"married";"primary";"no";-119;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
+30;"technician";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";703;1;-1;0;"unknown";"yes"
+31;"management";"single";"tertiary";"no";1852;"yes";"no";"unknown";6;"may";170;3;-1;0;"unknown";"no"
+35;"unemployed";"married";"secondary";"no";533;"yes";"no";"unknown";6;"may";802;1;-1;0;"unknown";"no"
+54;"technician";"divorced";"secondary";"no";21;"yes";"no";"unknown";6;"may";381;2;-1;0;"unknown";"no"
+34;"admin.";"single";"unknown";"no";2434;"yes";"no";"unknown";6;"may";218;4;-1;0;"unknown";"no"
+32;"technician";"married";"secondary";"no";90;"yes";"yes";"unknown";6;"may";57;2;-1;0;"unknown";"no"
+56;"admin.";"divorced";"unknown";"no";4246;"yes";"no";"unknown";6;"may";304;2;-1;0;"unknown";"no"
+32;"admin.";"single";"tertiary";"no";395;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
+42;"blue-collar";"married";"primary";"no";15;"yes";"no";"unknown";6;"may";230;1;-1;0;"unknown";"no"
+33;"services";"married";"tertiary";"no";85;"no";"no";"unknown";6;"may";262;3;-1;0;"unknown";"no"
+52;"entrepreneur";"married";"tertiary";"no";-184;"yes";"yes";"unknown";6;"may";392;2;-1;0;"unknown";"no"
+52;"services";"married";"secondary";"no";660;"no";"no";"unknown";6;"may";201;2;-1;0;"unknown";"no"
+52;"blue-collar";"divorced";"primary";"yes";-183;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
+30;"unemployed";"divorced";"secondary";"no";1144;"yes";"no";"unknown";6;"may";252;1;-1;0;"unknown";"no"
+44;"services";"divorced";"secondary";"no";1;"yes";"no";"unknown";6;"may";235;4;-1;0;"unknown";"no"
+35;"admin.";"married";"secondary";"no";69;"yes";"yes";"unknown";6;"may";235;2;-1;0;"unknown";"no"
+55;"management";"single";"secondary";"no";220;"yes";"no";"unknown";6;"may";328;2;-1;0;"unknown";"no"
+33;"blue-collar";"married";"primary";"no";332;"yes";"no";"unknown";6;"may";116;2;-1;0;"unknown";"no"
+37;"blue-collar";"single";"secondary";"no";240;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
+42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";293;1;-1;0;"unknown";"no"
+43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";37;2;-1;0;"unknown";"no"
+38;"entrepreneur";"married";"tertiary";"no";898;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";123;"yes";"yes";"unknown";6;"may";530;2;-1;0;"unknown";"no"
+31;"student";"single";"secondary";"no";252;"yes";"no";"unknown";6;"may";175;3;-1;0;"unknown";"no"
+41;"management";"married";"tertiary";"no";65;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
+41;"technician";"married";"secondary";"no";-366;"yes";"yes";"unknown";6;"may";29;3;-1;0;"unknown";"no"
+29;"student";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";311;2;-1;0;"unknown";"no"
+38;"admin.";"single";"secondary";"no";221;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
+44;"self-employed";"divorced";"tertiary";"no";4;"yes";"no";"unknown";6;"may";312;3;-1;0;"unknown";"no"
+39;"admin.";"married";"secondary";"no";104;"yes";"no";"unknown";6;"may";412;1;-1;0;"unknown";"no"
+28;"technician";"single";"secondary";"no";312;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
+33;"blue-collar";"married";"secondary";"no";-349;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
+41;"services";"married";"unknown";"no";4;"no";"no";"unknown";6;"may";284;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";-322;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
+29;"admin.";"married";"secondary";"no";-150;"yes";"no";"unknown";6;"may";328;1;-1;0;"unknown";"no"
+38;"management";"married";"unknown";"no";1349;"yes";"no";"unknown";6;"may";100;1;-1;0;"unknown";"no"
+32;"admin.";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";226;1;-1;0;"unknown";"no"
+45;"services";"married";"secondary";"no";1259;"yes";"no";"unknown";6;"may";507;1;-1;0;"unknown";"no"
+33;"admin.";"single";"secondary";"no";101;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";848;"yes";"no";"unknown";6;"may";684;2;-1;0;"unknown";"no"
+41;"entrepreneur";"married";"unknown";"no";89;"yes";"no";"unknown";6;"may";333;2;-1;0;"unknown";"no"
+41;"blue-collar";"married";"secondary";"no";140;"yes";"no";"unknown";6;"may";311;3;-1;0;"unknown";"no"
+35;"admin.";"single";"secondary";"no";148;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
+40;"technician";"single";"secondary";"no";200;"yes";"no";"unknown";6;"may";322;2;-1;0;"unknown";"no"
+60;"self-employed";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";202;4;-1;0;"unknown";"no"
+47;"services";"divorced";"secondary";"no";201;"yes";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
+46;"blue-collar";"married";"primary";"no";530;"yes";"no";"unknown";6;"may";739;3;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";273;2;-1;0;"unknown";"no"
+49;"self-employed";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
+29;"blue-collar";"married";"secondary";"no";43;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
+31;"management";"single";"tertiary";"no";-173;"yes";"no";"unknown";6;"may";396;2;-1;0;"unknown";"no"
+38;"management";"married";"tertiary";"no";389;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";215;"yes";"yes";"unknown";6;"may";308;3;-1;0;"unknown";"no"
+35;"technician";"married";"secondary";"no";-131;"yes";"no";"unknown";6;"may";467;2;-1;0;"unknown";"no"
+31;"management";"single";"secondary";"no";783;"yes";"no";"unknown";6;"may";320;1;-1;0;"unknown";"no"
+41;"admin.";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
+46;"services";"married";"unknown";"no";80;"yes";"no";"unknown";6;"may";245;2;-1;0;"unknown";"no"
+40;"services";"divorced";"secondary";"no";105;"yes";"no";"unknown";6;"may";189;2;-1;0;"unknown";"no"
+29;"admin.";"married";"secondary";"no";182;"yes";"yes";"unknown";6;"may";477;1;-1;0;"unknown";"no"
+49;"admin.";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";65;3;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"no";510;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
+40;"management";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
+53;"admin.";"married";"secondary";"no";244;"yes";"yes";"unknown";6;"may";197;2;-1;0;"unknown";"no"
+49;"management";"married";"tertiary";"no";92;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";6;"may";64;2;-1;0;"unknown";"no"
+29;"student";"single";"secondary";"no";948;"yes";"no";"unknown";6;"may";75;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";6;"may";400;2;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";710;"yes";"no";"unknown";6;"may";378;3;-1;0;"unknown";"no"
+39;"services";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";118;2;-1;0;"unknown";"no"
+36;"technician";"married";"secondary";"no";368;"yes";"yes";"unknown";6;"may";1597;2;-1;0;"unknown";"yes"
+44;"entrepreneur";"married";"tertiary";"no";1631;"yes";"no";"unknown";6;"may";346;2;-1;0;"unknown";"no"
+40;"admin.";"married";"secondary";"no";6;"yes";"no";"unknown";6;"may";60;3;-1;0;"unknown";"no"
+49;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";6;"may";276;2;-1;0;"unknown";"no"
+30;"technician";"single";"unknown";"no";-48;"yes";"no";"unknown";6;"may";152;2;-1;0;"unknown";"no"
+57;"management";"married";"tertiary";"no";2142;"yes";"no";"unknown";6;"may";251;3;-1;0;"unknown";"no"
+24;"services";"single";"secondary";"no";77;"yes";"yes";"unknown";6;"may";390;2;-1;0;"unknown";"no"
+46;"blue-collar";"married";"unknown";"no";401;"yes";"no";"unknown";6;"may";306;2;-1;0;"unknown";"no"
+33;"admin.";"married";"secondary";"no";21;"no";"no";"unknown";6;"may";189;3;-1;0;"unknown";"no"
+43;"services";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";125;2;-1;0;"unknown";"no"
+43;"admin.";"single";"secondary";"no";-497;"yes";"no";"unknown";6;"may";234;2;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"primary";"no";369;"no";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
+44;"technician";"single";"unknown";"no";78;"yes";"no";"unknown";6;"may";13;6;-1;0;"unknown";"no"
+35;"technician";"single";"tertiary";"no";226;"yes";"yes";"unknown";6;"may";283;3;-1;0;"unknown";"no"
+47;"technician";"married";"secondary";"no";503;"yes";"no";"unknown";6;"may";109;2;-1;0;"unknown";"no"
+33;"blue-collar";"married";"secondary";"no";372;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
+31;"admin.";"married";"secondary";"no";0;"yes";"yes";"unknown";6;"may";144;2;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";121;2;-1;0;"unknown";"no"
+36;"entrepreneur";"married";"tertiary";"no";125;"yes";"no";"unknown";6;"may";95;3;-1;0;"unknown";"no"
+56;"retired";"divorced";"primary";"no";4;"yes";"no";"unknown";6;"may";31;3;-1;0;"unknown";"no"
+40;"admin.";"single";"unknown";"no";419;"yes";"no";"unknown";6;"may";112;3;-1;0;"unknown";"no"
+41;"admin.";"divorced";"secondary";"no";322;"yes";"no";"unknown";6;"may";87;4;-1;0;"unknown";"no"
+53;"retired";"married";"secondary";"no";303;"yes";"no";"unknown";6;"may";593;2;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";607;"yes";"no";"unknown";6;"may";99;2;-1;0;"unknown";"no"
+44;"blue-collar";"divorced";"secondary";"no";579;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";3047;"yes";"no";"unknown";6;"may";285;2;-1;0;"unknown";"no"
+54;"technician";"divorced";"secondary";"no";83;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
+58;"management";"married";"tertiary";"no";68;"yes";"no";"unknown";6;"may";172;5;-1;0;"unknown";"no"
+52;"blue-collar";"married";"primary";"no";58;"yes";"no";"unknown";6;"may";213;3;-1;0;"unknown";"no"
+28;"admin.";"single";"secondary";"no";251;"yes";"no";"unknown";6;"may";178;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";688;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
+60;"retired";"married";"primary";"no";364;"yes";"no";"unknown";6;"may";631;2;-1;0;"unknown";"no"
+42;"services";"divorced";"secondary";"no";55;"yes";"no";"unknown";6;"may";176;5;-1;0;"unknown";"no"
+42;"admin.";"married";"secondary";"no";101;"yes";"no";"unknown";6;"may";32;3;-1;0;"unknown";"no"
+44;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";1529;2;-1;0;"unknown";"no"
+51;"blue-collar";"divorced";"primary";"no";325;"yes";"no";"unknown";6;"may";254;2;-1;0;"unknown";"no"
+49;"blue-collar";"married";"primary";"no";198;"yes";"no";"unknown";6;"may";200;2;-1;0;"unknown";"no"
+47;"entrepreneur";"married";"unknown";"no";209;"yes";"no";"unknown";6;"may";135;2;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";112;4;-1;0;"unknown";"no"
+34;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";314;3;-1;0;"unknown";"no"
+35;"services";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";597;3;-1;0;"unknown";"no"
+35;"blue-collar";"single";"secondary";"no";376;"yes";"yes";"unknown";6;"may";207;3;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";-7;"yes";"no";"unknown";6;"may";410;2;-1;0;"unknown";"no"
+55;"technician";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
+55;"retired";"married";"secondary";"no";143;"yes";"no";"unknown";6;"may";42;3;-1;0;"unknown";"no"
+35;"management";"single";"tertiary";"no";550;"yes";"no";"unknown";6;"may";55;2;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";162;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
+53;"management";"married";"tertiary";"no";115;"yes";"no";"unknown";6;"may";336;3;-1;0;"unknown";"no"
+41;"blue-collar";"married";"primary";"no";512;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
+57;"blue-collar";"married";"unknown";"no";807;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
+45;"blue-collar";"married";"unknown";"no";248;"yes";"no";"unknown";6;"may";88;5;-1;0;"unknown";"no"
+43;"blue-collar";"married";"primary";"no";1211;"yes";"no";"unknown";6;"may";208;3;-1;0;"unknown";"no"
+56;"self-employed";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";305;2;-1;0;"unknown";"no"
+31;"entrepreneur";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";206;2;-1;0;"unknown";"no"
+37;"blue-collar";"single";"secondary";"no";88;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
+30;"management";"married";"tertiary";"no";32;"yes";"no";"unknown";6;"may";122;3;-1;0;"unknown";"no"
+30;"admin.";"single";"secondary";"no";115;"yes";"no";"unknown";6;"may";66;3;-1;0;"unknown";"no"
+54;"blue-collar";"married";"secondary";"no";254;"yes";"no";"unknown";6;"may";66;2;-1;0;"unknown";"no"
+36;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";6;"may";164;2;-1;0;"unknown";"no"
+55;"unemployed";"married";"tertiary";"no";383;"no";"no";"unknown";6;"may";343;3;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";569;"yes";"yes";"unknown";6;"may";126;2;-1;0;"unknown";"no"
+38;"housemaid";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";59;3;-1;0;"unknown";"no"
+48;"admin.";"married";"secondary";"no";3754;"yes";"no";"unknown";6;"may";249;3;-1;0;"unknown";"no"
+55;"housemaid";"divorced";"tertiary";"no";6920;"yes";"no";"unknown";6;"may";406;3;-1;0;"unknown";"no"
+59;"services";"married";"secondary";"no";307;"yes";"yes";"unknown";6;"may";250;7;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";-421;"yes";"no";"unknown";6;"may";183;5;-1;0;"unknown";"no"
+33;"blue-collar";"divorced";"secondary";"no";60;"no";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";67;"yes";"no";"unknown";6;"may";220;2;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";402;"yes";"no";"unknown";6;"may";153;3;-1;0;"unknown";"no"
+30;"self-employed";"single";"tertiary";"no";800;"no";"no";"unknown";6;"may";95;2;-1;0;"unknown";"no"
+42;"technician";"married";"tertiary";"no";239;"yes";"yes";"unknown";6;"may";191;3;-1;0;"unknown";"no"
+51;"blue-collar";"divorced";"secondary";"no";421;"yes";"no";"unknown";6;"may";216;2;-1;0;"unknown";"no"
+44;"admin.";"divorced";"secondary";"no";161;"yes";"no";"unknown";7;"may";89;2;-1;0;"unknown";"no"
+46;"technician";"married";"secondary";"yes";289;"no";"no";"unknown";7;"may";51;3;-1;0;"unknown";"no"
+29;"student";"single";"secondary";"no";110;"yes";"no";"unknown";7;"may";169;3;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";245;"yes";"no";"unknown";7;"may";148;3;-1;0;"unknown";"no"
+42;"services";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";132;3;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";156;"yes";"no";"unknown";7;"may";117;3;-1;0;"unknown";"no"
+42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";7;"may";275;4;-1;0;"unknown";"no"
+39;"admin.";"married";"secondary";"no";20;"yes";"no";"unknown";7;"may";124;2;-1;0;"unknown";"no"
+55;"technician";"single";"tertiary";"no";92;"yes";"no";"unknown";7;"may";118;3;-1;0;"unknown";"no"
+46;"services";"married";"secondary";"no";89;"yes";"no";"unknown";7;"may";479;2;-1;0;"unknown";"no"
+42;"blue-collar";"married";"secondary";"no";166;"yes";"no";"unknown";7;"may";285;3;-1;0;"unknown";"no"
+45;"management";"married";"tertiary";"no";103;"yes";"no";"unknown";7;"may";35;4;-1;0;"unknown";"no"
+43;"blue-collar";"married";"primary";"no";-454;"yes";"no";"unknown";7;"may";322;2;-1;0;"unknown";"no"
+42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";7;"may";202;2;-1;0;"unknown";"no"
+30;"admin.";"married";"secondary";"no";4;"no";"no";"unknown";7;"may";172;8;-1;0;"unknown";"no"
+47;"blue-collar";"married";"secondary";"no";1001;"yes";"no";"unknown";7;"may";201;4;-1;0;"unknown";"no"
+51;"services";"divorced";"secondary";"no";-69;"yes";"no";"unknown";7;"may";216;3;-1;0;"unknown";"no"
+38;"technician";"single";"secondary";"no";42;"yes";"no";"unknown";7;"may";195;2;-1;0;"unknown";"no"
+57;"technician";"married";"unknown";"no";1617;"yes";"no";"unknown";7;"may";96;2;-1;0;"unknown";"no"
+42;"management";"divorced";"tertiary";"no";221;"yes";"no";"unknown";7;"may";720;2;-1;0;"unknown";"no"
+32;"technician";"divorced";"secondary";"no";210;"yes";"yes";"unknown";7;"may";188;2;-1;0;"unknown";"no"
+46;"management";"married";"tertiary";"no";0;"no";"no";"unknown";7;"may";70;2;-1;0;"unknown";"no"
+29;"student";"single";"tertiary";"no";185;"yes";"no";"unknown";7;"may";141;3;-1;0;"unknown";"no"
+59;"retired";"married";"secondary";"no";836;"yes";"no";"unknown";7;"may";106;1;-1;0;"unknown";"no"
+32;"blue-collar";"single";"secondary";"no";301;"yes";"no";"unknown";7;"may";395;2;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";503;"yes";"no";"unknown";7;"may";629;2;-1;0;"unknown";"no"
+40;"retired";"married";"primary";"no";407;"yes";"no";"unknown";7;"may";502;1;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";53;"yes";"no";"unknown";7;"may";446;1;-1;0;"unknown";"no"
+46;"self-employed";"married";"tertiary";"no";2303;"yes";"no";"unknown";7;"may";241;1;-1;0;"unknown";"no"
+43;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";7;"may";131;3;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";205;"yes";"no";"unknown";7;"may";312;1;-1;0;"unknown";"no"
+39;"management";"married";"tertiary";"no";305;"yes";"no";"unknown";7;"may";275;6;-1;0;"unknown";"no"
+30;"blue-collar";"divorced";"secondary";"no";251;"yes";"yes";"unknown";7;"may";120;2;-1;0;"unknown";"no"
+56;"retired";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";333;4;-1;0;"unknown";"no"
+29;"technician";"married";"secondary";"no";8;"no";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"secondary";"no";139;"yes";"no";"unknown";7;"may";91;1;-1;0;"unknown";"no"
+36;"services";"married";"secondary";"no";184;"yes";"no";"unknown";7;"may";128;3;-1;0;"unknown";"no"
+37;"blue-collar";"single";"secondary";"no";238;"yes";"no";"unknown";7;"may";200;2;-1;0;"unknown";"no"
+35;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";7;"may";326;1;-1;0;"unknown";"no"
+35;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";7;"may";292;1;-1;0;"unknown";"no"
+47;"services";"married";"primary";"no";222;"yes";"no";"unknown";7;"may";68;1;-1;0;"unknown";"no"
+31;"services";"married";"secondary";"no";414;"yes";"no";"unknown";7;"may";215;1;-1;0;"unknown";"no"
+56;"retired";"single";"primary";"no";223;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";197;"no";"no";"unknown";7;"may";32;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";-251;"yes";"no";"unknown";7;"may";162;1;-1;0;"unknown";"no"
+45;"self-employed";"divorced";"secondary";"no";-139;"yes";"no";"unknown";7;"may";152;3;-1;0;"unknown";"no"
+47;"blue-collar";"married";"unknown";"no";733;"yes";"no";"unknown";7;"may";268;1;-1;0;"unknown";"no"
+29;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";7;"may";104;2;-1;0;"unknown";"no"
+57;"services";"married";"secondary";"no";1;"no";"no";"unknown";7;"may";852;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";97;"yes";"no";"unknown";7;"may";923;3;-1;0;"unknown";"no"
+31;"blue-collar";"single";"primary";"no";435;"yes";"no";"unknown";7;"may";159;2;-1;0;"unknown";"no"
+31;"management";"divorced";"tertiary";"no";0;"yes";"no";"unknown";7;"may";953;3;-1;0;"unknown";"no"
+37;"technician";"single";"tertiary";"no";147;"no";"no";"unknown";7;"may";416;2;-1;0;"unknown";"no"
+30;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";174;1;-1;0;"unknown";"no"
+58;"services";"divorced";"secondary";"no";1109;"yes";"yes";"unknown";7;"may";180;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";404;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";981;"yes";"no";"unknown";7;"may";294;1;-1;0;"unknown";"no"
+33;"blue-collar";"single";"primary";"no";95;"yes";"no";"unknown";7;"may";102;1;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";302;"yes";"no";"unknown";7;"may";124;1;-1;0;"unknown";"no"
+36;"services";"divorced";"secondary";"no";-290;"yes";"yes";"unknown";7;"may";128;1;-1;0;"unknown";"no"
+37;"services";"single";"secondary";"no";259;"yes";"no";"unknown";7;"may";130;1;-1;0;"unknown";"no"
+35;"blue-collar";"married";"secondary";"no";527;"yes";"yes";"unknown";7;"may";143;1;-1;0;"unknown";"no"
+55;"retired";"married";"secondary";"no";102;"yes";"no";"unknown";7;"may";74;1;-1;0;"unknown";"no"
+34;"management";"single";"tertiary";"no";872;"yes";"no";"unknown";7;"may";105;2;-1;0;"unknown";"no"
+40;"management";"divorced";"tertiary";"no";490;"yes";"no";"unknown";7;"may";477;2;-1;0;"unknown";"no"
+42;"blue-collar";"single";"primary";"no";19;"yes";"no";"unknown";7;"may";158;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";16;"yes";"no";"unknown";7;"may";250;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";386;"yes";"no";"unknown";7;"may";168;1;-1;0;"unknown";"no"
+35;"technician";"single";"secondary";"no";539;"yes";"no";"unknown";7;"may";520;1;-1;0;"unknown";"no"
+44;"technician";"divorced";"secondary";"no";-329;"yes";"no";"unknown";7;"may";171;1;-1;0;"unknown";"no"
+30;"services";"single";"secondary";"no";-174;"yes";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
+45;"entrepreneur";"married";"secondary";"no";68;"yes";"no";"unknown";7;"may";254;1;-1;0;"unknown";"no"
+35;"blue-collar";"single";"unknown";"yes";-532;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
+36;"admin.";"divorced";"secondary";"no";0;"yes";"no";"unknown";7;"may";133;2;-1;0;"unknown";"no"
+49;"blue-collar";"married";"secondary";"no";64;"yes";"no";"unknown";7;"may";293;3;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";1415;"yes";"no";"unknown";7;"may";485;1;-1;0;"unknown";"no"
+31;"technician";"single";"secondary";"no";147;"yes";"no";"unknown";7;"may";374;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";72;"yes";"no";"unknown";7;"may";425;6;-1;0;"unknown";"no"
+37;"services";"single";"secondary";"no";-196;"yes";"no";"unknown";7;"may";207;1;-1;0;"unknown";"no"
+33;"blue-collar";"married";"primary";"no";716;"yes";"no";"unknown";7;"may";83;3;-1;0;"unknown";"no"
+37;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";7;"may";228;1;-1;0;"unknown";"no"
+42;"services";"married";"secondary";"no";-246;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
+56;"blue-collar";"married";"secondary";"no";-203;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";245;"yes";"yes";"unknown";7;"may";732;2;-1;0;"unknown";"yes"
+36;"services";"single";"secondary";"no";342;"yes";"no";"unknown";7;"may";142;1;-1;0;"unknown";"no"
+29;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
+54;"management";"married";"tertiary";"yes";-248;"yes";"yes";"unknown";7;"may";112;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";376;"yes";"no";"unknown";7;"may";1521;1;-1;0;"unknown";"no"
+43;"blue-collar";"divorced";"secondary";"no";370;"yes";"no";"unknown";7;"may";216;1;-1;0;"unknown";"no"
+47;"admin.";"single";"secondary";"no";594;"yes";"no";"unknown";7;"may";161;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"secondary";"no";387;"yes";"no";"unknown";7;"may";122;2;-1;0;"unknown";"no"
+38;"services";"married";"secondary";"no";208;"yes";"no";"unknown";7;"may";800;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";563;"yes";"no";"unknown";7;"may";615;1;-1;0;"unknown";"no"
+33;"services";"divorced";"secondary";"no";392;"yes";"yes";"unknown";7;"may";254;1;-1;0;"unknown";"no"
+33;"retired";"married";"secondary";"no";165;"no";"no";"unknown";7;"may";111;1;-1;0;"unknown";"no"
+53;"admin.";"divorced";"unknown";"no";236;"yes";"no";"unknown";7;"may";354;1;-1;0;"unknown";"no"
+37;"services";"married";"primary";"no";52;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
+40;"management";"single";"tertiary";"no";1265;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";693;"yes";"no";"unknown";7;"may";327;3;-1;0;"unknown";"no"
+35;"technician";"married";"secondary";"no";118;"yes";"no";"unknown";7;"may";236;1;-1;0;"unknown";"no"
+49;"blue-collar";"married";"primary";"no";3659;"yes";"no";"unknown";7;"may";160;1;-1;0;"unknown";"no"
+26;"blue-collar";"single";"secondary";"no";24;"yes";"no";"unknown";7;"may";180;1;-1;0;"unknown";"no"
+38;"management";"single";"tertiary";"no";673;"yes";"no";"unknown";7;"may";184;1;-1;0;"unknown";"no"
+52;"self-employed";"married";"secondary";"no";273;"no";"no";"unknown";7;"may";227;1;-1;0;"unknown";"no"
+33;"services";"divorced";"secondary";"no";327;"yes";"no";"unknown";7;"may";109;1;-1;0;"unknown";"no"
+31;"admin.";"single";"secondary";"no";299;"yes";"no";"unknown";7;"may";492;2;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";298;1;-1;0;"unknown";"no"
+35;"blue-collar";"single";"primary";"no";109;"yes";"no";"unknown";7;"may";83;2;-1;0;"unknown";"no"
+55;"management";"divorced";"tertiary";"no";552;"no";"no";"unknown";7;"may";241;2;-1;0;"unknown";"no"
+32;"blue-collar";"divorced";"primary";"no";473;"yes";"no";"unknown";7;"may";204;2;-1;0;"unknown";"no"
+37;"unknown";"single";"unknown";"no";414;"yes";"no";"unknown";7;"may";131;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"secondary";

<TRUNCATED>

[32/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
new file mode 100644
index 0000000..bd1149b
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.tools;
+
+import com.google.common.base.Function;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterables;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public class IOUtils {
+
+  private IOUtils() {}
+
+  /**
+   * Converts CentroidWritable values in a sequence file into Centroids lazily.
+   * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
+   * @return an Iterable<Centroid> with the converted vectors.
+   */
+  public static Iterable<Centroid> getCentroidsFromCentroidWritableIterable(
+      Iterable<CentroidWritable>  dirIterable) {
+    return Iterables.transform(dirIterable, new Function<CentroidWritable, Centroid>() {
+      @Override
+      public Centroid apply(CentroidWritable input) {
+        Preconditions.checkNotNull(input);
+        return input.getCentroid().clone();
+      }
+    });
+  }
+
+  /**
+   * Converts CentroidWritable values in a sequence file into Centroids lazily.
+   * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
+   * @return an Iterable<Centroid> with the converted vectors.
+   */
+  public static Iterable<Centroid> getCentroidsFromClusterWritableIterable(Iterable<ClusterWritable>  dirIterable) {
+    return Iterables.transform(dirIterable, new Function<ClusterWritable, Centroid>() {
+      int numClusters = 0;
+      @Override
+      public Centroid apply(ClusterWritable input) {
+        Preconditions.checkNotNull(input);
+        return new Centroid(numClusters++, input.getValue().getCenter().clone(),
+            input.getValue().getTotalObservations());
+      }
+    });
+  }
+
+  /**
+   * Converts VectorWritable values in a sequence file into Vectors lazily.
+   * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
+   * @return an Iterable<Vector> with the converted vectors.
+   */
+  public static Iterable<Vector> getVectorsFromVectorWritableIterable(Iterable<VectorWritable> dirIterable) {
+    return Iterables.transform(dirIterable, new Function<VectorWritable, Vector>() {
+      @Override
+      public Vector apply(VectorWritable input) {
+        Preconditions.checkNotNull(input);
+        return input.get().clone();
+      }
+    });
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
new file mode 100644
index 0000000..083cd8c
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
@@ -0,0 +1,125 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.syntheticcontrol.canopy;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.canopy.CanopyDriver;
+import org.apache.mahout.clustering.conversion.InputDriver;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.utils.clustering.ClusterDumper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Deprecated
+public final class Job extends AbstractJob {
+
+  private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
+
+  private Job() {
+  }
+
+  private static final Logger log = LoggerFactory.getLogger(Job.class);
+
+  public static void main(String[] args) throws Exception {
+    if (args.length > 0) {
+      log.info("Running with only user-supplied arguments");
+      ToolRunner.run(new Configuration(), new Job(), args);
+    } else {
+      log.info("Running with default arguments");
+      Path output = new Path("output");
+      HadoopUtil.delete(new Configuration(), output);
+      run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55);
+    }
+  }
+
+  /**
+   * Run the canopy clustering job on an input dataset using the given distance
+   * measure, t1 and t2 parameters. All output data will be written to the
+   * output directory, which will be initially deleted if it exists. The
+   * clustered points will reside in the path <output>/clustered-points. By
+   * default, the job expects the a file containing synthetic_control.data as
+   * obtained from
+   * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
+   * resides in a directory named "testdata", and writes output to a directory
+   * named "output".
+   * 
+   * @param input
+   *          the String denoting the input directory path
+   * @param output
+   *          the String denoting the output directory path
+   * @param measure
+   *          the DistanceMeasure to use
+   * @param t1
+   *          the canopy T1 threshold
+   * @param t2
+   *          the canopy T2 threshold
+   */
+  private static void run(Path input, Path output, DistanceMeasure measure,
+      double t1, double t2) throws Exception {
+    Path directoryContainingConvertedInput = new Path(output,
+        DIRECTORY_CONTAINING_CONVERTED_INPUT);
+    InputDriver.runJob(input, directoryContainingConvertedInput,
+        "org.apache.mahout.math.RandomAccessSparseVector");
+    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput,
+        output, measure, t1, t2, true, 0.0, false);
+    // run ClusterDumper
+    ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
+        "clusters-0-final"), new Path(output, "clusteredPoints"));
+    clusterDumper.printClusters(null);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+
+    addInputOption();
+    addOutputOption();
+    addOption(DefaultOptionCreator.distanceMeasureOption().create());
+    addOption(DefaultOptionCreator.t1Option().create());
+    addOption(DefaultOptionCreator.t2Option().create());
+    addOption(DefaultOptionCreator.overwriteOption().create());
+
+    Map<String, List<String>> argMap = parseArguments(args);
+    if (argMap == null) {
+      return -1;
+    }
+
+    Path input = getInputPath();
+    Path output = getOutputPath();
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.delete(new Configuration(), output);
+    }
+    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+    double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
+    double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
+    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
+
+    run(input, output, measure, t1, t2);
+    return 0;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
new file mode 100644
index 0000000..43beb78
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
@@ -0,0 +1,144 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.syntheticcontrol.fuzzykmeans;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.canopy.CanopyDriver;
+import org.apache.mahout.clustering.conversion.InputDriver;
+import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.apache.mahout.utils.clustering.ClusterDumper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class Job extends AbstractJob {
+  
+  private static final Logger log = LoggerFactory.getLogger(Job.class);
+  
+  private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
+  
+  private static final String M_OPTION = FuzzyKMeansDriver.M_OPTION;
+  
+  private Job() {
+  }
+  
+  public static void main(String[] args) throws Exception {
+    if (args.length > 0) {
+      log.info("Running with only user-supplied arguments");
+      ToolRunner.run(new Configuration(), new Job(), args);
+    } else {
+      log.info("Running with default arguments");
+      Path output = new Path("output");
+      Configuration conf = new Configuration();
+      HadoopUtil.delete(conf, output);
+      run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 10, 2.0f, 0.5);
+    }
+  }
+  
+  @Override
+  public int run(String[] args) throws Exception {
+    addInputOption();
+    addOutputOption();
+    addOption(DefaultOptionCreator.distanceMeasureOption().create());
+    addOption(DefaultOptionCreator.convergenceOption().create());
+    addOption(DefaultOptionCreator.maxIterationsOption().create());
+    addOption(DefaultOptionCreator.overwriteOption().create());
+    addOption(DefaultOptionCreator.t1Option().create());
+    addOption(DefaultOptionCreator.t2Option().create());
+    addOption(M_OPTION, M_OPTION, "coefficient normalization factor, must be greater than 1", true);
+    
+    Map<String,List<String>> argMap = parseArguments(args);
+    if (argMap == null) {
+      return -1;
+    }
+    
+    Path input = getInputPath();
+    Path output = getOutputPath();
+    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+    if (measureClass == null) {
+      measureClass = SquaredEuclideanDistanceMeasure.class.getName();
+    }
+    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+    float fuzziness = Float.parseFloat(getOption(M_OPTION));
+    
+    addOption(new DefaultOptionBuilder().withLongName(M_OPTION).withRequired(true)
+        .withArgument(new ArgumentBuilder().withName(M_OPTION).withMinimum(1).withMaximum(1).create())
+        .withDescription("coefficient normalization factor, must be greater than 1").withShortName(M_OPTION).create());
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.delete(getConf(), output);
+    }
+    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
+    double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
+    double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
+    run(getConf(), input, output, measure, t1, t2, maxIterations, fuzziness, convergenceDelta);
+    return 0;
+  }
+  
+  /**
+   * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
+   * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
+   * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file
+   * containing synthetic_control.data as obtained from
+   * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata",
+   * and writes output to a directory named "output".
+   * 
+   * @param input
+   *          the String denoting the input directory path
+   * @param output
+   *          the String denoting the output directory path
+   * @param t1
+   *          the canopy T1 threshold
+   * @param t2
+   *          the canopy T2 threshold
+   * @param maxIterations
+   *          the int maximum number of iterations
+   * @param fuzziness
+   *          the float "m" fuzziness coefficient
+   * @param convergenceDelta
+   *          the double convergence criteria for iterations
+   */
+  public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2,
+      int maxIterations, float fuzziness, double convergenceDelta) throws Exception {
+    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
+    log.info("Preparing Input");
+    InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
+    log.info("Running Canopy to get initial clusters");
+    Path canopyOutput = new Path(output, "canopies");
+    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, false);
+    log.info("Running FuzzyKMeans");
+    FuzzyKMeansDriver.run(directoryContainingConvertedInput, new Path(canopyOutput, "clusters-0-final"), output,
+        convergenceDelta, maxIterations, fuzziness, true, true, 0.0, false);
+    // run ClusterDumper
+    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints"));
+    clusterDumper.printClusters(null);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
new file mode 100644
index 0000000..70c41fe
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
@@ -0,0 +1,187 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.syntheticcontrol.kmeans;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.canopy.CanopyDriver;
+import org.apache.mahout.clustering.conversion.InputDriver;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.apache.mahout.utils.clustering.ClusterDumper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class Job extends AbstractJob {
+  
+  private static final Logger log = LoggerFactory.getLogger(Job.class);
+  
+  private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
+  
+  private Job() {
+  }
+  
+  public static void main(String[] args) throws Exception {
+    if (args.length > 0) {
+      log.info("Running with only user-supplied arguments");
+      ToolRunner.run(new Configuration(), new Job(), args);
+    } else {
+      log.info("Running with default arguments");
+      Path output = new Path("output");
+      Configuration conf = new Configuration();
+      HadoopUtil.delete(conf, output);
+      run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10);
+    }
+  }
+  
+  @Override
+  public int run(String[] args) throws Exception {
+    addInputOption();
+    addOutputOption();
+    addOption(DefaultOptionCreator.distanceMeasureOption().create());
+    addOption(DefaultOptionCreator.numClustersOption().create());
+    addOption(DefaultOptionCreator.t1Option().create());
+    addOption(DefaultOptionCreator.t2Option().create());
+    addOption(DefaultOptionCreator.convergenceOption().create());
+    addOption(DefaultOptionCreator.maxIterationsOption().create());
+    addOption(DefaultOptionCreator.overwriteOption().create());
+    
+    Map<String,List<String>> argMap = parseArguments(args);
+    if (argMap == null) {
+      return -1;
+    }
+    
+    Path input = getInputPath();
+    Path output = getOutputPath();
+    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+    if (measureClass == null) {
+      measureClass = SquaredEuclideanDistanceMeasure.class.getName();
+    }
+    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.delete(getConf(), output);
+    }
+    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
+    if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
+      int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
+      run(getConf(), input, output, measure, k, convergenceDelta, maxIterations);
+    } else {
+      double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
+      double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
+      run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations);
+    }
+    return 0;
+  }
+  
+  /**
+   * Run the kmeans clustering job on an input dataset using the given the number of clusters k and iteration
+   * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
+   * The clustered points will reside in the path <output>/clustered-points. By default, the job expects a file
+   * containing equal length space delimited data that resides in a directory named "testdata", and writes output to a
+   * directory named "output".
+   * 
+   * @param conf
+   *          the Configuration to use
+   * @param input
+   *          the String denoting the input directory path
+   * @param output
+   *          the String denoting the output directory path
+   * @param measure
+   *          the DistanceMeasure to use
+   * @param k
+   *          the number of clusters in Kmeans
+   * @param convergenceDelta
+   *          the double convergence criteria for iterations
+   * @param maxIterations
+   *          the int maximum number of iterations
+   */
+  public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
+      double convergenceDelta, int maxIterations) throws Exception {
+    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
+    log.info("Preparing Input");
+    InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
+    log.info("Running random seed to get initial clusters");
+    Path clusters = new Path(output, "random-seeds");
+    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
+    log.info("Running KMeans with k = {}", k);
+    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, convergenceDelta,
+        maxIterations, true, 0.0, false);
+    // run ClusterDumper
+    Path outGlob = new Path(output, "clusters-*-final");
+    Path clusteredPoints = new Path(output,"clusteredPoints");
+    log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints);
+    ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints);
+    clusterDumper.printClusters(null);
+  }
+  
+  /**
+   * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
+   * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
+   * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file
+   * containing synthetic_control.data as obtained from
+   * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata",
+   * and writes output to a directory named "output".
+   * 
+   * @param conf
+   *          the Configuration to use
+   * @param input
+   *          the String denoting the input directory path
+   * @param output
+   *          the String denoting the output directory path
+   * @param measure
+   *          the DistanceMeasure to use
+   * @param t1
+   *          the canopy T1 threshold
+   * @param t2
+   *          the canopy T2 threshold
+   * @param convergenceDelta
+   *          the double convergence criteria for iterations
+   * @param maxIterations
+   *          the int maximum number of iterations
+   */
+  public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2,
+      double convergenceDelta, int maxIterations) throws Exception {
+    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
+    log.info("Preparing Input");
+    InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
+    log.info("Running Canopy to get initial clusters");
+    Path canopyOutput = new Path(output, "canopies");
+    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0,
+        false);
+    log.info("Running KMeans");
+    KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR
+        + "-final"), output, convergenceDelta, maxIterations, true, 0.0, false);
+    // run ClusterDumper
+    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output,
+        "clusteredPoints"));
+    clusterDumper.printClusters(null);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java
new file mode 100644
index 0000000..92363e5
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth;
+
+import java.io.IOException;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.Parameters;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.fpm.pfpgrowth.dataset.KeyBasedStringTupleGrouper;
+
+public final class DeliciousTagsExample {
+  private DeliciousTagsExample() { }
+  
+  public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
+    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+    ArgumentBuilder abuilder = new ArgumentBuilder();
+    GroupBuilder gbuilder = new GroupBuilder();
+    Option inputDirOpt = DefaultOptionCreator.inputOption().create();
+    
+    Option outputOpt = DefaultOptionCreator.outputOption().create();
+    
+    Option helpOpt = DefaultOptionCreator.helpOption();
+    Option recordSplitterOpt = obuilder.withLongName("splitterPattern").withArgument(
+      abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1).create()).withDescription(
+      "Regular Expression pattern used to split given line into fields."
+          + " Default value splits comma or tab separated fields."
+          + " Default Value: \"[ ,\\t]*\\t[ ,\\t]*\" ").withShortName("regex").create();
+    Option encodingOpt = obuilder.withLongName("encoding").withArgument(
+      abuilder.withName("encoding").withMinimum(1).withMaximum(1).create()).withDescription(
+      "(Optional) The file encoding.  Default value: UTF-8").withShortName("e").create();
+    Group group = gbuilder.withName("Options").withOption(inputDirOpt).withOption(outputOpt).withOption(
+      helpOpt).withOption(recordSplitterOpt).withOption(encodingOpt).create();
+    
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(group);
+      CommandLine cmdLine = parser.parse(args);
+      
+      if (cmdLine.hasOption(helpOpt)) {
+        CommandLineUtil.printHelp(group);
+        return;
+      }
+      Parameters params = new Parameters();
+      if (cmdLine.hasOption(recordSplitterOpt)) {
+        params.set("splitPattern", (String) cmdLine.getValue(recordSplitterOpt));
+      }
+      
+      String encoding = "UTF-8";
+      if (cmdLine.hasOption(encodingOpt)) {
+        encoding = (String) cmdLine.getValue(encodingOpt);
+      }
+      params.set("encoding", encoding);
+      String inputDir = (String) cmdLine.getValue(inputDirOpt);
+      String outputDir = (String) cmdLine.getValue(outputOpt);
+      params.set("input", inputDir);
+      params.set("output", outputDir);
+      params.set("groupingFieldCount", "2");
+      params.set("gfield0", "1");
+      params.set("gfield1", "2");
+      params.set("selectedFieldCount", "1");
+      params.set("field0", "3");
+      params.set("maxTransactionLength", "100");
+      KeyBasedStringTupleGrouper.startJob(params);
+      
+    } catch (OptionException ex) {
+      CommandLineUtil.printHelp(group);
+    }
+    
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java
new file mode 100644
index 0000000..4c80a31
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth.dataset;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.common.StringTuple;
+
+public class KeyBasedStringTupleCombiner extends Reducer<Text,StringTuple,Text,StringTuple> {
+  
+  @Override
+  protected void reduce(Text key,
+                        Iterable<StringTuple> values,
+                        Context context) throws IOException, InterruptedException {
+    Set<String> outputValues = new HashSet<>();
+    for (StringTuple value : values) {
+      outputValues.addAll(value.getEntries());
+    }
+    context.write(key, new StringTuple(outputValues));
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java
new file mode 100644
index 0000000..cd17770
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth.dataset;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Parameters;
+import org.apache.mahout.common.StringTuple;
+
+public final class KeyBasedStringTupleGrouper {
+  
+  private KeyBasedStringTupleGrouper() { }
+  
+  public static void startJob(Parameters params) throws IOException,
+                                                InterruptedException,
+                                                ClassNotFoundException {
+    Configuration conf = new Configuration();
+    
+    conf.set("job.parameters", params.toString());
+    conf.set("mapred.compress.map.output", "true");
+    conf.set("mapred.output.compression.type", "BLOCK");
+    conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
+    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+                                  + "org.apache.hadoop.io.serializer.WritableSerialization");
+    
+    String input = params.get("input");
+    Job job = new Job(conf, "Generating dataset based from input" + input);
+    job.setJarByClass(KeyBasedStringTupleGrouper.class);
+    
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(StringTuple.class);
+    
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(Text.class);
+    
+    FileInputFormat.addInputPath(job, new Path(input));
+    Path outPath = new Path(params.get("output"));
+    FileOutputFormat.setOutputPath(job, outPath);
+    
+    HadoopUtil.delete(conf, outPath);
+
+    job.setInputFormatClass(TextInputFormat.class);
+    job.setMapperClass(KeyBasedStringTupleMapper.class);
+    job.setCombinerClass(KeyBasedStringTupleCombiner.class);
+    job.setReducerClass(KeyBasedStringTupleReducer.class);
+    job.setOutputFormatClass(TextOutputFormat.class);
+    
+    boolean succeeded = job.waitForCompletion(true);
+    if (!succeeded) {
+      throw new IllegalStateException("Job failed!");
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java
new file mode 100644
index 0000000..362d1ce
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth.dataset;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.common.Parameters;
+import org.apache.mahout.common.StringTuple;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Splits the line using a {@link Pattern} and outputs key as given by the groupingFields
+ * 
+ */
+public class KeyBasedStringTupleMapper extends Mapper<LongWritable,Text,Text,StringTuple> {
+  
+  private static final Logger log = LoggerFactory.getLogger(KeyBasedStringTupleMapper.class);
+  
+  private Pattern splitter;
+  
+  private int[] selectedFields;
+  
+  private int[] groupingFields;
+  
+  @Override
+  protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+    String[] fields = splitter.split(value.toString());
+    if (fields.length != 4) {
+      log.info("{} {}", fields.length, value.toString());
+      context.getCounter("Map", "ERROR").increment(1);
+      return;
+    }
+    Collection<String> oKey = new ArrayList<>();
+    for (int groupingField : groupingFields) {
+      oKey.add(fields[groupingField]);
+      context.setStatus(fields[groupingField]);
+    }
+    
+    List<String> oValue = new ArrayList<>();
+    for (int selectedField : selectedFields) {
+      oValue.add(fields[selectedField]);
+    }
+    
+    context.write(new Text(oKey.toString()), new StringTuple(oValue));
+    
+  }
+  
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    super.setup(context);
+    Parameters params = new Parameters(context.getConfiguration().get("job.parameters", ""));
+    splitter = Pattern.compile(params.get("splitPattern", "[ \t]*\t[ \t]*"));
+    
+    int selectedFieldCount = Integer.valueOf(params.get("selectedFieldCount", "0"));
+    selectedFields = new int[selectedFieldCount];
+    for (int i = 0; i < selectedFieldCount; i++) {
+      selectedFields[i] = Integer.valueOf(params.get("field" + i, "0"));
+    }
+    
+    int groupingFieldCount = Integer.valueOf(params.get("groupingFieldCount", "0"));
+    groupingFields = new int[groupingFieldCount];
+    for (int i = 0; i < groupingFieldCount; i++) {
+      groupingFields[i] = Integer.valueOf(params.get("gfield" + i, "0"));
+    }
+    
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java
new file mode 100644
index 0000000..a7ef762
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth.dataset;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.common.Parameters;
+import org.apache.mahout.common.StringTuple;
+
+public class KeyBasedStringTupleReducer extends Reducer<Text,StringTuple,Text,Text> {
+  
+  private int maxTransactionLength = 100;
+  
+  @Override
+  protected void reduce(Text key, Iterable<StringTuple> values, Context context)
+    throws IOException, InterruptedException {
+    Collection<String> items = new HashSet<>();
+    
+    for (StringTuple value : values) {
+      for (String field : value.getEntries()) {
+        items.add(field);
+      }
+    }
+    if (items.size() > 1) {
+      int i = 0;
+      StringBuilder sb = new StringBuilder();
+      String sep = "";
+      for (String field : items) {
+        if (i % maxTransactionLength == 0) {
+          if (i != 0) {
+            context.write(null, new Text(sb.toString()));
+          }
+          sb.replace(0, sb.length(), "");
+          sep = "";
+        }
+        
+        sb.append(sep).append(field);
+        sep = "\t";
+        
+        i++;
+        
+      }
+      if (sb.length() > 0) {
+        context.write(null, new Text(sb.toString()));
+      }
+    }
+  }
+  
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    super.setup(context);
+    Parameters params = new Parameters(context.getConfiguration().get("job.parameters", ""));
+    maxTransactionLength = Integer.valueOf(params.get("maxTransactionLength", "100"));
+  }
+}


[33/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
new file mode 100644
index 0000000..f4b8bcb
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.io.Resources;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * Train a logistic regression for the examples from Chapter 13 of Mahout in Action
+ */
+public final class TrainLogistic {
+
+  private static String inputFile;
+  private static String outputFile;
+  private static LogisticModelParameters lmp;
+  private static int passes;
+  private static boolean scores;
+  private static OnlineLogisticRegression model;
+
+  private TrainLogistic() {
+  }
+
+  public static void main(String[] args) throws Exception {
+    mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+  }
+
+  static void mainToOutput(String[] args, PrintWriter output) throws Exception {
+    if (parseArgs(args)) {
+      double logPEstimate = 0;
+      int samples = 0;
+
+      CsvRecordFactory csv = lmp.getCsvRecordFactory();
+      OnlineLogisticRegression lr = lmp.createRegression();
+      for (int pass = 0; pass < passes; pass++) {
+        try (BufferedReader in = open(inputFile)) {
+          // read variable names
+          csv.firstLine(in.readLine());
+
+          String line = in.readLine();
+          while (line != null) {
+            // for each new line, get target and predictors
+            Vector input = new RandomAccessSparseVector(lmp.getNumFeatures());
+            int targetValue = csv.processLine(line, input);
+
+            // check performance while this is still news
+            double logP = lr.logLikelihood(targetValue, input);
+            if (!Double.isInfinite(logP)) {
+              if (samples < 20) {
+                logPEstimate = (samples * logPEstimate + logP) / (samples + 1);
+              } else {
+                logPEstimate = 0.95 * logPEstimate + 0.05 * logP;
+              }
+              samples++;
+            }
+            double p = lr.classifyScalar(input);
+            if (scores) {
+              output.printf(Locale.ENGLISH, "%10d %2d %10.2f %2.4f %10.4f %10.4f%n",
+                samples, targetValue, lr.currentLearningRate(), p, logP, logPEstimate);
+            }
+
+            // now update model
+            lr.train(targetValue, input);
+
+            line = in.readLine();
+          }
+        }
+      }
+
+      try (OutputStream modelOutput = new FileOutputStream(outputFile)) {
+        lmp.saveTo(modelOutput);
+      }
+
+      output.println(lmp.getNumFeatures());
+      output.println(lmp.getTargetVariable() + " ~ ");
+      String sep = "";
+      for (String v : csv.getTraceDictionary().keySet()) {
+        double weight = predictorWeight(lr, 0, csv, v);
+        if (weight != 0) {
+          output.printf(Locale.ENGLISH, "%s%.3f*%s", sep, weight, v);
+          sep = " + ";
+        }
+      }
+      output.printf("%n");
+      model = lr;
+      for (int row = 0; row < lr.getBeta().numRows(); row++) {
+        for (String key : csv.getTraceDictionary().keySet()) {
+          double weight = predictorWeight(lr, row, csv, key);
+          if (weight != 0) {
+            output.printf(Locale.ENGLISH, "%20s %.5f%n", key, weight);
+          }
+        }
+        for (int column = 0; column < lr.getBeta().numCols(); column++) {
+          output.printf(Locale.ENGLISH, "%15.9f ", lr.getBeta().get(row, column));
+        }
+        output.println();
+      }
+    }
+  }
+
+  private static double predictorWeight(OnlineLogisticRegression lr, int row, RecordFactory csv, String predictor) {
+    double weight = 0;
+    for (Integer column : csv.getTraceDictionary().get(predictor)) {
+      weight += lr.getBeta().get(row, column);
+    }
+    return weight;
+  }
+
+  private static boolean parseArgs(String[] args) {
+    DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+    Option help = builder.withLongName("help").withDescription("print this list").create();
+
+    Option quiet = builder.withLongName("quiet").withDescription("be extra quiet").create();
+    Option scores = builder.withLongName("scores").withDescription("output score diagnostics during training").create();
+
+    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+    Option inputFile = builder.withLongName("input")
+            .withRequired(true)
+            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+            .withDescription("where to get training data")
+            .create();
+
+    Option outputFile = builder.withLongName("output")
+            .withRequired(true)
+            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
+            .withDescription("where to get training data")
+            .create();
+
+    Option predictors = builder.withLongName("predictors")
+            .withRequired(true)
+            .withArgument(argumentBuilder.withName("p").create())
+            .withDescription("a list of predictor variables")
+            .create();
+
+    Option types = builder.withLongName("types")
+            .withRequired(true)
+            .withArgument(argumentBuilder.withName("t").create())
+            .withDescription("a list of predictor variable types (numeric, word, or text)")
+            .create();
+
+    Option target = builder.withLongName("target")
+            .withRequired(true)
+            .withArgument(argumentBuilder.withName("target").withMaximum(1).create())
+            .withDescription("the name of the target variable")
+            .create();
+
+    Option features = builder.withLongName("features")
+            .withArgument(
+                    argumentBuilder.withName("numFeatures")
+                            .withDefault("1000")
+                            .withMaximum(1).create())
+            .withDescription("the number of internal hashed features to use")
+            .create();
+
+    Option passes = builder.withLongName("passes")
+            .withArgument(
+                    argumentBuilder.withName("passes")
+                            .withDefault("2")
+                            .withMaximum(1).create())
+            .withDescription("the number of times to pass over the input data")
+            .create();
+
+    Option lambda = builder.withLongName("lambda")
+            .withArgument(argumentBuilder.withName("lambda").withDefault("1e-4").withMaximum(1).create())
+            .withDescription("the amount of coefficient decay to use")
+            .create();
+
+    Option rate = builder.withLongName("rate")
+            .withArgument(argumentBuilder.withName("learningRate").withDefault("1e-3").withMaximum(1).create())
+            .withDescription("the learning rate")
+            .create();
+
+    Option noBias = builder.withLongName("noBias")
+            .withDescription("don't include a bias term")
+            .create();
+
+    Option targetCategories = builder.withLongName("categories")
+            .withRequired(true)
+            .withArgument(argumentBuilder.withName("number").withMaximum(1).create())
+            .withDescription("the number of target categories to be considered")
+            .create();
+
+    Group normalArgs = new GroupBuilder()
+            .withOption(help)
+            .withOption(quiet)
+            .withOption(inputFile)
+            .withOption(outputFile)
+            .withOption(target)
+            .withOption(targetCategories)
+            .withOption(predictors)
+            .withOption(types)
+            .withOption(passes)
+            .withOption(lambda)
+            .withOption(rate)
+            .withOption(noBias)
+            .withOption(features)
+            .create();
+
+    Parser parser = new Parser();
+    parser.setHelpOption(help);
+    parser.setHelpTrigger("--help");
+    parser.setGroup(normalArgs);
+    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+    CommandLine cmdLine = parser.parseAndHelp(args);
+
+    if (cmdLine == null) {
+      return false;
+    }
+
+    TrainLogistic.inputFile = getStringArgument(cmdLine, inputFile);
+    TrainLogistic.outputFile = getStringArgument(cmdLine, outputFile);
+
+    List<String> typeList = new ArrayList<>();
+    for (Object x : cmdLine.getValues(types)) {
+      typeList.add(x.toString());
+    }
+
+    List<String> predictorList = new ArrayList<>();
+    for (Object x : cmdLine.getValues(predictors)) {
+      predictorList.add(x.toString());
+    }
+
+    lmp = new LogisticModelParameters();
+    lmp.setTargetVariable(getStringArgument(cmdLine, target));
+    lmp.setMaxTargetCategories(getIntegerArgument(cmdLine, targetCategories));
+    lmp.setNumFeatures(getIntegerArgument(cmdLine, features));
+    lmp.setUseBias(!getBooleanArgument(cmdLine, noBias));
+    lmp.setTypeMap(predictorList, typeList);
+
+    lmp.setLambda(getDoubleArgument(cmdLine, lambda));
+    lmp.setLearningRate(getDoubleArgument(cmdLine, rate));
+
+    TrainLogistic.scores = getBooleanArgument(cmdLine, scores);
+    TrainLogistic.passes = getIntegerArgument(cmdLine, passes);
+
+    return true;
+  }
+
+  private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
+    return (String) cmdLine.getValue(inputFile);
+  }
+
+  private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
+    return cmdLine.hasOption(option);
+  }
+
+  private static int getIntegerArgument(CommandLine cmdLine, Option features) {
+    return Integer.parseInt((String) cmdLine.getValue(features));
+  }
+
+  private static double getDoubleArgument(CommandLine cmdLine, Option op) {
+    return Double.parseDouble((String) cmdLine.getValue(op));
+  }
+
+  public static OnlineLogisticRegression getModel() {
+    return model;
+  }
+
+  public static LogisticModelParameters getParameters() {
+    return lmp;
+  }
+
+  static BufferedReader open(String inputFile) throws IOException {
+    InputStream in;
+    try {
+      in = Resources.getResource(inputFile).openStream();
+    } catch (IllegalArgumentException e) {
+      in = new FileInputStream(new File(inputFile));
+    }
+    return new BufferedReader(new InputStreamReader(in, Charsets.UTF_8));
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
new file mode 100644
index 0000000..632b32c
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+import org.apache.mahout.classifier.NewsgroupHelper;
+import org.apache.mahout.ep.State;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.vectorizer.encoders.Dictionary;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Reads and trains an adaptive logistic regression model on the 20 newsgroups data.
+ * The first command line argument gives the path of the directory holding the training
+ * data.  The optional second argument, leakType, defines which classes of features to use.
+ * Importantly, leakType controls whether a synthetic date is injected into the data as
+ * a target leak and if so, how.
+ * <p/>
+ * The value of leakType % 3 determines whether the target leak is injected according to
+ * the following table:
+ * <p/>
+ * <table>
+ * <tr><td valign='top'>0</td><td>No leak injected</td></tr>
+ * <tr><td valign='top'>1</td><td>Synthetic date injected in MMM-yyyy format. This will be a single token and
+ * is a perfect target leak since each newsgroup is given a different month</td></tr>
+ * <tr><td valign='top'>2</td><td>Synthetic date injected in dd-MMM-yyyy HH:mm:ss format.  The day varies
+ * and thus there are more leak symbols that need to be learned.  Ultimately this is just
+ * as big a leak as case 1.</td></tr>
+ * </table>
+ * <p/>
+ * Leaktype also determines what other text will be indexed.  If leakType is greater
+ * than or equal to 6, then neither headers nor text body will be used for features and the leak is the only
+ * source of data.  If leakType is greater than or equal to 3, then subject words will be used as features.
+ * If leakType is less than 3, then both subject and body text will be used as features.
+ * <p/>
+ * A leakType of 0 gives no leak and all textual features.
+ * <p/>
+ * See the following table for a summary of commonly used values for leakType
+ * <p/>
+ * <table>
+ * <tr><td><b>leakType</b></td><td><b>Leak?</b></td><td><b>Subject?</b></td><td><b>Body?</b></td></tr>
+ * <tr><td colspan=4><hr></td></tr>
+ * <tr><td>0</td><td>no</td><td>yes</td><td>yes</td></tr>
+ * <tr><td>1</td><td>mmm-yyyy</td><td>yes</td><td>yes</td></tr>
+ * <tr><td>2</td><td>dd-mmm-yyyy</td><td>yes</td><td>yes</td></tr>
+ * <tr><td colspan=4><hr></td></tr>
+ * <tr><td>3</td><td>no</td><td>yes</td><td>no</td></tr>
+ * <tr><td>4</td><td>mmm-yyyy</td><td>yes</td><td>no</td></tr>
+ * <tr><td>5</td><td>dd-mmm-yyyy</td><td>yes</td><td>no</td></tr>
+ * <tr><td colspan=4><hr></td></tr>
+ * <tr><td>6</td><td>no</td><td>no</td><td>no</td></tr>
+ * <tr><td>7</td><td>mmm-yyyy</td><td>no</td><td>no</td></tr>
+ * <tr><td>8</td><td>dd-mmm-yyyy</td><td>no</td><td>no</td></tr>
+ * <tr><td colspan=4><hr></td></tr>
+ * </table>
+ */
+public final class TrainNewsGroups {
+
+  private TrainNewsGroups() {
+  }
+
+  public static void main(String[] args) throws IOException {
+    File base = new File(args[0]);
+
+    Multiset<String> overallCounts = HashMultiset.create();
+
+    int leakType = 0;
+    if (args.length > 1) {
+      leakType = Integer.parseInt(args[1]);
+    }
+
+    Dictionary newsGroups = new Dictionary();
+
+    NewsgroupHelper helper = new NewsgroupHelper();
+    helper.getEncoder().setProbes(2);
+    AdaptiveLogisticRegression learningAlgorithm =
+        new AdaptiveLogisticRegression(20, NewsgroupHelper.FEATURES, new L1());
+    learningAlgorithm.setInterval(800);
+    learningAlgorithm.setAveragingWindow(500);
+
+    List<File> files = new ArrayList<>();
+    for (File newsgroup : base.listFiles()) {
+      if (newsgroup.isDirectory()) {
+        newsGroups.intern(newsgroup.getName());
+        files.addAll(Arrays.asList(newsgroup.listFiles()));
+      }
+    }
+    Collections.shuffle(files);
+    System.out.println(files.size() + " training files");
+    SGDInfo info = new SGDInfo();
+
+    int k = 0;
+
+    for (File file : files) {
+      String ng = file.getParentFile().getName();
+      int actual = newsGroups.intern(ng);
+
+      Vector v = helper.encodeFeatureVector(file, actual, leakType, overallCounts);
+      learningAlgorithm.train(actual, v);
+
+      k++;
+      State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();
+
+      SGDHelper.analyzeState(info, leakType, k, best);
+    }
+    learningAlgorithm.close();
+    SGDHelper.dissect(leakType, newsGroups, learningAlgorithm, files, overallCounts);
+    System.out.println("exiting main");
+
+    File modelFile = new File(System.getProperty("java.io.tmpdir"), "news-group.model");
+    ModelSerializer.writeBinary(modelFile.getAbsolutePath(),
+        learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
+
+    List<Integer> counts = new ArrayList<>();
+    System.out.println("Word counts");
+    for (String count : overallCounts.elementSet()) {
+      counts.add(overallCounts.count(count));
+    }
+    Collections.sort(counts, Ordering.natural().reverse());
+    k = 0;
+    for (Integer count : counts) {
+      System.out.println(k + "\t" + count);
+      k++;
+      if (k > 1000) {
+        break;
+      }
+    }
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java
new file mode 100644
index 0000000..7a74289
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.Locale;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.classifier.ConfusionMatrix;
+import org.apache.mahout.classifier.evaluation.Auc;
+import org.apache.mahout.classifier.sgd.AdaptiveLogisticRegression.Wrapper;
+import org.apache.mahout.ep.State;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.stats.OnlineSummarizer;
+
+/*
+ * Auc and averageLikelihood are always shown if possible, if the number of target value is more than 2, 
+ * then Auc and entropy matirx are not shown regardless the value of showAuc and showEntropy
+ * the user passes, because the current implementation does not support them on two value targets.
+ * */
+public final class ValidateAdaptiveLogistic {
+
+  private static String inputFile;
+  private static String modelFile;
+  private static String defaultCategory;
+  private static boolean showAuc;
+  private static boolean showScores;
+  private static boolean showConfusion;
+
+  private ValidateAdaptiveLogistic() {
+  }
+
+  public static void main(String[] args) throws IOException {
+    mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+  }
+
+  static void mainToOutput(String[] args, PrintWriter output) throws IOException {
+    if (parseArgs(args)) {
+      if (!showAuc && !showConfusion && !showScores) {
+        showAuc = true;
+        showConfusion = true;
+      }
+
+      Auc collector = null;
+      AdaptiveLogisticModelParameters lmp = AdaptiveLogisticModelParameters
+          .loadFromFile(new File(modelFile));
+      CsvRecordFactory csv = lmp.getCsvRecordFactory();
+      AdaptiveLogisticRegression lr = lmp.createAdaptiveLogisticRegression();      
+
+      if (lmp.getTargetCategories().size() <= 2) {
+        collector = new Auc();
+      }
+
+      OnlineSummarizer slh = new OnlineSummarizer();
+      ConfusionMatrix cm = new ConfusionMatrix(lmp.getTargetCategories(), defaultCategory);
+
+      State<Wrapper, CrossFoldLearner> best = lr.getBest();
+      if (best == null) {
+        output.println("AdaptiveLogisticRegression has not be trained probably.");
+        return;
+      }
+      CrossFoldLearner learner = best.getPayload().getLearner();
+
+      BufferedReader in = TrainLogistic.open(inputFile);
+      String line = in.readLine();
+      csv.firstLine(line);
+      line = in.readLine();
+      if (showScores) {
+        output.println("\"target\", \"model-output\", \"log-likelihood\", \"average-likelihood\"");
+      }
+      while (line != null) {
+        Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
+        //TODO: How to avoid extra target values not shown in the training process.
+        int target = csv.processLine(line, v);
+        double likelihood = learner.logLikelihood(target, v);
+        double score = learner.classifyFull(v).maxValue();
+
+        slh.add(likelihood);
+        cm.addInstance(csv.getTargetString(line), csv.getTargetLabel(target));        
+
+        if (showScores) {
+          output.printf(Locale.ENGLISH, "%8d, %.12f, %.13f, %.13f%n", target,
+              score, learner.logLikelihood(target, v), slh.getMean());
+        }
+        if (collector != null) {
+          collector.add(target, score);
+        }
+        line = in.readLine();
+      }
+
+      output.printf(Locale.ENGLISH,"\nLog-likelihood:");
+      output.printf(Locale.ENGLISH, "Min=%.2f, Max=%.2f, Mean=%.2f, Median=%.2f%n",
+          slh.getMin(), slh.getMax(), slh.getMean(), slh.getMedian());
+
+      if (collector != null) {        
+        output.printf(Locale.ENGLISH, "%nAUC = %.2f%n", collector.auc());
+      }
+
+      if (showConfusion) {
+        output.printf(Locale.ENGLISH, "%n%s%n%n", cm.toString());
+
+        if (collector != null) {
+          Matrix m = collector.entropy();
+          output.printf(Locale.ENGLISH,
+              "Entropy Matrix: [[%.1f, %.1f], [%.1f, %.1f]]%n", m.get(0, 0),
+              m.get(1, 0), m.get(0, 1), m.get(1, 1));
+        }        
+      }
+
+    }
+  }
+
+  private static boolean parseArgs(String[] args) {
+    DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+    Option help = builder.withLongName("help")
+        .withDescription("print this list").create();
+
+    Option quiet = builder.withLongName("quiet")
+        .withDescription("be extra quiet").create();
+
+    Option auc = builder.withLongName("auc").withDescription("print AUC")
+        .create();
+    Option confusion = builder.withLongName("confusion")
+        .withDescription("print confusion matrix").create();
+
+    Option scores = builder.withLongName("scores")
+        .withDescription("print scores").create();
+
+    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+    Option inputFileOption = builder
+        .withLongName("input")
+        .withRequired(true)
+        .withArgument(
+            argumentBuilder.withName("input").withMaximum(1)
+                .create())
+        .withDescription("where to get validate data").create();
+
+    Option modelFileOption = builder
+        .withLongName("model")
+        .withRequired(true)
+        .withArgument(
+            argumentBuilder.withName("model").withMaximum(1)
+                .create())
+        .withDescription("where to get the trained model").create();
+
+    Option defaultCagetoryOption = builder
+      .withLongName("defaultCategory")
+      .withRequired(false)
+      .withArgument(
+          argumentBuilder.withName("defaultCategory").withMaximum(1).withDefault("unknown")
+          .create())
+      .withDescription("the default category value to use").create();
+
+    Group normalArgs = new GroupBuilder().withOption(help)
+        .withOption(quiet).withOption(auc).withOption(scores)
+        .withOption(confusion).withOption(inputFileOption)
+        .withOption(modelFileOption).withOption(defaultCagetoryOption).create();
+
+    Parser parser = new Parser();
+    parser.setHelpOption(help);
+    parser.setHelpTrigger("--help");
+    parser.setGroup(normalArgs);
+    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+    CommandLine cmdLine = parser.parseAndHelp(args);
+
+    if (cmdLine == null) {
+      return false;
+    }
+
+    inputFile = getStringArgument(cmdLine, inputFileOption);
+    modelFile = getStringArgument(cmdLine, modelFileOption);
+    defaultCategory = getStringArgument(cmdLine, defaultCagetoryOption);
+    showAuc = getBooleanArgument(cmdLine, auc);
+    showScores = getBooleanArgument(cmdLine, scores);
+    showConfusion = getBooleanArgument(cmdLine, confusion);
+
+    return true;
+  }
+
+  private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
+    return cmdLine.hasOption(option);
+  }
+
+  private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
+    return (String) cmdLine.getValue(inputFile);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java
new file mode 100644
index 0000000..ab3c861
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd.bankmarketing;
+
+import com.google.common.collect.Lists;
+import org.apache.mahout.classifier.evaluation.Auc;
+import org.apache.mahout.classifier.sgd.L1;
+import org.apache.mahout.classifier.sgd.OnlineLogisticRegression;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Uses the SGD classifier on the 'Bank marketing' dataset from UCI.
+ *
+ * See http://archive.ics.uci.edu/ml/datasets/Bank+Marketing
+ *
+ * Learn when people accept or reject an offer from the bank via telephone based on income, age, education and more.
+ */
+public class BankMarketingClassificationMain {
+
+  public static final int NUM_CATEGORIES = 2;
+
+  public static void main(String[] args) throws Exception {
+    List<TelephoneCall> calls = Lists.newArrayList(new TelephoneCallParser("bank-full.csv"));
+
+    double heldOutPercentage = 0.10;
+
+    for (int run = 0; run < 20; run++) {
+      Collections.shuffle(calls);
+      int cutoff = (int) (heldOutPercentage * calls.size());
+      List<TelephoneCall> test = calls.subList(0, cutoff);
+      List<TelephoneCall> train = calls.subList(cutoff, calls.size());
+
+      OnlineLogisticRegression lr = new OnlineLogisticRegression(NUM_CATEGORIES, TelephoneCall.FEATURES, new L1())
+        .learningRate(1)
+        .alpha(1)
+        .lambda(0.000001)
+        .stepOffset(10000)
+        .decayExponent(0.2);
+      for (int pass = 0; pass < 20; pass++) {
+        for (TelephoneCall observation : train) {
+          lr.train(observation.getTarget(), observation.asVector());
+        }
+        if (pass % 5 == 0) {
+          Auc eval = new Auc(0.5);
+          for (TelephoneCall testCall : test) {
+            eval.add(testCall.getTarget(), lr.classifyScalar(testCall.asVector()));
+          }
+          System.out.printf("%d, %.4f, %.4f\n", pass, lr.currentLearningRate(), eval.auc());
+        }
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java
new file mode 100644
index 0000000..728ec20
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd.bankmarketing;
+
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
+import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
+import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
+
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+public class TelephoneCall {
+  public static final int FEATURES = 100;
+  private static final ConstantValueEncoder interceptEncoder = new ConstantValueEncoder("intercept");
+  private static final FeatureVectorEncoder featureEncoder = new StaticWordValueEncoder("feature");
+
+  private RandomAccessSparseVector vector;
+
+  private Map<String, String> fields = new LinkedHashMap<>();
+
+  public TelephoneCall(Iterable<String> fieldNames, Iterable<String> values) {
+    vector = new RandomAccessSparseVector(FEATURES);
+    Iterator<String> value = values.iterator();
+    interceptEncoder.addToVector("1", vector);
+    for (String name : fieldNames) {
+      String fieldValue = value.next();
+      fields.put(name, fieldValue);
+
+      switch (name) {
+        case "age": {
+          double v = Double.parseDouble(fieldValue);
+          featureEncoder.addToVector(name, Math.log(v), vector);
+          break;
+        }
+        case "balance": {
+          double v;
+          v = Double.parseDouble(fieldValue);
+          if (v < -2000) {
+            v = -2000;
+          }
+          featureEncoder.addToVector(name, Math.log(v + 2001) - 8, vector);
+          break;
+        }
+        case "duration": {
+          double v;
+          v = Double.parseDouble(fieldValue);
+          featureEncoder.addToVector(name, Math.log(v + 1) - 5, vector);
+          break;
+        }
+        case "pdays": {
+          double v;
+          v = Double.parseDouble(fieldValue);
+          featureEncoder.addToVector(name, Math.log(v + 2), vector);
+          break;
+        }
+        case "job":
+        case "marital":
+        case "education":
+        case "default":
+        case "housing":
+        case "loan":
+        case "contact":
+        case "campaign":
+        case "previous":
+        case "poutcome":
+          featureEncoder.addToVector(name + ":" + fieldValue, 1, vector);
+          break;
+        case "day":
+        case "month":
+        case "y":
+          // ignore these for vectorizing
+          break;
+        default:
+          throw new IllegalArgumentException(String.format("Bad field name: %s", name));
+      }
+    }
+  }
+
+  public Vector asVector() {
+    return vector;
+  }
+
+  public int getTarget() {
+    return fields.get("y").equals("no") ? 0 : 1;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java
new file mode 100644
index 0000000..5ef6490
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd.bankmarketing;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.base.Splitter;
+import com.google.common.collect.AbstractIterator;
+import com.google.common.io.Resources;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.Iterator;
+
+/** Parses semi-colon separated data as TelephoneCalls  */
+public class TelephoneCallParser implements Iterable<TelephoneCall> {
+
+  private final Splitter onSemi = Splitter.on(";").trimResults(CharMatcher.anyOf("\" ;"));
+  private String resourceName;
+
+  public TelephoneCallParser(String resourceName) throws IOException {
+    this.resourceName = resourceName;
+  }
+
+  @Override
+  public Iterator<TelephoneCall> iterator() {
+    try {
+      return new AbstractIterator<TelephoneCall>() {
+        BufferedReader input =
+            new BufferedReader(new InputStreamReader(Resources.getResource(resourceName).openStream()));
+        Iterable<String> fieldNames = onSemi.split(input.readLine());
+
+          @Override
+          protected TelephoneCall computeNext() {
+            try {
+              String line = input.readLine();
+              if (line == null) {
+                return endOfData();
+              }
+
+              return new TelephoneCall(fieldNames, onSemi.split(line));
+            } catch (IOException e) {
+              throw new RuntimeException("Error reading data", e);
+            }
+          }
+        };
+      } catch (IOException e) {
+        throw new RuntimeException("Error reading data", e);
+      }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
new file mode 100644
index 0000000..a0b845f
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+
+final class ClustersFilter implements PathFilter {
+
+  @Override
+  public boolean accept(Path path) {
+    String pathString = path.toString();
+    return pathString.contains("/clusters-");
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
new file mode 100644
index 0000000..50dba99
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import java.awt.BasicStroke;
+import java.awt.Color;
+import java.awt.Graphics;
+import java.awt.Graphics2D;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.canopy.CanopyDriver;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
+import org.apache.mahout.math.DenseVector;
+
+/**
+ * Java desktop graphics class that runs canopy clustering and displays the results.
+ * This class generates random data and clusters it.
+ */
+@Deprecated
+public class DisplayCanopy extends DisplayClustering {
+
+  DisplayCanopy() {
+    initialize();
+    this.setTitle("Canopy Clusters (>" + (int) (significance * 100) + "% of population)");
+  }
+
+  @Override
+  public void paint(Graphics g) {
+    plotSampleData((Graphics2D) g);
+    plotClusters((Graphics2D) g);
+  }
+
+  protected static void plotClusters(Graphics2D g2) {
+    int cx = CLUSTERS.size() - 1;
+    for (List<Cluster> clusters : CLUSTERS) {
+      for (Cluster cluster : clusters) {
+        if (isSignificant(cluster)) {
+          g2.setStroke(new BasicStroke(1));
+          g2.setColor(Color.BLUE);
+          double[] t1 = {T1, T1};
+          plotEllipse(g2, cluster.getCenter(), new DenseVector(t1));
+          double[] t2 = {T2, T2};
+          plotEllipse(g2, cluster.getCenter(), new DenseVector(t2));
+          g2.setColor(COLORS[Math.min(DisplayClustering.COLORS.length - 1, cx)]);
+          g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1));
+          plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3));
+        }
+      }
+      cx--;
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    Path samples = new Path("samples");
+    Path output = new Path("output");
+    Configuration conf = new Configuration();
+    HadoopUtil.delete(conf, samples);
+    HadoopUtil.delete(conf, output);
+    RandomUtils.useTestSeed();
+    generateSamples();
+    writeSampleData(samples);
+    CanopyDriver.buildClusters(conf, samples, output, new ManhattanDistanceMeasure(), T1, T2, 0, true);
+    loadClustersWritable(output);
+
+    new DisplayCanopy();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
new file mode 100644
index 0000000..ad85c6a
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import java.awt.*;
+import java.awt.event.WindowAdapter;
+import java.awt.event.WindowEvent;
+import java.awt.geom.AffineTransform;
+import java.awt.geom.Ellipse2D;
+import java.awt.geom.Rectangle2D;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.UncommonDistributions;
+import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DisplayClustering extends Frame {
+  
+  private static final Logger log = LoggerFactory.getLogger(DisplayClustering.class);
+  
+  protected static final int DS = 72; // default scale = 72 pixels per inch
+  
+  protected static final int SIZE = 8; // screen size in inches
+  
+  private static final Collection<Vector> SAMPLE_PARAMS = new ArrayList<>();
+  
+  protected static final List<VectorWritable> SAMPLE_DATA = new ArrayList<>();
+  
+  protected static final List<List<Cluster>> CLUSTERS = new ArrayList<>();
+  
+  static final Color[] COLORS = { Color.red, Color.orange, Color.yellow, Color.green, Color.blue, Color.magenta,
+    Color.lightGray };
+  
+  protected static final double T1 = 3.0;
+  
+  protected static final double T2 = 2.8;
+  
+  static double significance = 0.05;
+  
+  protected static int res; // screen resolution
+  
+  public DisplayClustering() {
+    initialize();
+    this.setTitle("Sample Data");
+  }
+  
+  public void initialize() {
+    // Get screen resolution
+    res = Toolkit.getDefaultToolkit().getScreenResolution();
+    
+    // Set Frame size in inches
+    this.setSize(SIZE * res, SIZE * res);
+    this.setVisible(true);
+    this.setTitle("Asymmetric Sample Data");
+    
+    // Window listener to terminate program.
+    this.addWindowListener(new WindowAdapter() {
+      @Override
+      public void windowClosing(WindowEvent e) {
+        System.exit(0);
+      }
+    });
+  }
+  
+  public static void main(String[] args) throws Exception {
+    RandomUtils.useTestSeed();
+    generateSamples();
+    new DisplayClustering();
+  }
+  
+  // Override the paint() method
+  @Override
+  public void paint(Graphics g) {
+    Graphics2D g2 = (Graphics2D) g;
+    plotSampleData(g2);
+    plotSampleParameters(g2);
+    plotClusters(g2);
+  }
+  
+  protected static void plotClusters(Graphics2D g2) {
+    int cx = CLUSTERS.size() - 1;
+    for (List<Cluster> clusters : CLUSTERS) {
+      g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1));
+      g2.setColor(COLORS[Math.min(COLORS.length - 1, cx--)]);
+      for (Cluster cluster : clusters) {
+        plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3));
+      }
+    }
+  }
+  
+  protected static void plotSampleParameters(Graphics2D g2) {
+    Vector v = new DenseVector(2);
+    Vector dv = new DenseVector(2);
+    g2.setColor(Color.RED);
+    for (Vector param : SAMPLE_PARAMS) {
+      v.set(0, param.get(0));
+      v.set(1, param.get(1));
+      dv.set(0, param.get(2) * 3);
+      dv.set(1, param.get(3) * 3);
+      plotEllipse(g2, v, dv);
+    }
+  }
+  
+  protected static void plotSampleData(Graphics2D g2) {
+    double sx = (double) res / DS;
+    g2.setTransform(AffineTransform.getScaleInstance(sx, sx));
+    
+    // plot the axes
+    g2.setColor(Color.BLACK);
+    Vector dv = new DenseVector(2).assign(SIZE / 2.0);
+    plotRectangle(g2, new DenseVector(2).assign(2), dv);
+    plotRectangle(g2, new DenseVector(2).assign(-2), dv);
+    
+    // plot the sample data
+    g2.setColor(Color.DARK_GRAY);
+    dv.assign(0.03);
+    for (VectorWritable v : SAMPLE_DATA) {
+      plotRectangle(g2, v.get(), dv);
+    }
+  }
+  
+  /**
+   * This method plots points and colors them according to their cluster
+   * membership, rather than drawing ellipses.
+   * 
+   * As of commit, this method is used only by K-means spectral clustering.
+   * Since the cluster assignments are set within the eigenspace of the data, it
+   * is not inherent that the original data cluster as they would in K-means:
+   * that is, as symmetric gaussian mixtures.
+   * 
+   * Since Spectral K-Means uses K-Means to cluster the eigenspace data, the raw
+   * output is not directly usable. Rather, the cluster assignments from the raw
+   * output need to be transferred back to the original data. As such, this
+   * method will read the SequenceFile cluster results of K-means and transfer
+   * the cluster assignments to the original data, coloring them appropriately.
+   * 
+   * @param g2
+   * @param data
+   */
+  protected static void plotClusteredSampleData(Graphics2D g2, Path data) {
+    double sx = (double) res / DS;
+    g2.setTransform(AffineTransform.getScaleInstance(sx, sx));
+    
+    g2.setColor(Color.BLACK);
+    Vector dv = new DenseVector(2).assign(SIZE / 2.0);
+    plotRectangle(g2, new DenseVector(2).assign(2), dv);
+    plotRectangle(g2, new DenseVector(2).assign(-2), dv);
+    
+    // plot the sample data, colored according to the cluster they belong to
+    dv.assign(0.03);
+    
+    Path clusteredPointsPath = new Path(data, "clusteredPoints");
+    Path inputPath = new Path(clusteredPointsPath, "part-m-00000");
+    Map<Integer,Color> colors = new HashMap<>();
+    int point = 0;
+    for (Pair<IntWritable,WeightedVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedVectorWritable>(
+        inputPath, new Configuration())) {
+      int clusterId = record.getFirst().get();
+      VectorWritable v = SAMPLE_DATA.get(point++);
+      Integer key = clusterId;
+      if (!colors.containsKey(key)) {
+        colors.put(key, COLORS[Math.min(COLORS.length - 1, colors.size())]);
+      }
+      plotClusteredRectangle(g2, v.get(), dv, colors.get(key));
+    }
+  }
+  
+  /**
+   * Identical to plotRectangle(), but with the option of setting the color of
+   * the rectangle's stroke.
+   * 
+   * NOTE: This should probably be refactored with plotRectangle() since most of
+   * the code here is direct copy/paste from that method.
+   * 
+   * @param g2
+   *          A Graphics2D context.
+   * @param v
+   *          A vector for the rectangle's center.
+   * @param dv
+   *          A vector for the rectangle's dimensions.
+   * @param color
+   *          The color of the rectangle's stroke.
+   */
+  protected static void plotClusteredRectangle(Graphics2D g2, Vector v, Vector dv, Color color) {
+    double[] flip = {1, -1};
+    Vector v2 = v.times(new DenseVector(flip));
+    v2 = v2.minus(dv.divide(2));
+    int h = SIZE / 2;
+    double x = v2.get(0) + h;
+    double y = v2.get(1) + h;
+    
+    g2.setStroke(new BasicStroke(1));
+    g2.setColor(color);
+    g2.draw(new Rectangle2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
+  }
+  
+  /**
+   * Draw a rectangle on the graphics context
+   * 
+   * @param g2
+   *          a Graphics2D context
+   * @param v
+   *          a Vector of rectangle center
+   * @param dv
+   *          a Vector of rectangle dimensions
+   */
+  protected static void plotRectangle(Graphics2D g2, Vector v, Vector dv) {
+    double[] flip = {1, -1};
+    Vector v2 = v.times(new DenseVector(flip));
+    v2 = v2.minus(dv.divide(2));
+    int h = SIZE / 2;
+    double x = v2.get(0) + h;
+    double y = v2.get(1) + h;
+    g2.draw(new Rectangle2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
+  }
+  
+  /**
+   * Draw an ellipse on the graphics context
+   * 
+   * @param g2
+   *          a Graphics2D context
+   * @param v
+   *          a Vector of ellipse center
+   * @param dv
+   *          a Vector of ellipse dimensions
+   */
+  protected static void plotEllipse(Graphics2D g2, Vector v, Vector dv) {
+    double[] flip = {1, -1};
+    Vector v2 = v.times(new DenseVector(flip));
+    v2 = v2.minus(dv.divide(2));
+    int h = SIZE / 2;
+    double x = v2.get(0) + h;
+    double y = v2.get(1) + h;
+    g2.draw(new Ellipse2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
+  }
+  
+  protected static void generateSamples() {
+    generateSamples(500, 1, 1, 3);
+    generateSamples(300, 1, 0, 0.5);
+    generateSamples(300, 0, 2, 0.1);
+  }
+  
+  protected static void generate2dSamples() {
+    generate2dSamples(500, 1, 1, 3, 1);
+    generate2dSamples(300, 1, 0, 0.5, 1);
+    generate2dSamples(300, 0, 2, 0.1, 0.5);
+  }
+  
+  /**
+   * Generate random samples and add them to the sampleData
+   * 
+   * @param num
+   *          int number of samples to generate
+   * @param mx
+   *          double x-value of the sample mean
+   * @param my
+   *          double y-value of the sample mean
+   * @param sd
+   *          double standard deviation of the samples
+   */
+  protected static void generateSamples(int num, double mx, double my, double sd) {
+    double[] params = {mx, my, sd, sd};
+    SAMPLE_PARAMS.add(new DenseVector(params));
+    log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd);
+    for (int i = 0; i < num; i++) {
+      SAMPLE_DATA.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
+          UncommonDistributions.rNorm(my, sd)})));
+    }
+  }
+  
+  protected static void writeSampleData(Path output) throws IOException {
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(output.toUri(), conf);
+
+    try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class, VectorWritable.class)) {
+      int i = 0;
+      for (VectorWritable vw : SAMPLE_DATA) {
+        writer.append(new Text("sample_" + i++), vw);
+      }
+    }
+  }
+  
+  protected static List<Cluster> readClustersWritable(Path clustersIn) {
+    List<Cluster> clusters = new ArrayList<>();
+    Configuration conf = new Configuration();
+    for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST,
+        PathFilters.logsCRCFilter(), conf)) {
+      Cluster cluster = value.getValue();
+      log.info(
+          "Reading Cluster:{} center:{} numPoints:{} radius:{}",
+          cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null),
+          cluster.getNumObservations(), AbstractCluster.formatVector(cluster.getRadius(), null));
+      clusters.add(cluster);
+    }
+    return clusters;
+  }
+  
+  protected static void loadClustersWritable(Path output) throws IOException {
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(output.toUri(), conf);
+    for (FileStatus s : fs.listStatus(output, new ClustersFilter())) {
+      List<Cluster> clusters = readClustersWritable(s.getPath());
+      CLUSTERS.add(clusters);
+    }
+  }
+  
+  /**
+   * Generate random samples and add them to the sampleData
+   * 
+   * @param num
+   *          int number of samples to generate
+   * @param mx
+   *          double x-value of the sample mean
+   * @param my
+   *          double y-value of the sample mean
+   * @param sdx
+   *          double x-value standard deviation of the samples
+   * @param sdy
+   *          double y-value standard deviation of the samples
+   */
+  protected static void generate2dSamples(int num, double mx, double my, double sdx, double sdy) {
+    double[] params = {mx, my, sdx, sdy};
+    SAMPLE_PARAMS.add(new DenseVector(params));
+    log.info("Generating {} samples m=[{}, {}] sd=[{}, {}]", num, mx, my, sdx, sdy);
+    for (int i = 0; i < num; i++) {
+      SAMPLE_DATA.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sdx),
+          UncommonDistributions.rNorm(my, sdy)})));
+    }
+  }
+  
+  protected static boolean isSignificant(Cluster cluster) {
+    return (double) cluster.getNumObservations() / SAMPLE_DATA.size() > significance;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
new file mode 100644
index 0000000..f8ce7c7
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import java.awt.Graphics;
+import java.awt.Graphics2D;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
+import org.apache.mahout.clustering.fuzzykmeans.SoftCluster;
+import org.apache.mahout.clustering.iterator.ClusterIterator;
+import org.apache.mahout.clustering.iterator.FuzzyKMeansClusteringPolicy;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
+import org.apache.mahout.math.Vector;
+
+import com.google.common.collect.Lists;
+
+public class DisplayFuzzyKMeans extends DisplayClustering {
+  
+  DisplayFuzzyKMeans() {
+    initialize();
+    this.setTitle("Fuzzy k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
+  }
+  
+  // Override the paint() method
+  @Override
+  public void paint(Graphics g) {
+    plotSampleData((Graphics2D) g);
+    plotClusters((Graphics2D) g);
+  }
+  
+  public static void main(String[] args) throws Exception {
+    DistanceMeasure measure = new ManhattanDistanceMeasure();
+    
+    Path samples = new Path("samples");
+    Path output = new Path("output");
+    Configuration conf = new Configuration();
+    HadoopUtil.delete(conf, output);
+    HadoopUtil.delete(conf, samples);
+    RandomUtils.useTestSeed();
+    DisplayClustering.generateSamples();
+    writeSampleData(samples);
+    boolean runClusterer = true;
+    int maxIterations = 10;
+    float threshold = 0.001F;
+    float m = 1.1F;
+    if (runClusterer) {
+      runSequentialFuzzyKClusterer(conf, samples, output, measure, maxIterations, m, threshold);
+    } else {
+      int numClusters = 3;
+      runSequentialFuzzyKClassifier(conf, samples, output, measure, numClusters, maxIterations, m, threshold);
+    }
+    new DisplayFuzzyKMeans();
+  }
+  
+  private static void runSequentialFuzzyKClassifier(Configuration conf, Path samples, Path output,
+      DistanceMeasure measure, int numClusters, int maxIterations, float m, double threshold) throws IOException {
+    Collection<Vector> points = Lists.newArrayList();
+    for (int i = 0; i < numClusters; i++) {
+      points.add(SAMPLE_DATA.get(i).get());
+    }
+    List<Cluster> initialClusters = Lists.newArrayList();
+    int id = 0;
+    for (Vector point : points) {
+      initialClusters.add(new SoftCluster(point, id++, measure));
+    }
+    ClusterClassifier prior = new ClusterClassifier(initialClusters, new FuzzyKMeansClusteringPolicy(m, threshold));
+    Path priorPath = new Path(output, "classifier-0");
+    prior.writeToSeqFiles(priorPath);
+    
+    ClusterIterator.iterateSeq(conf, samples, priorPath, output, maxIterations);
+    loadClustersWritable(output);
+  }
+  
+  private static void runSequentialFuzzyKClusterer(Configuration conf, Path samples, Path output,
+      DistanceMeasure measure, int maxIterations, float m, double threshold) throws IOException,
+      ClassNotFoundException, InterruptedException {
+    Path clustersIn = new Path(output, "random-seeds");
+    RandomSeedGenerator.buildRandom(conf, samples, clustersIn, 3, measure);
+    FuzzyKMeansDriver.run(samples, clustersIn, output, threshold, maxIterations, m, true, true, threshold,
+        true);
+    
+    loadClustersWritable(output);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
new file mode 100644
index 0000000..336d69e
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import java.awt.Graphics;
+import java.awt.Graphics2D;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.clustering.iterator.ClusterIterator;
+import org.apache.mahout.clustering.iterator.KMeansClusteringPolicy;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
+import org.apache.mahout.math.Vector;
+
+import com.google.common.collect.Lists;
+
+public class DisplayKMeans extends DisplayClustering {
+  
+  DisplayKMeans() {
+    initialize();
+    this.setTitle("k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
+  }
+  
+  public static void main(String[] args) throws Exception {
+    DistanceMeasure measure = new ManhattanDistanceMeasure();
+    Path samples = new Path("samples");
+    Path output = new Path("output");
+    Configuration conf = new Configuration();
+    HadoopUtil.delete(conf, samples);
+    HadoopUtil.delete(conf, output);
+    
+    RandomUtils.useTestSeed();
+    generateSamples();
+    writeSampleData(samples);
+    boolean runClusterer = true;
+    double convergenceDelta = 0.001;
+    int numClusters = 3;
+    int maxIterations = 10;
+    if (runClusterer) {
+      runSequentialKMeansClusterer(conf, samples, output, measure, numClusters, maxIterations, convergenceDelta);
+    } else {
+      runSequentialKMeansClassifier(conf, samples, output, measure, numClusters, maxIterations, convergenceDelta);
+    }
+    new DisplayKMeans();
+  }
+  
+  private static void runSequentialKMeansClassifier(Configuration conf, Path samples, Path output,
+      DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta) throws IOException {
+    Collection<Vector> points = Lists.newArrayList();
+    for (int i = 0; i < numClusters; i++) {
+      points.add(SAMPLE_DATA.get(i).get());
+    }
+    List<Cluster> initialClusters = Lists.newArrayList();
+    int id = 0;
+    for (Vector point : points) {
+      initialClusters.add(new org.apache.mahout.clustering.kmeans.Kluster(point, id++, measure));
+    }
+    ClusterClassifier prior = new ClusterClassifier(initialClusters, new KMeansClusteringPolicy(convergenceDelta));
+    Path priorPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
+    prior.writeToSeqFiles(priorPath);
+    
+    ClusterIterator.iterateSeq(conf, samples, priorPath, output, maxIterations);
+    loadClustersWritable(output);
+  }
+  
+  private static void runSequentialKMeansClusterer(Configuration conf, Path samples, Path output,
+    DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta)
+    throws IOException, InterruptedException, ClassNotFoundException {
+    Path clustersIn = new Path(output, "random-seeds");
+    RandomSeedGenerator.buildRandom(conf, samples, clustersIn, numClusters, measure);
+    KMeansDriver.run(samples, clustersIn, output, convergenceDelta, maxIterations, true, 0.0, true);
+    loadClustersWritable(output);
+  }
+  
+  // Override the paint() method
+  @Override
+  public void paint(Graphics g) {
+    plotSampleData((Graphics2D) g);
+    plotClusters((Graphics2D) g);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
new file mode 100644
index 0000000..2b70749
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import java.awt.Graphics;
+import java.awt.Graphics2D;
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.Writer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.spectral.kmeans.SpectralKMeansDriver;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
+
+public class DisplaySpectralKMeans extends DisplayClustering {
+
+  protected static final String SAMPLES = "samples";
+  protected static final String OUTPUT = "output";
+  protected static final String TEMP = "tmp";
+  protected static final String AFFINITIES = "affinities";
+
+  DisplaySpectralKMeans() {
+    initialize();
+    setTitle("Spectral k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
+  }
+
+  public static void main(String[] args) throws Exception {
+    DistanceMeasure measure = new ManhattanDistanceMeasure();
+    Path samples = new Path(SAMPLES);
+    Path output = new Path(OUTPUT);
+    Path tempDir = new Path(TEMP);
+    Configuration conf = new Configuration();
+    HadoopUtil.delete(conf, samples);
+    HadoopUtil.delete(conf, output);
+
+    RandomUtils.useTestSeed();
+    DisplayClustering.generateSamples();
+    writeSampleData(samples);
+    Path affinities = new Path(output, AFFINITIES);
+    FileSystem fs = FileSystem.get(output.toUri(), conf);
+    if (!fs.exists(output)) {
+      fs.mkdirs(output);
+    }
+
+    try (Writer writer = new BufferedWriter(new FileWriter(affinities.toString()))){
+      for (int i = 0; i < SAMPLE_DATA.size(); i++) {
+        for (int j = 0; j < SAMPLE_DATA.size(); j++) {
+          writer.write(i + "," + j + ',' + measure.distance(SAMPLE_DATA.get(i).get(),
+              SAMPLE_DATA.get(j).get()) + '\n');
+        }
+      }
+    }
+
+    int maxIter = 10;
+    double convergenceDelta = 0.001;
+    SpectralKMeansDriver.run(new Configuration(), affinities, output, SAMPLE_DATA.size(), 3, measure,
+        convergenceDelta, maxIter, tempDir);
+    new DisplaySpectralKMeans();
+  }
+
+  @Override
+  public void paint(Graphics g) {
+    plotClusteredSampleData((Graphics2D) g, new Path(new Path(OUTPUT), "kmeans_out"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/README.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/README.txt b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/README.txt
new file mode 100644
index 0000000..470c16c
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/README.txt
@@ -0,0 +1,22 @@
+The following classes can be run without parameters to generate a sample data set and 
+run the reference clustering implementations over them:
+
+DisplayClustering - generates 1000 samples from three, symmetric distributions. This is the same 
+    data set that is used by the following clustering programs. It displays the points on a screen
+    and superimposes the model parameters that were used to generate the points. You can edit the
+    generateSamples() method to change the sample points used by these programs.
+    
+  * DisplayCanopy - uses Canopy clustering
+  * DisplayKMeans - uses k-Means clustering
+  * DisplayFuzzyKMeans - uses Fuzzy k-Means clustering
+  
+  * NOTE: some of these programs display the sample points and then superimpose all of the clusters
+    from each iteration. The last iteration's clusters are in bold red and the previous several are 
+    colored (orange, yellow, green, blue, violet) in order after which all earlier clusters are in
+    light grey. This helps to visualize how the clusters converge upon a solution over multiple
+    iterations.
+  * NOTE: by changing the parameter values (k, ALPHA_0, numIterations) and the display SIGNIFICANCE
+    you can obtain different results.
+    
+  
+    
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
new file mode 100644
index 0000000..c29cbc4
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
@@ -0,0 +1,279 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.tools;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.List;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.io.Closeables;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.ClusteringUtils;
+import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.stats.OnlineSummarizer;
+
+public class ClusterQualitySummarizer extends AbstractJob {
+  private String outputFile;
+
+  private PrintWriter fileOut;
+
+  private String trainFile;
+  private String testFile;
+  private String centroidFile;
+  private String centroidCompareFile;
+  private boolean mahoutKMeansFormat;
+  private boolean mahoutKMeansFormatCompare;
+
+  private DistanceMeasure distanceMeasure = new SquaredEuclideanDistanceMeasure();
+
+  public void printSummaries(List<OnlineSummarizer> summarizers, String type) {
+    printSummaries(summarizers, type, fileOut);
+  }
+
+  public static void printSummaries(List<OnlineSummarizer> summarizers, String type, PrintWriter fileOut) {
+    double maxDistance = 0;
+    for (int i = 0; i < summarizers.size(); ++i) {
+      OnlineSummarizer summarizer = summarizers.get(i);
+      if (summarizer.getCount() > 1) {
+        maxDistance = Math.max(maxDistance, summarizer.getMax());
+        System.out.printf("Average distance in cluster %d [%d]: %f\n", i, summarizer.getCount(), summarizer.getMean());
+        // If there is just one point in the cluster, quartiles cannot be estimated. We'll just assume all the quartiles
+        // equal the only value.
+        if (fileOut != null) {
+          fileOut.printf("%d,%f,%f,%f,%f,%f,%f,%f,%d,%s\n", i, summarizer.getMean(),
+              summarizer.getSD(),
+              summarizer.getQuartile(0),
+              summarizer.getQuartile(1),
+              summarizer.getQuartile(2),
+              summarizer.getQuartile(3),
+              summarizer.getQuartile(4), summarizer.getCount(), type);
+        }
+      } else {
+        System.out.printf("Cluster %d is has %d data point. Need atleast 2 data points in a cluster for" +
+            " OnlineSummarizer.\n", i, summarizer.getCount());
+      }
+    }
+    System.out.printf("Num clusters: %d; maxDistance: %f\n", summarizers.size(), maxDistance);
+  }
+
+  public int run(String[] args) throws IOException {
+    if (!parseArgs(args)) {
+      return -1;
+    }
+
+    Configuration conf = new Configuration();
+    try {
+      fileOut = new PrintWriter(new FileOutputStream(outputFile));
+      fileOut.printf("cluster,distance.mean,distance.sd,distance.q0,distance.q1,distance.q2,distance.q3,"
+          + "distance.q4,count,is.train\n");
+
+      // Reading in the centroids (both pairs, if they exist).
+      List<Centroid> centroids;
+      List<Centroid> centroidsCompare = null;
+      if (mahoutKMeansFormat) {
+        SequenceFileDirValueIterable<ClusterWritable> clusterIterable =
+            new SequenceFileDirValueIterable<>(new Path(centroidFile), PathType.GLOB, conf);
+        centroids = Lists.newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterIterable));
+      } else {
+        SequenceFileDirValueIterable<CentroidWritable> centroidIterable =
+            new SequenceFileDirValueIterable<>(new Path(centroidFile), PathType.GLOB, conf);
+        centroids = Lists.newArrayList(IOUtils.getCentroidsFromCentroidWritableIterable(centroidIterable));
+      }
+
+      if (centroidCompareFile != null) {
+        if (mahoutKMeansFormatCompare) {
+          SequenceFileDirValueIterable<ClusterWritable> clusterCompareIterable =
+              new SequenceFileDirValueIterable<>(new Path(centroidCompareFile), PathType.GLOB, conf);
+          centroidsCompare = Lists.newArrayList(
+              IOUtils.getCentroidsFromClusterWritableIterable(clusterCompareIterable));
+        } else {
+          SequenceFileDirValueIterable<CentroidWritable> centroidCompareIterable =
+              new SequenceFileDirValueIterable<>(new Path(centroidCompareFile), PathType.GLOB, conf);
+          centroidsCompare = Lists.newArrayList(
+              IOUtils.getCentroidsFromCentroidWritableIterable(centroidCompareIterable));
+        }
+      }
+
+      // Reading in the "training" set.
+      SequenceFileDirValueIterable<VectorWritable> trainIterable =
+          new SequenceFileDirValueIterable<>(new Path(trainFile), PathType.GLOB, conf);
+      Iterable<Vector> trainDatapoints = IOUtils.getVectorsFromVectorWritableIterable(trainIterable);
+      Iterable<Vector> datapoints = trainDatapoints;
+
+      printSummaries(ClusteringUtils.summarizeClusterDistances(trainDatapoints, centroids,
+          new SquaredEuclideanDistanceMeasure()), "train");
+
+      // Also adding in the "test" set.
+      if (testFile != null) {
+        SequenceFileDirValueIterable<VectorWritable> testIterable =
+            new SequenceFileDirValueIterable<>(new Path(testFile), PathType.GLOB, conf);
+        Iterable<Vector> testDatapoints = IOUtils.getVectorsFromVectorWritableIterable(testIterable);
+
+        printSummaries(ClusteringUtils.summarizeClusterDistances(testDatapoints, centroids,
+            new SquaredEuclideanDistanceMeasure()), "test");
+
+        datapoints = Iterables.concat(trainDatapoints, testDatapoints);
+      }
+
+      // At this point, all train/test CSVs have been written. We now compute quality metrics.
+      List<OnlineSummarizer> summaries =
+          ClusteringUtils.summarizeClusterDistances(datapoints, centroids, distanceMeasure);
+      List<OnlineSummarizer> compareSummaries = null;
+      if (centroidsCompare != null) {
+        compareSummaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroidsCompare, distanceMeasure);
+      }
+      System.out.printf("[Dunn Index] First: %f", ClusteringUtils.dunnIndex(centroids, distanceMeasure, summaries));
+      if (compareSummaries != null) {
+        System.out.printf(" Second: %f\n", ClusteringUtils.dunnIndex(centroidsCompare, distanceMeasure, compareSummaries));
+      } else {
+        System.out.printf("\n");
+      }
+      System.out.printf("[Davies-Bouldin Index] First: %f",
+          ClusteringUtils.daviesBouldinIndex(centroids, distanceMeasure, summaries));
+      if (compareSummaries != null) {
+        System.out.printf(" Second: %f\n",
+          ClusteringUtils.daviesBouldinIndex(centroidsCompare, distanceMeasure, compareSummaries));
+      } else {
+        System.out.printf("\n");
+      }
+    } catch (IOException e) {
+      System.out.println(e.getMessage());
+    } finally {
+      Closeables.close(fileOut, false);
+    }
+    return 0;
+  }
+
+  private boolean parseArgs(String[] args) {
+    DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+    Option help = builder.withLongName("help").withDescription("print this list").create();
+
+    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+    Option inputFileOption = builder.withLongName("input")
+        .withShortName("i")
+        .withRequired(true)
+        .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+        .withDescription("where to get seq files with the vectors (training set)")
+        .create();
+
+    Option testInputFileOption = builder.withLongName("testInput")
+        .withShortName("itest")
+        .withArgument(argumentBuilder.withName("testInput").withMaximum(1).create())
+        .withDescription("where to get seq files with the vectors (test set)")
+        .create();
+
+    Option centroidsFileOption = builder.withLongName("centroids")
+        .withShortName("c")
+        .withRequired(true)
+        .withArgument(argumentBuilder.withName("centroids").withMaximum(1).create())
+        .withDescription("where to get seq files with the centroids (from Mahout KMeans or StreamingKMeansDriver)")
+        .create();
+
+    Option centroidsCompareFileOption = builder.withLongName("centroidsCompare")
+        .withShortName("cc")
+        .withRequired(false)
+        .withArgument(argumentBuilder.withName("centroidsCompare").withMaximum(1).create())
+        .withDescription("where to get seq files with the second set of centroids (from Mahout KMeans or "
+            + "StreamingKMeansDriver)")
+        .create();
+
+    Option outputFileOption = builder.withLongName("output")
+        .withShortName("o")
+        .withRequired(true)
+        .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
+        .withDescription("where to dump the CSV file with the results")
+        .create();
+
+    Option mahoutKMeansFormatOption = builder.withLongName("mahoutkmeansformat")
+        .withShortName("mkm")
+        .withDescription("if set, read files as (IntWritable, ClusterWritable) pairs")
+        .withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create())
+        .create();
+
+    Option mahoutKMeansCompareFormatOption = builder.withLongName("mahoutkmeansformatCompare")
+        .withShortName("mkmc")
+        .withDescription("if set, read files as (IntWritable, ClusterWritable) pairs")
+        .withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create())
+        .create();
+
+    Group normalArgs = new GroupBuilder()
+        .withOption(help)
+        .withOption(inputFileOption)
+        .withOption(testInputFileOption)
+        .withOption(outputFileOption)
+        .withOption(centroidsFileOption)
+        .withOption(centroidsCompareFileOption)
+        .withOption(mahoutKMeansFormatOption)
+        .withOption(mahoutKMeansCompareFormatOption)
+        .create();
+
+    Parser parser = new Parser();
+    parser.setHelpOption(help);
+    parser.setHelpTrigger("--help");
+    parser.setGroup(normalArgs);
+    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 150));
+
+    CommandLine cmdLine = parser.parseAndHelp(args);
+    if (cmdLine == null) {
+      return false;
+    }
+
+    trainFile = (String) cmdLine.getValue(inputFileOption);
+    if (cmdLine.hasOption(testInputFileOption)) {
+      testFile = (String) cmdLine.getValue(testInputFileOption);
+    }
+    centroidFile = (String) cmdLine.getValue(centroidsFileOption);
+    if (cmdLine.hasOption(centroidsCompareFileOption)) {
+      centroidCompareFile = (String) cmdLine.getValue(centroidsCompareFileOption);
+    }
+    outputFile = (String) cmdLine.getValue(outputFileOption);
+    if (cmdLine.hasOption(mahoutKMeansFormatOption)) {
+      mahoutKMeansFormat = true;
+    }
+    if (cmdLine.hasOption(mahoutKMeansCompareFormatOption)) {
+      mahoutKMeansFormatCompare = true;
+    }
+    return true;
+  }
+
+  public static void main(String[] args) throws IOException {
+    new ClusterQualitySummarizer().run(args);
+  }
+}


[26/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RelevantItemsDataSplitter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RelevantItemsDataSplitter.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RelevantItemsDataSplitter.java
new file mode 100644
index 0000000..da318d5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RelevantItemsDataSplitter.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.eval;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+/**
+ * Implementations of this interface determine the items that are considered relevant,
+ * and splits data into a training and test subset, for purposes of precision/recall
+ * tests as implemented by implementations of {@link RecommenderIRStatsEvaluator}.
+ */
+public interface RelevantItemsDataSplitter {
+
+  /**
+   * During testing, relevant items are removed from a particular users' preferences,
+   * and a model is build using this user's other preferences and all other users.
+   *
+   * @param at                 Maximum number of items to be removed
+   * @param relevanceThreshold Minimum strength of preference for an item to be considered
+   *                           relevant
+   * @return IDs of relevant items
+   */
+  FastIDSet getRelevantItemsIDs(long userID,
+                                int at,
+                                double relevanceThreshold,
+                                DataModel dataModel) throws TasteException;
+
+  /**
+   * Adds a single user and all their preferences to the training model.
+   *
+   * @param userID          ID of user whose preferences we are trying to predict
+   * @param relevantItemIDs IDs of items considered relevant to that user
+   * @param trainingUsers   the database of training preferences to which we will
+   *                        append the ones for otherUserID.
+   * @param otherUserID     for whom we are adding preferences to the training model
+   */
+  void processOtherUser(long userID,
+                        FastIDSet relevantItemIDs,
+                        FastByIDMap<PreferenceArray> trainingUsers,
+                        long otherUserID,
+                        DataModel dataModel) throws TasteException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityEntityWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityEntityWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityEntityWritable.java
new file mode 100644
index 0000000..e70a675
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityEntityWritable.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import com.google.common.primitives.Longs;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.mahout.math.Varint;
+
+/** A {@link WritableComparable} encapsulating two items. */
+public final class EntityEntityWritable implements WritableComparable<EntityEntityWritable>, Cloneable {
+  
+  private long aID;
+  private long bID;
+  
+  public EntityEntityWritable() {
+  // do nothing
+  }
+  
+  public EntityEntityWritable(long aID, long bID) {
+    this.aID = aID;
+    this.bID = bID;
+  }
+  
+  long getAID() {
+    return aID;
+  }
+  
+  long getBID() {
+    return bID;
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    Varint.writeSignedVarLong(aID, out);
+    Varint.writeSignedVarLong(bID, out);
+  }
+  
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    aID = Varint.readSignedVarLong(in);
+    bID = Varint.readSignedVarLong(in);
+  }
+  
+  @Override
+  public int compareTo(EntityEntityWritable that) {
+    int aCompare = compare(aID, that.getAID());
+    return aCompare == 0 ? compare(bID, that.getBID()) : aCompare;
+  }
+  
+  private static int compare(long a, long b) {
+    return a < b ? -1 : a > b ? 1 : 0;
+  }
+  
+  @Override
+  public int hashCode() {
+    return Longs.hashCode(aID) + 31 * Longs.hashCode(bID);
+  }
+  
+  @Override
+  public boolean equals(Object o) {
+    if (o instanceof EntityEntityWritable) {
+      EntityEntityWritable that = (EntityEntityWritable) o;
+      return aID == that.getAID() && bID == that.getBID();
+    }
+    return false;
+  }
+  
+  @Override
+  public String toString() {
+    return aID + "\t" + bID;
+  }
+
+  @Override
+  public EntityEntityWritable clone() {
+    return new EntityEntityWritable(aID, bID);
+  }
+  
+}
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritable.java
new file mode 100644
index 0000000..2aab63c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritable.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.VarLongWritable;
+
+/** A {@link org.apache.hadoop.io.Writable} encapsulating an item ID and a preference value. */
+public final class EntityPrefWritable extends VarLongWritable implements Cloneable {
+  
+  private float prefValue;
+  
+  public EntityPrefWritable() {
+    // do nothing
+  }
+  
+  public EntityPrefWritable(long itemID, float prefValue) {
+    super(itemID);
+    this.prefValue = prefValue;
+  }
+  
+  public EntityPrefWritable(EntityPrefWritable other) {
+    this(other.get(), other.getPrefValue());
+  }
+
+  public long getID() {
+    return get();
+  }
+
+  public float getPrefValue() {
+    return prefValue;
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    super.write(out);
+    out.writeFloat(prefValue);
+  }
+  
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    super.readFields(in);
+    prefValue = in.readFloat();
+  }
+
+  @Override
+  public int hashCode() {
+    return super.hashCode() ^ RandomUtils.hashFloat(prefValue);
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (!(o instanceof EntityPrefWritable)) {
+      return false;
+    }
+    EntityPrefWritable other = (EntityPrefWritable) o;
+    return get() == other.get() && prefValue == other.getPrefValue();
+  }
+
+  @Override
+  public String toString() {
+    return get() + "\t" + prefValue;
+  }
+
+  @Override
+  public EntityPrefWritable clone() {
+    return new EntityPrefWritable(get(), prefValue);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/MutableRecommendedItem.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/MutableRecommendedItem.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/MutableRecommendedItem.java
new file mode 100644
index 0000000..3de272d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/MutableRecommendedItem.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.common.RandomUtils;
+
+/**
+ * Mutable variant of {@link RecommendedItem}
+ */
+public class MutableRecommendedItem implements RecommendedItem {
+
+  private long itemID;
+  private float value;
+
+  public MutableRecommendedItem() {}
+
+  public MutableRecommendedItem(long itemID, float value) {
+    this.itemID = itemID;
+    this.value = value;
+  }
+
+  @Override
+  public long getItemID() {
+    return itemID;
+  }
+
+  @Override
+  public float getValue() {
+    return value;
+  }
+
+  public void setItemID(long itemID) {
+    this.itemID = itemID;
+  }
+
+  public void set(long itemID, float value) {
+    this.itemID = itemID;
+    this.value = value;
+  }
+
+  public void capToMaxValue(float maxValue) {
+    if (value > maxValue) {
+      value = maxValue;
+    }
+  }
+
+  @Override
+  public String toString() {
+    return "MutableRecommendedItem[item:" + itemID + ", value:" + value + ']';
+  }
+
+  @Override
+  public int hashCode() {
+    return (int) itemID ^ RandomUtils.hashFloat(value);
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (!(o instanceof MutableRecommendedItem)) {
+      return false;
+    }
+    RecommendedItem other = (RecommendedItem) o;
+    return itemID == other.getItemID() && value == other.getValue();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommendedItemsWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommendedItemsWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommendedItemsWritable.java
new file mode 100644
index 0000000..bc832aa
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommendedItemsWritable.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.cf.taste.impl.recommender.GenericRecommendedItem;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.math.Varint;
+
+/**
+ * A {@link Writable} which encapsulates a list of {@link RecommendedItem}s. This is the mapper (and reducer)
+ * output, and represents items recommended to a user. The first item is the one whose estimated preference is
+ * highest.
+ */
+public final class RecommendedItemsWritable implements Writable {
+
+  private List<RecommendedItem> recommended;
+  
+  public RecommendedItemsWritable() {
+  // do nothing
+  }
+  
+  public RecommendedItemsWritable(List<RecommendedItem> recommended) {
+    this.recommended = recommended;
+  }
+  
+  public List<RecommendedItem> getRecommendedItems() {
+    return recommended;
+  }
+
+  public void set(List<RecommendedItem> recommended) {
+    this.recommended = recommended;
+  }
+  
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(recommended.size());
+    for (RecommendedItem item : recommended) {
+      Varint.writeSignedVarLong(item.getItemID(), out);
+      out.writeFloat(item.getValue());
+    }
+  }
+  
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    int size = in.readInt();
+    recommended = new ArrayList<>(size);
+    for (int i = 0; i < size; i++) {
+      long itemID = Varint.readSignedVarLong(in);
+      float value = in.readFloat();
+      RecommendedItem recommendedItem = new GenericRecommendedItem(itemID, value);
+      recommended.add(recommendedItem);
+    }
+  }
+  
+  @Override
+  public String toString() {
+    StringBuilder result = new StringBuilder(200);
+    result.append('[');
+    boolean first = true;
+    for (RecommendedItem item : recommended) {
+      if (first) {
+        first = false;
+      } else {
+        result.append(',');
+      }
+      result.append(String.valueOf(item.getItemID()));
+      result.append(':');
+      result.append(String.valueOf(item.getValue()));
+    }
+    result.append(']');
+    return result.toString();
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TasteHadoopUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TasteHadoopUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TasteHadoopUtils.java
new file mode 100644
index 0000000..e3fab29
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TasteHadoopUtils.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+import com.google.common.primitives.Longs;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.map.OpenIntLongHashMap;
+
+import java.util.regex.Pattern;
+
+/**
+ * Some helper methods for the hadoop-related stuff in org.apache.mahout.cf.taste
+ */
+public final class TasteHadoopUtils {
+
+  public static final int USER_ID_POS = 0;
+  public static final int ITEM_ID_POS = 1;
+
+  /** Standard delimiter of textual preference data */
+  private static final Pattern PREFERENCE_TOKEN_DELIMITER = Pattern.compile("[\t,]");
+
+  private TasteHadoopUtils() {}
+
+  /**
+   * Splits a preference data line into string tokens
+   */
+  public static String[] splitPrefTokens(CharSequence line) {
+    return PREFERENCE_TOKEN_DELIMITER.split(line);
+  }
+
+  /**
+   * Maps a long to an int with range of 0 to Integer.MAX_VALUE-1
+   */
+  public static int idToIndex(long id) {
+    return 0x7FFFFFFF & Longs.hashCode(id) % 0x7FFFFFFE;
+  }
+
+  public static int readID(String token, boolean usesLongIDs) {
+    return usesLongIDs ? idToIndex(Long.parseLong(token)) : Integer.parseInt(token);
+  }
+
+  /**
+   * Reads a binary mapping file
+   */
+  public static OpenIntLongHashMap readIDIndexMap(String idIndexPathStr, Configuration conf) {
+    OpenIntLongHashMap indexIDMap = new OpenIntLongHashMap();
+    Path itemIDIndexPath = new Path(idIndexPathStr);
+    for (Pair<VarIntWritable,VarLongWritable> record
+         : new SequenceFileDirIterable<VarIntWritable,VarLongWritable>(itemIDIndexPath,
+                                                                       PathType.LIST,
+                                                                       PathFilters.partFilter(),
+                                                                       null,
+                                                                       true,
+                                                                       conf)) {
+      indexIDMap.put(record.getFirst().get(), record.getSecond().get());
+    }
+    return indexIDMap;
+  }
+
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToEntityPrefsMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToEntityPrefsMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToEntityPrefsMapper.java
new file mode 100644
index 0000000..fdb552e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToEntityPrefsMapper.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.cf.taste.hadoop.item.RecommenderJob;
+import org.apache.mahout.math.VarLongWritable;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+public abstract class ToEntityPrefsMapper extends
+    Mapper<LongWritable,Text, VarLongWritable,VarLongWritable> {
+
+  public static final String TRANSPOSE_USER_ITEM = ToEntityPrefsMapper.class + "transposeUserItem";
+  public static final String RATING_SHIFT = ToEntityPrefsMapper.class + "shiftRatings";
+
+  private static final Pattern DELIMITER = Pattern.compile("[\t,]");
+
+  private boolean booleanData;
+  private boolean transpose;
+  private final boolean itemKey;
+  private float ratingShift;
+
+  ToEntityPrefsMapper(boolean itemKey) {
+    this.itemKey = itemKey;
+  }
+
+  @Override
+  protected void setup(Context context) {
+    Configuration jobConf = context.getConfiguration();
+    booleanData = jobConf.getBoolean(RecommenderJob.BOOLEAN_DATA, false);
+    transpose = jobConf.getBoolean(TRANSPOSE_USER_ITEM, false);
+    ratingShift = Float.parseFloat(jobConf.get(RATING_SHIFT, "0.0"));
+  }
+
+  @Override
+  public void map(LongWritable key,
+                  Text value,
+                  Context context) throws IOException, InterruptedException {
+    String[] tokens = DELIMITER.split(value.toString());
+    long userID = Long.parseLong(tokens[0]);
+    long itemID = Long.parseLong(tokens[1]);
+    if (itemKey ^ transpose) {
+      // If using items as keys, and not transposing items and users, then users are items!
+      // Or if not using items as keys (users are, as usual), but transposing items and users,
+      // then users are items! Confused?
+      long temp = userID;
+      userID = itemID;
+      itemID = temp;
+    }
+    if (booleanData) {
+      context.write(new VarLongWritable(userID), new VarLongWritable(itemID));
+    } else {
+      float prefValue = tokens.length > 2 ? Float.parseFloat(tokens[2]) + ratingShift : 1.0f;
+      context.write(new VarLongWritable(userID), new EntityPrefWritable(itemID, prefValue));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToItemPrefsMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToItemPrefsMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToItemPrefsMapper.java
new file mode 100644
index 0000000..f5f9574
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToItemPrefsMapper.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+/**
+ * <h1>Input</h1>
+ *
+ * <p>
+ * Intended for use with {@link org.apache.hadoop.mapreduce.lib.input.TextInputFormat};
+ * accepts line number / line pairs as
+ * {@link org.apache.hadoop.io.LongWritable}/{@link org.apache.hadoop.io.Text} pairs.
+ * </p>
+ *
+ * <p>
+ * Each line is assumed to be of the form {@code userID,itemID,preference}, or {@code userID,itemID}.
+ * </p>
+ *
+ * <h1>Output</h1>
+ *
+ * <p>
+ * Outputs the user ID as a {@link org.apache.mahout.math.VarLongWritable} mapped to the item ID and preference as a
+ * {@link EntityPrefWritable}.
+ * </p>
+ */
+public final class ToItemPrefsMapper extends ToEntityPrefsMapper {
+
+  public ToItemPrefsMapper() {
+    super(false);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TopItemsQueue.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TopItemsQueue.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TopItemsQueue.java
new file mode 100644
index 0000000..8f563b0
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TopItemsQueue.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+
+public class TopItemsQueue extends PriorityQueue<MutableRecommendedItem> {
+
+  private static final long SENTINEL_ID = Long.MIN_VALUE;
+
+  private final int maxSize;
+
+  public TopItemsQueue(int maxSize) {
+    super(maxSize);
+    this.maxSize = maxSize;
+  }
+
+  public List<RecommendedItem> getTopItems() {
+    List<RecommendedItem> recommendedItems = new ArrayList<>(maxSize);
+    while (size() > 0) {
+      MutableRecommendedItem topItem = pop();
+      // filter out "sentinel" objects necessary for maintaining an efficient priority queue
+      if (topItem.getItemID() != SENTINEL_ID) {
+        recommendedItems.add(topItem);
+      }
+    }
+    Collections.reverse(recommendedItems);
+    return recommendedItems;
+  }
+
+  @Override
+  protected boolean lessThan(MutableRecommendedItem one, MutableRecommendedItem two) {
+    return one.getValue() < two.getValue();
+  }
+
+  @Override
+  protected MutableRecommendedItem getSentinelObject() {
+    return new MutableRecommendedItem(SENTINEL_ID, Float.MIN_VALUE);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ALS.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ALS.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ALS.java
new file mode 100644
index 0000000..4bb95ae
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ALS.java
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import com.google.common.base.Preconditions;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.als.AlternatingLeastSquaresSolver;
+import org.apache.mahout.math.map.OpenIntObjectHashMap;
+
+final class ALS {
+
+  private ALS() {}
+
+  static Vector readFirstRow(Path dir, Configuration conf) throws IOException {
+    Iterator<VectorWritable> iterator = new SequenceFileDirValueIterator<>(dir, PathType.LIST,
+        PathFilters.partFilter(), null, true, conf);
+    return iterator.hasNext() ? iterator.next().get() : null;
+  }
+
+  public static OpenIntObjectHashMap<Vector> readMatrixByRowsFromDistributedCache(int numEntities,
+      Configuration conf) throws IOException {
+
+    IntWritable rowIndex = new IntWritable();
+    VectorWritable row = new VectorWritable();
+
+
+    OpenIntObjectHashMap<Vector> featureMatrix = numEntities > 0
+        ? new OpenIntObjectHashMap<Vector>(numEntities) : new OpenIntObjectHashMap<Vector>();
+
+    Path[] cachedFiles = HadoopUtil.getCachedFiles(conf);
+    LocalFileSystem localFs = FileSystem.getLocal(conf);
+
+    for (Path cachedFile : cachedFiles) {
+      try (SequenceFile.Reader reader = new SequenceFile.Reader(localFs.getConf(), SequenceFile.Reader.file(cachedFile))) {
+        while (reader.next(rowIndex, row)) {
+          featureMatrix.put(rowIndex.get(), row.get());
+        }
+      }
+    }
+
+    Preconditions.checkState(!featureMatrix.isEmpty(), "Feature matrix is empty");
+    return featureMatrix;
+  }
+
+  public static OpenIntObjectHashMap<Vector> readMatrixByRows(Path dir, Configuration conf) {
+    OpenIntObjectHashMap<Vector> matrix = new OpenIntObjectHashMap<>();
+    for (Pair<IntWritable,VectorWritable> pair
+        : new SequenceFileDirIterable<IntWritable,VectorWritable>(dir, PathType.LIST, PathFilters.partFilter(), conf)) {
+      int rowIndex = pair.getFirst().get();
+      Vector row = pair.getSecond().get();
+      matrix.put(rowIndex, row);
+    }
+    return matrix;
+  }
+
+  public static Vector solveExplicit(VectorWritable ratingsWritable, OpenIntObjectHashMap<Vector> uOrM,
+    double lambda, int numFeatures) {
+    Vector ratings = ratingsWritable.get();
+
+    List<Vector> featureVectors = new ArrayList<>(ratings.getNumNondefaultElements());
+    for (Vector.Element e : ratings.nonZeroes()) {
+      int index = e.index();
+      featureVectors.add(uOrM.get(index));
+    }
+
+    return AlternatingLeastSquaresSolver.solve(featureVectors, ratings, lambda, numFeatures);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java
new file mode 100644
index 0000000..b061a63
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java
@@ -0,0 +1,158 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.RandomUtils;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+/**
+ * <p>Split a recommendation dataset into a training and a test set</p>
+ *
+  * <p>Command line arguments specific to this class are:</p>
+ *
+ * <ol>
+ * <li>--input (path): Directory containing one or more text files with the dataset</li>
+ * <li>--output (path): path where output should go</li>
+ * <li>--trainingPercentage (double): percentage of the data to use as training set (optional, default 0.9)</li>
+ * <li>--probePercentage (double): percentage of the data to use as probe set (optional, default 0.1)</li>
+ * </ol>
+ */
+public class DatasetSplitter extends AbstractJob {
+
+  private static final String TRAINING_PERCENTAGE = DatasetSplitter.class.getName() + ".trainingPercentage";
+  private static final String PROBE_PERCENTAGE = DatasetSplitter.class.getName() + ".probePercentage";
+  private static final String PART_TO_USE = DatasetSplitter.class.getName() + ".partToUse";
+
+  private static final Text INTO_TRAINING_SET = new Text("T");
+  private static final Text INTO_PROBE_SET = new Text("P");
+
+  private static final double DEFAULT_TRAINING_PERCENTAGE = 0.9;
+  private static final double DEFAULT_PROBE_PERCENTAGE = 0.1;
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new DatasetSplitter(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+
+    addInputOption();
+    addOutputOption();
+    addOption("trainingPercentage", "t", "percentage of the data to use as training set (default: " 
+        + DEFAULT_TRAINING_PERCENTAGE + ')', String.valueOf(DEFAULT_TRAINING_PERCENTAGE));
+    addOption("probePercentage", "p", "percentage of the data to use as probe set (default: " 
+        + DEFAULT_PROBE_PERCENTAGE + ')', String.valueOf(DEFAULT_PROBE_PERCENTAGE));
+
+    Map<String,List<String>> parsedArgs = parseArguments(args);
+    if (parsedArgs == null) {
+      return -1;
+    }
+
+    double trainingPercentage = Double.parseDouble(getOption("trainingPercentage"));
+    double probePercentage = Double.parseDouble(getOption("probePercentage"));
+    String tempDir = getOption("tempDir");
+
+    Path markedPrefs = new Path(tempDir, "markedPreferences");
+    Path trainingSetPath = new Path(getOutputPath(), "trainingSet");
+    Path probeSetPath = new Path(getOutputPath(), "probeSet");
+
+    Job markPreferences = prepareJob(getInputPath(), markedPrefs, TextInputFormat.class, MarkPreferencesMapper.class,
+        Text.class, Text.class, SequenceFileOutputFormat.class);
+    markPreferences.getConfiguration().set(TRAINING_PERCENTAGE, String.valueOf(trainingPercentage));
+    markPreferences.getConfiguration().set(PROBE_PERCENTAGE, String.valueOf(probePercentage));
+    boolean succeeded = markPreferences.waitForCompletion(true);
+    if (!succeeded) {
+      return -1;
+    }
+
+    Job createTrainingSet = prepareJob(markedPrefs, trainingSetPath, SequenceFileInputFormat.class,
+        WritePrefsMapper.class, NullWritable.class, Text.class, TextOutputFormat.class);
+    createTrainingSet.getConfiguration().set(PART_TO_USE, INTO_TRAINING_SET.toString());
+    succeeded = createTrainingSet.waitForCompletion(true);
+    if (!succeeded) {
+      return -1;
+    }
+
+    Job createProbeSet = prepareJob(markedPrefs, probeSetPath, SequenceFileInputFormat.class,
+        WritePrefsMapper.class, NullWritable.class, Text.class, TextOutputFormat.class);
+    createProbeSet.getConfiguration().set(PART_TO_USE, INTO_PROBE_SET.toString());
+    succeeded = createProbeSet.waitForCompletion(true);
+    if (!succeeded) {
+      return -1;
+    }
+
+    return 0;
+  }
+
+  static class MarkPreferencesMapper extends Mapper<LongWritable,Text,Text,Text> {
+
+    private Random random;
+    private double trainingBound;
+    private double probeBound;
+
+    @Override
+    protected void setup(Context ctx) throws IOException, InterruptedException {
+      random = RandomUtils.getRandom();
+      trainingBound = Double.parseDouble(ctx.getConfiguration().get(TRAINING_PERCENTAGE));
+      probeBound = trainingBound + Double.parseDouble(ctx.getConfiguration().get(PROBE_PERCENTAGE));
+    }
+
+    @Override
+    protected void map(LongWritable key, Text text, Context ctx) throws IOException, InterruptedException {
+      double randomValue = random.nextDouble();
+      if (randomValue <= trainingBound) {
+        ctx.write(INTO_TRAINING_SET, text);
+      } else if (randomValue <= probeBound) {
+        ctx.write(INTO_PROBE_SET, text);
+      }
+    }
+  }
+
+  static class WritePrefsMapper extends Mapper<Text,Text,NullWritable,Text> {
+
+    private String partToUse;
+
+    @Override
+    protected void setup(Context ctx) throws IOException, InterruptedException {
+      partToUse = ctx.getConfiguration().get(PART_TO_USE);
+    }
+
+    @Override
+    protected void map(Text key, Text text, Context ctx) throws IOException, InterruptedException {
+      if (partToUse.equals(key.toString())) {
+        ctx.write(NullWritable.get(), text);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java
new file mode 100644
index 0000000..4e6aaf5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java
@@ -0,0 +1,166 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.map.OpenIntObjectHashMap;
+
+/**
+ * <p>Measures the root-mean-squared error of a rating matrix factorization against a test set.</p>
+ *
+ * <p>Command line arguments specific to this class are:</p>
+ *
+ * <ol>
+ * <li>--output (path): path where output should go</li>
+ * <li>--pairs (path): path containing the test ratings, each line must be userID,itemID,rating</li>
+ * <li>--userFeatures (path): path to the user feature matrix</li>
+ * <li>--itemFeatures (path): path to the item feature matrix</li>
+ * </ol>
+ */
+public class FactorizationEvaluator extends AbstractJob {
+
+  private static final String USER_FEATURES_PATH = RecommenderJob.class.getName() + ".userFeatures";
+  private static final String ITEM_FEATURES_PATH = RecommenderJob.class.getName() + ".itemFeatures";
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new FactorizationEvaluator(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+
+    addInputOption();
+    addOption("userFeatures", null, "path to the user feature matrix", true);
+    addOption("itemFeatures", null, "path to the item feature matrix", true);
+    addOption("usesLongIDs", null, "input contains long IDs that need to be translated");
+    addOutputOption();
+
+    Map<String,List<String>> parsedArgs = parseArguments(args);
+    if (parsedArgs == null) {
+      return -1;
+    }
+
+    Path errors = getTempPath("errors");
+
+    Job predictRatings = prepareJob(getInputPath(), errors, TextInputFormat.class, PredictRatingsMapper.class,
+        DoubleWritable.class, NullWritable.class, SequenceFileOutputFormat.class);
+
+    Configuration conf = predictRatings.getConfiguration();
+    conf.set(USER_FEATURES_PATH, getOption("userFeatures"));
+    conf.set(ITEM_FEATURES_PATH, getOption("itemFeatures"));
+
+    boolean usesLongIDs = Boolean.parseBoolean(getOption("usesLongIDs"));
+    if (usesLongIDs) {
+      conf.set(ParallelALSFactorizationJob.USES_LONG_IDS, String.valueOf(true));
+    }
+
+
+    boolean succeeded = predictRatings.waitForCompletion(true);
+    if (!succeeded) {
+      return -1;
+    }
+
+    FileSystem fs = FileSystem.get(getOutputPath().toUri(), getConf());
+    FSDataOutputStream outputStream = fs.create(getOutputPath("rmse.txt"));
+    try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream, Charsets.UTF_8))){
+      double rmse = computeRmse(errors);
+      writer.write(String.valueOf(rmse));
+    }
+    return 0;
+  }
+
+  private double computeRmse(Path errors) {
+    RunningAverage average = new FullRunningAverage();
+    for (Pair<DoubleWritable,NullWritable> entry
+        : new SequenceFileDirIterable<DoubleWritable, NullWritable>(errors, PathType.LIST, PathFilters.logsCRCFilter(),
+          getConf())) {
+      DoubleWritable error = entry.getFirst();
+      average.addDatum(error.get() * error.get());
+    }
+
+    return Math.sqrt(average.getAverage());
+  }
+
+  public static class PredictRatingsMapper extends Mapper<LongWritable,Text,DoubleWritable,NullWritable> {
+
+    private OpenIntObjectHashMap<Vector> U;
+    private OpenIntObjectHashMap<Vector> M;
+
+    private boolean usesLongIDs;
+
+    private final DoubleWritable error = new DoubleWritable();
+
+    @Override
+    protected void setup(Context ctx) throws IOException, InterruptedException {
+      Configuration conf = ctx.getConfiguration();
+
+      Path pathToU = new Path(conf.get(USER_FEATURES_PATH));
+      Path pathToM = new Path(conf.get(ITEM_FEATURES_PATH));
+
+      U = ALS.readMatrixByRows(pathToU, conf);
+      M = ALS.readMatrixByRows(pathToM, conf);
+
+      usesLongIDs = conf.getBoolean(ParallelALSFactorizationJob.USES_LONG_IDS, false);
+    }
+
+    @Override
+    protected void map(LongWritable key, Text value, Context ctx) throws IOException, InterruptedException {
+
+      String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString());
+
+      int userID = TasteHadoopUtils.readID(tokens[TasteHadoopUtils.USER_ID_POS], usesLongIDs);
+      int itemID = TasteHadoopUtils.readID(tokens[TasteHadoopUtils.ITEM_ID_POS], usesLongIDs);
+      double rating = Double.parseDouble(tokens[2]);
+
+      if (U.containsKey(userID) && M.containsKey(itemID)) {
+        double estimate = U.get(userID).dot(M.get(itemID));
+        error.set(rating - estimate);
+        ctx.write(error, NullWritable.get());
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/MultithreadedSharingMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/MultithreadedSharingMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/MultithreadedSharingMapper.java
new file mode 100644
index 0000000..d93e3a4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/MultithreadedSharingMapper.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import java.io.IOException;
+
+/**
+ * Multithreaded Mapper for {@link SharingMapper}s. Will call setupSharedInstance() once in the controlling thread
+ * before executing the mappers using a thread pool.
+ *
+ * @param <K1>
+ * @param <V1>
+ * @param <K2>
+ * @param <V2>
+ */
+public class MultithreadedSharingMapper<K1, V1, K2, V2> extends MultithreadedMapper<K1, V1, K2, V2> {
+
+  @Override
+  public void run(Context ctx) throws IOException, InterruptedException {
+    Class<Mapper<K1, V1, K2, V2>> mapperClass =
+        MultithreadedSharingMapper.getMapperClass((JobContext) ctx);
+    Preconditions.checkNotNull(mapperClass, "Could not find Multithreaded Mapper class.");
+
+    Configuration conf = ctx.getConfiguration();
+    // instantiate the mapper
+    Mapper<K1, V1, K2, V2> mapper1 = ReflectionUtils.newInstance(mapperClass, conf);
+    SharingMapper<K1, V1, K2, V2, ?> mapper = null;
+    if (mapper1 instanceof SharingMapper) {
+      mapper = (SharingMapper<K1, V1, K2, V2, ?>) mapper1;
+    }
+    Preconditions.checkNotNull(mapper, "Could not instantiate SharingMapper. Class was: %s",
+                               mapper1.getClass().getName());
+
+    // single threaded call to setup the sharing mapper
+    mapper.setupSharedInstance(ctx);
+
+    // multithreaded execution
+    super.run(ctx);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java
new file mode 100644
index 0000000..2ce9b61
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java
@@ -0,0 +1,414 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.mapreduce.MergeVectorsCombiner;
+import org.apache.mahout.common.mapreduce.MergeVectorsReducer;
+import org.apache.mahout.common.mapreduce.TransposeMapper;
+import org.apache.mahout.common.mapreduce.VectorSumCombiner;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.Vectors;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>MapReduce implementation of the two factorization algorithms described in
+ *
+ * <p>"Large-scale Parallel Collaborative Filtering for the Netflix Prize" available at
+ * http://www.hpl.hp.com/personal/Robert_Schreiber/papers/2008%20AAIM%20Netflix/netflix_aaim08(submitted).pdf.</p>
+ *
+ * "<p>Collaborative Filtering for Implicit Feedback Datasets" available at
+ * http://research.yahoo.com/pub/2433</p>
+ *
+ * </p>
+ * <p>Command line arguments specific to this class are:</p>
+ *
+ * <ol>
+ * <li>--input (path): Directory containing one or more text files with the dataset</li>
+ * <li>--output (path): path where output should go</li>
+ * <li>--lambda (double): regularization parameter to avoid overfitting</li>
+ * <li>--userFeatures (path): path to the user feature matrix</li>
+ * <li>--itemFeatures (path): path to the item feature matrix</li>
+ * <li>--numThreadsPerSolver (int): threads to use per solver mapper, (default: 1)</li>
+ * </ol>
+ */
+public class ParallelALSFactorizationJob extends AbstractJob {
+
+  private static final Logger log = LoggerFactory.getLogger(ParallelALSFactorizationJob.class);
+
+  static final String NUM_FEATURES = ParallelALSFactorizationJob.class.getName() + ".numFeatures";
+  static final String LAMBDA = ParallelALSFactorizationJob.class.getName() + ".lambda";
+  static final String ALPHA = ParallelALSFactorizationJob.class.getName() + ".alpha";
+  static final String NUM_ENTITIES = ParallelALSFactorizationJob.class.getName() + ".numEntities";
+
+  static final String USES_LONG_IDS = ParallelALSFactorizationJob.class.getName() + ".usesLongIDs";
+  static final String TOKEN_POS = ParallelALSFactorizationJob.class.getName() + ".tokenPos";
+
+  private boolean implicitFeedback;
+  private int numIterations;
+  private int numFeatures;
+  private double lambda;
+  private double alpha;
+  private int numThreadsPerSolver;
+
+  enum Stats { NUM_USERS }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new ParallelALSFactorizationJob(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+
+    addInputOption();
+    addOutputOption();
+    addOption("lambda", null, "regularization parameter", true);
+    addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false));
+    addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40));
+    addOption("numFeatures", null, "dimension of the feature space", true);
+    addOption("numIterations", null, "number of iterations", true);
+    addOption("numThreadsPerSolver", null, "threads per solver mapper", String.valueOf(1));
+    addOption("usesLongIDs", null, "input contains long IDs that need to be translated");
+
+    Map<String,List<String>> parsedArgs = parseArguments(args);
+    if (parsedArgs == null) {
+      return -1;
+    }
+
+    numFeatures = Integer.parseInt(getOption("numFeatures"));
+    numIterations = Integer.parseInt(getOption("numIterations"));
+    lambda = Double.parseDouble(getOption("lambda"));
+    alpha = Double.parseDouble(getOption("alpha"));
+    implicitFeedback = Boolean.parseBoolean(getOption("implicitFeedback"));
+
+    numThreadsPerSolver = Integer.parseInt(getOption("numThreadsPerSolver"));
+    boolean usesLongIDs = Boolean.parseBoolean(getOption("usesLongIDs", String.valueOf(false)));
+
+    /*
+    * compute the factorization A = U M'
+    *
+    * where A (users x items) is the matrix of known ratings
+    *           U (users x features) is the representation of users in the feature space
+    *           M (items x features) is the representation of items in the feature space
+    */
+
+    if (usesLongIDs) {
+      Job mapUsers = prepareJob(getInputPath(), getOutputPath("userIDIndex"), TextInputFormat.class,
+          MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class,
+          VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
+      mapUsers.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.USER_ID_POS));
+      mapUsers.waitForCompletion(true);
+
+      Job mapItems = prepareJob(getInputPath(), getOutputPath("itemIDIndex"), TextInputFormat.class,
+          MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class,
+          VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
+      mapItems.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.ITEM_ID_POS));
+      mapItems.waitForCompletion(true);
+    }
+
+   /* create A' */
+    Job itemRatings = prepareJob(getInputPath(), pathToItemRatings(),
+        TextInputFormat.class, ItemRatingVectorsMapper.class, IntWritable.class,
+        VectorWritable.class, VectorSumReducer.class, IntWritable.class,
+        VectorWritable.class, SequenceFileOutputFormat.class);
+    itemRatings.setCombinerClass(VectorSumCombiner.class);
+    itemRatings.getConfiguration().set(USES_LONG_IDS, String.valueOf(usesLongIDs));
+    boolean succeeded = itemRatings.waitForCompletion(true);
+    if (!succeeded) {
+      return -1;
+    }
+
+    /* create A */
+    Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(),
+        TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeUserVectorsReducer.class,
+        IntWritable.class, VectorWritable.class);
+    userRatings.setCombinerClass(MergeVectorsCombiner.class);
+    succeeded = userRatings.waitForCompletion(true);
+    if (!succeeded) {
+      return -1;
+    }
+
+    //TODO this could be fiddled into one of the upper jobs
+    Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"),
+        AverageRatingMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
+        IntWritable.class, VectorWritable.class);
+    averageItemRatings.setCombinerClass(MergeVectorsCombiner.class);
+    succeeded = averageItemRatings.waitForCompletion(true);
+    if (!succeeded) {
+      return -1;
+    }
+
+    Vector averageRatings = ALS.readFirstRow(getTempPath("averageRatings"), getConf());
+
+    int numItems = averageRatings.getNumNondefaultElements();
+    int numUsers = (int) userRatings.getCounters().findCounter(Stats.NUM_USERS).getValue();
+
+    log.info("Found {} users and {} items", numUsers, numItems);
+
+    /* create an initial M */
+    initializeM(averageRatings);
+
+    for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {
+      /* broadcast M, read A row-wise, recompute U row-wise */
+      log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations);
+      runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1), currentIteration, "U",
+          numItems);
+      /* broadcast U, read A' row-wise, recompute M row-wise */
+      log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations);
+      runSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration), currentIteration, "M",
+          numUsers);
+    }
+
+    return 0;
+  }
+
+  private void initializeM(Vector averageRatings) throws IOException {
+    Random random = RandomUtils.getRandom();
+
+    FileSystem fs = FileSystem.get(pathToM(-1).toUri(), getConf());
+    try (SequenceFile.Writer writer =
+             new SequenceFile.Writer(fs, getConf(), new Path(pathToM(-1), "part-m-00000"),
+                 IntWritable.class, VectorWritable.class)) {
+      IntWritable index = new IntWritable();
+      VectorWritable featureVector = new VectorWritable();
+
+      for (Vector.Element e : averageRatings.nonZeroes()) {
+        Vector row = new DenseVector(numFeatures);
+        row.setQuick(0, e.get());
+        for (int m = 1; m < numFeatures; m++) {
+          row.setQuick(m, random.nextDouble());
+        }
+        index.set(e.index());
+        featureVector.set(row);
+        writer.append(index, featureVector);
+      }
+    }
+  }
+
+  static class VectorSumReducer
+      extends Reducer<WritableComparable<?>, VectorWritable, WritableComparable<?>, VectorWritable> {
+
+    private final VectorWritable result = new VectorWritable();
+
+    @Override
+    protected void reduce(WritableComparable<?> key, Iterable<VectorWritable> values, Context ctx)
+      throws IOException, InterruptedException {
+      Vector sum = Vectors.sum(values.iterator());
+      result.set(new SequentialAccessSparseVector(sum));
+      ctx.write(key, result);
+    }
+  }
+
+  static class MergeUserVectorsReducer extends
+      Reducer<WritableComparable<?>,VectorWritable,WritableComparable<?>,VectorWritable> {
+
+    private final VectorWritable result = new VectorWritable();
+
+    @Override
+    public void reduce(WritableComparable<?> key, Iterable<VectorWritable> vectors, Context ctx)
+      throws IOException, InterruptedException {
+      Vector merged = VectorWritable.merge(vectors.iterator()).get();
+      result.set(new SequentialAccessSparseVector(merged));
+      ctx.write(key, result);
+      ctx.getCounter(Stats.NUM_USERS).increment(1);
+    }
+  }
+
+  static class ItemRatingVectorsMapper extends Mapper<LongWritable,Text,IntWritable,VectorWritable> {
+
+    private final IntWritable itemIDWritable = new IntWritable();
+    private final VectorWritable ratingsWritable = new VectorWritable(true);
+    private final Vector ratings = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
+
+    private boolean usesLongIDs;
+
+    @Override
+    protected void setup(Context ctx) throws IOException, InterruptedException {
+      usesLongIDs = ctx.getConfiguration().getBoolean(USES_LONG_IDS, false);
+    }
+
+    @Override
+    protected void map(LongWritable offset, Text line, Context ctx) throws IOException, InterruptedException {
+      String[] tokens = TasteHadoopUtils.splitPrefTokens(line.toString());
+      int userID = TasteHadoopUtils.readID(tokens[TasteHadoopUtils.USER_ID_POS], usesLongIDs);
+      int itemID = TasteHadoopUtils.readID(tokens[TasteHadoopUtils.ITEM_ID_POS], usesLongIDs);
+      float rating = Float.parseFloat(tokens[2]);
+
+      ratings.setQuick(userID, rating);
+
+      itemIDWritable.set(itemID);
+      ratingsWritable.set(ratings);
+
+      ctx.write(itemIDWritable, ratingsWritable);
+
+      // prepare instance for reuse
+      ratings.setQuick(userID, 0.0d);
+    }
+  }
+
+  private void runSolver(Path ratings, Path output, Path pathToUorM, int currentIteration, String matrixName,
+                         int numEntities) throws ClassNotFoundException, IOException, InterruptedException {
+
+    // necessary for local execution in the same JVM only
+    SharingMapper.reset();
+
+    Class<? extends Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable>> solverMapperClassInternal;
+    String name;
+
+    if (implicitFeedback) {
+      solverMapperClassInternal = SolveImplicitFeedbackMapper.class;
+      name = "Recompute " + matrixName + ", iteration (" + currentIteration + '/' + numIterations + "), "
+          + '(' + numThreadsPerSolver + " threads, " + numFeatures + " features, implicit feedback)";
+    } else {
+      solverMapperClassInternal = SolveExplicitFeedbackMapper.class;
+      name = "Recompute " + matrixName + ", iteration (" + currentIteration + '/' + numIterations + "), "
+          + '(' + numThreadsPerSolver + " threads, " + numFeatures + " features, explicit feedback)";
+    }
+
+    Job solverForUorI = prepareJob(ratings, output, SequenceFileInputFormat.class, MultithreadedSharingMapper.class,
+        IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, name);
+    Configuration solverConf = solverForUorI.getConfiguration();
+    solverConf.set(LAMBDA, String.valueOf(lambda));
+    solverConf.set(ALPHA, String.valueOf(alpha));
+    solverConf.setInt(NUM_FEATURES, numFeatures);
+    solverConf.set(NUM_ENTITIES, String.valueOf(numEntities));
+
+    FileSystem fs = FileSystem.get(pathToUorM.toUri(), solverConf);
+    FileStatus[] parts = fs.listStatus(pathToUorM, PathFilters.partFilter());
+    for (FileStatus part : parts) {
+      if (log.isDebugEnabled()) {
+        log.debug("Adding {} to distributed cache", part.getPath().toString());
+      }
+      DistributedCache.addCacheFile(part.getPath().toUri(), solverConf);
+    }
+
+    MultithreadedMapper.setMapperClass(solverForUorI, solverMapperClassInternal);
+    MultithreadedMapper.setNumberOfThreads(solverForUorI, numThreadsPerSolver);
+
+    boolean succeeded = solverForUorI.waitForCompletion(true);
+    if (!succeeded) {
+      throw new IllegalStateException("Job failed!");
+    }
+  }
+
+  static class AverageRatingMapper extends Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable> {
+
+    private final IntWritable firstIndex = new IntWritable(0);
+    private final Vector featureVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
+    private final VectorWritable featureVectorWritable = new VectorWritable();
+
+    @Override
+    protected void map(IntWritable r, VectorWritable v, Context ctx) throws IOException, InterruptedException {
+      RunningAverage avg = new FullRunningAverage();
+      for (Vector.Element e : v.get().nonZeroes()) {
+        avg.addDatum(e.get());
+      }
+
+      featureVector.setQuick(r.get(), avg.getAverage());
+      featureVectorWritable.set(featureVector);
+      ctx.write(firstIndex, featureVectorWritable);
+
+      // prepare instance for reuse
+      featureVector.setQuick(r.get(), 0.0d);
+    }
+  }
+
+  static class MapLongIDsMapper extends Mapper<LongWritable,Text,VarIntWritable,VarLongWritable> {
+
+    private int tokenPos;
+    private final VarIntWritable index = new VarIntWritable();
+    private final VarLongWritable idWritable = new VarLongWritable();
+
+    @Override
+    protected void setup(Context ctx) throws IOException, InterruptedException {
+      tokenPos = ctx.getConfiguration().getInt(TOKEN_POS, -1);
+      Preconditions.checkState(tokenPos >= 0);
+    }
+
+    @Override
+    protected void map(LongWritable key, Text line, Context ctx) throws IOException, InterruptedException {
+      String[] tokens = TasteHadoopUtils.splitPrefTokens(line.toString());
+
+      long id = Long.parseLong(tokens[tokenPos]);
+
+      index.set(TasteHadoopUtils.idToIndex(id));
+      idWritable.set(id);
+      ctx.write(index, idWritable);
+    }
+  }
+
+  static class IDMapReducer extends Reducer<VarIntWritable,VarLongWritable,VarIntWritable,VarLongWritable> {
+    @Override
+    protected void reduce(VarIntWritable index, Iterable<VarLongWritable> ids, Context ctx)
+      throws IOException, InterruptedException {
+      ctx.write(index, ids.iterator().next());
+    }
+  }
+
+  private Path pathToM(int iteration) {
+    return iteration == numIterations - 1 ? getOutputPath("M") : getTempPath("M-" + iteration);
+  }
+
+  private Path pathToU(int iteration) {
+    return iteration == numIterations - 1 ? getOutputPath("U") : getTempPath("U-" + iteration);
+  }
+
+  private Path pathToItemRatings() {
+    return getTempPath("itemRatings");
+  }
+
+  private Path pathToUserRatings() {
+    return getOutputPath("userRatings");
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/PredictionMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/PredictionMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/PredictionMapper.java
new file mode 100644
index 0000000..6e7ea81
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/PredictionMapper.java
@@ -0,0 +1,145 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.mahout.cf.taste.hadoop.MutableRecommendedItem;
+import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.cf.taste.hadoop.TopItemsQueue;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.IntObjectProcedure;
+import org.apache.mahout.math.map.OpenIntLongHashMap;
+import org.apache.mahout.math.map.OpenIntObjectHashMap;
+import org.apache.mahout.math.set.OpenIntHashSet;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * a multithreaded mapper that loads the feature matrices U and M into memory. Afterwards it computes recommendations
+ * from these. Can be executed by a {@link MultithreadedSharingMapper}.
+ */
+public class PredictionMapper extends SharingMapper<IntWritable,VectorWritable,LongWritable,RecommendedItemsWritable,
+    Pair<OpenIntObjectHashMap<Vector>,OpenIntObjectHashMap<Vector>>> {
+
+  private int recommendationsPerUser;
+  private float maxRating;
+
+  private boolean usesLongIDs;
+  private OpenIntLongHashMap userIDIndex;
+  private OpenIntLongHashMap itemIDIndex;
+
+  private final LongWritable userIDWritable = new LongWritable();
+  private final RecommendedItemsWritable recommendations = new RecommendedItemsWritable();
+
+  @Override
+  Pair<OpenIntObjectHashMap<Vector>, OpenIntObjectHashMap<Vector>> createSharedInstance(Context ctx) {
+    Configuration conf = ctx.getConfiguration();
+    Path pathToU = new Path(conf.get(RecommenderJob.USER_FEATURES_PATH));
+    Path pathToM = new Path(conf.get(RecommenderJob.ITEM_FEATURES_PATH));
+
+    OpenIntObjectHashMap<Vector> U = ALS.readMatrixByRows(pathToU, conf);
+    OpenIntObjectHashMap<Vector> M = ALS.readMatrixByRows(pathToM, conf);
+
+    return new Pair<>(U, M);
+  }
+
+  @Override
+  protected void setup(Context ctx) throws IOException, InterruptedException {
+    Configuration conf = ctx.getConfiguration();
+    recommendationsPerUser = conf.getInt(RecommenderJob.NUM_RECOMMENDATIONS,
+        RecommenderJob.DEFAULT_NUM_RECOMMENDATIONS);
+    maxRating = Float.parseFloat(conf.get(RecommenderJob.MAX_RATING));
+
+    usesLongIDs = conf.getBoolean(ParallelALSFactorizationJob.USES_LONG_IDS, false);
+    if (usesLongIDs) {
+      userIDIndex = TasteHadoopUtils.readIDIndexMap(conf.get(RecommenderJob.USER_INDEX_PATH), conf);
+      itemIDIndex = TasteHadoopUtils.readIDIndexMap(conf.get(RecommenderJob.ITEM_INDEX_PATH), conf);
+    }
+  }
+
+  @Override
+  protected void map(IntWritable userIndexWritable, VectorWritable ratingsWritable, Context ctx)
+    throws IOException, InterruptedException {
+
+    Pair<OpenIntObjectHashMap<Vector>, OpenIntObjectHashMap<Vector>> uAndM = getSharedInstance();
+    OpenIntObjectHashMap<Vector> U = uAndM.getFirst();
+    OpenIntObjectHashMap<Vector> M = uAndM.getSecond();
+
+    Vector ratings = ratingsWritable.get();
+    int userIndex = userIndexWritable.get();
+    final OpenIntHashSet alreadyRatedItems = new OpenIntHashSet(ratings.getNumNondefaultElements());
+
+    for (Vector.Element e : ratings.nonZeroes()) {
+      alreadyRatedItems.add(e.index());
+    }
+
+    final TopItemsQueue topItemsQueue = new TopItemsQueue(recommendationsPerUser);
+    final Vector userFeatures = U.get(userIndex);
+
+    M.forEachPair(new IntObjectProcedure<Vector>() {
+      @Override
+      public boolean apply(int itemID, Vector itemFeatures) {
+        if (!alreadyRatedItems.contains(itemID)) {
+          double predictedRating = userFeatures.dot(itemFeatures);
+
+          MutableRecommendedItem top = topItemsQueue.top();
+          if (predictedRating > top.getValue()) {
+            top.set(itemID, (float) predictedRating);
+            topItemsQueue.updateTop();
+          }
+        }
+        return true;
+      }
+    });
+
+    List<RecommendedItem> recommendedItems = topItemsQueue.getTopItems();
+
+    if (!recommendedItems.isEmpty()) {
+
+      // cap predictions to maxRating
+      for (RecommendedItem topItem : recommendedItems) {
+        ((MutableRecommendedItem) topItem).capToMaxValue(maxRating);
+      }
+
+      if (usesLongIDs) {
+        long userID = userIDIndex.get(userIndex);
+        userIDWritable.set(userID);
+
+        for (RecommendedItem topItem : recommendedItems) {
+          // remap item IDs
+          long itemID = itemIDIndex.get((int) topItem.getItemID());
+          ((MutableRecommendedItem) topItem).setItemID(itemID);
+        }
+
+      } else {
+        userIDWritable.set(userIndex);
+      }
+
+      recommendations.set(recommendedItems);
+      ctx.write(userIDWritable, recommendations);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java
new file mode 100644
index 0000000..679d227
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable;
+import org.apache.mahout.common.AbstractJob;
+
+import java.util.List;
+import java.util.Map;
+
+/**
+ * <p>Computes the top-N recommendations per user from a decomposition of the rating matrix</p>
+ *
+ * <p>Command line arguments specific to this class are:</p>
+ *
+ * <ol>
+ * <li>--input (path): Directory containing the vectorized user ratings</li>
+ * <li>--output (path): path where output should go</li>
+ * <li>--numRecommendations (int): maximum number of recommendations per user (default: 10)</li>
+ * <li>--maxRating (double): maximum rating of an item</li>
+ * <li>--numThreads (int): threads to use per mapper, (default: 1)</li>
+ * </ol>
+ */
+public class RecommenderJob extends AbstractJob {
+
+  static final String NUM_RECOMMENDATIONS = RecommenderJob.class.getName() + ".numRecommendations";
+  static final String USER_FEATURES_PATH = RecommenderJob.class.getName() + ".userFeatures";
+  static final String ITEM_FEATURES_PATH = RecommenderJob.class.getName() + ".itemFeatures";
+  static final String MAX_RATING = RecommenderJob.class.getName() + ".maxRating";
+  static final String USER_INDEX_PATH = RecommenderJob.class.getName() + ".userIndex";
+  static final String ITEM_INDEX_PATH = RecommenderJob.class.getName() + ".itemIndex";
+
+  static final int DEFAULT_NUM_RECOMMENDATIONS = 10;
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new RecommenderJob(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+
+    addInputOption();
+    addOption("userFeatures", null, "path to the user feature matrix", true);
+    addOption("itemFeatures", null, "path to the item feature matrix", true);
+    addOption("numRecommendations", null, "number of recommendations per user",
+        String.valueOf(DEFAULT_NUM_RECOMMENDATIONS));
+    addOption("maxRating", null, "maximum rating available", true);
+    addOption("numThreads", null, "threads per mapper", String.valueOf(1));
+    addOption("usesLongIDs", null, "input contains long IDs that need to be translated");
+    addOption("userIDIndex", null, "index for user long IDs (necessary if usesLongIDs is true)");
+    addOption("itemIDIndex", null, "index for user long IDs (necessary if usesLongIDs is true)");
+    addOutputOption();
+
+    Map<String,List<String>> parsedArgs = parseArguments(args);
+    if (parsedArgs == null) {
+      return -1;
+    }
+
+    Job prediction = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class,
+        MultithreadedSharingMapper.class, IntWritable.class, RecommendedItemsWritable.class, TextOutputFormat.class);
+    Configuration conf = prediction.getConfiguration();
+
+    int numThreads = Integer.parseInt(getOption("numThreads"));
+
+    conf.setInt(NUM_RECOMMENDATIONS, Integer.parseInt(getOption("numRecommendations")));
+    conf.set(USER_FEATURES_PATH, getOption("userFeatures"));
+    conf.set(ITEM_FEATURES_PATH, getOption("itemFeatures"));
+    conf.set(MAX_RATING, getOption("maxRating"));
+
+    boolean usesLongIDs = Boolean.parseBoolean(getOption("usesLongIDs"));
+    if (usesLongIDs) {
+      conf.set(ParallelALSFactorizationJob.USES_LONG_IDS, String.valueOf(true));
+      conf.set(USER_INDEX_PATH, getOption("userIDIndex"));
+      conf.set(ITEM_INDEX_PATH, getOption("itemIDIndex"));
+    }
+
+    MultithreadedMapper.setMapperClass(prediction, PredictionMapper.class);
+    MultithreadedMapper.setNumberOfThreads(prediction, numThreads);
+
+    boolean succeeded = prediction.waitForCompletion(true);
+    if (!succeeded) {
+      return -1;
+    }
+
+    return 0;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SharingMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SharingMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SharingMapper.java
new file mode 100644
index 0000000..9925807
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SharingMapper.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import org.apache.hadoop.mapreduce.Mapper;
+
+import java.io.IOException;
+
+/**
+ * Mapper class to be used by {@link MultithreadedSharingMapper}. Offers "global" before() and after() methods
+ * that will typically be used to set up static variables.
+ *
+ * Suitable for mappers that need large, read-only in-memory data to operate.
+ *
+ * @param <K1>
+ * @param <V1>
+ * @param <K2>
+ * @param <V2>
+ */
+public abstract class SharingMapper<K1,V1,K2,V2,S> extends Mapper<K1,V1,K2,V2> {
+
+  private static Object SHARED_INSTANCE;
+
+  /**
+   * Called before the multithreaded execution
+   *
+   * @param context mapper's context
+   */
+  abstract S createSharedInstance(Context context) throws IOException;
+
+  final void setupSharedInstance(Context context) throws IOException {
+    if (SHARED_INSTANCE == null) {
+      SHARED_INSTANCE = createSharedInstance(context);
+    }
+  }
+
+  final S getSharedInstance() {
+    return (S) SHARED_INSTANCE;
+  }
+
+  static void reset() {
+    SHARED_INSTANCE = null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveExplicitFeedbackMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveExplicitFeedbackMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveExplicitFeedbackMapper.java
new file mode 100644
index 0000000..2569918
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveExplicitFeedbackMapper.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import org.apache.mahout.math.map.OpenIntObjectHashMap;
+
+import java.io.IOException;
+
+/** Solving mapper that can be safely executed using multiple threads */
+public class SolveExplicitFeedbackMapper
+    extends SharingMapper<IntWritable,VectorWritable,IntWritable,VectorWritable,OpenIntObjectHashMap<Vector>> {
+
+  private double lambda;
+  private int numFeatures;
+  private final VectorWritable uiOrmj = new VectorWritable();
+
+  @Override
+  OpenIntObjectHashMap<Vector> createSharedInstance(Context ctx) throws IOException {
+    Configuration conf = ctx.getConfiguration();
+    int numEntities = Integer.parseInt(conf.get(ParallelALSFactorizationJob.NUM_ENTITIES));
+    return ALS.readMatrixByRowsFromDistributedCache(numEntities, conf);
+  }
+
+  @Override
+  protected void setup(Mapper.Context ctx) throws IOException, InterruptedException {
+    lambda = Double.parseDouble(ctx.getConfiguration().get(ParallelALSFactorizationJob.LAMBDA));
+    numFeatures = ctx.getConfiguration().getInt(ParallelALSFactorizationJob.NUM_FEATURES, -1);
+    Preconditions.checkArgument(numFeatures > 0, "numFeatures must be greater then 0!");
+  }
+
+  @Override
+  protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx)
+    throws IOException, InterruptedException {
+    OpenIntObjectHashMap<Vector> uOrM = getSharedInstance();
+    uiOrmj.set(ALS.solveExplicit(ratingsWritable, uOrM, lambda, numFeatures));
+    ctx.write(userOrItemID, uiOrmj);
+  }
+
+}


[35/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java
new file mode 100644
index 0000000..a99d54c
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java
@@ -0,0 +1,265 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.impl.recommender.svd.Factorization;
+import org.apache.mahout.cf.taste.impl.recommender.svd.Factorizer;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.common.RandomUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Collection;
+import java.util.Random;
+
+/**
+ * {@link Factorizer} based on Simon Funk's famous article <a href="http://sifter.org/~simon/journal/20061211.html">
+ * "Netflix Update: Try this at home"</a>.
+ *
+ * Attempts to be as memory efficient as possible, only iterating once through the
+ * {@link FactorizablePreferences} or {@link DataModel} while copying everything to primitive arrays.
+ * Learning works in place on these datastructures after that.
+ */
+public class ParallelArraysSGDFactorizer implements Factorizer {
+
+  public static final double DEFAULT_LEARNING_RATE = 0.005;
+  public static final double DEFAULT_PREVENT_OVERFITTING = 0.02;
+  public static final double DEFAULT_RANDOM_NOISE = 0.005;
+
+  private final int numFeatures;
+  private final int numIterations;
+  private final float minPreference;
+  private final float maxPreference;
+
+  private final Random random;
+  private final double learningRate;
+  private final double preventOverfitting;
+
+  private final FastByIDMap<Integer> userIDMapping;
+  private final FastByIDMap<Integer> itemIDMapping;
+
+  private final double[][] userFeatures;
+  private final double[][] itemFeatures;
+
+  private final int[] userIndexes;
+  private final int[] itemIndexes;
+  private final float[] values;
+
+  private final double defaultValue;
+  private final double interval;
+  private final double[] cachedEstimates;
+
+
+  private static final Logger log = LoggerFactory.getLogger(ParallelArraysSGDFactorizer.class);
+
+  public ParallelArraysSGDFactorizer(DataModel dataModel, int numFeatures, int numIterations) {
+    this(new DataModelFactorizablePreferences(dataModel), numFeatures, numIterations, DEFAULT_LEARNING_RATE,
+        DEFAULT_PREVENT_OVERFITTING, DEFAULT_RANDOM_NOISE);
+  }
+
+  public ParallelArraysSGDFactorizer(DataModel dataModel, int numFeatures, int numIterations, double learningRate,
+                                     double preventOverfitting, double randomNoise) {
+    this(new DataModelFactorizablePreferences(dataModel), numFeatures, numIterations, learningRate, preventOverfitting,
+        randomNoise);
+  }
+
+  public ParallelArraysSGDFactorizer(FactorizablePreferences factorizablePrefs, int numFeatures, int numIterations) {
+    this(factorizablePrefs, numFeatures, numIterations, DEFAULT_LEARNING_RATE, DEFAULT_PREVENT_OVERFITTING,
+        DEFAULT_RANDOM_NOISE);
+  }
+
+  public ParallelArraysSGDFactorizer(FactorizablePreferences factorizablePreferences, int numFeatures,
+      int numIterations, double learningRate, double preventOverfitting, double randomNoise) {
+
+    this.numFeatures = numFeatures;
+    this.numIterations = numIterations;
+    minPreference = factorizablePreferences.getMinPreference();
+    maxPreference = factorizablePreferences.getMaxPreference();
+
+    this.random = RandomUtils.getRandom();
+    this.learningRate = learningRate;
+    this.preventOverfitting = preventOverfitting;
+
+    int numUsers = factorizablePreferences.numUsers();
+    int numItems = factorizablePreferences.numItems();
+    int numPrefs = factorizablePreferences.numPreferences();
+
+    log.info("Mapping {} users...", numUsers);
+    userIDMapping = new FastByIDMap<>(numUsers);
+    int index = 0;
+    LongPrimitiveIterator userIterator = factorizablePreferences.getUserIDs();
+    while (userIterator.hasNext()) {
+      userIDMapping.put(userIterator.nextLong(), index++);
+    }
+
+    log.info("Mapping {} items", numItems);
+    itemIDMapping = new FastByIDMap<>(numItems);
+    index = 0;
+    LongPrimitiveIterator itemIterator = factorizablePreferences.getItemIDs();
+    while (itemIterator.hasNext()) {
+      itemIDMapping.put(itemIterator.nextLong(), index++);
+    }
+
+    this.userIndexes = new int[numPrefs];
+    this.itemIndexes = new int[numPrefs];
+    this.values = new float[numPrefs];
+    this.cachedEstimates = new double[numPrefs];
+
+    index = 0;
+    log.info("Loading {} preferences into memory", numPrefs);
+    RunningAverage average = new FullRunningAverage();
+    for (Preference preference : factorizablePreferences.getPreferences()) {
+      userIndexes[index] = userIDMapping.get(preference.getUserID());
+      itemIndexes[index] = itemIDMapping.get(preference.getItemID());
+      values[index] = preference.getValue();
+      cachedEstimates[index] = 0;
+
+      average.addDatum(preference.getValue());
+
+      index++;
+      if (index % 1000000 == 0) {
+        log.info("Processed {} preferences", index);
+      }
+    }
+    log.info("Processed {} preferences, done.", index);
+
+    double averagePreference = average.getAverage();
+    log.info("Average preference value is {}", averagePreference);
+
+    double prefInterval = factorizablePreferences.getMaxPreference() - factorizablePreferences.getMinPreference();
+    defaultValue = Math.sqrt((averagePreference - prefInterval * 0.1) / numFeatures);
+    interval = prefInterval * 0.1 / numFeatures;
+
+    userFeatures = new double[numUsers][numFeatures];
+    itemFeatures = new double[numItems][numFeatures];
+
+    log.info("Initializing feature vectors...");
+    for (int feature = 0; feature < numFeatures; feature++) {
+      for (int userIndex = 0; userIndex < numUsers; userIndex++) {
+        userFeatures[userIndex][feature] = defaultValue + (random.nextDouble() - 0.5) * interval * randomNoise;
+      }
+      for (int itemIndex = 0; itemIndex < numItems; itemIndex++) {
+        itemFeatures[itemIndex][feature] = defaultValue + (random.nextDouble() - 0.5) * interval * randomNoise;
+      }
+    }
+  }
+
+  @Override
+  public Factorization factorize() throws TasteException {
+    for (int feature = 0; feature < numFeatures; feature++) {
+      log.info("Shuffling preferences...");
+      shufflePreferences();
+      log.info("Starting training of feature {} ...", feature);
+      for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {
+        if (currentIteration == numIterations - 1) {
+          double rmse = trainingIterationWithRmse(feature);
+          log.info("Finished training feature {} with RMSE {}", feature, rmse);
+        } else {
+          trainingIteration(feature);
+        }
+      }
+      if (feature < numFeatures - 1) {
+        log.info("Updating cache...");
+        for (int index = 0; index < userIndexes.length; index++) {
+          cachedEstimates[index] = estimate(userIndexes[index], itemIndexes[index], feature, cachedEstimates[index],
+              false);
+        }
+      }
+    }
+    log.info("Factorization done");
+    return new Factorization(userIDMapping, itemIDMapping, userFeatures, itemFeatures);
+  }
+
+  private void trainingIteration(int feature) {
+    for (int index = 0; index < userIndexes.length; index++) {
+      train(userIndexes[index], itemIndexes[index], feature, values[index], cachedEstimates[index]);
+    }
+  }
+
+  private double trainingIterationWithRmse(int feature) {
+    double rmse = 0.0;
+    for (int index = 0; index < userIndexes.length; index++) {
+      double error = train(userIndexes[index], itemIndexes[index], feature, values[index], cachedEstimates[index]);
+      rmse += error * error;
+    }
+    return Math.sqrt(rmse / userIndexes.length);
+  }
+
+  private double estimate(int userIndex, int itemIndex, int feature, double cachedEstimate, boolean trailing) {
+    double sum = cachedEstimate;
+    sum += userFeatures[userIndex][feature] * itemFeatures[itemIndex][feature];
+    if (trailing) {
+      sum += (numFeatures - feature - 1) * (defaultValue + interval) * (defaultValue + interval);
+      if (sum > maxPreference) {
+        sum = maxPreference;
+      } else if (sum < minPreference) {
+        sum = minPreference;
+      }
+    }
+    return sum;
+  }
+
+  public double train(int userIndex, int itemIndex, int feature, double original, double cachedEstimate) {
+    double error = original - estimate(userIndex, itemIndex, feature, cachedEstimate, true);
+    double[] userVector = userFeatures[userIndex];
+    double[] itemVector = itemFeatures[itemIndex];
+
+    userVector[feature] += learningRate * (error * itemVector[feature] - preventOverfitting * userVector[feature]);
+    itemVector[feature] += learningRate * (error * userVector[feature] - preventOverfitting * itemVector[feature]);
+
+    return error;
+  }
+
+  protected void shufflePreferences() {
+    /* Durstenfeld shuffle */
+    for (int currentPos = userIndexes.length - 1; currentPos > 0; currentPos--) {
+      int swapPos = random.nextInt(currentPos + 1);
+      swapPreferences(currentPos, swapPos);
+    }
+  }
+
+  private void swapPreferences(int posA, int posB) {
+    int tmpUserIndex = userIndexes[posA];
+    int tmpItemIndex = itemIndexes[posA];
+    float tmpValue = values[posA];
+    double tmpEstimate = cachedEstimates[posA];
+
+    userIndexes[posA] = userIndexes[posB];
+    itemIndexes[posA] = itemIndexes[posB];
+    values[posA] = values[posB];
+    cachedEstimates[posA] = cachedEstimates[posB];
+
+    userIndexes[posB] = tmpUserIndex;
+    itemIndexes[posB] = tmpItemIndex;
+    values[posB] = tmpValue;
+    cachedEstimates[posB] = tmpEstimate;
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    // do nothing
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java
new file mode 100644
index 0000000..5cce02d
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
+
+import com.google.common.io.Closeables;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
+import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
+import org.apache.mahout.cf.taste.example.kddcup.track1.EstimateConverter;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.impl.recommender.svd.Factorization;
+import org.apache.mahout.cf.taste.impl.recommender.svd.Factorizer;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.Pair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * run an SVD factorization of the KDD track1 data.
+ *
+ * needs at least 6-7GB of memory, tested with -Xms6700M -Xmx6700M
+ *
+ */
+public final class Track1SVDRunner {
+
+  private static final Logger log = LoggerFactory.getLogger(Track1SVDRunner.class);
+
+  private Track1SVDRunner() {
+  }
+
+  public static void main(String[] args) throws Exception {
+
+    if (args.length != 2) {
+      System.err.println("Necessary arguments: <kddDataFileDirectory> <resultFile>");
+      return;
+    }
+
+    File dataFileDirectory = new File(args[0]);
+    if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
+      throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
+    }
+
+    File resultFile = new File(args[1]);
+
+    /* the knobs to turn */
+    int numFeatures = 20;
+    int numIterations = 5;
+    double learningRate = 0.0001;
+    double preventOverfitting = 0.002;
+    double randomNoise = 0.0001;
+
+
+    KDDCupFactorizablePreferences factorizablePreferences =
+        new KDDCupFactorizablePreferences(KDDCupDataModel.getTrainingFile(dataFileDirectory));
+
+    Factorizer sgdFactorizer = new ParallelArraysSGDFactorizer(factorizablePreferences, numFeatures, numIterations,
+        learningRate, preventOverfitting, randomNoise);
+
+    Factorization factorization = sgdFactorizer.factorize();
+
+    log.info("Estimating validation preferences...");
+    int prefsProcessed = 0;
+    RunningAverage average = new FullRunningAverage();
+    for (Pair<PreferenceArray,long[]> validationPair
+        : new DataFileIterable(KDDCupDataModel.getValidationFile(dataFileDirectory))) {
+      for (Preference validationPref : validationPair.getFirst()) {
+        double estimate = estimatePreference(factorization, validationPref.getUserID(), validationPref.getItemID(),
+            factorizablePreferences.getMinPreference(), factorizablePreferences.getMaxPreference());
+        double error = validationPref.getValue() - estimate;
+        average.addDatum(error * error);
+        prefsProcessed++;
+        if (prefsProcessed % 100000 == 0) {
+          log.info("Computed {} estimations", prefsProcessed);
+        }
+      }
+    }
+    log.info("Computed {} estimations, done.", prefsProcessed);
+
+    double rmse = Math.sqrt(average.getAverage());
+    log.info("RMSE {}", rmse);
+
+    log.info("Estimating test preferences...");
+    OutputStream out = null;
+    try {
+      out = new BufferedOutputStream(new FileOutputStream(resultFile));
+
+      for (Pair<PreferenceArray,long[]> testPair
+          : new DataFileIterable(KDDCupDataModel.getTestFile(dataFileDirectory))) {
+        for (Preference testPref : testPair.getFirst()) {
+          double estimate = estimatePreference(factorization, testPref.getUserID(), testPref.getItemID(),
+              factorizablePreferences.getMinPreference(), factorizablePreferences.getMaxPreference());
+          byte result = EstimateConverter.convert(estimate, testPref.getUserID(), testPref.getItemID());
+          out.write(result);
+        }
+      }
+    } finally {
+      Closeables.close(out, false);
+    }
+    log.info("wrote estimates to {}, done.", resultFile.getAbsolutePath());
+  }
+
+  static double estimatePreference(Factorization factorization, long userID, long itemID, float minPreference,
+      float maxPreference) throws NoSuchUserException, NoSuchItemException {
+    double[] userFeatures = factorization.getUserFeatures(userID);
+    double[] itemFeatures = factorization.getItemFeatures(itemID);
+    double estimate = 0;
+    for (int feature = 0; feature < userFeatures.length; feature++) {
+      estimate += userFeatures[feature] * itemFeatures[feature];
+    }
+    if (estimate < minPreference) {
+      estimate = minPreference;
+    } else if (estimate > maxPreference) {
+      estimate = maxPreference;
+    }
+    return estimate;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java
new file mode 100644
index 0000000..ce025a9
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.similarity.AbstractItemSimilarity;
+import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+
+final class HybridSimilarity extends AbstractItemSimilarity {
+
+  private final ItemSimilarity cfSimilarity;
+  private final ItemSimilarity contentSimilarity;
+
+  HybridSimilarity(DataModel dataModel, File dataFileDirectory) throws IOException {
+    super(dataModel);
+    cfSimilarity = new LogLikelihoodSimilarity(dataModel);
+    contentSimilarity = new TrackItemSimilarity(dataFileDirectory);
+  }
+
+  @Override
+  public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+    return contentSimilarity.itemSimilarity(itemID1, itemID2) * cfSimilarity.itemSimilarity(itemID1, itemID2);
+  }
+
+  @Override
+  public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+    double[] result = contentSimilarity.itemSimilarities(itemID1, itemID2s);
+    double[] multipliers = cfSimilarity.itemSimilarities(itemID1, itemID2s);
+    for (int i = 0; i < result.length; i++) {
+      result[i] *= multipliers[i];
+    }
+    return result;
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    cfSimilarity.refresh(alreadyRefreshed);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java
new file mode 100644
index 0000000..50fd35e
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.TreeMap;
+import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicInteger;
+
+final class Track2Callable implements Callable<UserResult> {
+
+  private static final Logger log = LoggerFactory.getLogger(Track2Callable.class);
+  private static final AtomicInteger COUNT = new AtomicInteger();
+
+  private final Recommender recommender;
+  private final PreferenceArray userTest;
+
+  Track2Callable(Recommender recommender, PreferenceArray userTest) {
+    this.recommender = recommender;
+    this.userTest = userTest;
+  }
+
+  @Override
+  public UserResult call() throws TasteException {
+
+    int testSize = userTest.length();
+    if (testSize != 6) {
+      throw new IllegalArgumentException("Expecting 6 items for user but got " + userTest);
+    }
+    long userID = userTest.get(0).getUserID();
+    TreeMap<Double,Long> estimateToItemID = new TreeMap<>(Collections.reverseOrder());
+
+    for (int i = 0; i < testSize; i++) {
+      long itemID = userTest.getItemID(i);
+      double estimate;
+      try {
+        estimate = recommender.estimatePreference(userID, itemID);
+      } catch (NoSuchItemException nsie) {
+        // OK in the sample data provided before the contest, should never happen otherwise
+        log.warn("Unknown item {}; OK unless this is the real contest data", itemID);
+        continue;
+      }
+
+      if (!Double.isNaN(estimate)) {
+        estimateToItemID.put(estimate, itemID);
+      }
+    }
+
+    Collection<Long> itemIDs = estimateToItemID.values();
+    List<Long> topThree = new ArrayList<>(itemIDs);
+    if (topThree.size() > 3) {
+      topThree = topThree.subList(0, 3);
+    } else if (topThree.size() < 3) {
+      log.warn("Unable to recommend three items for {}", userID);
+      // Some NaNs - just guess at the rest then
+      Collection<Long> newItemIDs = new HashSet<>(3);
+      newItemIDs.addAll(itemIDs);
+      int i = 0;
+      while (i < testSize && newItemIDs.size() < 3) {
+        newItemIDs.add(userTest.getItemID(i));
+        i++;
+      }
+      topThree = new ArrayList<>(newItemIDs);
+    }
+    if (topThree.size() != 3) {
+      throw new IllegalStateException();
+    }
+
+    boolean[] result = new boolean[testSize];
+    for (int i = 0; i < testSize; i++) {
+      result[i] = topThree.contains(userTest.getItemID(i));
+    }
+
+    if (COUNT.incrementAndGet() % 1000 == 0) {
+      log.info("Completed {} users", COUNT.get());
+    }
+
+    return new UserResult(userID, result);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java
new file mode 100644
index 0000000..185a00d
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.recommender.GenericBooleanPrefItemBasedRecommender;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+
+public final class Track2Recommender implements Recommender {
+
+  private final Recommender recommender;
+
+  public Track2Recommender(DataModel dataModel, File dataFileDirectory) throws TasteException {
+    // Change this to whatever you like!
+    ItemSimilarity similarity;
+    try {
+      similarity = new HybridSimilarity(dataModel, dataFileDirectory);
+    } catch (IOException ioe) {
+      throw new TasteException(ioe);
+    }
+    recommender = new GenericBooleanPrefItemBasedRecommender(dataModel, similarity);
+  }
+  
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
+    return recommender.recommend(userID, howMany);
+  }
+
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
+    return recommend(userID, howMany, null, includeKnownItems);
+  }
+
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
+    return recommender.recommend(userID, howMany, rescorer, false);
+  }
+  
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+    throws TasteException {
+    return recommender.recommend(userID, howMany, rescorer, includeKnownItems);
+  }
+  
+  @Override
+  public float estimatePreference(long userID, long itemID) throws TasteException {
+    return recommender.estimatePreference(userID, itemID);
+  }
+  
+  @Override
+  public void setPreference(long userID, long itemID, float value) throws TasteException {
+    recommender.setPreference(userID, itemID, value);
+  }
+  
+  @Override
+  public void removePreference(long userID, long itemID) throws TasteException {
+    recommender.removePreference(userID, itemID);
+  }
+  
+  @Override
+  public DataModel getDataModel() {
+    return recommender.getDataModel();
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    recommender.refresh(alreadyRefreshed);
+  }
+  
+  @Override
+  public String toString() {
+    return "Track1Recommender[recommender:" + recommender + ']';
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java
new file mode 100644
index 0000000..09ade5d
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
+import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+
+final class Track2RecommenderBuilder implements RecommenderBuilder {
+  
+  @Override
+  public Recommender buildRecommender(DataModel dataModel) throws TasteException {
+    return new Track2Recommender(dataModel, ((KDDCupDataModel) dataModel).getDataFileDirectory());
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java
new file mode 100644
index 0000000..3cbb61c
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
+import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.Pair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+/**
+ * <p>Runs "track 2" of the KDD Cup competition using whatever recommender is inside {@link Track2Recommender}
+ * and attempts to output the result in the correct contest format.</p>
+ *
+ * <p>Run as: {@code Track2Runner [track 2 data file directory] [output file]}</p>
+ */
+public final class Track2Runner {
+
+  private static final Logger log = LoggerFactory.getLogger(Track2Runner.class);
+
+  private Track2Runner() {
+  }
+
+  public static void main(String[] args) throws Exception {
+
+    File dataFileDirectory = new File(args[0]);
+    if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
+      throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
+    }
+
+    long start = System.currentTimeMillis();
+
+    KDDCupDataModel model = new KDDCupDataModel(KDDCupDataModel.getTrainingFile(dataFileDirectory));
+    Track2Recommender recommender = new Track2Recommender(model, dataFileDirectory);
+
+    long end = System.currentTimeMillis();
+    log.info("Loaded model in {}s", (end - start) / 1000);
+    start = end;
+
+    Collection<Track2Callable> callables = new ArrayList<>();
+    for (Pair<PreferenceArray,long[]> tests : new DataFileIterable(KDDCupDataModel.getTestFile(dataFileDirectory))) {
+      PreferenceArray userTest = tests.getFirst();
+      callables.add(new Track2Callable(recommender, userTest));
+    }
+
+    int cores = Runtime.getRuntime().availableProcessors();
+    log.info("Running on {} cores", cores);
+    ExecutorService executor = Executors.newFixedThreadPool(cores);
+    List<Future<UserResult>> futures = executor.invokeAll(callables);
+    executor.shutdown();
+
+    end = System.currentTimeMillis();
+    log.info("Ran recommendations in {}s", (end - start) / 1000);
+    start = end;
+
+    try (OutputStream out = new BufferedOutputStream(new FileOutputStream(new File(args[1])))){
+      long lastUserID = Long.MIN_VALUE;
+      for (Future<UserResult> future : futures) {
+        UserResult result = future.get();
+        long userID = result.getUserID();
+        if (userID <= lastUserID) {
+          throw new IllegalStateException();
+        }
+        lastUserID = userID;
+        out.write(result.getResultBytes());
+      }
+    }
+
+    end = System.currentTimeMillis();
+    log.info("Wrote output in {}s", (end - start) / 1000);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java
new file mode 100644
index 0000000..abd15f8
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+import java.util.regex.Pattern;
+
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+
+final class TrackData {
+
+  private static final Pattern PIPE = Pattern.compile("\\|");
+  private static final String NO_VALUE = "None";
+  static final long NO_VALUE_ID = Long.MIN_VALUE;
+  private static final FastIDSet NO_GENRES = new FastIDSet();
+
+  private final long trackID;
+  private final long albumID;
+  private final long artistID;
+  private final FastIDSet genreIDs;
+
+  TrackData(CharSequence line) {
+    String[] tokens = PIPE.split(line);
+    trackID = Long.parseLong(tokens[0]);
+    albumID = parse(tokens[1]);
+    artistID = parse(tokens[2]);
+    if (tokens.length > 3) {
+      genreIDs = new FastIDSet(tokens.length - 3);
+      for (int i = 3; i < tokens.length; i++) {
+        genreIDs.add(Long.parseLong(tokens[i]));
+      }
+    } else {
+      genreIDs = NO_GENRES;
+    }
+  }
+
+  private static long parse(String value) {
+    return NO_VALUE.equals(value) ? NO_VALUE_ID : Long.parseLong(value);
+  }
+
+  public long getTrackID() {
+    return trackID;
+  }
+
+  public long getAlbumID() {
+    return albumID;
+  }
+
+  public long getArtistID() {
+    return artistID;
+  }
+
+  public FastIDSet getGenreIDs() {
+    return genreIDs;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java
new file mode 100644
index 0000000..3012a84
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.apache.mahout.common.iterator.FileLineIterable;
+
+final class TrackItemSimilarity implements ItemSimilarity {
+
+  private final FastByIDMap<TrackData> trackData;
+
+  TrackItemSimilarity(File dataFileDirectory) throws IOException {
+    trackData = new FastByIDMap<>();
+    for (String line : new FileLineIterable(KDDCupDataModel.getTrackFile(dataFileDirectory))) {
+      TrackData trackDatum = new TrackData(line);
+      trackData.put(trackDatum.getTrackID(), trackDatum);
+    }
+  }
+
+  @Override
+  public double itemSimilarity(long itemID1, long itemID2) {
+    if (itemID1 == itemID2) {
+      return 1.0;
+    }
+    TrackData data1 = trackData.get(itemID1);
+    TrackData data2 = trackData.get(itemID2);
+    if (data1 == null || data2 == null) {
+      return 0.0;
+    }
+
+    // Arbitrarily decide that same album means "very similar"
+    if (data1.getAlbumID() != TrackData.NO_VALUE_ID && data1.getAlbumID() == data2.getAlbumID()) {
+      return 0.9;
+    }
+    // ... and same artist means "fairly similar"
+    if (data1.getArtistID() != TrackData.NO_VALUE_ID && data1.getArtistID() == data2.getArtistID()) {
+      return 0.7;
+    }
+
+    // Tanimoto coefficient similarity based on genre, but maximum value of 0.25
+    FastIDSet genres1 = data1.getGenreIDs();
+    FastIDSet genres2 = data2.getGenreIDs();
+    if (genres1 == null || genres2 == null) {
+      return 0.0;
+    }
+    int intersectionSize = genres1.intersectionSize(genres2);
+    if (intersectionSize == 0) {
+      return 0.0;
+    }
+    int unionSize = genres1.size() + genres2.size() - intersectionSize;
+    return intersectionSize / (4.0 * unionSize);
+  }
+
+  @Override
+  public double[] itemSimilarities(long itemID1, long[] itemID2s) {
+    int length = itemID2s.length;
+    double[] result = new double[length];
+    for (int i = 0; i < length; i++) {
+      result[i] = itemSimilarity(itemID1, itemID2s[i]);
+    }
+    return result;
+  }
+
+  @Override
+  public long[] allSimilarItemIDs(long itemID) {
+    FastIDSet allSimilarItemIDs = new FastIDSet();
+    LongPrimitiveIterator allItemIDs = trackData.keySetIterator();
+    while (allItemIDs.hasNext()) {
+      long possiblySimilarItemID = allItemIDs.nextLong();
+      if (!Double.isNaN(itemSimilarity(itemID, possiblySimilarItemID))) {
+        allSimilarItemIDs.add(possiblySimilarItemID);
+      }
+    }
+    return allSimilarItemIDs.toArray();
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    // do nothing
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java
new file mode 100644
index 0000000..e554d10
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+final class UserResult {
+
+  private final long userID;
+  private final byte[] resultBytes;
+
+  UserResult(long userID, boolean[] result) {
+
+    this.userID = userID;
+
+    int trueCount = 0;
+    for (boolean b : result) {
+      if (b) {
+        trueCount++;
+      }
+    }
+    if (trueCount != 3) {
+      throw new IllegalStateException();
+    }
+
+    resultBytes = new byte[result.length];
+    for (int i = 0; i < result.length; i++) {
+      resultBytes[i] = (byte) (result[i] ? '1' : '0');
+    }
+  }
+
+  public long getUserID() {
+    return userID;
+  }
+
+  public byte[] getResultBytes() {
+    return resultBytes;
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java
new file mode 100644
index 0000000..22f122e
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.example.als.netflix;
+
+import com.google.common.base.Preconditions;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.cf.taste.impl.model.GenericPreference;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.common.iterator.FileLineIterable;
+import org.apache.mahout.common.iterator.FileLineIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+/** converts the raw files provided by netflix to an appropriate input format */
+public final class NetflixDatasetConverter {
+
+  private static final Logger log = LoggerFactory.getLogger(NetflixDatasetConverter.class);
+
+  private static final Pattern SEPARATOR = Pattern.compile(",");
+  private static final String MOVIE_DENOTER = ":";
+  private static final String TAB = "\t";
+  private static final String NEWLINE = "\n";
+
+  private NetflixDatasetConverter() {
+  }
+
+  public static void main(String[] args) throws IOException {
+
+    if (args.length != 4) {
+      System.err.println("Usage: NetflixDatasetConverter /path/to/training_set/ /path/to/qualifying.txt "
+          + "/path/to/judging.txt /path/to/destination");
+      return;
+    }
+
+    String trainingDataDir = args[0];
+    String qualifyingTxt = args[1];
+    String judgingTxt = args[2];
+    Path outputPath = new Path(args[3]);
+
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(outputPath.toUri(), conf);
+
+    Preconditions.checkArgument(trainingDataDir != null, "Training Data location needs to be specified");
+    log.info("Creating training set at {}/trainingSet/ratings.tsv ...", outputPath);
+    try (BufferedWriter writer =
+             new BufferedWriter(
+                 new OutputStreamWriter(
+                     fs.create(new Path(outputPath, "trainingSet/ratings.tsv")), Charsets.UTF_8))){
+
+      int ratingsProcessed = 0;
+      for (File movieRatings : new File(trainingDataDir).listFiles()) {
+        try (FileLineIterator lines = new FileLineIterator(movieRatings)) {
+          boolean firstLineRead = false;
+          String movieID = null;
+          while (lines.hasNext()) {
+            String line = lines.next();
+            if (firstLineRead) {
+              String[] tokens = SEPARATOR.split(line);
+              String userID = tokens[0];
+              String rating = tokens[1];
+              writer.write(userID + TAB + movieID + TAB + rating + NEWLINE);
+              ratingsProcessed++;
+              if (ratingsProcessed % 1000000 == 0) {
+                log.info("{} ratings processed...", ratingsProcessed);
+              }
+            } else {
+              movieID = line.replaceAll(MOVIE_DENOTER, "");
+              firstLineRead = true;
+            }
+          }
+        }
+
+      }
+      log.info("{} ratings processed. done.", ratingsProcessed);
+    }
+
+    log.info("Reading probes...");
+    List<Preference> probes = new ArrayList<>(2817131);
+    long currentMovieID = -1;
+    for (String line : new FileLineIterable(new File(qualifyingTxt))) {
+      if (line.contains(MOVIE_DENOTER)) {
+        currentMovieID = Long.parseLong(line.replaceAll(MOVIE_DENOTER, ""));
+      } else {
+        long userID = Long.parseLong(SEPARATOR.split(line)[0]);
+        probes.add(new GenericPreference(userID, currentMovieID, 0));
+      }
+    }
+    log.info("{} probes read...", probes.size());
+
+    log.info("Reading ratings, creating probe set at {}/probeSet/ratings.tsv ...", outputPath);
+    try (BufferedWriter writer =
+             new BufferedWriter(new OutputStreamWriter(
+                 fs.create(new Path(outputPath, "probeSet/ratings.tsv")), Charsets.UTF_8))){
+      int ratingsProcessed = 0;
+      for (String line : new FileLineIterable(new File(judgingTxt))) {
+        if (line.contains(MOVIE_DENOTER)) {
+          currentMovieID = Long.parseLong(line.replaceAll(MOVIE_DENOTER, ""));
+        } else {
+          float rating = Float.parseFloat(SEPARATOR.split(line)[0]);
+          Preference pref = probes.get(ratingsProcessed);
+          Preconditions.checkState(pref.getItemID() == currentMovieID);
+          ratingsProcessed++;
+          writer.write(pref.getUserID() + TAB + pref.getItemID() + TAB + rating + NEWLINE);
+          if (ratingsProcessed % 1000000 == 0) {
+            log.info("{} ratings processed...", ratingsProcessed);
+          }
+        }
+      }
+      log.info("{} ratings processed. done.", ratingsProcessed);
+    }
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java
new file mode 100644
index 0000000..8021d00
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity.precompute.example;
+
+import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender;
+import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity;
+import org.apache.mahout.cf.taste.impl.similarity.precompute.FileSimilarItemsWriter;
+import org.apache.mahout.cf.taste.impl.similarity.precompute.MultithreadedBatchItemSimilarities;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.ItemBasedRecommender;
+import org.apache.mahout.cf.taste.similarity.precompute.BatchItemSimilarities;
+
+import java.io.File;
+
+/**
+ * Example that precomputes all item similarities of the Movielens1M dataset
+ *
+ * Usage: download movielens1M from http://www.grouplens.org/node/73 , unzip it and invoke this code with the path
+ * to the ratings.dat file as argument
+ *
+ */
+public final class BatchItemSimilaritiesGroupLens {
+
+  private BatchItemSimilaritiesGroupLens() {}
+
+  public static void main(String[] args) throws Exception {
+
+    if (args.length != 1) {
+      System.err.println("Need path to ratings.dat of the movielens1M dataset as argument!");
+      System.exit(-1);
+    }
+
+    File resultFile = new File(System.getProperty("java.io.tmpdir"), "similarities.csv");
+    if (resultFile.exists()) {
+      resultFile.delete();
+    }
+
+    DataModel dataModel = new GroupLensDataModel(new File(args[0]));
+    ItemBasedRecommender recommender = new GenericItemBasedRecommender(dataModel,
+        new LogLikelihoodSimilarity(dataModel));
+    BatchItemSimilarities batch = new MultithreadedBatchItemSimilarities(recommender, 5);
+
+    int numSimilarities = batch.computeItemSimilarities(Runtime.getRuntime().availableProcessors(), 1,
+        new FileSimilarItemsWriter(resultFile));
+
+    System.out.println("Computed " + numSimilarities + " similarities for " + dataModel.getNumItems() + " items "
+        + "and saved them to " + resultFile.getAbsolutePath());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java
new file mode 100644
index 0000000..7ee9b17
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity.precompute.example;
+
+import com.google.common.io.Files;
+import com.google.common.io.InputSupplier;
+import com.google.common.io.Resources;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.net.URL;
+import java.util.regex.Pattern;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
+import org.apache.mahout.common.iterator.FileLineIterable;
+
+public final class GroupLensDataModel extends FileDataModel {
+  
+  private static final String COLON_DELIMTER = "::";
+  private static final Pattern COLON_DELIMITER_PATTERN = Pattern.compile(COLON_DELIMTER);
+  
+  public GroupLensDataModel() throws IOException {
+    this(readResourceToTempFile("/org/apache/mahout/cf/taste/example/grouplens/ratings.dat"));
+  }
+  
+  /**
+   * @param ratingsFile GroupLens ratings.dat file in its native format
+   * @throws IOException if an error occurs while reading or writing files
+   */
+  public GroupLensDataModel(File ratingsFile) throws IOException {
+    super(convertGLFile(ratingsFile));
+  }
+  
+  private static File convertGLFile(File originalFile) throws IOException {
+    // Now translate the file; remove commas, then convert "::" delimiter to comma
+    File resultFile = new File(new File(System.getProperty("java.io.tmpdir")), "ratings.txt");
+    if (resultFile.exists()) {
+      resultFile.delete();
+    }
+    try (Writer writer = new OutputStreamWriter(new FileOutputStream(resultFile), Charsets.UTF_8)){
+      for (String line : new FileLineIterable(originalFile, false)) {
+        int lastDelimiterStart = line.lastIndexOf(COLON_DELIMTER);
+        if (lastDelimiterStart < 0) {
+          throw new IOException("Unexpected input format on line: " + line);
+        }
+        String subLine = line.substring(0, lastDelimiterStart);
+        String convertedLine = COLON_DELIMITER_PATTERN.matcher(subLine).replaceAll(",");
+        writer.write(convertedLine);
+        writer.write('\n');
+      }
+    } catch (IOException ioe) {
+      resultFile.delete();
+      throw ioe;
+    }
+    return resultFile;
+  }
+
+  public static File readResourceToTempFile(String resourceName) throws IOException {
+    InputSupplier<? extends InputStream> inSupplier;
+    try {
+      URL resourceURL = Resources.getResource(GroupLensDataModel.class, resourceName);
+      inSupplier = Resources.newInputStreamSupplier(resourceURL);
+    } catch (IllegalArgumentException iae) {
+      File resourceFile = new File("src/main/java" + resourceName);
+      inSupplier = Files.newInputStreamSupplier(resourceFile);
+    }
+    File tempFile = File.createTempFile("taste", null);
+    tempFile.deleteOnExit();
+    Files.copy(inSupplier, tempFile);
+    return tempFile;
+  }
+
+  @Override
+  public String toString() {
+    return "GroupLensDataModel";
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
new file mode 100644
index 0000000..5cec51c
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
@@ -0,0 +1,128 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+import com.google.common.collect.ConcurrentHashMultiset;
+import com.google.common.collect.Multiset;
+import com.google.common.io.Closeables;
+import com.google.common.io.Files;
+import org.apache.commons.io.Charsets;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
+import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
+import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.text.SimpleDateFormat;
+import java.util.Collection;
+import java.util.Date;
+import java.util.Locale;
+import java.util.Random;
+
+public final class NewsgroupHelper {
+  
+  private static final SimpleDateFormat[] DATE_FORMATS = {
+    new SimpleDateFormat("", Locale.ENGLISH),
+    new SimpleDateFormat("MMM-yyyy", Locale.ENGLISH),
+    new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.ENGLISH)
+  };
+
+  public static final int FEATURES = 10000;
+  // 1997-01-15 00:01:00 GMT
+  private static final long DATE_REFERENCE = 853286460;
+  private static final long MONTH = 30 * 24 * 3600;
+  private static final long WEEK = 7 * 24 * 3600;
+  
+  private final Random rand = RandomUtils.getRandom();  
+  private final Analyzer analyzer = new StandardAnalyzer();
+  private final FeatureVectorEncoder encoder = new StaticWordValueEncoder("body");
+  private final FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
+  
+  public FeatureVectorEncoder getEncoder() {
+    return encoder;
+  }
+  
+  public FeatureVectorEncoder getBias() {
+    return bias;
+  }
+  
+  public Random getRandom() {
+    return rand;
+  }
+
+  public Vector encodeFeatureVector(File file, int actual, int leakType, Multiset<String> overallCounts)
+    throws IOException {
+    long date = (long) (1000 * (DATE_REFERENCE + actual * MONTH + 1 * WEEK * rand.nextDouble()));
+    Multiset<String> words = ConcurrentHashMultiset.create();
+
+    try (BufferedReader reader = Files.newReader(file, Charsets.UTF_8)) {
+      String line = reader.readLine();
+      Reader dateString = new StringReader(DATE_FORMATS[leakType % 3].format(new Date(date)));
+      countWords(analyzer, words, dateString, overallCounts);
+      while (line != null && !line.isEmpty()) {
+        boolean countHeader = (
+                line.startsWith("From:") || line.startsWith("Subject:")
+                        || line.startsWith("Keywords:") || line.startsWith("Summary:")) && leakType < 6;
+        do {
+          Reader in = new StringReader(line);
+          if (countHeader) {
+            countWords(analyzer, words, in, overallCounts);
+          }
+          line = reader.readLine();
+        } while (line != null && line.startsWith(" "));
+      }
+      if (leakType < 3) {
+        countWords(analyzer, words, reader, overallCounts);
+      }
+    }
+
+    Vector v = new RandomAccessSparseVector(FEATURES);
+    bias.addToVector("", 1, v);
+    for (String word : words.elementSet()) {
+      encoder.addToVector(word, Math.log1p(words.count(word)), v);
+    }
+
+    return v;
+  }
+
+  public static void countWords(Analyzer analyzer,
+                                 Collection<String> words,
+                                 Reader in,
+                                 Multiset<String> overallCounts) throws IOException {
+    TokenStream ts = analyzer.tokenStream("text", in);
+    ts.addAttribute(CharTermAttribute.class);
+    ts.reset();
+    while (ts.incrementToken()) {
+      String s = ts.getAttribute(CharTermAttribute.class).toString();
+      words.add(s);
+    }
+    overallCounts.addAll(words);
+    ts.end();
+    Closeables.close(ts, true);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java
new file mode 100644
index 0000000..16e9d80
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.email;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+import java.util.Locale;
+import java.util.regex.Pattern;
+
+/**
+ * Convert the labels created by the {@link org.apache.mahout.utils.email.MailProcessor} to one consumable
+ * by the classifiers
+ */
+public class PrepEmailMapper extends Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable> {
+
+  private static final Pattern DASH_DOT = Pattern.compile("-|\\.");
+  private static final Pattern SLASH = Pattern.compile("\\/");
+
+  private boolean useListName = false; //if true, use the project name and the list name in label creation
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    useListName = Boolean.parseBoolean(context.getConfiguration().get(PrepEmailVectorsDriver.USE_LIST_NAME));
+  }
+
+  @Override
+  protected void map(WritableComparable<?> key, VectorWritable value, Context context)
+    throws IOException, InterruptedException {
+    String input = key.toString();
+    ///Example: /cocoon.apache.org/dev/200307.gz/001401c3414f$8394e160$1e01a8c0@WRPO
+    String[] splits = SLASH.split(input);
+    //we need the first two splits;
+    if (splits.length >= 3) {
+      StringBuilder bldr = new StringBuilder();
+      bldr.append(escape(splits[1]));
+      if (useListName) {
+        bldr.append('_').append(escape(splits[2]));
+      }
+      context.write(new Text(bldr.toString()), value);
+    }
+
+  }
+  
+  private static String escape(CharSequence value) {
+    return DASH_DOT.matcher(value).replaceAll("_").toLowerCase(Locale.ENGLISH);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java
new file mode 100644
index 0000000..da6e613
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.email;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+public class PrepEmailReducer extends Reducer<Text, VectorWritable, Text, VectorWritable> {
+
+  private long maxItemsPerLabel = 10000;
+
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    maxItemsPerLabel = Long.parseLong(context.getConfiguration().get(PrepEmailVectorsDriver.ITEMS_PER_CLASS));
+  }
+
+  @Override
+  protected void reduce(Text key, Iterable<VectorWritable> values, Context context)
+    throws IOException, InterruptedException {
+    //TODO: support randomization?  Likely not needed due to the SplitInput utility which does random selection
+    long i = 0;
+    Iterator<VectorWritable> iterator = values.iterator();
+    while (i < maxItemsPerLabel && iterator.hasNext()) {
+      context.write(key, iterator.next());
+      i++;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java
new file mode 100644
index 0000000..8fba739
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.email;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.math.VectorWritable;
+
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Convert the labels generated by {@link org.apache.mahout.text.SequenceFilesFromMailArchives} and
+ * {@link org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles} to ones consumable by the classifiers. We do this
+ * here b/c if it is done in the creation of sparse vectors, the Reducer collapses all the vectors.
+ */
+public class PrepEmailVectorsDriver extends AbstractJob {
+
+  public static final String ITEMS_PER_CLASS = "itemsPerClass";
+  public static final String USE_LIST_NAME = "USE_LIST_NAME";
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new PrepEmailVectorsDriver(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+    addInputOption();
+    addOutputOption();
+    addOption(DefaultOptionCreator.overwriteOption().create());
+    addOption("maxItemsPerLabel", "mipl", "The maximum number of items per label.  Can be useful for making the "
+        + "training sets the same size", String.valueOf(100000));
+    addOption(buildOption("useListName", "ul", "Use the name of the list as part of the label.  If not set, then "
+        + "just use the project name", false, false, "false"));
+    Map<String,List<String>> parsedArgs = parseArguments(args);
+    if (parsedArgs == null) {
+      return -1;
+    }
+
+    Path input = getInputPath();
+    Path output = getOutputPath();
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.delete(getConf(), output);
+    }
+    Job convertJob = prepareJob(input, output, SequenceFileInputFormat.class, PrepEmailMapper.class, Text.class,
+        VectorWritable.class, PrepEmailReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
+    convertJob.getConfiguration().set(ITEMS_PER_CLASS, getOption("maxItemsPerLabel"));
+    convertJob.getConfiguration().set(USE_LIST_NAME, String.valueOf(hasOption("useListName")));
+
+    boolean succeeded = convertJob.waitForCompletion(true);
+    return succeeded ? 0 : -1;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java
new file mode 100644
index 0000000..9c0ef56
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java
@@ -0,0 +1,277 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import com.google.common.io.Resources;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.math.Matrix;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+/**
+ * This class implements a sample program that uses a pre-tagged training data
+ * set to train an HMM model as a POS tagger. The training data is automatically
+ * downloaded from the following URL:
+ * http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/train.txt It then
+ * trains an HMM Model using supervised learning and tests the model on the
+ * following test data set:
+ * http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/test.txt Further
+ * details regarding the data files can be found at
+ * http://flexcrfs.sourceforge.net/#Case_Study
+ */
+public final class PosTagger {
+
+  private static final Logger log = LoggerFactory.getLogger(PosTagger.class);
+
+  private static final Pattern SPACE = Pattern.compile(" ");
+  private static final Pattern SPACES = Pattern.compile("[ ]+");
+
+  /**
+   * No public constructors for utility classes.
+   */
+  private PosTagger() {
+    // nothing to do here really.
+  }
+
+  /**
+   * Model trained in the example.
+   */
+  private static HmmModel taggingModel;
+
+  /**
+   * Map for storing the IDs for the POS tags (hidden states)
+   */
+  private static Map<String, Integer> tagIDs;
+
+  /**
+   * Counter for the next assigned POS tag ID The value of 0 is reserved for
+   * "unknown POS tag"
+   */
+  private static int nextTagId;
+
+  /**
+   * Map for storing the IDs for observed words (observed states)
+   */
+  private static Map<String, Integer> wordIDs;
+
+  /**
+   * Counter for the next assigned word ID The value of 0 is reserved for
+   * "unknown word"
+   */
+  private static int nextWordId = 1; // 0 is reserved for "unknown word"
+
+  /**
+   * Used for storing a list of POS tags of read sentences.
+   */
+  private static List<int[]> hiddenSequences;
+
+  /**
+   * Used for storing a list of word tags of read sentences.
+   */
+  private static List<int[]> observedSequences;
+
+  /**
+   * number of read lines
+   */
+  private static int readLines;
+
+  /**
+   * Given an URL, this function fetches the data file, parses it, assigns POS
+   * Tag/word IDs and fills the hiddenSequences/observedSequences lists with
+   * data from those files. The data is expected to be in the following format
+   * (one word per line): word pos-tag np-tag sentences are closed with the .
+   * pos tag
+   *
+   * @param url       Where the data file is stored
+   * @param assignIDs Should IDs for unknown words/tags be assigned? (Needed for
+   *                  training data, not needed for test data)
+   * @throws IOException in case data file cannot be read.
+   */
+  private static void readFromURL(String url, boolean assignIDs) throws IOException {
+    // initialize the data structure
+    hiddenSequences = new LinkedList<>();
+    observedSequences = new LinkedList<>();
+    readLines = 0;
+
+    // now read line by line of the input file
+    List<Integer> observedSequence = new LinkedList<>();
+    List<Integer> hiddenSequence = new LinkedList<>();
+
+    for (String line :Resources.readLines(new URL(url), Charsets.UTF_8)) {
+      if (line.isEmpty()) {
+        // new sentence starts
+        int[] observedSequenceArray = new int[observedSequence.size()];
+        int[] hiddenSequenceArray = new int[hiddenSequence.size()];
+        for (int i = 0; i < observedSequence.size(); ++i) {
+          observedSequenceArray[i] = observedSequence.get(i);
+          hiddenSequenceArray[i] = hiddenSequence.get(i);
+        }
+        // now register those arrays
+        hiddenSequences.add(hiddenSequenceArray);
+        observedSequences.add(observedSequenceArray);
+        // and reset the linked lists
+        observedSequence.clear();
+        hiddenSequence.clear();
+        continue;
+      }
+      readLines++;
+      // we expect the format [word] [POS tag] [NP tag]
+      String[] tags = SPACE.split(line);
+      // when analyzing the training set, assign IDs
+      if (assignIDs) {
+        if (!wordIDs.containsKey(tags[0])) {
+          wordIDs.put(tags[0], nextWordId++);
+        }
+        if (!tagIDs.containsKey(tags[1])) {
+          tagIDs.put(tags[1], nextTagId++);
+        }
+      }
+      // determine the IDs
+      Integer wordID = wordIDs.get(tags[0]);
+      Integer tagID = tagIDs.get(tags[1]);
+      // now construct the current sequence
+      if (wordID == null) {
+        observedSequence.add(0);
+      } else {
+        observedSequence.add(wordID);
+      }
+
+      if (tagID == null) {
+        hiddenSequence.add(0);
+      } else {
+        hiddenSequence.add(tagID);
+      }
+    }
+
+    // if there is still something in the pipe, register it
+    if (!observedSequence.isEmpty()) {
+      int[] observedSequenceArray = new int[observedSequence.size()];
+      int[] hiddenSequenceArray = new int[hiddenSequence.size()];
+      for (int i = 0; i < observedSequence.size(); ++i) {
+        observedSequenceArray[i] = observedSequence.get(i);
+        hiddenSequenceArray[i] = hiddenSequence.get(i);
+      }
+      // now register those arrays
+      hiddenSequences.add(hiddenSequenceArray);
+      observedSequences.add(observedSequenceArray);
+    }
+  }
+
+  private static void trainModel(String trainingURL) throws IOException {
+    tagIDs = new HashMap<>(44); // we expect 44 distinct tags
+    wordIDs = new HashMap<>(19122); // we expect 19122
+    // distinct words
+    log.info("Reading and parsing training data file from URL: {}", trainingURL);
+    long start = System.currentTimeMillis();
+    readFromURL(trainingURL, true);
+    long end = System.currentTimeMillis();
+    double duration = (end - start) / 1000.0;
+    log.info("Parsing done in {} seconds!", duration);
+    log.info("Read {} lines containing {} sentences with a total of {} distinct words and {} distinct POS tags.",
+             readLines, hiddenSequences.size(), nextWordId - 1, nextTagId - 1);
+    start = System.currentTimeMillis();
+    taggingModel = HmmTrainer.trainSupervisedSequence(nextTagId, nextWordId,
+        hiddenSequences, observedSequences, 0.05);
+    // we have to adjust the model a bit,
+    // since we assume a higher probability that a given unknown word is NNP
+    // than anything else
+    Matrix emissions = taggingModel.getEmissionMatrix();
+    for (int i = 0; i < taggingModel.getNrOfHiddenStates(); ++i) {
+      emissions.setQuick(i, 0, 0.1 / taggingModel.getNrOfHiddenStates());
+    }
+    int nnptag = tagIDs.get("NNP");
+    emissions.setQuick(nnptag, 0, 1 / (double) taggingModel.getNrOfHiddenStates());
+    // re-normalize the emission probabilities
+    HmmUtils.normalizeModel(taggingModel);
+    // now register the names
+    taggingModel.registerHiddenStateNames(tagIDs);
+    taggingModel.registerOutputStateNames(wordIDs);
+    end = System.currentTimeMillis();
+    duration = (end - start) / 1000.0;
+    log.info("Trained HMM models in {} seconds!", duration);
+  }
+
+  private static void testModel(String testingURL) throws IOException {
+    log.info("Reading and parsing test data file from URL: {}", testingURL);
+    long start = System.currentTimeMillis();
+    readFromURL(testingURL, false);
+    long end = System.currentTimeMillis();
+    double duration = (end - start) / 1000.0;
+    log.info("Parsing done in {} seconds!", duration);
+    log.info("Read {} lines containing {} sentences.", readLines, hiddenSequences.size());
+
+    start = System.currentTimeMillis();
+    int errorCount = 0;
+    int totalCount = 0;
+    for (int i = 0; i < observedSequences.size(); ++i) {
+      // fetch the viterbi path as the POS tag for this observed sequence
+      int[] posEstimate = HmmEvaluator.decode(taggingModel, observedSequences.get(i), false);
+      // compare with the expected
+      int[] posExpected = hiddenSequences.get(i);
+      for (int j = 0; j < posExpected.length; ++j) {
+        totalCount++;
+        if (posEstimate[j] != posExpected[j]) {
+          errorCount++;
+        }
+      }
+    }
+    end = System.currentTimeMillis();
+    duration = (end - start) / 1000.0;
+    log.info("POS tagged test file in {} seconds!", duration);
+    double errorRate = (double) errorCount / totalCount;
+    log.info("Tagged the test file with an error rate of: {}", errorRate);
+  }
+
+  private static List<String> tagSentence(String sentence) {
+    // first, we need to isolate all punctuation characters, so that they
+    // can be recognized
+    sentence = sentence.replaceAll("[,.!?:;\"]", " $0 ");
+    sentence = sentence.replaceAll("''", " '' ");
+    // now we tokenize the sentence
+    String[] tokens = SPACES.split(sentence);
+    // now generate the observed sequence
+    int[] observedSequence = HmmUtils.encodeStateSequence(taggingModel, Arrays.asList(tokens), true, 0);
+    // POS tag this observedSequence
+    int[] hiddenSequence = HmmEvaluator.decode(taggingModel, observedSequence, false);
+    // and now decode the tag names
+    return HmmUtils.decodeStateSequence(taggingModel, hiddenSequence, false, null);
+  }
+
+  public static void main(String[] args) throws IOException {
+    // generate the model from URL
+    trainModel("http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/train.txt");
+    testModel("http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/test.txt");
+    // tag an exemplary sentence
+    String test = "McDonalds is a huge company with many employees .";
+    String[] testWords = SPACE.split(test);
+    List<String> posTags = tagSentence(test);
+    for (int i = 0; i < posTags.size(); ++i) {
+      log.info("{}[{}]", testWords[i], posTags.get(i));
+    }
+  }
+
+}


[11/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmTrainer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmTrainer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmTrainer.java
new file mode 100644
index 0000000..a1cd3e0
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmTrainer.java
@@ -0,0 +1,488 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+
+/**
+ * Class containing several algorithms used to train a Hidden Markov Model. The
+ * three main algorithms are: supervised learning, unsupervised Viterbi and
+ * unsupervised Baum-Welch.
+ */
+public final class HmmTrainer {
+
+  /**
+   * No public constructor for utility classes.
+   */
+  private HmmTrainer() {
+    // nothing to do here really.
+  }
+
+  /**
+   * Create an supervised initial estimate of an HMM Model based on a sequence
+   * of observed and hidden states.
+   *
+   * @param nrOfHiddenStates The total number of hidden states
+   * @param nrOfOutputStates The total number of output states
+   * @param observedSequence Integer array containing the observed sequence
+   * @param hiddenSequence   Integer array containing the hidden sequence
+   * @param pseudoCount      Value that is assigned to non-occurring transitions to avoid zero
+   *                         probabilities.
+   * @return An initial model using the estimated parameters
+   */
+  public static HmmModel trainSupervised(int nrOfHiddenStates, int nrOfOutputStates, int[] observedSequence,
+      int[] hiddenSequence, double pseudoCount) {
+    // make sure the pseudo count is not zero
+    pseudoCount = pseudoCount == 0 ? Double.MIN_VALUE : pseudoCount;
+
+    // initialize the parameters
+    DenseMatrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates);
+    DenseMatrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates);
+    // assign a small initial probability that is larger than zero, so
+    // unseen states will not get a zero probability
+    transitionMatrix.assign(pseudoCount);
+    emissionMatrix.assign(pseudoCount);
+    // given no prior knowledge, we have to assume that all initial hidden
+    // states are equally likely
+    DenseVector initialProbabilities = new DenseVector(nrOfHiddenStates);
+    initialProbabilities.assign(1.0 / nrOfHiddenStates);
+
+    // now loop over the sequences to count the number of transitions
+    countTransitions(transitionMatrix, emissionMatrix, observedSequence,
+        hiddenSequence);
+
+    // make sure that probabilities are normalized
+    for (int i = 0; i < nrOfHiddenStates; i++) {
+      // compute sum of probabilities for current row of transition matrix
+      double sum = 0;
+      for (int j = 0; j < nrOfHiddenStates; j++) {
+        sum += transitionMatrix.getQuick(i, j);
+      }
+      // normalize current row of transition matrix
+      for (int j = 0; j < nrOfHiddenStates; j++) {
+        transitionMatrix.setQuick(i, j, transitionMatrix.getQuick(i, j) / sum);
+      }
+      // compute sum of probabilities for current row of emission matrix
+      sum = 0;
+      for (int j = 0; j < nrOfOutputStates; j++) {
+        sum += emissionMatrix.getQuick(i, j);
+      }
+      // normalize current row of emission matrix
+      for (int j = 0; j < nrOfOutputStates; j++) {
+        emissionMatrix.setQuick(i, j, emissionMatrix.getQuick(i, j) / sum);
+      }
+    }
+
+    // return a new model using the parameter estimations
+    return new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);
+  }
+
+  /**
+   * Function that counts the number of state->state and state->output
+   * transitions for the given observed/hidden sequence.
+   *
+   * @param transitionMatrix transition matrix to use.
+   * @param emissionMatrix emission matrix to use for counting.
+   * @param observedSequence observation sequence to use.
+   * @param hiddenSequence sequence of hidden states to use.
+   */
+  private static void countTransitions(Matrix transitionMatrix,
+                                       Matrix emissionMatrix, int[] observedSequence, int[] hiddenSequence) {
+    emissionMatrix.setQuick(hiddenSequence[0], observedSequence[0],
+        emissionMatrix.getQuick(hiddenSequence[0], observedSequence[0]) + 1);
+    for (int i = 1; i < observedSequence.length; ++i) {
+      transitionMatrix
+          .setQuick(hiddenSequence[i - 1], hiddenSequence[i], transitionMatrix
+              .getQuick(hiddenSequence[i - 1], hiddenSequence[i]) + 1);
+      emissionMatrix.setQuick(hiddenSequence[i], observedSequence[i],
+          emissionMatrix.getQuick(hiddenSequence[i], observedSequence[i]) + 1);
+    }
+  }
+
+  /**
+   * Create an supervised initial estimate of an HMM Model based on a number of
+   * sequences of observed and hidden states.
+   *
+   * @param nrOfHiddenStates The total number of hidden states
+   * @param nrOfOutputStates The total number of output states
+   * @param hiddenSequences Collection of hidden sequences to use for training
+   * @param observedSequences Collection of observed sequences to use for training associated with hidden sequences.
+   * @param pseudoCount      Value that is assigned to non-occurring transitions to avoid zero
+   *                         probabilities.
+   * @return An initial model using the estimated parameters
+   */
+  public static HmmModel trainSupervisedSequence(int nrOfHiddenStates,
+                                                 int nrOfOutputStates, Collection<int[]> hiddenSequences,
+                                                 Collection<int[]> observedSequences, double pseudoCount) {
+
+    // make sure the pseudo count is not zero
+    pseudoCount = pseudoCount == 0 ? Double.MIN_VALUE : pseudoCount;
+
+    // initialize parameters
+    DenseMatrix transitionMatrix = new DenseMatrix(nrOfHiddenStates,
+        nrOfHiddenStates);
+    DenseMatrix emissionMatrix = new DenseMatrix(nrOfHiddenStates,
+        nrOfOutputStates);
+    DenseVector initialProbabilities = new DenseVector(nrOfHiddenStates);
+
+    // assign pseudo count to avoid zero probabilities
+    transitionMatrix.assign(pseudoCount);
+    emissionMatrix.assign(pseudoCount);
+    initialProbabilities.assign(pseudoCount);
+
+    // now loop over the sequences to count the number of transitions
+    Iterator<int[]> hiddenSequenceIt = hiddenSequences.iterator();
+    Iterator<int[]> observedSequenceIt = observedSequences.iterator();
+    while (hiddenSequenceIt.hasNext() && observedSequenceIt.hasNext()) {
+      // fetch the current set of sequences
+      int[] hiddenSequence = hiddenSequenceIt.next();
+      int[] observedSequence = observedSequenceIt.next();
+      // increase the count for initial probabilities
+      initialProbabilities.setQuick(hiddenSequence[0], initialProbabilities
+          .getQuick(hiddenSequence[0]) + 1);
+      countTransitions(transitionMatrix, emissionMatrix, observedSequence,
+          hiddenSequence);
+    }
+
+    // make sure that probabilities are normalized
+    double isum = 0; // sum of initial probabilities
+    for (int i = 0; i < nrOfHiddenStates; i++) {
+      isum += initialProbabilities.getQuick(i);
+      // compute sum of probabilities for current row of transition matrix
+      double sum = 0;
+      for (int j = 0; j < nrOfHiddenStates; j++) {
+        sum += transitionMatrix.getQuick(i, j);
+      }
+      // normalize current row of transition matrix
+      for (int j = 0; j < nrOfHiddenStates; j++) {
+        transitionMatrix.setQuick(i, j, transitionMatrix.getQuick(i, j) / sum);
+      }
+      // compute sum of probabilities for current row of emission matrix
+      sum = 0;
+      for (int j = 0; j < nrOfOutputStates; j++) {
+        sum += emissionMatrix.getQuick(i, j);
+      }
+      // normalize current row of emission matrix
+      for (int j = 0; j < nrOfOutputStates; j++) {
+        emissionMatrix.setQuick(i, j, emissionMatrix.getQuick(i, j) / sum);
+      }
+    }
+    // normalize the initial probabilities
+    for (int i = 0; i < nrOfHiddenStates; ++i) {
+      initialProbabilities.setQuick(i, initialProbabilities.getQuick(i) / isum);
+    }
+
+    // return a new model using the parameter estimates
+    return new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);
+  }
+
+  /**
+   * Iteratively train the parameters of the given initial model wrt to the
+   * observed sequence using Viterbi training.
+   *
+   * @param initialModel     The initial model that gets iterated
+   * @param observedSequence The sequence of observed states
+   * @param pseudoCount      Value that is assigned to non-occurring transitions to avoid zero
+   *                         probabilities.
+   * @param epsilon          Convergence criteria
+   * @param maxIterations    The maximum number of training iterations
+   * @param scaled           Use Log-scaled implementation, this is computationally more
+   *                         expensive but offers better numerical stability for large observed
+   *                         sequences
+   * @return The iterated model
+   */
+  public static HmmModel trainViterbi(HmmModel initialModel,
+                                      int[] observedSequence, double pseudoCount, double epsilon,
+                                      int maxIterations, boolean scaled) {
+
+    // make sure the pseudo count is not zero
+    pseudoCount = pseudoCount == 0 ? Double.MIN_VALUE : pseudoCount;
+
+    // allocate space for iteration models
+    HmmModel lastIteration = initialModel.clone();
+    HmmModel iteration = initialModel.clone();
+
+    // allocate space for Viterbi path calculation
+    int[] viterbiPath = new int[observedSequence.length];
+    int[][] phi = new int[observedSequence.length - 1][initialModel
+        .getNrOfHiddenStates()];
+    double[][] delta = new double[observedSequence.length][initialModel
+        .getNrOfHiddenStates()];
+
+    // now run the Viterbi training iteration
+    for (int i = 0; i < maxIterations; ++i) {
+      // compute the Viterbi path
+      HmmAlgorithms.viterbiAlgorithm(viterbiPath, delta, phi, lastIteration,
+          observedSequence, scaled);
+      // Viterbi iteration uses the viterbi path to update
+      // the probabilities
+      Matrix emissionMatrix = iteration.getEmissionMatrix();
+      Matrix transitionMatrix = iteration.getTransitionMatrix();
+
+      // first, assign the pseudo count
+      emissionMatrix.assign(pseudoCount);
+      transitionMatrix.assign(pseudoCount);
+
+      // now count the transitions
+      countTransitions(transitionMatrix, emissionMatrix, observedSequence,
+          viterbiPath);
+
+      // and normalize the probabilities
+      for (int j = 0; j < iteration.getNrOfHiddenStates(); ++j) {
+        double sum = 0;
+        // normalize the rows of the transition matrix
+        for (int k = 0; k < iteration.getNrOfHiddenStates(); ++k) {
+          sum += transitionMatrix.getQuick(j, k);
+        }
+        for (int k = 0; k < iteration.getNrOfHiddenStates(); ++k) {
+          transitionMatrix
+              .setQuick(j, k, transitionMatrix.getQuick(j, k) / sum);
+        }
+        // normalize the rows of the emission matrix
+        sum = 0;
+        for (int k = 0; k < iteration.getNrOfOutputStates(); ++k) {
+          sum += emissionMatrix.getQuick(j, k);
+        }
+        for (int k = 0; k < iteration.getNrOfOutputStates(); ++k) {
+          emissionMatrix.setQuick(j, k, emissionMatrix.getQuick(j, k) / sum);
+        }
+      }
+      // check for convergence
+      if (checkConvergence(lastIteration, iteration, epsilon)) {
+        break;
+      }
+      // overwrite the last iterated model by the new iteration
+      lastIteration.assign(iteration);
+    }
+    // we are done :)
+    return iteration;
+  }
+
+  /**
+   * Iteratively train the parameters of the given initial model wrt the
+   * observed sequence using Baum-Welch training.
+   *
+   * @param initialModel     The initial model that gets iterated
+   * @param observedSequence The sequence of observed states
+   * @param epsilon          Convergence criteria
+   * @param maxIterations    The maximum number of training iterations
+   * @param scaled           Use log-scaled implementations of forward/backward algorithm. This
+   *                         is computationally more expensive, but offers better numerical
+   *                         stability for long output sequences.
+   * @return The iterated model
+   */
+  public static HmmModel trainBaumWelch(HmmModel initialModel,
+                                        int[] observedSequence, double epsilon, int maxIterations, boolean scaled) {
+    // allocate space for the iterations
+    HmmModel lastIteration = initialModel.clone();
+    HmmModel iteration = initialModel.clone();
+
+    // allocate space for baum-welch factors
+    int hiddenCount = initialModel.getNrOfHiddenStates();
+    int visibleCount = observedSequence.length;
+    Matrix alpha = new DenseMatrix(visibleCount, hiddenCount);
+    Matrix beta = new DenseMatrix(visibleCount, hiddenCount);
+
+    // now run the baum Welch training iteration
+    for (int it = 0; it < maxIterations; ++it) {
+      // fetch emission and transition matrix of current iteration
+      Vector initialProbabilities = iteration.getInitialProbabilities();
+      Matrix emissionMatrix = iteration.getEmissionMatrix();
+      Matrix transitionMatrix = iteration.getTransitionMatrix();
+
+      // compute forward and backward factors
+      HmmAlgorithms.forwardAlgorithm(alpha, iteration, observedSequence, scaled);
+      HmmAlgorithms.backwardAlgorithm(beta, iteration, observedSequence, scaled);
+
+      if (scaled) {
+        logScaledBaumWelch(observedSequence, iteration, alpha, beta);
+      } else {
+        unscaledBaumWelch(observedSequence, iteration, alpha, beta);
+      }
+      // normalize transition/emission probabilities
+      // and normalize the probabilities
+      double isum = 0;
+      for (int j = 0; j < iteration.getNrOfHiddenStates(); ++j) {
+        double sum = 0;
+        // normalize the rows of the transition matrix
+        for (int k = 0; k < iteration.getNrOfHiddenStates(); ++k) {
+          sum += transitionMatrix.getQuick(j, k);
+        }
+        for (int k = 0; k < iteration.getNrOfHiddenStates(); ++k) {
+          transitionMatrix
+              .setQuick(j, k, transitionMatrix.getQuick(j, k) / sum);
+        }
+        // normalize the rows of the emission matrix
+        sum = 0;
+        for (int k = 0; k < iteration.getNrOfOutputStates(); ++k) {
+          sum += emissionMatrix.getQuick(j, k);
+        }
+        for (int k = 0; k < iteration.getNrOfOutputStates(); ++k) {
+          emissionMatrix.setQuick(j, k, emissionMatrix.getQuick(j, k) / sum);
+        }
+        // normalization parameter for initial probabilities
+        isum += initialProbabilities.getQuick(j);
+      }
+      // normalize initial probabilities
+      for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) {
+        initialProbabilities.setQuick(i, initialProbabilities.getQuick(i)
+            / isum);
+      }
+      // check for convergence
+      if (checkConvergence(lastIteration, iteration, epsilon)) {
+        break;
+      }
+      // overwrite the last iterated model by the new iteration
+      lastIteration.assign(iteration);
+    }
+    // we are done :)
+    return iteration;
+  }
+
+  private static void unscaledBaumWelch(int[] observedSequence, HmmModel iteration, Matrix alpha, Matrix beta) {
+    Vector initialProbabilities = iteration.getInitialProbabilities();
+    Matrix emissionMatrix = iteration.getEmissionMatrix();
+    Matrix transitionMatrix = iteration.getTransitionMatrix();
+    double modelLikelihood = HmmEvaluator.modelLikelihood(alpha, false);
+
+    for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) {
+      initialProbabilities.setQuick(i, alpha.getQuick(0, i)
+          * beta.getQuick(0, i));
+    }
+
+    // recompute transition probabilities
+    for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) {
+      for (int j = 0; j < iteration.getNrOfHiddenStates(); ++j) {
+        double temp = 0;
+        for (int t = 0; t < observedSequence.length - 1; ++t) {
+          temp += alpha.getQuick(t, i)
+              * emissionMatrix.getQuick(j, observedSequence[t + 1])
+              * beta.getQuick(t + 1, j);
+        }
+        transitionMatrix.setQuick(i, j, transitionMatrix.getQuick(i, j)
+            * temp / modelLikelihood);
+      }
+    }
+    // recompute emission probabilities
+    for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) {
+      for (int j = 0; j < iteration.getNrOfOutputStates(); ++j) {
+        double temp = 0;
+        for (int t = 0; t < observedSequence.length; ++t) {
+          // delta tensor
+          if (observedSequence[t] == j) {
+            temp += alpha.getQuick(t, i) * beta.getQuick(t, i);
+          }
+        }
+        emissionMatrix.setQuick(i, j, temp / modelLikelihood);
+      }
+    }
+  }
+
+  private static void logScaledBaumWelch(int[] observedSequence, HmmModel iteration, Matrix alpha, Matrix beta) {
+    Vector initialProbabilities = iteration.getInitialProbabilities();
+    Matrix emissionMatrix = iteration.getEmissionMatrix();
+    Matrix transitionMatrix = iteration.getTransitionMatrix();
+    double modelLikelihood = HmmEvaluator.modelLikelihood(alpha, true);
+
+    for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) {
+      initialProbabilities.setQuick(i, Math.exp(alpha.getQuick(0, i) + beta.getQuick(0, i)));
+    }
+
+    // recompute transition probabilities
+    for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) {
+      for (int j = 0; j < iteration.getNrOfHiddenStates(); ++j) {
+        double sum = Double.NEGATIVE_INFINITY; // log(0)
+        for (int t = 0; t < observedSequence.length - 1; ++t) {
+          double temp = alpha.getQuick(t, i)
+              + Math.log(emissionMatrix.getQuick(j, observedSequence[t + 1]))
+              + beta.getQuick(t + 1, j);
+          if (temp > Double.NEGATIVE_INFINITY) {
+            // handle 0-probabilities
+            sum = temp + Math.log1p(Math.exp(sum - temp));
+          }
+        }
+        transitionMatrix.setQuick(i, j, transitionMatrix.getQuick(i, j)
+            * Math.exp(sum - modelLikelihood));
+      }
+    }
+    // recompute emission probabilities
+    for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) {
+      for (int j = 0; j < iteration.getNrOfOutputStates(); ++j) {
+        double sum = Double.NEGATIVE_INFINITY; // log(0)
+        for (int t = 0; t < observedSequence.length; ++t) {
+          // delta tensor
+          if (observedSequence[t] == j) {
+            double temp = alpha.getQuick(t, i) + beta.getQuick(t, i);
+            if (temp > Double.NEGATIVE_INFINITY) {
+              // handle 0-probabilities
+              sum = temp + Math.log1p(Math.exp(sum - temp));
+            }
+          }
+        }
+        emissionMatrix.setQuick(i, j, Math.exp(sum - modelLikelihood));
+      }
+    }
+  }
+
+  /**
+   * Check convergence of two HMM models by computing a simple distance between
+   * emission / transition matrices
+   *
+   * @param oldModel Old HMM Model
+   * @param newModel New HMM Model
+   * @param epsilon  Convergence Factor
+   * @return true if training converged to a stable state.
+   */
+  private static boolean checkConvergence(HmmModel oldModel, HmmModel newModel,
+                                          double epsilon) {
+    // check convergence of transitionProbabilities
+    Matrix oldTransitionMatrix = oldModel.getTransitionMatrix();
+    Matrix newTransitionMatrix = newModel.getTransitionMatrix();
+    double diff = 0;
+    for (int i = 0; i < oldModel.getNrOfHiddenStates(); ++i) {
+      for (int j = 0; j < oldModel.getNrOfHiddenStates(); ++j) {
+        double tmp = oldTransitionMatrix.getQuick(i, j)
+            - newTransitionMatrix.getQuick(i, j);
+        diff += tmp * tmp;
+      }
+    }
+    double norm = Math.sqrt(diff);
+    diff = 0;
+    // check convergence of emissionProbabilities
+    Matrix oldEmissionMatrix = oldModel.getEmissionMatrix();
+    Matrix newEmissionMatrix = newModel.getEmissionMatrix();
+    for (int i = 0; i < oldModel.getNrOfHiddenStates(); i++) {
+      for (int j = 0; j < oldModel.getNrOfOutputStates(); j++) {
+
+        double tmp = oldEmissionMatrix.getQuick(i, j)
+            - newEmissionMatrix.getQuick(i, j);
+        diff += tmp * tmp;
+      }
+    }
+    norm += Math.sqrt(diff);
+    // iteration has converged :)
+    return norm < epsilon;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmUtils.java
new file mode 100644
index 0000000..e710816
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmUtils.java
@@ -0,0 +1,360 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SparseMatrix;
+import org.apache.mahout.math.Vector;
+
+/**
+ * A collection of utilities for handling HMMModel objects.
+ */
+public final class HmmUtils {
+
+  /**
+   * No public constructor for utility classes.
+   */
+  private HmmUtils() {
+    // nothing to do here really.
+  }
+
+  /**
+   * Compute the cumulative transition probability matrix for the given HMM
+   * model. Matrix where each row i is the cumulative distribution of the
+   * transition probability distribution for hidden state i.
+   *
+   * @param model The HMM model for which the cumulative transition matrix should be
+   *              computed
+   * @return The computed cumulative transition matrix.
+   */
+  public static Matrix getCumulativeTransitionMatrix(HmmModel model) {
+    // fetch the needed parameters from the model
+    int hiddenStates = model.getNrOfHiddenStates();
+    Matrix transitionMatrix = model.getTransitionMatrix();
+    // now compute the cumulative transition matrix
+    Matrix resultMatrix = new DenseMatrix(hiddenStates, hiddenStates);
+    for (int i = 0; i < hiddenStates; ++i) {
+      double sum = 0;
+      for (int j = 0; j < hiddenStates; ++j) {
+        sum += transitionMatrix.get(i, j);
+        resultMatrix.set(i, j, sum);
+      }
+      resultMatrix.set(i, hiddenStates - 1, 1.0);
+      // make sure the last
+      // state has always a
+      // cumulative
+      // probability of
+      // exactly 1.0
+    }
+    return resultMatrix;
+  }
+
+  /**
+   * Compute the cumulative output probability matrix for the given HMM model.
+   * Matrix where each row i is the cumulative distribution of the output
+   * probability distribution for hidden state i.
+   *
+   * @param model The HMM model for which the cumulative output matrix should be
+   *              computed
+   * @return The computed cumulative output matrix.
+   */
+  public static Matrix getCumulativeOutputMatrix(HmmModel model) {
+    // fetch the needed parameters from the model
+    int hiddenStates = model.getNrOfHiddenStates();
+    int outputStates = model.getNrOfOutputStates();
+    Matrix outputMatrix = model.getEmissionMatrix();
+    // now compute the cumulative output matrix
+    Matrix resultMatrix = new DenseMatrix(hiddenStates, outputStates);
+    for (int i = 0; i < hiddenStates; ++i) {
+      double sum = 0;
+      for (int j = 0; j < outputStates; ++j) {
+        sum += outputMatrix.get(i, j);
+        resultMatrix.set(i, j, sum);
+      }
+      resultMatrix.set(i, outputStates - 1, 1.0);
+      // make sure the last
+      // output state has
+      // always a cumulative
+      // probability of 1.0
+    }
+    return resultMatrix;
+  }
+
+  /**
+   * Compute the cumulative distribution of the initial hidden state
+   * probabilities for the given HMM model.
+   *
+   * @param model The HMM model for which the cumulative initial state probabilities
+   *              should be computed
+   * @return The computed cumulative initial state probability vector.
+   */
+  public static Vector getCumulativeInitialProbabilities(HmmModel model) {
+    // fetch the needed parameters from the model
+    int hiddenStates = model.getNrOfHiddenStates();
+    Vector initialProbabilities = model.getInitialProbabilities();
+    // now compute the cumulative output matrix
+    Vector resultVector = new DenseVector(initialProbabilities.size());
+    double sum = 0;
+    for (int i = 0; i < hiddenStates; ++i) {
+      sum += initialProbabilities.get(i);
+      resultVector.set(i, sum);
+    }
+    resultVector.set(hiddenStates - 1, 1.0); // make sure the last initial
+    // hidden state probability
+    // has always a cumulative
+    // probability of 1.0
+    return resultVector;
+  }
+
+  /**
+   * Validates an HMM model set
+   *
+   * @param model model to sanity check.
+   */
+  public static void validate(HmmModel model) {
+    if (model == null) {
+      return; // empty models are valid
+    }
+
+    /*
+     * The number of hidden states is positive.
+     */
+    Preconditions.checkArgument(model.getNrOfHiddenStates() > 0,
+      "Error: The number of hidden states has to be greater than 0");
+    
+    /*
+     * The number of output states is positive.
+     */
+    Preconditions.checkArgument(model.getNrOfOutputStates() > 0,
+      "Error: The number of output states has to be greater than 0!");
+
+    /*
+     * The size of the vector of initial probabilities is equal to the number of
+     * the hidden states. Each initial probability is non-negative. The sum of
+     * initial probabilities is equal to 1.
+     */
+    Preconditions.checkArgument(model.getInitialProbabilities() != null
+      && model.getInitialProbabilities().size() == model.getNrOfHiddenStates(),
+      "Error: The vector of initial probabilities is not initialized!");
+    
+    double sum = 0;
+    for (int i = 0; i < model.getInitialProbabilities().size(); i++) {
+      Preconditions.checkArgument(model.getInitialProbabilities().get(i) >= 0,
+        "Error: Initial probability of state %d is negative", i);
+      sum += model.getInitialProbabilities().get(i);
+    }
+    Preconditions.checkArgument(Math.abs(sum - 1) <= 0.00001,
+                                "Error: Initial probabilities do not add up to 1");
+    /*
+     * The row size of the output matrix is equal to the number of the hidden
+     * states. The column size is equal to the number of output states. Each
+     * probability of the matrix is non-negative. The sum of each row is equal
+     * to 1.
+     */
+    Preconditions.checkNotNull(model.getEmissionMatrix(), "Error: The output state matrix is not initialized!");
+    Preconditions.checkArgument(model.getEmissionMatrix().numRows() == model.getNrOfHiddenStates()
+        && model.getEmissionMatrix().numCols() == model.getNrOfOutputStates(),
+        "Error: The output state matrix is not of the form nrOfHiddenStates x nrOfOutputStates");
+    for (int i = 0; i < model.getEmissionMatrix().numRows(); i++) {
+      sum = 0;
+      for (int j = 0; j < model.getEmissionMatrix().numCols(); j++) {
+        Preconditions.checkArgument(model.getEmissionMatrix().get(i, j) >= 0,
+            "The output state probability from hidden state " + i + " to output state " + j + " is negative");
+        sum += model.getEmissionMatrix().get(i, j);
+      }
+      Preconditions.checkArgument(Math.abs(sum - 1) <= 0.00001,
+        "Error: The output state probabilities for hidden state %d don't add up to 1", i);
+    }
+
+    /*
+     * The size of both dimension of the transition matrix is equal to the
+     * number of the hidden states. Each probability of the matrix is
+     * non-negative. The sum of each row in transition matrix is equal to 1.
+     */
+    Preconditions.checkArgument(model.getTransitionMatrix() != null,
+      "Error: The hidden state matrix is not initialized!");
+    Preconditions.checkArgument(model.getTransitionMatrix().numRows() == model.getNrOfHiddenStates()
+      && model.getTransitionMatrix().numCols() == model.getNrOfHiddenStates(),
+      "Error: The output state matrix is not of the form nrOfHiddenStates x nrOfHiddenStates");
+    for (int i = 0; i < model.getTransitionMatrix().numRows(); i++) {
+      sum = 0;
+      for (int j = 0; j < model.getTransitionMatrix().numCols(); j++) {
+        Preconditions.checkArgument(model.getTransitionMatrix().get(i, j) >= 0,
+          "Error: The transition probability from hidden state %d to hidden state %d is negative", i, j);
+        sum += model.getTransitionMatrix().get(i, j);
+      }
+      Preconditions.checkArgument(Math.abs(sum - 1) <= 0.00001,
+        "Error: The transition probabilities for hidden state " + i + " don't add up to 1.");
+    }
+  }
+
+  /**
+   * Encodes a given collection of state names by the corresponding state IDs
+   * registered in a given model.
+   *
+   * @param model        Model to provide the encoding for
+   * @param sequence     Collection of state names
+   * @param observed     If set, the sequence is encoded as a sequence of observed states,
+   *                     else it is encoded as sequence of hidden states
+   * @param defaultValue The default value in case a state is not known
+   * @return integer array containing the encoded state IDs
+   */
+  public static int[] encodeStateSequence(HmmModel model,
+                                          Collection<String> sequence, boolean observed, int defaultValue) {
+    int[] encoded = new int[sequence.size()];
+    Iterator<String> seqIter = sequence.iterator();
+    for (int i = 0; i < sequence.size(); ++i) {
+      String nextState = seqIter.next();
+      int nextID;
+      if (observed) {
+        nextID = model.getOutputStateID(nextState);
+      } else {
+        nextID = model.getHiddenStateID(nextState);
+      }
+      // if the ID is -1, use the default value
+      encoded[i] = nextID < 0 ? defaultValue : nextID;
+    }
+    return encoded;
+  }
+
+  /**
+   * Decodes a given collection of state IDs into the corresponding state names
+   * registered in a given model.
+   *
+   * @param model        model to use for retrieving state names
+   * @param sequence     int array of state IDs
+   * @param observed     If set, the sequence is encoded as a sequence of observed states,
+   *                     else it is encoded as sequence of hidden states
+   * @param defaultValue The default value in case a state is not known
+   * @return list containing the decoded state names
+   */
+  public static List<String> decodeStateSequence(HmmModel model,
+                                                 int[] sequence,
+                                                 boolean observed,
+                                                 String defaultValue) {
+    List<String> decoded = new ArrayList<>(sequence.length);
+    for (int position : sequence) {
+      String nextState;
+      if (observed) {
+        nextState = model.getOutputStateName(position);
+      } else {
+        nextState = model.getHiddenStateName(position);
+      }
+      // if null was returned, use the default value
+      decoded.add(nextState == null ? defaultValue : nextState);
+    }
+    return decoded;
+  }
+
+  /**
+   * Function used to normalize the probabilities of a given HMM model
+   *
+   * @param model model to normalize
+   */
+  public static void normalizeModel(HmmModel model) {
+    Vector ip = model.getInitialProbabilities();
+    Matrix emission = model.getEmissionMatrix();
+    Matrix transition = model.getTransitionMatrix();
+    // check normalization for all probabilities
+    double isum = 0;
+    for (int i = 0; i < model.getNrOfHiddenStates(); ++i) {
+      isum += ip.getQuick(i);
+      double sum = 0;
+      for (int j = 0; j < model.getNrOfHiddenStates(); ++j) {
+        sum += transition.getQuick(i, j);
+      }
+      if (sum != 1.0) {
+        for (int j = 0; j < model.getNrOfHiddenStates(); ++j) {
+          transition.setQuick(i, j, transition.getQuick(i, j) / sum);
+        }
+      }
+      sum = 0;
+      for (int j = 0; j < model.getNrOfOutputStates(); ++j) {
+        sum += emission.getQuick(i, j);
+      }
+      if (sum != 1.0) {
+        for (int j = 0; j < model.getNrOfOutputStates(); ++j) {
+          emission.setQuick(i, j, emission.getQuick(i, j) / sum);
+        }
+      }
+    }
+    if (isum != 1.0) {
+      for (int i = 0; i < model.getNrOfHiddenStates(); ++i) {
+        ip.setQuick(i, ip.getQuick(i) / isum);
+      }
+    }
+  }
+
+  /**
+   * Method to reduce the size of an HMMmodel by converting the models
+   * DenseMatrix/DenseVectors to sparse implementations and setting every value
+   * < threshold to 0
+   *
+   * @param model model to truncate
+   * @param threshold minimum value a model entry must have to be retained.
+   * @return Truncated model
+   */
+  public static HmmModel truncateModel(HmmModel model, double threshold) {
+    Vector ip = model.getInitialProbabilities();
+    Matrix em = model.getEmissionMatrix();
+    Matrix tr = model.getTransitionMatrix();
+    // allocate the sparse data structures
+    RandomAccessSparseVector sparseIp = new RandomAccessSparseVector(model
+        .getNrOfHiddenStates());
+    SparseMatrix sparseEm = new SparseMatrix(model.getNrOfHiddenStates(), model.getNrOfOutputStates());
+    SparseMatrix sparseTr = new SparseMatrix(model.getNrOfHiddenStates(), model.getNrOfHiddenStates());
+    // now transfer the values
+    for (int i = 0; i < model.getNrOfHiddenStates(); ++i) {
+      double value = ip.getQuick(i);
+      if (value > threshold) {
+        sparseIp.setQuick(i, value);
+      }
+      for (int j = 0; j < model.getNrOfHiddenStates(); ++j) {
+        value = tr.getQuick(i, j);
+        if (value > threshold) {
+          sparseTr.setQuick(i, j, value);
+        }
+      }
+
+      for (int j = 0; j < model.getNrOfOutputStates(); ++j) {
+        value = em.getQuick(i, j);
+        if (value > threshold) {
+          sparseEm.setQuick(i, j, value);
+        }
+      }
+    }
+    // create a new model
+    HmmModel sparseModel = new HmmModel(sparseTr, sparseEm, sparseIp);
+    // normalize the model
+    normalizeModel(sparseModel);
+    // register the names
+    sparseModel.registerHiddenStateNames(model.getHiddenStateNames());
+    sparseModel.registerOutputStateNames(model.getOutputStateNames());
+    // and return
+    return sparseModel;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/LossyHmmSerializer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/LossyHmmSerializer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/LossyHmmSerializer.java
new file mode 100644
index 0000000..d0ae9c2
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/LossyHmmSerializer.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.MatrixWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Utils for serializing Writable parts of HmmModel (that means without hidden state names and so on)
+ */
+final class LossyHmmSerializer {
+
+  private LossyHmmSerializer() {
+  }
+
+  static void serialize(HmmModel model, DataOutput output) throws IOException {
+    MatrixWritable matrix = new MatrixWritable(model.getEmissionMatrix());
+    matrix.write(output);
+    matrix.set(model.getTransitionMatrix());
+    matrix.write(output);
+
+    VectorWritable vector = new VectorWritable(model.getInitialProbabilities());
+    vector.write(output);
+  }
+
+  static HmmModel deserialize(DataInput input) throws IOException {
+    MatrixWritable matrix = new MatrixWritable();
+    matrix.readFields(input);
+    Matrix emissionMatrix = matrix.get();
+
+    matrix.readFields(input);
+    Matrix transitionMatrix = matrix.get();
+
+    VectorWritable vector = new VectorWritable();
+    vector.readFields(input);
+    Vector initialProbabilities = vector.get();
+
+    return new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/RandomSequenceGenerator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/RandomSequenceGenerator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/RandomSequenceGenerator.java
new file mode 100644
index 0000000..02baef1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/RandomSequenceGenerator.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.common.CommandLineUtil;
+
+/**
+ * Command-line tool for generating random sequences by given HMM
+ */
+public final class RandomSequenceGenerator {
+
+  private RandomSequenceGenerator() {
+  }
+
+  public static void main(String[] args) throws IOException {
+    DefaultOptionBuilder optionBuilder = new DefaultOptionBuilder();
+    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+
+    Option outputOption = optionBuilder.withLongName("output").
+      withDescription("Output file with sequence of observed states").
+      withShortName("o").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+      withName("path").create()).withRequired(false).create();
+
+    Option modelOption = optionBuilder.withLongName("model").
+      withDescription("Path to serialized HMM model").
+      withShortName("m").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+      withName("path").create()).withRequired(true).create();
+
+    Option lengthOption = optionBuilder.withLongName("length").
+      withDescription("Length of generated sequence").
+      withShortName("l").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+      withName("number").create()).withRequired(true).create();
+
+    Group optionGroup = new GroupBuilder().
+      withOption(outputOption).withOption(modelOption).withOption(lengthOption).
+      withName("Options").create();
+
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(optionGroup);
+      CommandLine commandLine = parser.parse(args);
+
+      String output = (String) commandLine.getValue(outputOption);
+
+      String modelPath = (String) commandLine.getValue(modelOption);
+
+      int length = Integer.parseInt((String) commandLine.getValue(lengthOption));
+
+      //reading serialized HMM
+      HmmModel model;
+      try (DataInputStream modelStream = new DataInputStream(new FileInputStream(modelPath))){
+        model = LossyHmmSerializer.deserialize(modelStream);
+      }
+
+      //generating observations
+      int[] observations = HmmEvaluator.predict(model, length, System.currentTimeMillis());
+
+      //writing output
+      try (PrintWriter writer =
+               new PrintWriter(new OutputStreamWriter(new FileOutputStream(output), Charsets.UTF_8), true)){
+        for (int observation : observations) {
+          writer.print(observation);
+          writer.print(' ');
+        }
+      }
+    } catch (OptionException e) {
+      CommandLineUtil.printHelp(optionGroup);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/ViterbiEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/ViterbiEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/ViterbiEvaluator.java
new file mode 100644
index 0000000..317237d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/ViterbiEvaluator.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Scanner;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+
+/**
+ * Command-line tool for Viterbi evaluating
+ */
+public final class ViterbiEvaluator {
+
+  private ViterbiEvaluator() {
+  }
+
+  public static void main(String[] args) throws IOException {
+    DefaultOptionBuilder optionBuilder = new DefaultOptionBuilder();
+    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+
+    Option inputOption = DefaultOptionCreator.inputOption().create();
+
+    Option outputOption = DefaultOptionCreator.outputOption().create();
+
+    Option modelOption = optionBuilder.withLongName("model").
+      withDescription("Path to serialized HMM model").
+      withShortName("m").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+      withName("path").create()).withRequired(true).create();
+
+    Option likelihoodOption = optionBuilder.withLongName("likelihood").
+      withDescription("Compute likelihood of observed sequence").
+      withShortName("l").withRequired(false).create();
+
+    Group optionGroup = new GroupBuilder().withOption(inputOption).
+      withOption(outputOption).withOption(modelOption).withOption(likelihoodOption).
+      withName("Options").create();
+
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(optionGroup);
+      CommandLine commandLine = parser.parse(args);
+
+      String input = (String) commandLine.getValue(inputOption);
+      String output = (String) commandLine.getValue(outputOption);
+
+      String modelPath = (String) commandLine.getValue(modelOption);
+
+      boolean computeLikelihood = commandLine.hasOption(likelihoodOption);
+
+      //reading serialized HMM
+      ;
+      HmmModel model;
+      try (DataInputStream modelStream = new DataInputStream(new FileInputStream(modelPath))) {
+        model = LossyHmmSerializer.deserialize(modelStream);
+      }
+
+      //reading observations
+      List<Integer> observations = new ArrayList<>();
+      try (Scanner scanner = new Scanner(new FileInputStream(input), "UTF-8")) {
+        while (scanner.hasNextInt()) {
+          observations.add(scanner.nextInt());
+        }
+      }
+
+      int[] observationsArray = new int[observations.size()];
+      for (int i = 0; i < observations.size(); ++i) {
+        observationsArray[i] = observations.get(i);
+      }
+
+      //decoding
+      int[] hiddenStates = HmmEvaluator.decode(model, observationsArray, true);
+
+      //writing output
+      try (PrintWriter writer =
+               new PrintWriter(new OutputStreamWriter(new FileOutputStream(output), Charsets.UTF_8), true)) {
+        for (int hiddenState : hiddenStates) {
+          writer.print(hiddenState);
+          writer.print(' ');
+        }
+      }
+
+      if (computeLikelihood) {
+        System.out.println("Likelihood: " + HmmEvaluator.modelLikelihood(model, observationsArray, true));
+      }
+    } catch (OptionException e) {
+      CommandLineUtil.printHelp(optionGroup);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AbstractOnlineLogisticRegression.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AbstractOnlineLogisticRegression.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AbstractOnlineLogisticRegression.java
new file mode 100644
index 0000000..0b2c41b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AbstractOnlineLogisticRegression.java
@@ -0,0 +1,317 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.classifier.OnlineLearner;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.function.DoubleFunction;
+import org.apache.mahout.math.function.Functions;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * Generic definition of a 1 of n logistic regression classifier that returns probabilities in
+ * response to a feature vector.  This classifier uses 1 of n-1 coding where the 0-th category
+ * is not stored explicitly.
+ * <p/>
+ * Provides the SGD based algorithm for learning a logistic regression, but omits all
+ * annealing of learning rates.  Any extension of this abstract class must define the overall
+ * and per-term annealing for themselves.
+ */
+public abstract class AbstractOnlineLogisticRegression extends AbstractVectorClassifier implements OnlineLearner {
+  // coefficients for the classification.  This is a dense matrix
+  // that is (numCategories-1) x numFeatures
+  protected Matrix beta;
+
+  // number of categories we are classifying.  This should the number of rows of beta plus one.
+  protected int numCategories;
+
+  protected int step;
+
+  // information about how long since coefficient rows were updated.  This allows lazy regularization.
+  protected Vector updateSteps;
+
+  // information about how many updates we have had on a location.  This allows per-term
+  // annealing a la confidence weighted learning.
+  protected Vector updateCounts;
+
+  // weight of the prior on beta
+  private double lambda = 1.0e-5;
+  protected PriorFunction prior;
+
+  // can we ignore any further regularization when doing classification?
+  private boolean sealed;
+
+  // by default we don't do any fancy training
+  private Gradient gradient = new DefaultGradient();
+
+  /**
+   * Chainable configuration option.
+   *
+   * @param lambda New value of lambda, the weighting factor for the prior distribution.
+   * @return This, so other configurations can be chained.
+   */
+  public AbstractOnlineLogisticRegression lambda(double lambda) {
+    this.lambda = lambda;
+    return this;
+  }
+
+  /**
+   * Computes the inverse link function, by default the logistic link function.
+   *
+   * @param v The output of the linear combination in a GLM.  Note that the value
+   *          of v is disturbed.
+   * @return A version of v with the link function applied.
+   */
+  public static Vector link(Vector v) {
+    double max = v.maxValue();
+    if (max >= 40) {
+      // if max > 40, we subtract the large offset first
+      // the size of the max means that 1+sum(exp(v)) = sum(exp(v)) to within round-off
+      v.assign(Functions.minus(max)).assign(Functions.EXP);
+      return v.divide(v.norm(1));
+    } else {
+      v.assign(Functions.EXP);
+      return v.divide(1 + v.norm(1));
+    }
+  }
+
+  /**
+   * Computes the binomial logistic inverse link function.
+   *
+   * @param r The value to transform.
+   * @return The logit of r.
+   */
+  public static double link(double r) {
+    if (r < 0.0) {
+      double s = Math.exp(r);
+      return s / (1.0 + s);
+    } else {
+      double s = Math.exp(-r);
+      return 1.0 / (1.0 + s);
+    }
+  }
+
+  @Override
+  public Vector classifyNoLink(Vector instance) {
+    // apply pending regularization to whichever coefficients matter
+    regularize(instance);
+    return beta.times(instance);
+  }
+
+  public double classifyScalarNoLink(Vector instance) {
+    return beta.viewRow(0).dot(instance);
+  }
+
+  /**
+   * Returns n-1 probabilities, one for each category but the 0-th.  The probability of the 0-th
+   * category is 1 - sum(this result).
+   *
+   * @param instance A vector of features to be classified.
+   * @return A vector of probabilities, one for each of the first n-1 categories.
+   */
+  @Override
+  public Vector classify(Vector instance) {
+    return link(classifyNoLink(instance));
+  }
+
+  /**
+   * Returns a single scalar probability in the case where we have two categories.  Using this
+   * method avoids an extra vector allocation as opposed to calling classify() or an extra two
+   * vector allocations relative to classifyFull().
+   *
+   * @param instance The vector of features to be classified.
+   * @return The probability of the first of two categories.
+   * @throws IllegalArgumentException If the classifier doesn't have two categories.
+   */
+  @Override
+  public double classifyScalar(Vector instance) {
+    Preconditions.checkArgument(numCategories() == 2, "Can only call classifyScalar with two categories");
+
+    // apply pending regularization to whichever coefficients matter
+    regularize(instance);
+
+    // result is a vector with one element so we can just use dot product
+    return link(classifyScalarNoLink(instance));
+  }
+
+  @Override
+  public void train(long trackingKey, String groupKey, int actual, Vector instance) {
+    unseal();
+
+    double learningRate = currentLearningRate();
+
+    // push coefficients back to zero based on the prior
+    regularize(instance);
+
+    // update each row of coefficients according to result
+    Vector gradient = this.gradient.apply(groupKey, actual, instance, this);
+    for (int i = 0; i < numCategories - 1; i++) {
+      double gradientBase = gradient.get(i);
+
+      // then we apply the gradientBase to the resulting element.
+      for (Element updateLocation : instance.nonZeroes()) {
+        int j = updateLocation.index();
+
+        double newValue = beta.getQuick(i, j) + gradientBase * learningRate * perTermLearningRate(j) * instance.get(j);
+        beta.setQuick(i, j, newValue);
+      }
+    }
+
+    // remember that these elements got updated
+    for (Element element : instance.nonZeroes()) {
+      int j = element.index();
+      updateSteps.setQuick(j, getStep());
+      updateCounts.incrementQuick(j, 1);
+    }
+    nextStep();
+
+  }
+
+  @Override
+  public void train(long trackingKey, int actual, Vector instance) {
+    train(trackingKey, null, actual, instance);
+  }
+
+  @Override
+  public void train(int actual, Vector instance) {
+    train(0, null, actual, instance);
+  }
+
+  public void regularize(Vector instance) {
+    if (updateSteps == null || isSealed()) {
+      return;
+    }
+
+    // anneal learning rate
+    double learningRate = currentLearningRate();
+
+    // here we lazily apply the prior to make up for our neglect
+    for (int i = 0; i < numCategories - 1; i++) {
+      for (Element updateLocation : instance.nonZeroes()) {
+        int j = updateLocation.index();
+        double missingUpdates = getStep() - updateSteps.get(j);
+        if (missingUpdates > 0) {
+          double rate = getLambda() * learningRate * perTermLearningRate(j);
+          double newValue = prior.age(beta.get(i, j), missingUpdates, rate);
+          beta.set(i, j, newValue);
+          updateSteps.set(j, getStep());
+        }
+      }
+    }
+  }
+
+  // these two abstract methods are how extensions can modify the basic learning behavior of this object.
+
+  public abstract double perTermLearningRate(int j);
+
+  public abstract double currentLearningRate();
+
+  public void setPrior(PriorFunction prior) {
+    this.prior = prior;
+  }
+
+  public void setGradient(Gradient gradient) {
+    this.gradient = gradient;
+  }
+
+  public PriorFunction getPrior() {
+    return prior;
+  }
+
+  public Matrix getBeta() {
+    close();
+    return beta;
+  }
+
+  public void setBeta(int i, int j, double betaIJ) {
+    beta.set(i, j, betaIJ);
+  }
+
+  @Override
+  public int numCategories() {
+    return numCategories;
+  }
+
+  public int numFeatures() {
+    return beta.numCols();
+  }
+
+  public double getLambda() {
+    return lambda;
+  }
+
+  public int getStep() {
+    return step;
+  }
+
+  protected void nextStep() {
+    step++;
+  }
+
+  public boolean isSealed() {
+    return sealed;
+  }
+
+  protected void unseal() {
+    sealed = false;
+  }
+
+  private void regularizeAll() {
+    Vector all = new DenseVector(beta.numCols());
+    all.assign(1);
+    regularize(all);
+  }
+
+  @Override
+  public void close() {
+    if (!sealed) {
+      step++;
+      regularizeAll();
+      sealed = true;
+    }
+  }
+
+  public void copyFrom(AbstractOnlineLogisticRegression other) {
+    // number of categories we are classifying.  This should the number of rows of beta plus one.
+    Preconditions.checkArgument(numCategories == other.numCategories,
+            "Can't copy unless number of target categories is the same");
+
+    beta.assign(other.beta);
+
+    step = other.step;
+
+    updateSteps.assign(other.updateSteps);
+    updateCounts.assign(other.updateCounts);
+  }
+
+  public boolean validModel() {
+    double k = beta.aggregate(Functions.PLUS, new DoubleFunction() {
+      @Override
+      public double apply(double v) {
+        return Double.isNaN(v) || Double.isInfinite(v) ? 1 : 0;
+      }
+    });
+    return k < 1;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java
new file mode 100644
index 0000000..24e5798
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java
@@ -0,0 +1,586 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.OnlineLearner;
+import org.apache.mahout.ep.EvolutionaryProcess;
+import org.apache.mahout.ep.Mapping;
+import org.apache.mahout.ep.Payload;
+import org.apache.mahout.ep.State;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.stats.OnlineAuc;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.concurrent.ExecutionException;
+
+/**
+ * This is a meta-learner that maintains a pool of ordinary
+ * {@link org.apache.mahout.classifier.sgd.OnlineLogisticRegression} learners. Each
+ * member of the pool has different learning rates.  Whichever of the learners in the pool falls
+ * behind in terms of average log-likelihood will be tossed out and replaced with variants of the
+ * survivors.  This will let us automatically derive an annealing schedule that optimizes learning
+ * speed.  Since on-line learners tend to be IO bound anyway, it doesn't cost as much as it might
+ * seem that it would to maintain multiple learners in memory.  Doing this adaptation on-line as we
+ * learn also decreases the number of learning rate parameters required and replaces the normal
+ * hyper-parameter search.
+ * <p/>
+ * One wrinkle is that the pool of learners that we maintain is actually a pool of
+ * {@link org.apache.mahout.classifier.sgd.CrossFoldLearner} which themselves contain several OnlineLogisticRegression
+ * objects.  These pools allow estimation
+ * of performance on the fly even if we make many passes through the data.  This does, however,
+ * increase the cost of training since if we are using 5-fold cross-validation, each vector is used
+ * 4 times for training and once for classification.  If this becomes a problem, then we should
+ * probably use a 2-way unbalanced train/test split rather than full cross validation.  With the
+ * current default settings, we have 100 learners running.  This is better than the alternative of
+ * running hundreds of training passes to find good hyper-parameters because we only have to parse
+ * and feature-ize our inputs once.  If you already have good hyper-parameters, then you might
+ * prefer to just run one CrossFoldLearner with those settings.
+ * <p/>
+ * The fitness used here is AUC.  Another alternative would be to try log-likelihood, but it is much
+ * easier to get bogus values of log-likelihood than with AUC and the results seem to accord pretty
+ * well.  It would be nice to allow the fitness function to be pluggable. This use of AUC means that
+ * AdaptiveLogisticRegression is mostly suited for binary target variables. This will be fixed
+ * before long by extending OnlineAuc to handle non-binary cases or by using a different fitness
+ * value in non-binary cases.
+ */
+public class AdaptiveLogisticRegression implements OnlineLearner, Writable {
+  public static final int DEFAULT_THREAD_COUNT = 20;
+  public static final int DEFAULT_POOL_SIZE = 20;
+  private static final int SURVIVORS = 2;
+
+  private int record;
+  private int cutoff = 1000;
+  private int minInterval = 1000;
+  private int maxInterval = 1000;
+  private int currentStep = 1000;
+  private int bufferSize = 1000;
+
+  private List<TrainingExample> buffer = new ArrayList<>();
+  private EvolutionaryProcess<Wrapper, CrossFoldLearner> ep;
+  private State<Wrapper, CrossFoldLearner> best;
+  private int threadCount = DEFAULT_THREAD_COUNT;
+  private int poolSize = DEFAULT_POOL_SIZE;
+  private State<Wrapper, CrossFoldLearner> seed;
+  private int numFeatures;
+
+  private boolean freezeSurvivors = true;
+
+  private static final Logger log = LoggerFactory.getLogger(AdaptiveLogisticRegression.class);
+
+  public AdaptiveLogisticRegression() {}
+
+  /**
+   * Uses {@link #DEFAULT_THREAD_COUNT} and {@link #DEFAULT_POOL_SIZE}
+   * @param numCategories The number of categories (labels) to train on
+   * @param numFeatures The number of features used in creating the vectors (i.e. the cardinality of the vector)
+   * @param prior The {@link org.apache.mahout.classifier.sgd.PriorFunction} to use
+   *
+   * @see #AdaptiveLogisticRegression(int, int, org.apache.mahout.classifier.sgd.PriorFunction, int, int)
+   */
+  public AdaptiveLogisticRegression(int numCategories, int numFeatures, PriorFunction prior) {
+    this(numCategories, numFeatures, prior, DEFAULT_THREAD_COUNT, DEFAULT_POOL_SIZE);
+  }
+
+  /**
+   *
+   * @param numCategories The number of categories (labels) to train on
+   * @param numFeatures The number of features used in creating the vectors (i.e. the cardinality of the vector)
+   * @param prior The {@link org.apache.mahout.classifier.sgd.PriorFunction} to use
+   * @param threadCount The number of threads to use for training
+   * @param poolSize The number of {@link org.apache.mahout.classifier.sgd.CrossFoldLearner} to use.
+   */
+  public AdaptiveLogisticRegression(int numCategories, int numFeatures, PriorFunction prior, int threadCount,
+      int poolSize) {
+    this.numFeatures = numFeatures;
+    this.threadCount = threadCount;
+    this.poolSize = poolSize;
+    seed = new State<>(new double[2], 10);
+    Wrapper w = new Wrapper(numCategories, numFeatures, prior);
+    seed.setPayload(w);
+
+    Wrapper.setMappings(seed);
+    seed.setPayload(w);
+    setPoolSize(this.poolSize);
+  }
+
+  @Override
+  public void train(int actual, Vector instance) {
+    train(record, null, actual, instance);
+  }
+
+  @Override
+  public void train(long trackingKey, int actual, Vector instance) {
+    train(trackingKey, null, actual, instance);
+  }
+
+  @Override
+  public void train(long trackingKey, String groupKey, int actual, Vector instance) {
+    record++;
+
+    buffer.add(new TrainingExample(trackingKey, groupKey, actual, instance));
+    //don't train until we have enough examples
+    if (buffer.size() > bufferSize) {
+      trainWithBufferedExamples();
+    }
+  }
+
+  private void trainWithBufferedExamples() {
+    try {
+      this.best = ep.parallelDo(new EvolutionaryProcess.Function<Payload<CrossFoldLearner>>() {
+        @Override
+        public double apply(Payload<CrossFoldLearner> z, double[] params) {
+          Wrapper x = (Wrapper) z;
+          for (TrainingExample example : buffer) {
+            x.train(example);
+          }
+          if (x.getLearner().validModel()) {
+            if (x.getLearner().numCategories() == 2) {
+              return x.wrapped.auc();
+            } else {
+              return x.wrapped.logLikelihood();
+            }
+          } else {
+            return Double.NaN;
+          }
+        }
+      });
+    } catch (InterruptedException e) {
+      // ignore ... shouldn't happen
+      log.warn("Ignoring exception", e);
+    } catch (ExecutionException e) {
+      throw new IllegalStateException(e.getCause());
+    }
+    buffer.clear();
+
+    if (record > cutoff) {
+      cutoff = nextStep(record);
+
+      // evolve based on new fitness
+      ep.mutatePopulation(SURVIVORS);
+
+      if (freezeSurvivors) {
+        // now grossly hack the top survivors so they stick around.  Set their
+        // mutation rates small and also hack their learning rate to be small
+        // as well.
+        for (State<Wrapper, CrossFoldLearner> state : ep.getPopulation().subList(0, SURVIVORS)) {
+          Wrapper.freeze(state);
+        }
+      }
+    }
+
+  }
+
+  public int nextStep(int recordNumber) {
+    int stepSize = stepSize(recordNumber, 2.6);
+    if (stepSize < minInterval) {
+      stepSize = minInterval;
+    }
+
+    if (stepSize > maxInterval) {
+      stepSize = maxInterval;
+    }
+
+    int newCutoff = stepSize * (recordNumber / stepSize + 1);
+    if (newCutoff < cutoff + currentStep) {
+      newCutoff = cutoff + currentStep;
+    } else {
+      this.currentStep = stepSize;
+    }
+    return newCutoff;
+  }
+
+  public static int stepSize(int recordNumber, double multiplier) {
+    int[] bumps = {1, 2, 5};
+    double log = Math.floor(multiplier * Math.log10(recordNumber));
+    int bump = bumps[(int) log % bumps.length];
+    int scale = (int) Math.pow(10, Math.floor(log / bumps.length));
+
+    return bump * scale;
+  }
+
+  @Override
+  public void close() {
+    trainWithBufferedExamples();
+    try {
+      ep.parallelDo(new EvolutionaryProcess.Function<Payload<CrossFoldLearner>>() {
+        @Override
+        public double apply(Payload<CrossFoldLearner> payload, double[] params) {
+          CrossFoldLearner learner = ((Wrapper) payload).getLearner();
+          learner.close();
+          return learner.logLikelihood();
+        }
+      });
+    } catch (InterruptedException e) {
+      log.warn("Ignoring exception", e);
+    } catch (ExecutionException e) {
+      throw new IllegalStateException(e);
+    } finally {
+      ep.close();
+    }
+  }
+
+  /**
+   * How often should the evolutionary optimization of learning parameters occur?
+   *
+   * @param interval Number of training examples to use in each epoch of optimization.
+   */
+  public void setInterval(int interval) {
+    setInterval(interval, interval);
+  }
+
+  /**
+   * Starts optimization using the shorter interval and progresses to the longer using the specified
+   * number of steps per decade.  Note that values < 200 are not accepted.  Values even that small
+   * are unlikely to be useful.
+   *
+   * @param minInterval The minimum epoch length for the evolutionary optimization
+   * @param maxInterval The maximum epoch length
+   */
+  public void setInterval(int minInterval, int maxInterval) {
+    this.minInterval = Math.max(200, minInterval);
+    this.maxInterval = Math.max(200, maxInterval);
+    this.cutoff = minInterval * (record / minInterval + 1);
+    this.currentStep = minInterval;
+    bufferSize = Math.min(minInterval, bufferSize);
+  }
+
+  public final void setPoolSize(int poolSize) {
+    this.poolSize = poolSize;
+    setupOptimizer(poolSize);
+  }
+
+  public void setThreadCount(int threadCount) {
+    this.threadCount = threadCount;
+    setupOptimizer(poolSize);
+  }
+
+  public void setAucEvaluator(OnlineAuc auc) {
+    seed.getPayload().setAucEvaluator(auc);
+    setupOptimizer(poolSize);
+  }
+
+  private void setupOptimizer(int poolSize) {
+    ep = new EvolutionaryProcess<>(threadCount, poolSize, seed);
+  }
+
+  /**
+   * Returns the size of the internal feature vector.  Note that this is not the same as the number
+   * of distinct features, especially if feature hashing is being used.
+   *
+   * @return The internal feature vector size.
+   */
+  public int numFeatures() {
+    return numFeatures;
+  }
+
+  /**
+   * What is the AUC for the current best member of the population.  If no member is best, usually
+   * because we haven't done any training yet, then the result is set to NaN.
+   *
+   * @return The AUC of the best member of the population or NaN if we can't figure that out.
+   */
+  public double auc() {
+    if (best == null) {
+      return Double.NaN;
+    } else {
+      Wrapper payload = best.getPayload();
+      return payload.getLearner().auc();
+    }
+  }
+
+  public State<Wrapper, CrossFoldLearner> getBest() {
+    return best;
+  }
+
+  public void setBest(State<Wrapper, CrossFoldLearner> best) {
+    this.best = best;
+  }
+
+  public int getRecord() {
+    return record;
+  }
+
+  public void setRecord(int record) {
+    this.record = record;
+  }
+
+  public int getMinInterval() {
+    return minInterval;
+  }
+
+  public int getMaxInterval() {
+    return maxInterval;
+  }
+
+  public int getNumCategories() {
+    return seed.getPayload().getLearner().numCategories();
+  }
+
+  public PriorFunction getPrior() {
+    return seed.getPayload().getLearner().getPrior();
+  }
+
+  public void setBuffer(List<TrainingExample> buffer) {
+    this.buffer = buffer;
+  }
+
+  public List<TrainingExample> getBuffer() {
+    return buffer;
+  }
+
+  public EvolutionaryProcess<Wrapper, CrossFoldLearner> getEp() {
+    return ep;
+  }
+
+  public void setEp(EvolutionaryProcess<Wrapper, CrossFoldLearner> ep) {
+    this.ep = ep;
+  }
+
+  public State<Wrapper, CrossFoldLearner> getSeed() {
+    return seed;
+  }
+
+  public void setSeed(State<Wrapper, CrossFoldLearner> seed) {
+    this.seed = seed;
+  }
+
+  public int getNumFeatures() {
+    return numFeatures;
+  }
+
+  public void setAveragingWindow(int averagingWindow) {
+    seed.getPayload().getLearner().setWindowSize(averagingWindow);
+    setupOptimizer(poolSize);
+  }
+
+  public void setFreezeSurvivors(boolean freezeSurvivors) {
+    this.freezeSurvivors = freezeSurvivors;
+  }
+
+  /**
+   * Provides a shim between the EP optimization stuff and the CrossFoldLearner.  The most important
+   * interface has to do with the parameters of the optimization.  These are taken from the double[]
+   * params in the following order <ul> <li> regularization constant lambda <li> learningRate </ul>.
+   * All other parameters are set in such a way so as to defeat annealing to the extent possible.
+   * This lets the evolutionary algorithm handle the annealing.
+   * <p/>
+   * Note that per coefficient annealing is still done and no optimization of the per coefficient
+   * offset is done.
+   */
+  public static class Wrapper implements Payload<CrossFoldLearner> {
+    private CrossFoldLearner wrapped;
+
+    public Wrapper() {
+    }
+
+    public Wrapper(int numCategories, int numFeatures, PriorFunction prior) {
+      wrapped = new CrossFoldLearner(5, numCategories, numFeatures, prior);
+    }
+
+    @Override
+    public Wrapper copy() {
+      Wrapper r = new Wrapper();
+      r.wrapped = wrapped.copy();
+      return r;
+    }
+
+    @Override
+    public void update(double[] params) {
+      int i = 0;
+      wrapped.lambda(params[i++]);
+      wrapped.learningRate(params[i]);
+
+      wrapped.stepOffset(1);
+      wrapped.alpha(1);
+      wrapped.decayExponent(0);
+    }
+
+    public static void freeze(State<Wrapper, CrossFoldLearner> s) {
+      // radically decrease learning rate
+      double[] params = s.getParams();
+      params[1] -= 10;
+
+      // and cause evolution to hold (almost)
+      s.setOmni(s.getOmni() / 20);
+      double[] step = s.getStep();
+      for (int i = 0; i < step.length; i++) {
+        step[i] /= 20;
+      }
+    }
+
+    public static void setMappings(State<Wrapper, CrossFoldLearner> x) {
+      int i = 0;
+      // set the range for regularization (lambda)
+      x.setMap(i++, Mapping.logLimit(1.0e-8, 0.1));
+      // set the range for learning rate (mu)
+      x.setMap(i, Mapping.logLimit(1.0e-8, 1));
+    }
+
+    public void train(TrainingExample example) {
+      wrapped.train(example.getKey(), example.getGroupKey(), example.getActual(), example.getInstance());
+    }
+
+    public CrossFoldLearner getLearner() {
+      return wrapped;
+    }
+
+    @Override
+    public String toString() {
+      return String.format(Locale.ENGLISH, "auc=%.2f", wrapped.auc());
+    }
+
+    public void setAucEvaluator(OnlineAuc auc) {
+      wrapped.setAucEvaluator(auc);
+    }
+
+    @Override
+    public void write(DataOutput out) throws IOException {
+      wrapped.write(out);
+    }
+
+    @Override
+    public void readFields(DataInput input) throws IOException {
+      wrapped = new CrossFoldLearner();
+      wrapped.readFields(input);
+    }
+  }
+
+  public static class TrainingExample implements Writable {
+    private long key;
+    private String groupKey;
+    private int actual;
+    private Vector instance;
+
+    private TrainingExample() {
+    }
+
+    public TrainingExample(long key, String groupKey, int actual, Vector instance) {
+      this.key = key;
+      this.groupKey = groupKey;
+      this.actual = actual;
+      this.instance = instance;
+    }
+
+    public long getKey() {
+      return key;
+    }
+
+    public int getActual() {
+      return actual;
+    }
+
+    public Vector getInstance() {
+      return instance;
+    }
+
+    public String getGroupKey() {
+      return groupKey;
+    }
+
+    @Override
+    public void write(DataOutput out) throws IOException {
+      out.writeLong(key);
+      if (groupKey != null) {
+        out.writeBoolean(true);
+        out.writeUTF(groupKey);
+      } else {
+        out.writeBoolean(false);
+      }
+      out.writeInt(actual);
+      VectorWritable.writeVector(out, instance, true);
+    }
+
+    @Override
+    public void readFields(DataInput in) throws IOException {
+      key = in.readLong();
+      if (in.readBoolean()) {
+        groupKey = in.readUTF();
+      }
+      actual = in.readInt();
+      instance = VectorWritable.readVector(in);
+    }
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(record);
+    out.writeInt(cutoff);
+    out.writeInt(minInterval);
+    out.writeInt(maxInterval);
+    out.writeInt(currentStep);
+    out.writeInt(bufferSize);
+
+    out.writeInt(buffer.size());
+    for (TrainingExample example : buffer) {
+      example.write(out);
+    }
+
+    ep.write(out);
+
+    best.write(out);
+
+    out.writeInt(threadCount);
+    out.writeInt(poolSize);
+    seed.write(out);
+    out.writeInt(numFeatures);
+
+    out.writeBoolean(freezeSurvivors);
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    record = in.readInt();
+    cutoff = in.readInt();
+    minInterval = in.readInt();
+    maxInterval = in.readInt();
+    currentStep = in.readInt();
+    bufferSize = in.readInt();
+
+    int n = in.readInt();
+    buffer = new ArrayList<>();
+    for (int i = 0; i < n; i++) {
+      TrainingExample example = new TrainingExample();
+      example.readFields(in);
+      buffer.add(example);
+    }
+
+    ep = new EvolutionaryProcess<>();
+    ep.readFields(in);
+
+    best = new State<>();
+    best.readFields(in);
+
+    threadCount = in.readInt();
+    poolSize = in.readInt();
+    seed = new State<>();
+    seed.readFields(in);
+
+    numFeatures = in.readInt();
+    freezeSurvivors = in.readBoolean();
+  }
+}
+


[04/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansDriver.java
new file mode 100644
index 0000000..0f6f7f2
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansDriver.java
@@ -0,0 +1,493 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.mapreduce;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.neighborhood.BruteSearch;
+import org.apache.mahout.math.neighborhood.ProjectionSearch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Classifies the vectors into different clusters found by the clustering
+ * algorithm.
+ */
+public final class StreamingKMeansDriver extends AbstractJob {
+  /**
+   * Streaming KMeans options
+   */
+  /**
+   * The number of cluster that Mappers will use should be \(O(k log n)\) where k is the number of clusters
+   * to get at the end and n is the number of points to cluster. This doesn't need to be exact.
+   * It will be adjusted at runtime.
+   */
+  public static final String ESTIMATED_NUM_MAP_CLUSTERS = "estimatedNumMapClusters";
+  /**
+   * The initial estimated distance cutoff between two points for forming new clusters.
+   * @see org.apache.mahout.clustering.streaming.cluster.StreamingKMeans
+   * Defaults to 10e-6.
+   */
+  public static final String ESTIMATED_DISTANCE_CUTOFF = "estimatedDistanceCutoff";
+
+  /**
+   * Ball KMeans options
+   */
+  /**
+   * After mapping finishes, we get an intermediate set of vectors that represent approximate
+   * clusterings of the data from each Mapper. These can be clustered by the Reducer using
+   * BallKMeans in memory. This variable is the maximum number of iterations in the final
+   * BallKMeans algorithm.
+   * Defaults to 10.
+   */
+  public static final String MAX_NUM_ITERATIONS = "maxNumIterations";
+  /**
+   * The "ball" aspect of ball k-means means that only the closest points to the centroid will actually be used
+   * for updating. The fraction of the points to be used is those points whose distance to the center is within
+   * trimFraction * distance to the closest other center.
+   * Defaults to 0.9.
+   */
+  public static final String TRIM_FRACTION = "trimFraction";
+  /**
+   * Whether to use k-means++ initialization or random initialization of the seed centroids.
+   * Essentially, k-means++ provides better clusters, but takes longer, whereas random initialization takes less
+   * time, but produces worse clusters, and tends to fail more often and needs multiple runs to compare to
+   * k-means++. If set, uses randomInit.
+   * @see org.apache.mahout.clustering.streaming.cluster.BallKMeans
+   */
+  public static final String RANDOM_INIT = "randomInit";
+  /**
+   * Whether to correct the weights of the centroids after the clustering is done. The weights end up being wrong
+   * because of the trimFraction and possible train/test splits. In some cases, especially in a pipeline, having
+   * an accurate count of the weights is useful. If set, ignores the final weights.
+   */
+  public static final String IGNORE_WEIGHTS = "ignoreWeights";
+  /**
+   * The percentage of points that go into the "test" set when evaluating BallKMeans runs in the reducer.
+   */
+  public static final String TEST_PROBABILITY = "testProbability";
+  /**
+   * The percentage of points that go into the "training" set when evaluating BallKMeans runs in the reducer.
+   */
+  public static final String NUM_BALLKMEANS_RUNS = "numBallKMeansRuns";
+
+  /**
+   Searcher options
+   */
+  /**
+   * The Searcher class when performing nearest neighbor search in StreamingKMeans.
+   * Defaults to ProjectionSearch.
+   */
+  public static final String SEARCHER_CLASS_OPTION = "searcherClass";
+  /**
+   * The number of projections to use when using a projection searcher like ProjectionSearch or
+   * FastProjectionSearch. Projection searches work by projection the all the vectors on to a set of
+   * basis vectors and searching for the projected query in that totally ordered set. This
+   * however can produce false positives (vectors that are closer when projected than they would
+   * actually be.
+   * So, there must be more than one projection vectors in the basis. This variable is the number
+   * of vectors in a basis.
+   * Defaults to 3
+   */
+  public static final String NUM_PROJECTIONS_OPTION = "numProjections";
+  /**
+   * When using approximate searches (anything that's not BruteSearch),
+   * more than just the seemingly closest element must be considered. This variable has different
+   * meanings depending on the actual Searcher class used but is a measure of how many candidates
+   * will be considered.
+   * See the ProjectionSearch, FastProjectionSearch, LocalitySensitiveHashSearch classes for more
+   * details.
+   * Defaults to 2.
+   */
+  public static final String SEARCH_SIZE_OPTION = "searchSize";
+
+  /**
+   * Whether to run another pass of StreamingKMeans on the reducer's points before BallKMeans. On some data sets
+   * with a large number of mappers, the intermediate number of clusters passed to the reducer is too large to
+   * fit into memory directly, hence the option to collapse the clusters further with StreamingKMeans.
+   */
+  public static final String REDUCE_STREAMING_KMEANS = "reduceStreamingKMeans";
+
+  private static final Logger log = LoggerFactory.getLogger(StreamingKMeansDriver.class);
+
+  public static final float INVALID_DISTANCE_CUTOFF = -1;
+
+  @Override
+  public int run(String[] args) throws Exception {
+    // Standard options for any Mahout job.
+    addInputOption();
+    addOutputOption();
+    addOption(DefaultOptionCreator.overwriteOption().create());
+
+    // The number of clusters to create for the data.
+    addOption(DefaultOptionCreator.numClustersOption().withDescription(
+        "The k in k-Means. Approximately this many clusters will be generated.").create());
+
+    // StreamingKMeans (mapper) options
+    // There will be k final clusters, but in the Map phase to get a good approximation of the data, O(k log n)
+    // clusters are needed. Since n is the number of data points and not knowable until reading all the vectors,
+    // provide a decent estimate.
+    addOption(ESTIMATED_NUM_MAP_CLUSTERS, "km", "The estimated number of clusters to use for the "
+        + "Map phase of the job when running StreamingKMeans. This should be around k * log(n), "
+        + "where k is the final number of clusters and n is the total number of data points to "
+        + "cluster.", String.valueOf(1));
+
+    addOption(ESTIMATED_DISTANCE_CUTOFF, "e", "The initial estimated distance cutoff between two "
+        + "points for forming new clusters. If no value is given, it's estimated from the data set",
+        String.valueOf(INVALID_DISTANCE_CUTOFF));
+
+    // BallKMeans (reducer) options
+    addOption(MAX_NUM_ITERATIONS, "mi", "The maximum number of iterations to run for the "
+        + "BallKMeans algorithm used by the reducer. If no value is given, defaults to 10.", String.valueOf(10));
+
+    addOption(TRIM_FRACTION, "tf", "The 'ball' aspect of ball k-means means that only the closest points "
+        + "to the centroid will actually be used for updating. The fraction of the points to be used is those "
+        + "points whose distance to the center is within trimFraction * distance to the closest other center. "
+        + "If no value is given, defaults to 0.9.", String.valueOf(0.9));
+
+    addFlag(RANDOM_INIT, "ri", "Whether to use k-means++ initialization or random initialization "
+        + "of the seed centroids. Essentially, k-means++ provides better clusters, but takes longer, whereas random "
+        + "initialization takes less time, but produces worse clusters, and tends to fail more often and needs "
+        + "multiple runs to compare to k-means++. If set, uses the random initialization.");
+
+    addFlag(IGNORE_WEIGHTS, "iw", "Whether to correct the weights of the centroids after the clustering is done. "
+        + "The weights end up being wrong because of the trimFraction and possible train/test splits. In some cases, "
+        + "especially in a pipeline, having an accurate count of the weights is useful. If set, ignores the final "
+        + "weights");
+
+    addOption(TEST_PROBABILITY, "testp", "A double value between 0 and 1 that represents the percentage of "
+        + "points to be used for 'testing' different clustering runs in the final BallKMeans "
+        + "step. If no value is given, defaults to 0.1", String.valueOf(0.1));
+
+    addOption(NUM_BALLKMEANS_RUNS, "nbkm", "Number of BallKMeans runs to use at the end to try to cluster the "
+        + "points. If no value is given, defaults to 4", String.valueOf(4));
+
+    // Nearest neighbor search options
+    // The distance measure used for computing the distance between two points. Generally, the
+    // SquaredEuclideanDistance is used for clustering problems (it's equivalent to CosineDistance for normalized
+    // vectors).
+    // WARNING! You can use any metric but most of the literature is for the squared euclidean distance.
+    addOption(DefaultOptionCreator.distanceMeasureOption().create());
+
+    // The default searcher should be something more efficient that BruteSearch (ProjectionSearch, ...). See
+    // o.a.m.math.neighborhood.*
+    addOption(SEARCHER_CLASS_OPTION, "sc", "The type of searcher to be used when performing nearest "
+        + "neighbor searches. Defaults to ProjectionSearch.", ProjectionSearch.class.getCanonicalName());
+
+    // In the original paper, the authors used 1 projection vector.
+    addOption(NUM_PROJECTIONS_OPTION, "np", "The number of projections considered in estimating the "
+        + "distances between vectors. Only used when the distance measure requested is either "
+        + "ProjectionSearch or FastProjectionSearch. If no value is given, defaults to 3.", String.valueOf(3));
+
+    addOption(SEARCH_SIZE_OPTION, "s", "In more efficient searches (non BruteSearch), "
+        + "not all distances are calculated for determining the nearest neighbors. The number of "
+        + "elements whose distances from the query vector is actually computer is proportional to "
+        + "searchSize. If no value is given, defaults to 1.", String.valueOf(2));
+
+    addFlag(REDUCE_STREAMING_KMEANS, "rskm", "There might be too many intermediate clusters from the mapper "
+        + "to fit into memory, so the reducer can run another pass of StreamingKMeans to collapse them down to a "
+        + "fewer clusters");
+
+    addOption(DefaultOptionCreator.methodOption().create());
+
+    if (parseArguments(args) == null) {
+      return -1;
+    }
+    Path output = getOutputPath();
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.delete(getConf(), output);
+    }
+    configureOptionsForWorkers();
+    run(getConf(), getInputPath(), output);
+    return 0;
+  }
+
+  private void configureOptionsForWorkers() throws ClassNotFoundException {
+    log.info("Starting to configure options for workers");
+
+    String method = getOption(DefaultOptionCreator.METHOD_OPTION);
+
+    int numClusters = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
+
+    // StreamingKMeans
+    int estimatedNumMapClusters = Integer.parseInt(getOption(ESTIMATED_NUM_MAP_CLUSTERS));
+    float estimatedDistanceCutoff = Float.parseFloat(getOption(ESTIMATED_DISTANCE_CUTOFF));
+
+    // BallKMeans
+    int maxNumIterations = Integer.parseInt(getOption(MAX_NUM_ITERATIONS));
+    float trimFraction = Float.parseFloat(getOption(TRIM_FRACTION));
+    boolean randomInit = hasOption(RANDOM_INIT);
+    boolean ignoreWeights = hasOption(IGNORE_WEIGHTS);
+    float testProbability = Float.parseFloat(getOption(TEST_PROBABILITY));
+    int numBallKMeansRuns = Integer.parseInt(getOption(NUM_BALLKMEANS_RUNS));
+
+    // Nearest neighbor search
+    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+    String searcherClass = getOption(SEARCHER_CLASS_OPTION);
+
+    // Get more parameters depending on the kind of search class we're working with. BruteSearch
+    // doesn't need anything else.
+    // LocalitySensitiveHashSearch and ProjectionSearches need searchSize.
+    // ProjectionSearches also need the number of projections.
+    boolean getSearchSize = false;
+    boolean getNumProjections = false;
+    if (!searcherClass.equals(BruteSearch.class.getName())) {
+      getSearchSize = true;
+      getNumProjections = true;
+    }
+
+    // The search size to use. This is quite fuzzy and might end up not being configurable at all.
+    int searchSize = 0;
+    if (getSearchSize) {
+      searchSize = Integer.parseInt(getOption(SEARCH_SIZE_OPTION));
+    }
+
+    // The number of projections to use. This is only useful in projection searches which
+    // project the vectors on multiple basis vectors to get distance estimates that are faster to
+    // calculate.
+    int numProjections = 0;
+    if (getNumProjections) {
+      numProjections = Integer.parseInt(getOption(NUM_PROJECTIONS_OPTION));
+    }
+
+    boolean reduceStreamingKMeans = hasOption(REDUCE_STREAMING_KMEANS);
+
+    configureOptionsForWorkers(getConf(), numClusters,
+        /* StreamingKMeans */
+        estimatedNumMapClusters,  estimatedDistanceCutoff,
+        /* BallKMeans */
+        maxNumIterations, trimFraction, randomInit, ignoreWeights, testProbability, numBallKMeansRuns,
+        /* Searcher */
+        measureClass, searcherClass,  searchSize, numProjections,
+        method,
+        reduceStreamingKMeans);
+  }
+
+  /**
+   * Checks the parameters for a StreamingKMeans job and prepares a Configuration with them.
+   *
+   * @param conf the Configuration to populate
+   * @param numClusters k, the number of clusters at the end
+   * @param estimatedNumMapClusters O(k log n), the number of clusters requested from each mapper
+   * @param estimatedDistanceCutoff an estimate of the minimum distance that separates two clusters (can be smaller and
+   *                                will be increased dynamically)
+   * @param maxNumIterations the maximum number of iterations of BallKMeans
+   * @param trimFraction the fraction of the points to be considered in updating a ball k-means
+   * @param randomInit whether to initialize the ball k-means seeds randomly
+   * @param ignoreWeights whether to ignore the invalid final ball k-means weights
+   * @param testProbability the percentage of vectors assigned to the test set for selecting the best final centers
+   * @param numBallKMeansRuns the number of BallKMeans runs in the reducer that determine the centroids to return
+   *                          (clusters are computed for the training set and the error is computed on the test set)
+   * @param measureClass string, name of the distance measure class; theory works for Euclidean-like distances
+   * @param searcherClass string, name of the searcher that will be used for nearest neighbor search
+   * @param searchSize the number of closest neighbors to look at for selecting the closest one in approximate nearest
+   *                   neighbor searches
+   * @param numProjections the number of projected vectors to use for faster searching (only useful for ProjectionSearch
+   *                       or FastProjectionSearch); @see org.apache.mahout.math.neighborhood.ProjectionSearch
+   */
+  public static void configureOptionsForWorkers(Configuration conf,
+                                                int numClusters,
+                                                /* StreamingKMeans */
+                                                int estimatedNumMapClusters, float estimatedDistanceCutoff,
+                                                /* BallKMeans */
+                                                int maxNumIterations, float trimFraction, boolean randomInit,
+                                                boolean ignoreWeights, float testProbability, int numBallKMeansRuns,
+                                                /* Searcher */
+                                                String measureClass, String searcherClass,
+                                                int searchSize, int numProjections,
+                                                String method,
+                                                boolean reduceStreamingKMeans) throws ClassNotFoundException {
+    // Checking preconditions for the parameters.
+    Preconditions.checkArgument(numClusters > 0, 
+        "Invalid number of clusters requested: " + numClusters + ". Must be: numClusters > 0!");
+
+    // StreamingKMeans
+    Preconditions.checkArgument(estimatedNumMapClusters > numClusters, "Invalid number of estimated map "
+        + "clusters; There must be more than the final number of clusters (k log n vs k)");
+    Preconditions.checkArgument(estimatedDistanceCutoff == INVALID_DISTANCE_CUTOFF || estimatedDistanceCutoff > 0,
+        "estimatedDistanceCutoff must be equal to -1 or must be greater then 0!");
+
+    // BallKMeans
+    Preconditions.checkArgument(maxNumIterations > 0, "Must have at least one BallKMeans iteration");
+    Preconditions.checkArgument(trimFraction > 0, "trimFraction must be positive");
+    Preconditions.checkArgument(testProbability >= 0 && testProbability < 1, "test probability is not in the "
+        + "interval [0, 1)");
+    Preconditions.checkArgument(numBallKMeansRuns > 0, "numBallKMeans cannot be negative");
+
+    // Searcher
+    if (!searcherClass.contains("Brute")) {
+      // These tests only make sense when a relevant searcher is being used.
+      Preconditions.checkArgument(searchSize > 0, "Invalid searchSize. Must be positive.");
+      if (searcherClass.contains("Projection")) {
+        Preconditions.checkArgument(numProjections > 0, "Invalid numProjections. Must be positive");
+      }
+    }
+
+    // Setting the parameters in the Configuration.
+    conf.setInt(DefaultOptionCreator.NUM_CLUSTERS_OPTION, numClusters);
+    /* StreamingKMeans */
+    conf.setInt(ESTIMATED_NUM_MAP_CLUSTERS, estimatedNumMapClusters);
+    if (estimatedDistanceCutoff != INVALID_DISTANCE_CUTOFF) {
+      conf.setFloat(ESTIMATED_DISTANCE_CUTOFF, estimatedDistanceCutoff);
+    }
+    /* BallKMeans */
+    conf.setInt(MAX_NUM_ITERATIONS, maxNumIterations);
+    conf.setFloat(TRIM_FRACTION, trimFraction);
+    conf.setBoolean(RANDOM_INIT, randomInit);
+    conf.setBoolean(IGNORE_WEIGHTS, ignoreWeights);
+    conf.setFloat(TEST_PROBABILITY, testProbability);
+    conf.setInt(NUM_BALLKMEANS_RUNS, numBallKMeansRuns);
+    /* Searcher */
+    // Checks if the measureClass is available, throws exception otherwise.
+    Class.forName(measureClass);
+    conf.set(DefaultOptionCreator.DISTANCE_MEASURE_OPTION, measureClass);
+    // Checks if the searcherClass is available, throws exception otherwise.
+    Class.forName(searcherClass);
+    conf.set(SEARCHER_CLASS_OPTION, searcherClass);
+    conf.setInt(SEARCH_SIZE_OPTION, searchSize);
+    conf.setInt(NUM_PROJECTIONS_OPTION, numProjections);
+    conf.set(DefaultOptionCreator.METHOD_OPTION, method);
+
+    conf.setBoolean(REDUCE_STREAMING_KMEANS, reduceStreamingKMeans);
+
+    log.info("Parameters are: [k] numClusters {}; "
+        + "[SKM] estimatedNumMapClusters {}; estimatedDistanceCutoff {} "
+        + "[BKM] maxNumIterations {}; trimFraction {}; randomInit {}; ignoreWeights {}; "
+        + "testProbability {}; numBallKMeansRuns {}; "
+        + "[S] measureClass {}; searcherClass {}; searcherSize {}; numProjections {}; "
+        + "method {}; reduceStreamingKMeans {}", numClusters, estimatedNumMapClusters, estimatedDistanceCutoff,
+        maxNumIterations, trimFraction, randomInit, ignoreWeights, testProbability, numBallKMeansRuns,
+        measureClass, searcherClass, searchSize, numProjections, method, reduceStreamingKMeans);
+  }
+
+  /**
+   * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to
+   * cluster the input vectors.
+   *
+   * @param input the directory pathname for input points.
+   * @param output the directory pathname for output points.
+   * @return 0 on success, -1 on failure.
+   */
+  public static int run(Configuration conf, Path input, Path output)
+      throws IOException, InterruptedException, ClassNotFoundException, ExecutionException {
+    log.info("Starting StreamingKMeans clustering for vectors in {}; results are output to {}",
+        input.toString(), output.toString());
+
+    if (conf.get(DefaultOptionCreator.METHOD_OPTION,
+        DefaultOptionCreator.MAPREDUCE_METHOD).equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) {
+      return runSequentially(conf, input, output);
+    } else {
+      return runMapReduce(conf, input, output);
+    }
+  }
+
+  private static int runSequentially(Configuration conf, Path input, Path output)
+    throws IOException, ExecutionException, InterruptedException {
+    long start = System.currentTimeMillis();
+    // Run StreamingKMeans step in parallel by spawning 1 thread per input path to process.
+    ExecutorService pool = Executors.newCachedThreadPool();
+    List<Future<Iterable<Centroid>>> intermediateCentroidFutures = new ArrayList<>();
+    for (FileStatus status : HadoopUtil.listStatus(FileSystem.get(conf), input, PathFilters.logsCRCFilter())) {
+      intermediateCentroidFutures.add(pool.submit(new StreamingKMeansThread(status.getPath(), conf)));
+    }
+    log.info("Finished running Mappers");
+    // Merge the resulting "mapper" centroids.
+    List<Centroid> intermediateCentroids = new ArrayList<>();
+    for (Future<Iterable<Centroid>> futureIterable : intermediateCentroidFutures) {
+      for (Centroid centroid : futureIterable.get()) {
+        intermediateCentroids.add(centroid);
+      }
+    }
+    pool.shutdown();
+    pool.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
+    log.info("Finished StreamingKMeans");
+    SequenceFile.Writer writer = SequenceFile.createWriter(FileSystem.get(conf), conf, new Path(output, "part-r-00000"), IntWritable.class,
+        CentroidWritable.class);
+    int numCentroids = 0;
+    // Run BallKMeans on the intermediate centroids.
+    for (Vector finalVector : StreamingKMeansReducer.getBestCentroids(intermediateCentroids, conf)) {
+      Centroid finalCentroid = (Centroid)finalVector;
+      writer.append(new IntWritable(numCentroids++), new CentroidWritable(finalCentroid));
+    }
+    writer.close();
+    long end = System.currentTimeMillis();
+    log.info("Finished BallKMeans. Took {}.", (end - start) / 1000.0);
+    return 0;
+  }
+
+  public static int runMapReduce(Configuration conf, Path input, Path output)
+    throws IOException, ClassNotFoundException, InterruptedException {
+    // Prepare Job for submission.
+    Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
+        StreamingKMeansMapper.class, IntWritable.class, CentroidWritable.class,
+        StreamingKMeansReducer.class, IntWritable.class, CentroidWritable.class, SequenceFileOutputFormat.class,
+        conf);
+    job.setJobName(HadoopUtil.getCustomJobName(StreamingKMeansDriver.class.getSimpleName(), job,
+        StreamingKMeansMapper.class, StreamingKMeansReducer.class));
+
+    // There is only one reducer so that the intermediate centroids get collected on one
+    // machine and are clustered in memory to get the right number of clusters.
+    job.setNumReduceTasks(1);
+
+    // Set the JAR (so that the required libraries are available) and run.
+    job.setJarByClass(StreamingKMeansDriver.class);
+
+    // Run job!
+    long start = System.currentTimeMillis();
+    if (!job.waitForCompletion(true)) {
+      return -1;
+    }
+    long end = System.currentTimeMillis();
+
+    log.info("StreamingKMeans clustering complete. Results are in {}. Took {} ms", output.toString(), end - start);
+    return 0;
+  }
+
+  /**
+   * Constructor to be used by the ToolRunner.
+   */
+  private StreamingKMeansDriver() {}
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new StreamingKMeansDriver(), args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansMapper.java
new file mode 100644
index 0000000..f12a876
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansMapper.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.mapreduce;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.clustering.ClusteringUtils;
+import org.apache.mahout.clustering.streaming.cluster.StreamingKMeans;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.neighborhood.UpdatableSearcher;
+
+public class StreamingKMeansMapper extends Mapper<Writable, VectorWritable, IntWritable, CentroidWritable> {
+  private static final int NUM_ESTIMATE_POINTS = 1000;
+
+  /**
+   * The clusterer object used to cluster the points received by this mapper online.
+   */
+  private StreamingKMeans clusterer;
+
+  /**
+   * Number of points clustered so far.
+   */
+  private int numPoints = 0;
+
+  private boolean estimateDistanceCutoff = false;
+
+  private List<Centroid> estimatePoints;
+
+  @Override
+  public void setup(Context context) {
+    // At this point the configuration received from the Driver is assumed to be valid.
+    // No other checks are made.
+    Configuration conf = context.getConfiguration();
+    UpdatableSearcher searcher = StreamingKMeansUtilsMR.searcherFromConfiguration(conf);
+    int numClusters = conf.getInt(StreamingKMeansDriver.ESTIMATED_NUM_MAP_CLUSTERS, 1);
+    double estimatedDistanceCutoff = conf.getFloat(StreamingKMeansDriver.ESTIMATED_DISTANCE_CUTOFF,
+        StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF);
+    if (estimatedDistanceCutoff == StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF) {
+      estimateDistanceCutoff = true;
+      estimatePoints = new ArrayList<>();
+    }
+    // There is no way of estimating the distance cutoff unless we have some data.
+    clusterer = new StreamingKMeans(searcher, numClusters, estimatedDistanceCutoff);
+  }
+
+  private void clusterEstimatePoints() {
+    clusterer.setDistanceCutoff(ClusteringUtils.estimateDistanceCutoff(
+        estimatePoints, clusterer.getDistanceMeasure()));
+    clusterer.cluster(estimatePoints);
+    estimateDistanceCutoff = false;
+  }
+
+  @Override
+  public void map(Writable key, VectorWritable point, Context context) {
+    Centroid centroid = new Centroid(numPoints++, point.get(), 1);
+    if (estimateDistanceCutoff) {
+      if (numPoints < NUM_ESTIMATE_POINTS) {
+        estimatePoints.add(centroid);
+      } else if (numPoints == NUM_ESTIMATE_POINTS) {
+        clusterEstimatePoints();
+      }
+    } else {
+      clusterer.cluster(centroid);
+    }
+  }
+
+  @Override
+  public void cleanup(Context context) throws IOException, InterruptedException {
+    // We should cluster the points at the end if they haven't yet been clustered.
+    if (estimateDistanceCutoff) {
+      clusterEstimatePoints();
+    }
+    // Reindex the centroids before passing them to the reducer.
+    clusterer.reindexCentroids();
+    // All outputs have the same key to go to the same final reducer.
+    for (Centroid centroid : clusterer) {
+      context.write(new IntWritable(0), new CentroidWritable(centroid));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansReducer.java
new file mode 100644
index 0000000..2b78acc
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansReducer.java
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.mapreduce;
+
+import java.io.IOException;
+import java.util.List;
+
+import com.google.common.base.Function;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.clustering.streaming.cluster.BallKMeans;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class StreamingKMeansReducer extends Reducer<IntWritable, CentroidWritable, IntWritable, CentroidWritable> {
+
+  private static final Logger log = LoggerFactory.getLogger(StreamingKMeansReducer.class);
+
+  /**
+   * Configuration for the MapReduce job.
+   */
+  private Configuration conf;
+
+  @Override
+  public void setup(Context context) {
+    // At this point the configuration received from the Driver is assumed to be valid.
+    // No other checks are made.
+    conf = context.getConfiguration();
+  }
+
+  @Override
+  public void reduce(IntWritable key, Iterable<CentroidWritable> centroids,
+                     Context context) throws IOException, InterruptedException {
+    List<Centroid> intermediateCentroids;
+    // There might be too many intermediate centroids to fit into memory, in which case, we run another pass
+    // of StreamingKMeans to collapse the clusters further.
+    if (conf.getBoolean(StreamingKMeansDriver.REDUCE_STREAMING_KMEANS, false)) {
+      intermediateCentroids = Lists.newArrayList(
+          new StreamingKMeansThread(Iterables.transform(centroids, new Function<CentroidWritable, Centroid>() {
+            @Override
+            public Centroid apply(CentroidWritable input) {
+              Preconditions.checkNotNull(input);
+              return input.getCentroid().clone();
+            }
+          }), conf).call());
+    } else {
+      intermediateCentroids = centroidWritablesToList(centroids);
+    }
+
+    int index = 0;
+    for (Vector centroid : getBestCentroids(intermediateCentroids, conf)) {
+      context.write(new IntWritable(index), new CentroidWritable((Centroid) centroid));
+      ++index;
+    }
+  }
+
+  public static List<Centroid> centroidWritablesToList(Iterable<CentroidWritable> centroids) {
+    // A new list must be created because Hadoop iterators mutate the contents of the Writable in
+    // place, without allocating new references when iterating through the centroids Iterable.
+    return Lists.newArrayList(Iterables.transform(centroids, new Function<CentroidWritable, Centroid>() {
+      @Override
+      public Centroid apply(CentroidWritable input) {
+        Preconditions.checkNotNull(input);
+        return input.getCentroid().clone();
+      }
+    }));
+  }
+
+  public static Iterable<Vector> getBestCentroids(List<Centroid> centroids, Configuration conf) {
+
+    if (log.isInfoEnabled()) {
+      log.info("Number of Centroids: {}", centroids.size());
+    }
+
+    int numClusters = conf.getInt(DefaultOptionCreator.NUM_CLUSTERS_OPTION, 1);
+    int maxNumIterations = conf.getInt(StreamingKMeansDriver.MAX_NUM_ITERATIONS, 10);
+    float trimFraction = conf.getFloat(StreamingKMeansDriver.TRIM_FRACTION, 0.9f);
+    boolean kMeansPlusPlusInit = !conf.getBoolean(StreamingKMeansDriver.RANDOM_INIT, false);
+    boolean correctWeights = !conf.getBoolean(StreamingKMeansDriver.IGNORE_WEIGHTS, false);
+    float testProbability = conf.getFloat(StreamingKMeansDriver.TEST_PROBABILITY, 0.1f);
+    int numRuns = conf.getInt(StreamingKMeansDriver.NUM_BALLKMEANS_RUNS, 3);
+
+    BallKMeans ballKMeansCluster = new BallKMeans(StreamingKMeansUtilsMR.searcherFromConfiguration(conf),
+        numClusters, maxNumIterations, trimFraction, kMeansPlusPlusInit, correctWeights, testProbability, numRuns);
+    return ballKMeansCluster.cluster(centroids);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansThread.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansThread.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansThread.java
new file mode 100644
index 0000000..24cc1db
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansThread.java
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.mapreduce;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.Callable;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.ClusteringUtils;
+import org.apache.mahout.clustering.streaming.cluster.StreamingKMeans;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.neighborhood.UpdatableSearcher;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class StreamingKMeansThread implements Callable<Iterable<Centroid>> {
+  private static final Logger log = LoggerFactory.getLogger(StreamingKMeansThread.class);
+
+  private static final int NUM_ESTIMATE_POINTS = 1000;
+
+  private final Configuration conf;
+  private final Iterable<Centroid> dataPoints;
+
+  public StreamingKMeansThread(Path input, Configuration conf) {
+    this(StreamingKMeansUtilsMR.getCentroidsFromVectorWritable(
+        new SequenceFileValueIterable<VectorWritable>(input, false, conf)), conf);
+  }
+
+  public StreamingKMeansThread(Iterable<Centroid> dataPoints, Configuration conf) {
+    this.dataPoints = dataPoints;
+    this.conf = conf;
+  }
+
+  @Override
+  public Iterable<Centroid> call() {
+    UpdatableSearcher searcher = StreamingKMeansUtilsMR.searcherFromConfiguration(conf);
+    int numClusters = conf.getInt(StreamingKMeansDriver.ESTIMATED_NUM_MAP_CLUSTERS, 1);
+    double estimateDistanceCutoff = conf.getFloat(StreamingKMeansDriver.ESTIMATED_DISTANCE_CUTOFF,
+        StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF);
+
+    Iterator<Centroid> dataPointsIterator = dataPoints.iterator();
+
+    if (estimateDistanceCutoff == StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF) {
+      List<Centroid> estimatePoints = new ArrayList<>(NUM_ESTIMATE_POINTS);
+      while (dataPointsIterator.hasNext() && estimatePoints.size() < NUM_ESTIMATE_POINTS) {
+        Centroid centroid = dataPointsIterator.next();
+        estimatePoints.add(centroid);
+      }
+
+      if (log.isInfoEnabled()) {
+        log.info("Estimated Points: {}", estimatePoints.size());
+      }
+      estimateDistanceCutoff = ClusteringUtils.estimateDistanceCutoff(estimatePoints, searcher.getDistanceMeasure());
+    }
+
+    StreamingKMeans streamingKMeans = new StreamingKMeans(searcher, numClusters, estimateDistanceCutoff);
+
+    // datapointsIterator could be empty if no estimate distance was initially provided
+    // hence creating the iterator again here for the clustering
+    if (!dataPointsIterator.hasNext()) {
+      dataPointsIterator = dataPoints.iterator();
+    }
+
+    while (dataPointsIterator.hasNext()) {
+      streamingKMeans.cluster(dataPointsIterator.next());
+    }
+
+    streamingKMeans.reindexCentroids();
+    return streamingKMeans;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansUtilsMR.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansUtilsMR.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansUtilsMR.java
new file mode 100644
index 0000000..f00cf56
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansUtilsMR.java
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.mapreduce;
+
+import java.io.IOException;
+
+import com.google.common.base.Function;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.neighborhood.BruteSearch;
+import org.apache.mahout.math.neighborhood.FastProjectionSearch;
+import org.apache.mahout.math.neighborhood.LocalitySensitiveHashSearch;
+import org.apache.mahout.math.neighborhood.ProjectionSearch;
+import org.apache.mahout.math.neighborhood.UpdatableSearcher;
+
+public final class StreamingKMeansUtilsMR {
+
+  private StreamingKMeansUtilsMR() {
+  }
+
+  /**
+   * Instantiates a searcher from a given configuration.
+   * @param conf the configuration
+   * @return the instantiated searcher
+   * @throws RuntimeException if the distance measure class cannot be instantiated
+   * @throws IllegalStateException if an unknown searcher class was requested
+   */
+  public static UpdatableSearcher searcherFromConfiguration(Configuration conf) {
+    DistanceMeasure distanceMeasure;
+    String distanceMeasureClass = conf.get(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+    try {
+      distanceMeasure = (DistanceMeasure) Class.forName(distanceMeasureClass).getConstructor().newInstance();
+    } catch (Exception e) {
+      throw new RuntimeException("Failed to instantiate distanceMeasure", e);
+    }
+
+    int numProjections =  conf.getInt(StreamingKMeansDriver.NUM_PROJECTIONS_OPTION, 20);
+    int searchSize =  conf.getInt(StreamingKMeansDriver.SEARCH_SIZE_OPTION, 10);
+
+    String searcherClass = conf.get(StreamingKMeansDriver.SEARCHER_CLASS_OPTION);
+
+    if (searcherClass.equals(BruteSearch.class.getName())) {
+      return ClassUtils.instantiateAs(searcherClass, UpdatableSearcher.class,
+          new Class[]{DistanceMeasure.class}, new Object[]{distanceMeasure});
+    } else if (searcherClass.equals(FastProjectionSearch.class.getName())
+        || searcherClass.equals(ProjectionSearch.class.getName())) {
+      return ClassUtils.instantiateAs(searcherClass, UpdatableSearcher.class,
+          new Class[]{DistanceMeasure.class, int.class, int.class},
+          new Object[]{distanceMeasure, numProjections, searchSize});
+    } else if (searcherClass.equals(LocalitySensitiveHashSearch.class.getName())) {
+      return ClassUtils.instantiateAs(searcherClass, LocalitySensitiveHashSearch.class,
+          new Class[]{DistanceMeasure.class, int.class},
+          new Object[]{distanceMeasure, searchSize});
+    } else {
+      throw new IllegalStateException("Unknown class instantiation requested");
+    }
+  }
+
+  /**
+   * Returns an Iterable of centroids from an Iterable of VectorWritables by creating a new Centroid containing
+   * a RandomAccessSparseVector as a delegate for each VectorWritable.
+   * @param inputIterable VectorWritable Iterable to get Centroids from
+   * @return the new Centroids
+   */
+  public static Iterable<Centroid> getCentroidsFromVectorWritable(Iterable<VectorWritable> inputIterable) {
+    return Iterables.transform(inputIterable, new Function<VectorWritable, Centroid>() {
+      private int numVectors = 0;
+      @Override
+      public Centroid apply(VectorWritable input) {
+        Preconditions.checkNotNull(input);
+        return new Centroid(numVectors++, new RandomAccessSparseVector(input.get()), 1);
+      }
+    });
+  }
+
+  /**
+   * Returns an Iterable of Centroid from an Iterable of Vector by either casting each Vector to Centroid (if the
+   * instance extends Centroid) or create a new Centroid based on that Vector.
+   * The implicit expectation is that the input will not have interleaving types of vectors. Otherwise, the numbering
+   * of new Centroids will become invalid.
+   * @param input Iterable of Vectors to cast
+   * @return the new Centroids
+   */
+  public static Iterable<Centroid> castVectorsToCentroids(Iterable<Vector> input) {
+    return Iterables.transform(input, new Function<Vector, Centroid>() {
+      private int numVectors = 0;
+      @Override
+      public Centroid apply(Vector input) {
+        Preconditions.checkNotNull(input);
+        if (input instanceof Centroid) {
+          return (Centroid) input;
+        } else {
+          return new Centroid(numVectors++, input, 1);
+        }
+      }
+    });
+  }
+
+  /**
+   * Writes centroids to a sequence file.
+   * @param centroids the centroids to write.
+   * @param path the path of the output file.
+   * @param conf the configuration for the HDFS to write the file to.
+   * @throws java.io.IOException
+   */
+  public static void writeCentroidsToSequenceFile(Iterable<Centroid> centroids, Path path, Configuration conf)
+    throws IOException {
+    try (SequenceFile.Writer writer = SequenceFile.createWriter(FileSystem.get(conf), conf,
+        path, IntWritable.class, CentroidWritable.class)) {
+      int i = 0;
+      for (Centroid centroid : centroids) {
+        writer.append(new IntWritable(i++), new CentroidWritable(centroid));
+      }
+    }
+  }
+
+  public static void writeVectorsToSequenceFile(Iterable<? extends Vector> datapoints, Path path, Configuration conf)
+    throws IOException {
+    try (SequenceFile.Writer writer = SequenceFile.createWriter(FileSystem.get(conf), conf,
+        path, IntWritable.class, VectorWritable.class)){
+      int i = 0;
+      for (Vector vector : datapoints) {
+        writer.append(new IntWritable(i++), new VectorWritable(vector));
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java
new file mode 100644
index 0000000..d7ca554
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.tools;
+
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.Iterator;
+
+import com.google.common.collect.Iterables;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+
+public class ResplitSequenceFiles {
+
+  private String inputFile;
+  private String outputFileBase;
+  private int numSplits;
+
+  private Configuration conf;
+  private FileSystem fs;
+
+  private ResplitSequenceFiles() {}
+
+  private void writeSplit(Iterator<Pair<Writable, Writable>> inputIterator,
+                          int numSplit, int numEntriesPerSplit) throws IOException {
+    SequenceFile.Writer splitWriter = null;
+    for (int j = 0; j < numEntriesPerSplit; ++j) {
+      Pair<Writable, Writable> item = inputIterator.next();
+      if (splitWriter == null) {
+        splitWriter = SequenceFile.createWriter(fs, conf,
+            new Path(outputFileBase + "-" + numSplit), item.getFirst().getClass(), item.getSecond().getClass());
+      }
+      splitWriter.append(item.getFirst(), item.getSecond());
+    }
+    if (splitWriter != null) {
+      splitWriter.close();
+    }
+  }
+
+  private void run(PrintWriter printWriter) throws IOException {
+    conf = new Configuration();
+    SequenceFileDirIterable<Writable, Writable> inputIterable = new
+        SequenceFileDirIterable<>(new Path(inputFile), PathType.LIST, conf);
+    fs = FileSystem.get(conf);
+
+    int numEntries = Iterables.size(inputIterable);
+    int numEntriesPerSplit = numEntries / numSplits;
+    int numEntriesLastSplit = numEntriesPerSplit + numEntries - numEntriesPerSplit * numSplits;
+    Iterator<Pair<Writable, Writable>> inputIterator = inputIterable.iterator();
+
+    printWriter.printf("Writing %d splits\n", numSplits);
+    for (int i = 0; i < numSplits - 1; ++i) {
+      printWriter.printf("Writing split %d\n", i);
+      writeSplit(inputIterator, i, numEntriesPerSplit);
+    }
+    printWriter.printf("Writing split %d\n", numSplits - 1);
+    writeSplit(inputIterator, numSplits - 1, numEntriesLastSplit);
+  }
+
+  private boolean parseArgs(String[] args) {
+    DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+    Option help = builder.withLongName("help").withDescription("print this list").create();
+
+    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+    Option inputFileOption = builder.withLongName("input")
+        .withShortName("i")
+        .withRequired(true)
+        .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+        .withDescription("what the base folder for sequence files is (they all must have the same key/value type")
+        .create();
+
+    Option outputFileOption = builder.withLongName("output")
+        .withShortName("o")
+        .withRequired(true)
+        .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
+        .withDescription("the base name of the file split that the files will be split it; the i'th split has the "
+            + "suffix -i")
+        .create();
+
+    Option numSplitsOption = builder.withLongName("numSplits")
+        .withShortName("ns")
+        .withRequired(true)
+        .withArgument(argumentBuilder.withName("numSplits").withMaximum(1).create())
+        .withDescription("how many splits to use for the given files")
+        .create();
+
+    Group normalArgs = new GroupBuilder()
+        .withOption(help)
+        .withOption(inputFileOption)
+        .withOption(outputFileOption)
+        .withOption(numSplitsOption)
+        .create();
+
+    Parser parser = new Parser();
+    parser.setHelpOption(help);
+    parser.setHelpTrigger("--help");
+    parser.setGroup(normalArgs);
+    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+    CommandLine cmdLine = parser.parseAndHelp(args);
+
+    if (cmdLine == null) {
+      return false;
+    }
+
+    inputFile = (String) cmdLine.getValue(inputFileOption);
+    outputFileBase = (String) cmdLine.getValue(outputFileOption);
+    numSplits = Integer.parseInt((String) cmdLine.getValue(numSplitsOption));
+    return true;
+  }
+
+  public static void main(String[] args) throws IOException {
+    ResplitSequenceFiles runner = new ResplitSequenceFiles();
+    if (runner.parseArgs(args)) {
+      runner.run(new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/PathDirectory.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/PathDirectory.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/PathDirectory.java
new file mode 100644
index 0000000..11bc34a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/PathDirectory.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.topdown;
+
+import java.io.File;
+
+import org.apache.hadoop.fs.Path;
+
+/**
+ * Contains list of all internal paths used in top down clustering.
+ */
+public final class PathDirectory {
+
+  public static final String TOP_LEVEL_CLUSTER_DIRECTORY = "topLevelCluster";
+  public static final String POST_PROCESS_DIRECTORY = "clusterPostProcessed";
+  public static final String CLUSTERED_POINTS_DIRECTORY = "clusteredPoints";
+  public static final String BOTTOM_LEVEL_CLUSTER_DIRECTORY = "bottomLevelCluster";
+
+  private PathDirectory() {
+  }
+
+  /**
+   * All output of top level clustering is stored in output directory/topLevelCluster.
+   * 
+   * @param output
+   *          the output path of clustering.
+   * @return The top level Cluster Directory.
+   */
+  public static Path getTopLevelClusterPath(Path output) {
+    return new Path(output + File.separator + TOP_LEVEL_CLUSTER_DIRECTORY);
+  }
+  
+  /**
+   * The output of top level clusters is post processed and kept in this path.
+   * 
+   * @param outputPathProvidedByUser
+   *          the output path of clustering.
+   * @return the path where the output of top level cluster post processor is kept.
+   */
+  public static Path getClusterPostProcessorOutputDirectory(Path outputPathProvidedByUser) {
+    return new Path(outputPathProvidedByUser + File.separator + POST_PROCESS_DIRECTORY);
+  }
+  
+  /**
+   * The top level clustered points before post processing is generated here.
+   * 
+   * @param output
+   *          the output path of clustering.
+   * @return the clustered points directory
+   */
+  public static Path getClusterOutputClusteredPoints(Path output) {
+    return new Path(output + File.separator + CLUSTERED_POINTS_DIRECTORY + File.separator, "*");
+  }
+  
+  /**
+   * Each cluster produced by top level clustering is processed in output/"bottomLevelCluster"/clusterId.
+   * 
+   * @param output
+   * @param clusterId
+   * @return the bottom level clustering path.
+   */
+  public static Path getBottomLevelClusterPath(Path output, String clusterId) {
+    return new Path(output + File.separator + BOTTOM_LEVEL_CLUSTER_DIRECTORY + File.separator + clusterId);
+  }
+  
+  /**
+   * Each clusters path name is its clusterId. The vectors reside in separate files inside it.
+   * 
+   * @param clusterPostProcessorOutput
+   *          the path of cluster post processor output.
+   * @param clusterId
+   *          the id of the cluster.
+   * @return the cluster path for cluster id.
+   */
+  public static Path getClusterPathForClusterId(Path clusterPostProcessorOutput, String clusterId) {
+    return new Path(clusterPostProcessorOutput + File.separator + clusterId);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReader.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReader.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReader.java
new file mode 100644
index 0000000..d0563fd
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReader.java
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.topdown.postprocessor;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+/**
+ * Reads the number of clusters produced by the clustering algorithm.
+ */
+public final class ClusterCountReader {
+
+  private ClusterCountReader() {
+  }
+
+  /**
+   * Reads the number of clusters present by reading the clusters-*-final file.
+   *
+   * @param clusterOutputPath The output path provided to the clustering algorithm.
+   * @param conf              The hadoop configuration.
+   * @return the number of final clusters.
+   */
+  public static int getNumberOfClusters(Path clusterOutputPath, Configuration conf) throws IOException {
+    FileSystem fileSystem = clusterOutputPath.getFileSystem(conf);
+    FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter());
+    int numberOfClusters = 0;
+    Iterator<?> it = new SequenceFileDirValueIterator<>(clusterFiles[0].getPath(),
+            PathType.LIST,
+            PathFilters.partFilter(),
+            null,
+            true,
+            conf);
+    while (it.hasNext()) {
+      it.next();
+      numberOfClusters++;
+    }
+    return numberOfClusters;
+  }
+
+  /**
+   * Generates a list of all cluster ids by reading the clusters-*-final file.
+   *
+   * @param clusterOutputPath The output path provided to the clustering algorithm.
+   * @param conf              The hadoop configuration.
+   * @return An ArrayList containing the final cluster ids.
+   */
+  public static Map<Integer, Integer> getClusterIDs(Path clusterOutputPath, Configuration conf, boolean keyIsClusterId)
+    throws IOException {
+    Map<Integer, Integer> clusterIds = new HashMap<>();
+    FileSystem fileSystem = clusterOutputPath.getFileSystem(conf);
+    FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter());
+    //System.out.println("LOOK HERE: " + clusterOutputPath);
+    Iterator<ClusterWritable> it = new SequenceFileDirValueIterator<>(clusterFiles[0].getPath(),
+            PathType.LIST,
+            PathFilters.partFilter(),
+            null,
+            true,
+            conf);
+    int i = 0;
+    while (it.hasNext()) {
+      Integer key;
+      Integer value;
+      if (keyIsClusterId) { // key is the cluster id, value is i, the index we will use
+        key = it.next().getValue().getId();
+        value = i;
+      } else {
+        key = i;
+        value = it.next().getValue().getId();
+      }
+      clusterIds.put(key, value);
+      i++;
+    }
+    return clusterIds;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessor.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessor.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessor.java
new file mode 100644
index 0000000..ded76ad
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessor.java
@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.topdown.postprocessor;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.Writer;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.topdown.PathDirectory;
+import org.apache.mahout.common.IOUtils;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * This class reads the output of any clustering algorithm, and, creates separate directories for different
+ * clusters. Each cluster directory's name is its clusterId. Each and every point is written in the cluster
+ * directory associated with that point.
+ * <p/>
+ * This class incorporates a sequential algorithm and is appropriate for use for data which has been clustered
+ * sequentially.
+ * <p/>
+ * The sequential and non sequential version, both are being used from {@link ClusterOutputPostProcessorDriver}.
+ */
+public final class ClusterOutputPostProcessor {
+
+  private Path clusteredPoints;
+  private final FileSystem fileSystem;
+  private final Configuration conf;
+  private final Path clusterPostProcessorOutput;
+  private final Map<String, Path> postProcessedClusterDirectories = new HashMap<>();
+  private long uniqueVectorId = 0L;
+  private final Map<String, SequenceFile.Writer> writersForClusters;
+
+  public ClusterOutputPostProcessor(Path clusterOutputToBeProcessed,
+                                    Path output,
+                                    Configuration hadoopConfiguration) throws IOException {
+    this.clusterPostProcessorOutput = output;
+    this.clusteredPoints = PathDirectory.getClusterOutputClusteredPoints(clusterOutputToBeProcessed);
+    this.conf = hadoopConfiguration;
+    this.writersForClusters = new HashMap<>();
+    fileSystem = clusteredPoints.getFileSystem(conf);
+  }
+
+  /**
+   * This method takes the clustered points output by the clustering algorithms as input and writes them into
+   * their respective clusters.
+   */
+  public void process() throws IOException {
+    createPostProcessDirectory();
+    for (Pair<?, WeightedVectorWritable> record
+        : new SequenceFileDirIterable<Writable, WeightedVectorWritable>(clusteredPoints, PathType.GLOB, PathFilters.partFilter(),
+                                                                        null, false, conf)) {
+      String clusterId = record.getFirst().toString().trim();
+      putVectorInRespectiveCluster(clusterId, record.getSecond());
+    }
+    IOUtils.close(writersForClusters.values());
+    writersForClusters.clear();
+  }
+
+  /**
+   * Creates the directory to put post processed clusters.
+   */
+  private void createPostProcessDirectory() throws IOException {
+    if (!fileSystem.exists(clusterPostProcessorOutput)
+            && !fileSystem.mkdirs(clusterPostProcessorOutput)) {
+      throw new IOException("Error creating cluster post processor directory");
+    }
+  }
+
+  /**
+   * Finds out the cluster directory of the vector and writes it into the specified cluster.
+   */
+  private void putVectorInRespectiveCluster(String clusterId, WeightedVectorWritable point) throws IOException {
+    Writer writer = findWriterForVector(clusterId);
+    postProcessedClusterDirectories.put(clusterId,
+            PathDirectory.getClusterPathForClusterId(clusterPostProcessorOutput, clusterId));
+    writeVectorToCluster(writer, point);
+  }
+
+  /**
+   * Finds out the path in cluster where the point is supposed to be written.
+   */
+  private Writer findWriterForVector(String clusterId) throws IOException {
+    Path clusterDirectory = PathDirectory.getClusterPathForClusterId(clusterPostProcessorOutput, clusterId);
+    Writer writer = writersForClusters.get(clusterId);
+    if (writer == null) {
+      Path pathToWrite = new Path(clusterDirectory, new Path("part-m-0"));
+      writer = new Writer(fileSystem, conf, pathToWrite, LongWritable.class, VectorWritable.class);
+      writersForClusters.put(clusterId, writer);
+    }
+    return writer;
+  }
+
+  /**
+   * Writes vector to the cluster directory.
+   */
+  private void writeVectorToCluster(Writer writer, WeightedVectorWritable point) throws IOException {
+    writer.append(new LongWritable(uniqueVectorId++), new VectorWritable(point.getVector()));
+    writer.sync();
+  }
+
+  /**
+   * @return the set of all post processed cluster paths.
+   */
+  public Map<String, Path> getPostProcessedClusterDirectories() {
+    return postProcessedClusterDirectories;
+  }
+
+  public void setClusteredPoints(Path clusteredPoints) {
+    this.clusteredPoints = clusteredPoints;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorDriver.java
new file mode 100644
index 0000000..82a3071
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorDriver.java
@@ -0,0 +1,182 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.topdown.postprocessor;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+
+/**
+ * Post processes the output of clustering algorithms and groups them into respective clusters. Ideal to be
+ * used for top down clustering. It can also be used if the clustering output needs to be grouped into their
+ * respective clusters.
+ */
+public final class ClusterOutputPostProcessorDriver extends AbstractJob {
+
+  /**
+   * CLI to run clustering post processor. The input to post processor is the ouput path specified to the
+   * clustering.
+   */
+  @Override
+  public int run(String[] args) throws Exception {
+    addInputOption();
+    addOutputOption();
+    addOption(DefaultOptionCreator.methodOption().create());
+    addOption(DefaultOptionCreator.overwriteOption().create());
+
+    if (parseArguments(args) == null) {
+      return -1;
+    }
+    Path input = getInputPath();
+    Path output = getOutputPath();
+
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.delete(getConf(), output);
+    }
+    boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(
+            DefaultOptionCreator.SEQUENTIAL_METHOD);
+    run(input, output, runSequential);
+    return 0;
+
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new ClusterOutputPostProcessorDriver(), args);
+  }
+
+  /**
+   * Post processes the output of clustering algorithms and groups them into respective clusters. Each
+   * cluster's vectors are written into a directory named after its clusterId.
+   *
+   * @param input         The output path provided to the clustering algorithm, whose would be post processed. Hint: The
+   *                      path of the directory containing clusters-*-final and clusteredPoints.
+   * @param output        The post processed data would be stored at this path.
+   * @param runSequential If set to true, post processes it sequentially, else, uses. MapReduce. Hint: If the clustering
+   *                      was done sequentially, make it sequential, else vice versa.
+   */
+  public static void run(Path input, Path output, boolean runSequential) throws IOException,
+          InterruptedException,
+          ClassNotFoundException {
+    if (runSequential) {
+      postProcessSeq(input, output);
+    } else {
+      Configuration conf = new Configuration();
+      postProcessMR(conf, input, output);
+      movePartFilesToRespectiveDirectories(conf, output);
+    }
+
+  }
+
+  /**
+   * Process Sequentially. Reads the vectors one by one, and puts them into respective directory, named after
+   * their clusterId.
+   *
+   * @param input  The output path provided to the clustering algorithm, whose would be post processed. Hint : The
+   *               path of the directory containing clusters-*-final and clusteredPoints.
+   * @param output The post processed data would be stored at this path.
+   */
+  private static void postProcessSeq(Path input, Path output) throws IOException {
+    ClusterOutputPostProcessor clusterOutputPostProcessor = new ClusterOutputPostProcessor(input, output,
+            new Configuration());
+    clusterOutputPostProcessor.process();
+  }
+
+  /**
+   * Process as a map reduce job. The numberOfReduceTasks is set to the number of clusters present in the
+   * output. So that each cluster's vector is written in its own part file.
+   *
+   * @param conf   The hadoop configuration.
+   * @param input  The output path provided to the clustering algorithm, whose would be post processed. Hint : The
+   *               path of the directory containing clusters-*-final and clusteredPoints.
+   * @param output The post processed data would be stored at this path.
+   */
+  private static void postProcessMR(Configuration conf, Path input, Path output) throws IOException,
+          InterruptedException,
+          ClassNotFoundException {
+    System.out.println("WARNING: If you are running in Hadoop local mode, please use the --sequential option, "
+        + "as the MapReduce option will not work properly");
+    int numberOfClusters = ClusterCountReader.getNumberOfClusters(input, conf);
+    conf.set("clusterOutputPath", input.toString());
+    Job job = new Job(conf, "ClusterOutputPostProcessor Driver running over input: " + input);
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    job.setOutputFormatClass(SequenceFileOutputFormat.class);
+    job.setMapperClass(ClusterOutputPostProcessorMapper.class);
+    job.setMapOutputKeyClass(IntWritable.class);
+    job.setMapOutputValueClass(VectorWritable.class);
+    job.setReducerClass(ClusterOutputPostProcessorReducer.class);
+    job.setOutputKeyClass(IntWritable.class);
+    job.setOutputValueClass(VectorWritable.class);
+    job.setNumReduceTasks(numberOfClusters);
+    job.setJarByClass(ClusterOutputPostProcessorDriver.class);
+
+    FileInputFormat.addInputPath(job, new Path(input, new Path("clusteredPoints")));
+    FileOutputFormat.setOutputPath(job, output);
+    if (!job.waitForCompletion(true)) {
+      throw new InterruptedException("ClusterOutputPostProcessor Job failed processing " + input);
+    }
+  }
+
+  /**
+   * The mapreduce version of the post processor writes different clusters into different part files. This
+   * method reads the part files and moves them into directories named after their clusterIds.
+   *
+   * @param conf   The hadoop configuration.
+   * @param output The post processed data would be stored at this path.
+   */
+  private static void movePartFilesToRespectiveDirectories(Configuration conf, Path output) throws IOException {
+    FileSystem fileSystem = output.getFileSystem(conf);
+    for (FileStatus fileStatus : fileSystem.listStatus(output, PathFilters.partFilter())) {
+      SequenceFileIterator<Writable, Writable> it =
+              new SequenceFileIterator<>(fileStatus.getPath(), true, conf);
+      if (it.hasNext()) {
+        renameFile(it.next().getFirst(), fileStatus, conf);
+      }
+      it.close();
+    }
+  }
+
+  /**
+   * Using @FileSystem rename method to move the file.
+   */
+  private static void renameFile(Writable key, FileStatus fileStatus, Configuration conf) throws IOException {
+    Path path = fileStatus.getPath();
+    FileSystem fileSystem = path.getFileSystem(conf);
+    Path subDir = new Path(key.toString());
+    Path renameTo = new Path(path.getParent(), subDir);
+    fileSystem.mkdirs(renameTo);
+    fileSystem.rename(path, renameTo);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorMapper.java
new file mode 100644
index 0000000..6834362
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorMapper.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.topdown.postprocessor;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+import java.util.Map;
+
+/**
+ * Mapper for post processing cluster output.
+ */
+public class ClusterOutputPostProcessorMapper extends
+        Mapper<IntWritable, WeightedVectorWritable, IntWritable, VectorWritable> {
+
+  private Map<Integer, Integer> newClusterMappings;
+  private VectorWritable outputVector;
+
+  //read the current cluster ids, and populate the cluster mapping hash table
+  @Override
+  public void setup(Context context) throws IOException {
+    Configuration conf = context.getConfiguration();
+    //this give the clusters-x-final directory where the cluster ids can be read
+    Path clusterOutputPath = new Path(conf.get("clusterOutputPath"));
+    //we want the key to be the cluster id, the value to be the index
+    newClusterMappings = ClusterCountReader.getClusterIDs(clusterOutputPath, conf, true);
+    outputVector = new VectorWritable();
+  }
+
+  @Override
+  public void map(IntWritable key, WeightedVectorWritable val, Context context)
+    throws IOException, InterruptedException {
+    // by pivoting on the cluster mapping value, we can make sure that each unique cluster goes to it's own reducer,
+    // since they are numbered from 0 to k-1, where k is the number of clusters
+    outputVector.set(val.getVector());
+    context.write(new IntWritable(newClusterMappings.get(key.get())), outputVector);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorReducer.java
new file mode 100644
index 0000000..58dada4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorReducer.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.topdown.postprocessor;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+import java.util.Map;
+
+/**
+ * Reducer for post processing cluster output.
+ */
+public class ClusterOutputPostProcessorReducer
+    extends Reducer<IntWritable, VectorWritable, IntWritable, VectorWritable> {
+
+  private Map<Integer, Integer> reverseClusterMappings;
+
+  //read the current cluster ids, and populate the hash cluster mapping hash table
+  @Override
+  public void setup(Context context) throws IOException {
+    Configuration conf = context.getConfiguration();
+    Path clusterOutputPath = new Path(conf.get("clusterOutputPath"));
+    //we want to the key to be the index, the value to be the cluster id
+    reverseClusterMappings = ClusterCountReader.getClusterIDs(clusterOutputPath, conf, false);
+  }
+
+  /**
+   * The key is the remapped cluster id and the values contains the vectors in that cluster.
+   */
+  @Override
+  protected void reduce(IntWritable key, Iterable<VectorWritable> values, Context context) throws IOException,
+          InterruptedException {
+    //remap the cluster back to its original id
+    //and then output the vectors with their correct
+    //cluster id.
+    IntWritable outKey = new IntWritable(reverseClusterMappings.get(key.get()));
+    System.out.println(outKey + " this: " + this);
+    for (VectorWritable value : values) {
+      context.write(outKey, value);
+    }
+  }
+
+}


[48/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/resources/country.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/resources/country.txt b/community/mahout-mr/examples/bin/resources/country.txt
deleted file mode 100644
index 6a22091..0000000
--- a/community/mahout-mr/examples/bin/resources/country.txt
+++ /dev/null
@@ -1,229 +0,0 @@
-Afghanistan
-Albania
-Algeria
-American Samoa
-Andorra
-Angola
-Anguilla
-Antigua and Barbuda
-Argentina
-Armenia
-Aruba
-Australia
-Austria
-Azerbaijan
-Bahamas
-Bangladesh
-Barbados
-Belarus
-Belgium
-Belize
-Benin
-Bermuda
-Bhutan
-Bolivia
-Bosnia and Herzegovina
-Botswana
-Bouvet Island
-Brazil
-British Indian Ocean Territory
-Brunei Darussalam
-Bulgaria
-Burkina Faso
-Burundi
-Cambodia
-Cameroon
-Canada
-Cape Verde
-Cayman Islands
-Central African Republic
-Chad
-Chile
-China
-Christmas Island
-Cocos  Islands
-Colombia
-Comoros
-Congo
-Cook Islands
-Costa Rica
-Croatia
-C�te d'Ivoire
-Cuba
-Cyprus
-Czech Republic
-Djibouti
-Dominica
-Dominican Republic
-Ecuador
-Egypt
-El Salvador
-Equatorial Guinea
-Eritrea
-Estonia
-Ethiopia
-Falkland Islands 
-Faroe Islands
-Fiji
-Finland
-France
-French Guiana
-French Polynesia
-French Southern Territories
-Gabon
-Georgia
-Germany
-Ghana
-Gibraltar
-Greece
-Greenland
-Grenada
-Guadeloupe
-Guam
-Guatemala
-Guernsey
-Guinea
-Guinea-Bissau
-Guyana
-Haiti
-Honduras
-Hong Kong
-Hungary
-Iceland
-India
-Indonesia
-Iran
-Iraq
-Ireland
-Isle of Man
-Israel
-Italy
-Japan
-Jersey
-Jordan
-Kazakhstan
-Kenya
-Kiribati
-Korea
-Kuwait
-Kyrgyzstan
-Latvia
-Lebanon
-Lesotho
-Liberia
-Liechtenstein
-Lithuania
-Luxembourg
-Macedonia
-Madagascar
-Malawi
-Malaysia
-Maldives
-Mali
-Malta
-Marshall Islands
-Martinique
-Mauritania
-Mauritius
-Mayotte
-Mexico
-Micronesia
-Moldova
-Monaco
-Mongolia
-Montenegro
-Montserrat
-Morocco
-Mozambique
-Myanmar
-Namibia
-Nauru
-Nepal
-Netherlands
-Netherlands Antilles
-New Caledonia
-New Zealand
-Nicaragua
-Niger
-Nigeria
-Niue
-Norfolk Island
-Northern Mariana Islands
-Norway
-Oman
-Pakistan
-Palau
-Palestinian Territory
-Panama
-Papua New Guinea
-Paraguay
-Peru
-Philippines
-Pitcairn
-Poland
-Portugal
-Puerto Rico
-Qatar
-R�union
-Russian Federation
-Rwanda
-Saint Barth�lemy
-Saint Helena
-Saint Kitts and Nevis
-Saint Lucia
-Saint Martin 
-Saint Pierre and Miquelon
-Saint Vincent and the Grenadines
-Samoa
-San Marino
-Sao Tome and Principe
-Saudi Arabia
-Senegal
-Serbia
-Seychelles
-Sierra Leone
-Singapore
-Slovakia
-Slovenia
-Solomon Islands
-Somalia
-South Africa
-South Georgia and the South Sandwich Islands
-Spain
-Sri Lanka
-Sudan
-Suriname
-Svalbard and Jan Mayen
-Swaziland
-Sweden
-Switzerland
-Syrian Arab Republic
-Taiwan
-Tanzania
-Thailand
-Timor-Leste
-Togo
-Tokelau
-Tonga
-Trinidad and Tobago
-Tunisia
-Turkey
-Turkmenistan
-Turks and Caicos Islands
-Tuvalu
-Ukraine
-United Arab Emirates
-United Kingdom
-United States
-United States Minor Outlying Islands
-Uruguay
-Uzbekistan
-Vanuatu
-Vatican 
-Venezuela
-Vietnam
-Virgin Islands
-Wallis and Futuna
-Yemen
-Zambia
-Zimbabwe

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/resources/country10.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/resources/country10.txt b/community/mahout-mr/examples/bin/resources/country10.txt
deleted file mode 100644
index 97a63e1..0000000
--- a/community/mahout-mr/examples/bin/resources/country10.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-Australia
-Austria
-Bahamas
-Canada
-Colombia
-Cuba
-Panama
-Pakistan
-United Kingdom
-Vietnam

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/resources/country2.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/resources/country2.txt b/community/mahout-mr/examples/bin/resources/country2.txt
deleted file mode 100644
index f4b4f61..0000000
--- a/community/mahout-mr/examples/bin/resources/country2.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-United States
-United Kingdom

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/resources/donut-test.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/resources/donut-test.csv b/community/mahout-mr/examples/bin/resources/donut-test.csv
deleted file mode 100644
index 46ea564..0000000
--- a/community/mahout-mr/examples/bin/resources/donut-test.csv
+++ /dev/null
@@ -1,41 +0,0 @@
-"x","y","shape","color","xx","xy","yy","c","a","b"
-0.802415437065065,0.0978854028508067,21,2,0.643870533640319,0.07854475831082,0.00958155209126472,0.503141377562721,0.808363832523192,0.220502180491382
-0.97073650965467,0.989339149091393,23,2,0.942329371176533,0.96038763245370,0.978791951924881,0.67900343471543,1.38604520961670,0.989771844311643
-0.566630310611799,0.369259539060295,25,1,0.321069908904024,0.209233647314105,0.136352607187021,0.146740132271139,0.676330182744379,0.569352171215186
-0.377948862500489,0.500907538458705,24,1,0.142845342665413,0.189317434378387,0.250908362084759,0.122054511555201,0.62749797190921,0.79865886318828
-0.0133881184738129,0.269793515326455,25,2,0.000179241716268851,0.00361202754665705,0.0727885409122062,0.538317888266967,0.270125494221621,1.02283505301727
-0.395229484187439,0.385281964903697,25,1,0.156206345171069,0.152274792255611,0.148442192480054,0.155361155247979,0.551949760078871,0.717070128562224
-0.757145672803745,0.416044564917684,21,1,0.573269569845435,0.315006342020941,0.173093079997545,0.270503996498299,0.863922826323613,0.481737796145881
-0.589166145538911,0.971624446567148,24,2,0.347116747049177,0.572448230095344,0.944054065166917,0.479979395505718,1.13629697360157,1.05491161769044
-0.843438957352191,0.218833807157353,25,2,0.711389274779351,0.184572958142208,0.0478882351549814,0.443852166182378,0.871365313708512,0.269071728782402
-0.628562391968444,0.801476288354024,25,2,0.395090680597092,0.503777852913796,0.642364240793743,0.327744170151609,1.01855531091386,0.8833629703887
-0.262267543468624,0.247060472844169,22,2,0.0687842643570668,0.0647959433010369,0.0610388772419841,0.347124077652729,0.360309785599907,0.778002605819416
-0.738417695043609,0.562460686312988,21,1,0.545260692353516,0.415330923539883,0.316362023647678,0.246463657857698,0.928236347058869,0.620312280963368
-0.498857178725302,0.164454092038795,21,1,0.248858484765768,0.0820391043843046,0.0270451483883046,0.335547854098302,0.525265297877247,0.527436513434051
-0.499293045606464,0.733599063009024,25,1,0.249293545390979,0.366280910423824,0.538167585247717,0.233600132755117,0.88739006679064,0.888186376514393
-0.553942533675581,0.548312899889424,24,1,0.306852330614922,0.303733837011753,0.30064703618515,0.0724150069741539,0.779422457207946,0.706833997094728
-0.661088703200221,0.98143746308051,24,2,0.43703827349895,0.64881721974001,0.963219493937908,0.507672730364875,1.1833248782295,1.03830648704340
-0.492181566543877,0.376017479225993,23,1,0.242242694445585,0.185068871973329,0.141389144683470,0.124228794404457,0.619380205632255,0.63187712891139
-0.991064163157716,0.216620326042175,21,2,0.982208175495505,0.21468464215194,0.0469243656546183,0.566963889458783,1.01446170018888,0.21680455446021
-0.601602173643187,0.343355831922963,24,1,0.361925175332207,0.206563614817919,0.117893227315510,0.186709392055052,0.692689254029335,0.52594111396747
-0.0397100185509771,0.0602901463862509,25,2,0.00157688557331895,0.00239412283143915,0.00363490175127556,0.636562347604197,0.0721927096360464,0.962180726382856
-0.158290433697402,0.630195834673941,23,2,0.0250558614001118,0.0997539719848347,0.397146790040385,0.365672507948237,0.649771230080632,1.05148551299849
-0.967184047214687,0.497705311980098,25,2,0.935444981186582,0.48137263796116,0.247710577573207,0.467189682639721,1.08772954302059,0.498785990511377
-0.538070349488407,0.0130743277259171,24,2,0.289519700998577,0.00703490808881019,0.000170938045484685,0.488411672495383,0.538229169633216,0.462114639529248
-0.758642012253404,0.673675778554752,25,2,0.575537702755893,0.511078748249156,0.453839054611352,0.311542880770993,1.01458206044028,0.715606548922268
-0.986405614530668,0.981674374546856,21,2,0.972996036377624,0.9683291146939,0.96368457764196,0.684544100071034,1.39164672744903,0.981768498658543
-0.51937106740661,0.462004136526957,23,1,0.269746305659081,0.239951581534275,0.213447822168019,0.0426488439882434,0.695121664046734,0.666672328069706
-0.534244359936565,0.692785677267238,21,1,0.28541703612403,0.370116840724856,0.479951994626626,0.195803456422130,0.87485371963012,0.83479357381183
-0.0795328004751354,0.536029864801094,22,2,0.00632546635141770,0.0426319562859392,0.287328015958679,0.422008076977050,0.541898036820671,1.06517035321108
-0.330987347057089,0.804738595616072,23,2,0.10955262391189,0.266358292837412,0.647604207274128,0.348469350894533,0.870147591610767,1.04650950166343
-0.9804020607844,0.74571731640026,25,2,0.961188200790297,0.731102793761427,0.556094315979205,0.539595348001485,1.23178022259229,0.745974795285138
-0.362560331821442,0.805498170899227,21,2,0.131449994210474,0.292041684122788,0.648827303322001,0.334990738397057,0.883333061496328,1.02720817456326
-0.47635925677605,0.961423690896481,21,2,0.226918141516230,0.457983074842334,0.924335513417013,0.462028903057712,1.07296488988841,1.09477629741475
-0.850710266502574,0.635807712096721,24,2,0.723707957532881,0.540888148202193,0.404251446761667,0.376086992190972,1.06205433208219,0.65309943445803
-0.136131341336295,0.714137809583917,25,2,0.0185317420940189,0.0972165379176223,0.509992811077315,0.422203034393551,0.726996941651981,1.12083088398685
-0.930458213202655,0.865616530412808,24,2,0.865752486516278,0.805420010206583,0.749291977723908,0.564774043865972,1.27084399681479,0.868405457050378
-0.374636142514646,0.197784703457728,21,2,0.140352239278254,0.0740972983518064,0.0391187889218614,0.327185241457712,0.423640210792266,0.655895375171089
-0.482126326300204,0.841961156809703,22,1,0.232445794511731,0.405931639420132,0.708898589576332,0.342427950053959,0.970229036922758,0.988479504839456
-0.660344187868759,0.746531683253124,24,2,0.436054446452051,0.492967858096082,0.557309554100743,0.294088642131774,0.996676477375078,0.82016804669243
-0.0772640188224614,0.437956433976069,22,2,0.00596972860459766,0.0338382741581451,0.191805838061035,0.427264688298837,0.444719649515999,1.02139489377063
-0.998469967395067,0.464829172473401,25,2,0.996942275789907,0.464117968683793,0.216066159582307,0.499709210945471,1.10136662168971,0.464831690595724

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/resources/donut.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/resources/donut.csv b/community/mahout-mr/examples/bin/resources/donut.csv
deleted file mode 100644
index 33ba3b7..0000000
--- a/community/mahout-mr/examples/bin/resources/donut.csv
+++ /dev/null
@@ -1,41 +0,0 @@
-"x","y","shape","color","k","k0","xx","xy","yy","a","b","c","bias"
-0.923307513352484,0.0135197141207755,21,2,4,8,0.852496764213146,0.0124828536260896,0.000182782669907495,0.923406490600458,0.0778750292332978,0.644866125183976,1
-0.711011884035543,0.909141522599384,22,2,3,9,0.505537899239772,0.64641042683833,0.826538308114327,1.15415605849213,0.953966686673604,0.46035073663368,1
-0.75118898646906,0.836567111080512,23,2,3,9,0.564284893392414,0.62842000028592,0.699844531341594,1.12433510339845,0.872783737128441,0.419968245447719,1
-0.308209649519995,0.418023289414123,24,1,5,1,0.094993188057238,0.128838811521522,0.174743470492603,0.519361780024138,0.808280495564412,0.208575453051705,1
-0.849057961953804,0.500220163026825,25,1,5,2,0.720899422757147,0.424715912147755,0.250220211498583,0.985454024425153,0.52249756970547,0.349058031386046,1
-0.0738831346388906,0.486534863477573,21,2,6,1,0.00545871758406844,0.0359467208248278,0.236716173379140,0.492112681164801,1.04613986717142,0.42632955896436,1
-0.612888508243486,0.0204555552918464,22,2,4,10,0.375632323536926,0.0125369747681119,0.000418429742297785,0.613229772009826,0.387651566219268,0.492652707029903,1
-0.207169560948387,0.932857288978994,23,2,1,4,0.0429192269835473,0.193259634985281,0.870222721601238,0.955584610897845,1.22425602987611,0.522604151014326,1
-0.309267645236105,0.506309477845207,24,1,5,1,0.0956464763898851,0.156585139973909,0.256349287355886,0.593292308854389,0.856423069092351,0.190836685845410,1
-0.78758287569508,0.171928803203627,25,2,4,10,0.620286786088131,0.135408181241926,0.0295595133710317,0.806130448165285,0.273277419610556,0.436273561610666,1
-0.930236018029973,0.0790199618786573,21,2,4,8,0.86533904924026,0.0735072146828825,0.00624415437530446,0.93358620577618,0.105409523078414,0.601936228937031,1
-0.238834470743313,0.623727766098455,22,1,5,1,0.0570419044152386,0.148967690904034,0.389036326202168,0.667890882268509,0.984077887735915,0.288991338582386,1
-0.83537525916472,0.802311758277938,23,2,3,7,0.697851823624524,0.670231393002335,0.643704157471036,1.15825557675997,0.819027144096042,0.451518508649315,1
-0.656760312616825,0.320640653371811,24,1,5,3,0.43133410822855,0.210584055746134,0.102810428594702,0.730851925374252,0.469706197095164,0.238209090579297,1
-0.180789119331166,0.114329558331519,25,2,2,5,0.0326847056685386,0.0206695401642766,0.0130712479082803,0.213906413126907,0.82715035810576,0.500636870310341,1
-0.990028728265315,0.061085847672075,21,2,4,8,0.980156882790638,0.0604767440857932,0.00373148078581595,0.991911469626425,0.06189432159595,0.657855445853466,1
-0.751934139290825,0.972332585137337,22,2,3,9,0.565404949831033,0.731130065509666,0.945430656119858,1.22916052895905,1.00347761677540,0.535321288127727,1
-0.136412925552577,0.552212274167687,23,2,6,1,0.0186084862578129,0.0753288918452558,0.304938395741448,0.5688118159807,1.02504684326820,0.3673168690368,1
-0.5729476721026,0.0981996888294816,24,2,4,10,0.328269034967789,0.0562632831160512,0.0096431788862070,0.581302170866406,0.43819729534628,0.408368525870829,1
-0.446335297077894,0.339370004367083,25,1,5,3,0.199215197417612,0.151472811718508,0.115171999864114,0.560702414192882,0.649397107420365,0.169357302283512,1
-0.922843366628513,0.912627586396411,21,2,3,7,0.851639879330248,0.842212314308118,0.832889111451739,1.29789405992245,0.915883320912091,0.590811338548155,1
-0.166969822719693,0.398156099021435,22,2,6,1,0.0278789216990458,0.0664800532683736,0.158528279187967,0.431749002184154,0.923291695753637,0.348254618269284,1
-0.350683249300346,0.84422400011681,23,2,1,6,0.122978741339848,0.296055215498298,0.712714162373228,0.914162405545687,1.06504760696993,0.375214144584023,1
-0.47748578293249,0.792779305484146,24,1,5,6,0.227992672902653,0.378540847371773,0.628499027203925,0.9254683679665,0.949484141121692,0.29364368150863,1
-0.384564548265189,0.153326370986179,25,2,2,5,0.147889891782409,0.0589638865954405,0.0235089760397912,0.414003463538894,0.634247405427742,0.365387395199715,1
-0.563622857443988,0.467359990812838,21,1,5,3,0.317670725433326,0.263414773476928,0.218425361012576,0.73218582781006,0.639414084578942,0.071506910079209,1
-0.343304847599939,0.854578266385943,22,2,1,6,0.117858218385617,0.293380861503846,0.730304013379203,0.920957236664559,1.07775346743350,0.387658506651072,1
-0.666085948701948,0.710089378990233,23,1,5,2,0.443670491058174,0.472980557667886,0.504226926154735,0.973600234805286,0.784681795257806,0.267809801016930,1
-0.190568120684475,0.0772022884339094,24,2,2,5,0.0363162086212125,0.0147122950193909,0.00596019333943254,0.205612261211838,0.813105258002736,0.523933195018469,1
-0.353534662164748,0.427994541125372,25,1,5,1,0.124986757351942,0.151310905505115,0.183179327233118,0.555127088678854,0.775304301713569,0.163208092002022,1
-0.127048352966085,0.927507144864649,21,2,1,4,0.0161412839913949,0.117838255119330,0.860269503774972,0.936168140755905,1.27370093893119,0.567322915045421,1
-0.960906301159412,0.891004979610443,22,2,3,7,0.923340919607862,0.856172299272088,0.793889873690606,1.31043152942016,0.891862204031343,0.604416671286136,1
-0.306814440060407,0.902291874401271,23,2,1,6,0.094135100629581,0.276836176215481,0.81413062661056,0.953029761990747,1.13782109627099,0.446272800849954,1
-0.087350245565176,0.671402548439801,24,2,6,4,0.00763006540029655,0.0586471774793016,0.450781382051459,0.677060889028273,1.13300968942079,0.446831795474291,1
-0.27015240653418,0.371201378758997,25,1,5,1,0.0729823227562089,0.100280945780549,0.137790463592580,0.459099974241765,0.81882108746687,0.263474858488646,1
-0.871842501685023,0.569787061074749,21,2,3,2,0.7601093477444,0.496764576755166,0.324657294968199,1.04152131169391,0.584021951079369,0.378334613738721,1
-0.686449621338397,0.169308491749689,22,2,4,10,0.471213082635629,0.116221750050949,0.0286653653785545,0.707020825728764,0.356341416814533,0.379631841296403,1
-0.67132937326096,0.571220482233912,23,1,5,2,0.450683127402953,0.383477088331915,0.326292839323543,0.881462402332905,0.659027480614106,0.185542747720368,1
-0.548616112209857,0.405350996181369,24,1,5,3,0.300979638576258,0.222382087605415,0.164309430105228,0.682121007359754,0.606676886210257,0.106404700508298,1
-0.677980388281867,0.993355110753328,25,2,3,9,0.459657406894831,0.673475283690318,0.986754376059756,1.20266860895036,1.04424662144096,0.524477152905055,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/resources/test-data.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/resources/test-data.csv b/community/mahout-mr/examples/bin/resources/test-data.csv
deleted file mode 100644
index ab683cd..0000000
--- a/community/mahout-mr/examples/bin/resources/test-data.csv
+++ /dev/null
@@ -1,61 +0,0 @@
-"V1","V2","V3","V4","V5","V6","V7","V8","y"
-1,-0.212887381184450,-0.955959589855826,-0.00326541907490505,0.0560086232868742,0.091264583618544,0.0172194710825328,-0.0237399208336878,1
-1,3.14702017427074,2.12881054220556,-0.00566925018709358,-0.055626039510634,-0.0630510476335515,-0.00155145331201058,0.108559859662683,0
-1,-2.16541417186635,-2.71847685293678,-0.00833554984263851,0.0433655514274994,-0.102555485096075,-0.156155728366877,-0.0241458595902909,1
-1,-4.33686585982661,-2.6857484867589,-0.0115524101901378,0.122387581992154,0.081766215557828,-0.0206167352421607,-0.0424490760296281,1
-1,2.34100936064648,2.10958510331364,-0.0129315842415535,0.173866353524092,-0.0299915285951044,0.108136400830407,-0.0063355720943443,0
-1,1.30317270786224,3.37038662087804,-0.0230504278644102,-0.131884713919903,0.086455020204179,0.17337860146005,-0.0524355492943794,0
-1,1.94943481762617,3.54806480367192,-0.029538920288902,-0.0720379027720258,0.214306548234308,-0.082665692089578,0.226607475768828,0
-1,3.14635496849369,1.76134258264267,-0.0318247859223975,-0.187198080297378,-0.08576487890296,0.153638925055934,-0.0691201521844938,0
-1,-1.26105438936697,-1.95583819596755,-0.0367826492102569,-0.0936093811581598,-0.0317225362744449,-0.0840334569992295,-0.0627566339884115,1
-1,2.40442001058194,3.23077413487565,-0.0452264569747572,0.0371989606630366,-0.17352653795031,0.102543062447842,-0.0551882772900301,0
-1,-2.20940227045733,-0.175769402031962,-0.0465958462590872,0.130789407148096,-0.140283147466875,0.0708851428212228,0.0605244763586474,1
-1,-1.64710385829030,-2.57691366099069,-0.0553070134425288,-0.0349011715152424,-0.0826092377112715,0.106766133325393,-0.0585587032435851,1
-1,-2.6523724984616,-4.16903830585265,-0.0568310036349303,-0.0291979248790545,-0.255996825268056,0.0401827924643623,0.0179311252387879,1
-1,2.34337447158977,0.28996735916551,-0.0625800583342644,0.0899232083837452,0.0255207970332586,-0.0343458209061299,0.0755898049986344,0
-1,3.67556867120403,1.36097809464341,-0.0956707962851342,0.0537771695881714,-0.0373171704803031,0.0463473815328367,-0.228499359561800,0
-1,1.96533061882493,2.92646586187099,-0.103334098736041,-0.0194013528907574,0.0253359438067293,0.00748464018133427,-0.239745502177878,0
-1,-1.95041601303593,-0.860607985906108,-0.103721968898869,-0.00972933741506002,0.0227857854969761,-0.0287381002832544,-0.130156656165122,1
-1,-1.51543545229533,-1.35683836829949,-0.106483722717291,0.103877046729912,0.00840497101030744,0.0258430051020969,0.168907472637671,1
-1,1.45074382041585,1.88231080047069,-0.107681637419817,-0.00626324733854461,-0.144385489192821,0.00088239451623517,-0.00299885969569744,0
-1,3.87956616310254,4.31276421460554,-0.129963535661731,-0.0640782960295875,-0.0324909886960640,0.0428280701443882,0.0329254937199428,0
-1,-2.88187391546093,-3.16731558128991,-0.136390769151814,-0.155408895734766,0.105626409419800,-0.0918345772196075,0.197828194781600,1
-1,-2.65024496288248,-1.81147577507541,-0.145438998990911,0.0691687502404964,0.0749439097959056,-0.0674149410216342,0.123896965825847,1
-1,-1.37426198993006,-2.08894064826135,-0.153236566384176,0.0213513951854753,-0.134553043562400,0.00287304090325258,0.0122158739075685,1
-1,1.65698424179346,2.49004336804714,-0.153862461770005,0.105220938080375,-0.0946233303225818,-0.122426312548592,-0.00538234276442917,0
-1,2.93315586503758,2.75229115279104,-0.168877592929163,-0.0349207806558679,0.0189964813847077,0.202397029441612,0.0426299706123943,0
-1,-3.84306960373604,-2.35606387141237,-0.179511886850707,-0.0916819865200809,0.0265829433229566,0.101658708455140,-0.0855390303406673,1
-1,2.28101644492271,1.37963780647481,-0.180898801743387,-0.0789829066843624,-0.0779025366072777,0.0442621459868237,-0.136195159617836,0
-1,1.70008372335953,2.71018350574622,-0.188985514267118,-0.195856534813112,-0.106263419324547,-0.0311178988395261,-0.121173036989233,0
-1,-2.05613043162767,-1.73770126734937,0.00630625444849072,-0.134595964087825,0.0708994966210059,0.0739139562742148,-0.00416084523004362,1
-1,2.39375626983328,3.2468518382106,0.00951905535238045,-0.140380515724865,0.0630970962358967,0.00183192220061040,-0.0773483294293499,0
-1,4.26863682432937,3.49421800345979,0.0109175198048448,-0.109995560295421,-0.111585866731122,0.154763193427948,-0.0186987535307691,0
-1,1.54495296452702,3.17243560853872,0.0117478311845783,0.115838636637105,-0.1715332868224,0.0927292648278796,-0.0885962242970987,0
-1,2.16883227993245,1.63879588167162,0.0158863105366749,-0.00488771308802354,0.0280782748001184,0.131946735985038,0.066416828384239,0
-1,1.86427271422921,3.32026821853873,0.0162473257475520,0.0355005599857545,-0.0988825269654524,0.0527023072810735,0.100841323212596,0
-1,-3.03828333997027,-1.43214405751321,0.0247204684728272,0.146197859364444,0.0141171187314724,-0.201738256450160,0.044002672456105,1
-1,2.08595761680696,0.225336429607513,0.0335964287149376,0.0576493862055925,0.121452048491972,0.0640240734436852,0.224720096669846,0
-1,-1.85256114614442,-2.22817393781734,0.0346230650580488,0.160185441442375,0.0114059982858295,0.00496408500928602,-0.094156048483371,1
-1,2.33572915427688,1.03334367238243,0.0357824515834720,-0.172284120406131,0.0329286256184980,-0.101030665525296,-0.00238851979619332,0
-1,-2.00334039609229,-2.98875026257892,0.0375804284421083,0.142856636546252,-0.0862220203147005,-0.0441603903572752,0.0147126239348866,1
-1,2.38346139581192,1.21051372282823,0.0405425233313353,-0.145245065311593,-0.0216697981922324,-0.0128934036902430,-0.0325085994141851,0
-1,-1.15629168023471,-1.37784639006639,0.0429948703549178,-0.00491267793152886,0.0263522850749959,-0.0442602193050815,0.0582704866256344,1
-1,2.13230915550664,1.32833684701498,0.0434112538719301,-0.0296522957829338,0.00247091583877657,-0.123872403365319,-0.136549696313901,0
-1,-1.88291252343724,-1.99980946454726,0.0472833199907535,-0.0365284873908706,-0.0209054390489622,-0.0891896486647233,0.0542966824787834,1
-1,-1.34787394136153,-2.57763619051754,0.0493154843443071,0.0384664637019124,-0.00780509859650452,-0.118550134827935,0.00573215142098708,1
-1,-1.81748193199251,-2.72113041015796,0.0551479875680516,-0.255723061179778,-0.217672946803948,0.145106553357089,0.0632886151091758,1
-1,-3.13049595715861,-0.0285946551309455,0.0724437318718333,-0.0360911974267016,-0.121364676014540,0.038351368519738,-0.0125375424386282,1
-1,-2.3836883021805,-1.40162632998805,0.0746620557343183,0.069222624188286,0.04657285528431,0.0932835769596473,0.00836816351062604,1
-1,-2.43800450243598,-0.965440038635416,0.0763675021411913,-0.122575769653323,0.045866930905471,-0.0493852614669876,0.128116802512532,1
-1,1.09024638837653,2.21814920469686,0.0769910502309598,-0.270152593833931,-0.252735856082821,0.0661674666715274,-0.000429289775969046,0
-1,3.17642151475607,1.18015379683312,0.0776648965451875,-0.117234850817615,0.0759455286430382,0.119280079276134,0.117056969569811,0
-1,-3.5501372839931,-4.02435741321994,0.0833451415432366,-0.0185864612285970,0.0553371588028254,0.0269699189958747,-0.0930023774668385,1
-1,-2.85922019599943,-2.07644295605507,0.0903467736346066,0.124804691516462,0.0673015037344841,0.0234043567104492,0.0866115903248345,1
-1,0.513249476607372,5.0165612245778,0.0934321220365115,-0.0387550539552360,0.070129320868753,0.0635055975927393,-0.00773489793089484,0
-1,1.30094323285406,2.74698316868320,0.094239413405751,-0.105600040230387,-0.0134676903839459,0.00834379403909127,0.0978349326557826,0
-1,1.62511731278249,3.01296963021698,0.104352029985773,-0.0065839083200722,0.068460830526483,-0.1202220553,0.121998460927858,0
-1,1.82917662184333,2.89388269168932,0.110781239485760,-0.262387884050666,-0.00517657837760664,-0.0224028641246511,-0.108606003593092,0
-1,-3.17279743572930,-2.86698187406046,0.110873139279243,-0.093614374710967,0.0925974010859032,-0.00747619041107016,-0.066394213442664,1
-1,-3.20104938765970,-1.68043245593876,0.123227179211642,-0.00179275501686146,-0.175893752209014,-0.0835732816974749,0.0560957582079696,1
-1,-1.89923900052239,-2.92427973445236,0.147975477003611,0.00819675018680998,0.00470753628896422,-0.0122227288860826,0.209903875101594,1
-1,0.148491843864120,-1.54734877494689,0.162479731968606,0.112962938668545,-0.0100535803565242,0.0422099301034027,0.0752974779385111,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/set-dfs-commands.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/set-dfs-commands.sh b/community/mahout-mr/examples/bin/set-dfs-commands.sh
deleted file mode 100755
index 0ee5fe1..0000000
--- a/community/mahout-mr/examples/bin/set-dfs-commands.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-#   
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# 
-# 
-# Requires $HADOOP_HOME to be set.
-#
-# Figures out the major version of Hadoop we're using and sets commands
-# for dfs commands
-#
-# Run by each example script.
-
-# Find a hadoop shell
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-  HADOOP="${HADOOP_HOME}/bin/hadoop"
-  if [ ! -e $HADOOP ]; then
-    echo "Can't find hadoop in $HADOOP, exiting"
-    exit 1
-  fi
-fi
-
-# Check Hadoop version
-v=`${HADOOP_HOME}/bin/hadoop version | egrep "Hadoop [0-9]+.[0-9]+.[0-9]+" | cut -f 2 -d ' ' | cut -f 1 -d '.'`
-
-if [ $v -eq "1" -o $v -eq "0" ]
-then
-  echo "Discovered Hadoop v0 or v1."
-  export DFS="${HADOOP_HOME}/bin/hadoop dfs"
-  export DFSRM="$DFS -rmr -skipTrash"
-elif [ $v -eq "2" ]
-then
-  echo "Discovered Hadoop v2."
-  export DFS="${HADOOP_HOME}/bin/hdfs dfs"
-  export DFSRM="$DFS -rm -r -skipTrash"
-else
-  echo "Can't determine Hadoop version."
-  exit 1
-fi
-echo "Setting dfs command to $DFS, dfs rm to $DFSRM."
-
-export HVERSION=$v 

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/pom.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/pom.xml b/community/mahout-mr/examples/pom.xml
deleted file mode 100644
index 28a5795..0000000
--- a/community/mahout-mr/examples/pom.xml
+++ /dev/null
@@ -1,199 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.mahout</groupId>
-    <artifactId>mahout-mr</artifactId>
-    <version>0.14.0-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
-  </parent>
-
-  <artifactId>mr-examples</artifactId>
-  <name>Mahout Examples</name>
-  <description>Scalable machine learning library examples</description>
-
-  <packaging>jar</packaging>
-  <properties>
-    <mahout.skip.example>false</mahout.skip.example>
-  </properties>
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-        <executions>
-          <execution>
-            <id>copy-dependencies</id>
-            <phase>package</phase>
-            <goals>
-              <goal>copy-dependencies</goal>
-            </goals>
-            <configuration>
-              <!-- configure the plugin here -->
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
-
-      <!-- create examples hadoop job jar -->
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-assembly-plugin</artifactId>
-        <executions>
-          <execution>
-            <id>job</id>
-            <phase>package</phase>
-            <goals>
-              <goal>single</goal>
-            </goals>
-            <configuration>
-              <skipAssembly>${mahout.skip.example}</skipAssembly>
-              <descriptors>
-                <descriptor>src/main/assembly/job.xml</descriptor>
-              </descriptors>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
-
-
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-remote-resources-plugin</artifactId>
-        <configuration>
-          <appendedResourcesDirectory>../src/main/appended-resources</appendedResourcesDirectory>
-          <resourceBundles>
-            <resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle>
-          </resourceBundles>
-          <supplementalModels>
-            <supplementalModel>supplemental-models.xml</supplementalModel>
-          </supplementalModels>
-        </configuration>
-      </plugin>
-
-      <plugin>
-        <artifactId>maven-source-plugin</artifactId>
-      </plugin>
-
-      <plugin>
-        <groupId>org.mortbay.jetty</groupId>
-        <artifactId>maven-jetty-plugin</artifactId>
-        <version>6.1.26</version>
-      </plugin>
-    </plugins>
-
-  </build>
-
-  <dependencies>
-
-    <!-- our modules -->
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>mahout-hdfs</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>mahout-mr</artifactId>
-    </dependency>
-   <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>mahout-hdfs</artifactId>
-      <type>test-jar</type>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>mahout-mr</artifactId>
-      <type>test-jar</type>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>mahout-math</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>mahout-math</artifactId>
-      <type>test-jar</type>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>mahout-integration</artifactId>
-    </dependency>
-
-    <dependency>
-        <groupId>org.apache.lucene</groupId>
-        <artifactId>lucene-benchmark</artifactId>
-    </dependency>
-    <dependency>
-        <groupId>org.apache.lucene</groupId>
-        <artifactId>lucene-analyzers-common</artifactId>
-    </dependency>
-
-    <dependency>
-      <groupId>com.carrotsearch.randomizedtesting</groupId>
-      <artifactId>randomizedtesting-runner</artifactId>
-    </dependency>
-
-    <dependency>
-      <groupId>org.easymock</groupId>
-      <artifactId>easymock</artifactId>
-    </dependency>
-
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-    </dependency>
-
-    <dependency>
-      <groupId>org.slf4j</groupId>
-      <artifactId>slf4j-api</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>org.slf4j</groupId>
-      <artifactId>slf4j-log4j12</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>org.slf4j</groupId>
-      <artifactId>jcl-over-slf4j</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>commons-logging</groupId>
-      <artifactId>commons-logging</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>log4j</groupId>
-      <artifactId>log4j</artifactId>
-    </dependency>
-
-  </dependencies>
-
-  <profiles>
-    <profile>
-      <id>release.prepare</id>
-      <properties>
-        <mahout.skip.example>true</mahout.skip.example>
-      </properties>
-    </profile>
-  </profiles>
-</project>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/assembly/job.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/assembly/job.xml b/community/mahout-mr/examples/src/main/assembly/job.xml
deleted file mode 100644
index 0c41f3d..0000000
--- a/community/mahout-mr/examples/src/main/assembly/job.xml
+++ /dev/null
@@ -1,46 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<assembly
-  xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
-  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0
-    http://maven.apache.org/xsd/assembly-1.1.0.xsd">
-  <id>job</id>
-  <formats>
-   <format>jar</format>
-  </formats>
-  <includeBaseDirectory>false</includeBaseDirectory>
-  <dependencySets>
-    <dependencySet>
-      <unpack>true</unpack>
-      <unpackOptions>
-        <!-- MAHOUT-1126 -->
-        <excludes>
-          <exclude>META-INF/LICENSE</exclude>
-        </excludes>
-      </unpackOptions>
-      <scope>runtime</scope>
-      <outputDirectory>/</outputDirectory>
-      <useTransitiveFiltering>true</useTransitiveFiltering>
-      <excludes>
-        <exclude>org.apache.hadoop:hadoop-core</exclude>
-      </excludes>
-    </dependencySet>
-  </dependencySets>
-</assembly>
-  
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java
deleted file mode 100644
index 6392b9f..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example;
-
-import java.io.File;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-
-/**
- * This class provides a common implementation for parsing input parameters for
- * all taste examples. Currently they only need the path to the recommendations
- * file as input.
- * 
- * The class is safe to be used in threaded contexts.
- */
-public final class TasteOptionParser {
-  
-  private TasteOptionParser() {
-  }
-  
-  /**
-   * Parse the given command line arguments.
-   * @param args the arguments as given to the application.
-   * @return the input file if a file was given on the command line, null otherwise.
-   */
-  public static File getRatings(String[] args) throws OptionException {
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-    
-    Option inputOpt = obuilder.withLongName("input").withRequired(false).withShortName("i")
-        .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
-        .withDescription("The Path for input data directory.").create();
-    
-    Option helpOpt = DefaultOptionCreator.helpOption();
-    
-    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(helpOpt).create();
-    
-    Parser parser = new Parser();
-    parser.setGroup(group);
-    CommandLine cmdLine = parser.parse(args);
-    
-    if (cmdLine.hasOption(helpOpt)) {
-      CommandLineUtil.printHelp(group);
-      return null;
-    }
-
-    return cmdLine.hasOption(inputOpt) ? new File(cmdLine.getValue(inputOpt).toString()) : null;
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java
deleted file mode 100644
index c908e5b..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood;
-import org.apache.mahout.cf.taste.impl.recommender.GenericBooleanPrefUserBasedRecommender;
-import org.apache.mahout.cf.taste.impl.similarity.CachingUserSimilarity;
-import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
-import org.apache.mahout.cf.taste.recommender.IDRescorer;
-import org.apache.mahout.cf.taste.recommender.RecommendedItem;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.apache.mahout.cf.taste.similarity.UserSimilarity;
-
-import java.util.Collection;
-import java.util.List;
-
-/**
- * A simple {@link Recommender} implemented for the Book Crossing demo.
- * See the <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/">Book Crossing site</a>.
- */
-public final class BookCrossingBooleanRecommender implements Recommender {
-
-  private final Recommender recommender;
-
-  public BookCrossingBooleanRecommender(DataModel bcModel) throws TasteException {
-    UserSimilarity similarity = new CachingUserSimilarity(new LogLikelihoodSimilarity(bcModel), bcModel);
-    UserNeighborhood neighborhood =
-        new NearestNUserNeighborhood(10, Double.NEGATIVE_INFINITY, similarity, bcModel, 1.0);
-    recommender = new GenericBooleanPrefUserBasedRecommender(bcModel, neighborhood, similarity);
-  }
-
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
-    return recommender.recommend(userID, howMany);
-  }
-
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
-    return recommend(userID, howMany, null, includeKnownItems);
-  }
-
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
-    return recommender.recommend(userID, howMany, rescorer, false);
-  }
-  
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
-    throws TasteException {
-    return recommender.recommend(userID, howMany, rescorer, includeKnownItems);
-  }
-  
-  @Override
-  public float estimatePreference(long userID, long itemID) throws TasteException {
-    return recommender.estimatePreference(userID, itemID);
-  }
-
-  @Override
-  public void setPreference(long userID, long itemID, float value) throws TasteException {
-    recommender.setPreference(userID, itemID, value);
-  }
-
-  @Override
-  public void removePreference(long userID, long itemID) throws TasteException {
-    recommender.removePreference(userID, itemID);
-  }
-
-  @Override
-  public DataModel getDataModel() {
-    return recommender.getDataModel();
-  }
-
-  @Override
-  public void refresh(Collection<Refreshable> alreadyRefreshed) {
-    recommender.refresh(alreadyRefreshed);
-  }
-
-  @Override
-  public String toString() {
-    return "BookCrossingBooleanRecommender[recommender:" + recommender + ']';
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java
deleted file mode 100644
index 2219bce..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-
-final class BookCrossingBooleanRecommenderBuilder implements RecommenderBuilder {
-
-  @Override
-  public Recommender buildRecommender(DataModel dataModel) throws TasteException {
-    return new BookCrossingBooleanRecommender(dataModel);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java
deleted file mode 100644
index b9814c7..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import org.apache.commons.cli2.OptionException;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.IRStatistics;
-import org.apache.mahout.cf.taste.eval.RecommenderIRStatsEvaluator;
-import org.apache.mahout.cf.taste.example.TasteOptionParser;
-import org.apache.mahout.cf.taste.impl.eval.GenericRecommenderIRStatsEvaluator;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.IOException;
-
-public final class BookCrossingBooleanRecommenderEvaluatorRunner {
-
-  private static final Logger log = LoggerFactory.getLogger(BookCrossingBooleanRecommenderEvaluatorRunner.class);
-
-  private BookCrossingBooleanRecommenderEvaluatorRunner() {
-    // do nothing
-  }
-
-  public static void main(String... args) throws IOException, TasteException, OptionException {
-    RecommenderIRStatsEvaluator evaluator = new GenericRecommenderIRStatsEvaluator();
-    File ratingsFile = TasteOptionParser.getRatings(args);
-    DataModel model =
-        ratingsFile == null ? new BookCrossingDataModel(true) : new BookCrossingDataModel(ratingsFile, true);
-
-    IRStatistics evaluation = evaluator.evaluate(
-        new BookCrossingBooleanRecommenderBuilder(),
-        new BookCrossingDataModelBuilder(),
-        model,
-        null,
-        3,
-        Double.NEGATIVE_INFINITY,
-        1.0);
-
-    log.info(String.valueOf(evaluation));
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
deleted file mode 100644
index 3e2f8b5..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.regex.Pattern;
-
-import com.google.common.base.Charsets;
-import com.google.common.io.Closeables;
-import org.apache.mahout.cf.taste.similarity.precompute.example.GroupLensDataModel;
-import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
-import org.apache.mahout.common.iterator.FileLineIterable;
-
-/**
- * See <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip">download</a> for
- * data needed by this class. The BX-Book-Ratings.csv file is needed.
- */
-public final class BookCrossingDataModel extends FileDataModel {
-
-  private static final Pattern NON_DIGIT_SEMICOLON_PATTERN = Pattern.compile("[^0-9;]");
-
-  public BookCrossingDataModel(boolean ignoreRatings) throws IOException {
-    this(GroupLensDataModel.readResourceToTempFile(
-             "/org/apache/mahout/cf/taste/example/bookcrossing/BX-Book-Ratings.csv"),
-         ignoreRatings);
-  }
-  
-  /**
-   * @param ratingsFile BookCrossing ratings file in its native format
-   * @throws IOException if an error occurs while reading or writing files
-   */
-  public BookCrossingDataModel(File ratingsFile, boolean ignoreRatings) throws IOException {
-    super(convertBCFile(ratingsFile, ignoreRatings));
-  }
-  
-  private static File convertBCFile(File originalFile, boolean ignoreRatings) throws IOException {
-    if (!originalFile.exists()) {
-      throw new FileNotFoundException(originalFile.toString());
-    }
-    File resultFile = new File(new File(System.getProperty("java.io.tmpdir")), "taste.bookcrossing.txt");
-    resultFile.delete();
-    Writer writer = null;
-    try {
-      writer = new OutputStreamWriter(new FileOutputStream(resultFile), Charsets.UTF_8);
-      for (String line : new FileLineIterable(originalFile, true)) {
-        // 0 ratings are basically "no rating", ignore them (thanks h.9000)
-        if (line.endsWith("\"0\"")) {
-          continue;
-        }
-        // Delete replace anything that isn't numeric, or a semicolon delimiter. Make comma the delimiter.
-        String convertedLine = NON_DIGIT_SEMICOLON_PATTERN.matcher(line)
-            .replaceAll("").replace(';', ',');
-        // If this means we deleted an entire ID -- few cases like that -- skip the line
-        if (convertedLine.contains(",,")) {
-          continue;
-        }
-        if (ignoreRatings) {
-          // drop rating
-          convertedLine = convertedLine.substring(0, convertedLine.lastIndexOf(','));
-        }
-        writer.write(convertedLine);
-        writer.write('\n');
-      }
-      writer.flush();
-    } catch (IOException ioe) {
-      resultFile.delete();
-      throw ioe;
-    } finally {
-      Closeables.close(writer, false);
-    }
-    return resultFile;
-  }
-  
-  @Override
-  public String toString() {
-    return "BookCrossingDataModel";
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java
deleted file mode 100644
index 9ec2eaf..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import org.apache.mahout.cf.taste.eval.DataModelBuilder;
-import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
-import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-
-final class BookCrossingDataModelBuilder implements DataModelBuilder {
-
-  @Override
-  public DataModel buildDataModel(FastByIDMap<PreferenceArray> trainingData) {
-    return new GenericBooleanPrefDataModel(GenericBooleanPrefDataModel.toDataMap(trainingData));
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java
deleted file mode 100644
index c06ca2f..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood;
-import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender;
-import org.apache.mahout.cf.taste.impl.similarity.CachingUserSimilarity;
-import org.apache.mahout.cf.taste.impl.similarity.EuclideanDistanceSimilarity;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
-import org.apache.mahout.cf.taste.recommender.IDRescorer;
-import org.apache.mahout.cf.taste.recommender.RecommendedItem;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.apache.mahout.cf.taste.similarity.UserSimilarity;
-
-/**
- * A simple {@link Recommender} implemented for the Book Crossing demo.
- * See the <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/">Book Crossing site</a>.
- */
-public final class BookCrossingRecommender implements Recommender {
-
-  private final Recommender recommender;
-
-  public BookCrossingRecommender(DataModel bcModel) throws TasteException {
-    UserSimilarity similarity = new CachingUserSimilarity(new EuclideanDistanceSimilarity(bcModel), bcModel);
-    UserNeighborhood neighborhood = new NearestNUserNeighborhood(10, 0.2, similarity, bcModel, 0.2);
-    recommender = new GenericUserBasedRecommender(bcModel, neighborhood, similarity);
-  }
-  
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
-    return recommender.recommend(userID, howMany);
-  }
-
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
-    return recommend(userID, howMany, null, includeKnownItems);
-  }
-  
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
-    return recommender.recommend(userID, howMany, rescorer, false);
-  }
-  
-  @Override
-  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
-    throws TasteException {
-    return recommender.recommend(userID, howMany, rescorer, false);
-  }
-  
-  @Override
-  public float estimatePreference(long userID, long itemID) throws TasteException {
-    return recommender.estimatePreference(userID, itemID);
-  }
-  
-  @Override
-  public void setPreference(long userID, long itemID, float value) throws TasteException {
-    recommender.setPreference(userID, itemID, value);
-  }
-  
-  @Override
-  public void removePreference(long userID, long itemID) throws TasteException {
-    recommender.removePreference(userID, itemID);
-  }
-  
-  @Override
-  public DataModel getDataModel() {
-    return recommender.getDataModel();
-  }
-  
-  @Override
-  public void refresh(Collection<Refreshable> alreadyRefreshed) {
-    recommender.refresh(alreadyRefreshed);
-  }
-  
-  @Override
-  public String toString() {
-    return "BookCrossingRecommender[recommender:" + recommender + ']';
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java
deleted file mode 100644
index bb6d3e1..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-
-final class BookCrossingRecommenderBuilder implements RecommenderBuilder {
-  
-  @Override
-  public Recommender buildRecommender(DataModel dataModel) throws TasteException {
-    return new BookCrossingRecommender(dataModel);
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java
deleted file mode 100644
index 97074d2..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.commons.cli2.OptionException;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.RecommenderEvaluator;
-import org.apache.mahout.cf.taste.example.TasteOptionParser;
-import org.apache.mahout.cf.taste.impl.eval.AverageAbsoluteDifferenceRecommenderEvaluator;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class BookCrossingRecommenderEvaluatorRunner {
-  
-  private static final Logger log = LoggerFactory.getLogger(BookCrossingRecommenderEvaluatorRunner.class);
-  
-  private BookCrossingRecommenderEvaluatorRunner() {
-    // do nothing
-  }
-  
-  public static void main(String... args) throws IOException, TasteException, OptionException {
-    RecommenderEvaluator evaluator = new AverageAbsoluteDifferenceRecommenderEvaluator();
-    File ratingsFile = TasteOptionParser.getRatings(args);
-    DataModel model =
-        ratingsFile == null ? new BookCrossingDataModel(false) : new BookCrossingDataModel(ratingsFile, false);
-
-    double evaluation = evaluator.evaluate(new BookCrossingRecommenderBuilder(),
-      null,
-      model,
-      0.9,
-      0.3);
-    log.info(String.valueOf(evaluation));
-  }
-  
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README
deleted file mode 100644
index 9244fe3..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README
+++ /dev/null
@@ -1,9 +0,0 @@
-Code works with BookCrossing data set, which is not included in this distribution but is downloadable from
-http://www.informatik.uni-freiburg.de/~cziegler/BX/
-
-Data set originated from:
-
-Improving Recommendation Lists Through Topic Diversification,
- Cai-Nicolas Ziegler, Sean M. McNee, Joseph A. Konstan, Georg Lausen;
- Proceedings of the 14th International World Wide Web Conference (WWW '05), May 10-14, 2005, Chiba, Japan.
- To appear.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
deleted file mode 100644
index 033daa2..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.Writable;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
-import org.apache.mahout.math.map.OpenObjectIntHashMap;
-
-import java.io.IOException;
-import java.util.regex.Pattern;
-
-public final class EmailUtility {
-
-  public static final String SEPARATOR = "separator";
-  public static final String MSG_IDS_PREFIX = "msgIdsPrefix";
-  public static final String FROM_PREFIX = "fromPrefix";
-  public static final String MSG_ID_DIMENSION = "msgIdDim";
-  public static final String FROM_INDEX = "fromIdx";
-  public static final String REFS_INDEX = "refsIdx";
-  private static final String[] EMPTY = new String[0];
-  private static final Pattern ADDRESS_CLEANUP = Pattern.compile("mailto:|<|>|\\[|\\]|\\=20");
-  private static final Pattern ANGLE_BRACES = Pattern.compile("<|>");
-  private static final Pattern SPACE_OR_CLOSE_ANGLE = Pattern.compile(">|\\s+");
-  public static final Pattern WHITESPACE = Pattern.compile("\\s*");
-
-  private EmailUtility() {
-  }
-
-  /**
-   * Strip off some spurious characters that make it harder to dedup
-   */
-  public static String cleanUpEmailAddress(CharSequence address) {
-    //do some cleanup to normalize some things, like: Key: karthik ananth <ka...@gmail.com>: Value: 178
-    //Key: karthik ananth [mailto:karthik.jcecs@gmail.com]=20: Value: 179
-    //TODO: is there more to clean up here?
-    return ADDRESS_CLEANUP.matcher(address).replaceAll("");
-  }
-
-  public static void loadDictionaries(Configuration conf, String fromPrefix,
-                                      OpenObjectIntHashMap<String> fromDictionary,
-                                      String msgIdPrefix,
-                                      OpenObjectIntHashMap<String> msgIdDictionary) throws IOException {
-
-    Path[] localFiles = HadoopUtil.getCachedFiles(conf);
-    FileSystem fs = FileSystem.getLocal(conf);
-    for (Path dictionaryFile : localFiles) {
-
-      // key is word value is id
-
-      OpenObjectIntHashMap<String> dictionary = null;
-      if (dictionaryFile.getName().startsWith(fromPrefix)) {
-        dictionary = fromDictionary;
-      } else if (dictionaryFile.getName().startsWith(msgIdPrefix)) {
-        dictionary = msgIdDictionary;
-      }
-      if (dictionary != null) {
-        dictionaryFile = fs.makeQualified(dictionaryFile);
-        for (Pair<Writable, IntWritable> record
-            : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) {
-          dictionary.put(record.getFirst().toString(), record.getSecond().get());
-        }
-      }
-    }
-
-  }
-
-  public static String[] parseReferences(CharSequence rawRefs) {
-    String[] splits;
-    if (rawRefs != null && rawRefs.length() > 0) {
-      splits = SPACE_OR_CLOSE_ANGLE.split(rawRefs);
-      for (int i = 0; i < splits.length; i++) {
-        splits[i] = ANGLE_BRACES.matcher(splits[i]).replaceAll("");
-      }
-    } else {
-      splits = EMPTY;
-    }
-    return splits;
-  }
-
-  public enum Counters {
-    NO_MESSAGE_ID, NO_FROM_ADDRESS
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
deleted file mode 100644
index 5cd308d..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.math.VarIntWritable;
-
-import java.io.IOException;
-
-/**
- *  Assumes the input is in the format created by {@link org.apache.mahout.text.SequenceFilesFromMailArchives}
- */
-public final class FromEmailToDictionaryMapper extends Mapper<Text, Text, Text, VarIntWritable> {
-
-  private String separator;
-
-  @Override
-  protected void setup(Context context) throws IOException, InterruptedException {
-    super.setup(context);
-    separator = context.getConfiguration().get(EmailUtility.SEPARATOR);
-  }
-
-  @Override
-  protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
-    //From is in the value
-    String valStr = value.toString();
-    int idx = valStr.indexOf(separator);
-    if (idx == -1) {
-      context.getCounter(EmailUtility.Counters.NO_FROM_ADDRESS).increment(1);
-    } else {
-      String full = valStr.substring(0, idx);
-      //do some cleanup to normalize some things, like: Key: karthik ananth <ka...@gmail.com>: Value: 178
-      //Key: karthik ananth [mailto:karthik.jcecs@gmail.com]=20: Value: 179
-      //TODO: is there more to clean up here?
-      full = EmailUtility.cleanUpEmailAddress(full);
-
-      if (EmailUtility.WHITESPACE.matcher(full).matches()) {
-        context.getCounter(EmailUtility.Counters.NO_FROM_ADDRESS).increment(1);
-      } else {
-        context.write(new Text(full), new VarIntWritable(1));
-      }
-    }
-
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java
deleted file mode 100644
index 72fcde9..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.math.VarIntWritable;
-
-import java.io.IOException;
-
-/**
- * Key: the string id
- * Value: the count
- * Out Key: the string id
- * Out Value: the sum of the counts
- */
-public final class MailToDictionaryReducer extends Reducer<Text, VarIntWritable, Text, VarIntWritable> {
-
-  @Override
-  protected void reduce(Text key, Iterable<VarIntWritable> values, Context context)
-    throws IOException, InterruptedException {
-    int sum = 0;
-    for (VarIntWritable value : values) {
-      sum += value.get();
-    }
-    context.write(new Text(key), new VarIntWritable(sum));
-  }
-}


[14/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Builder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Builder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Builder.java
new file mode 100644
index 0000000..32d7b5c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Builder.java
@@ -0,0 +1,333 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.mahout.classifier.df.DecisionForest;
+import org.apache.mahout.classifier.df.builder.TreeBuilder;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+
+/**
+ * Base class for Mapred DecisionForest builders. Takes care of storing the parameters common to the mapred
+ * implementations.<br>
+ * The child classes must implement at least :
+ * <ul>
+ * <li>void configureJob(Job) : to further configure the job before its launch; and</li>
+ * <li>DecisionForest parseOutput(Job, PredictionCallback) : in order to convert the job outputs into a
+ * DecisionForest and its corresponding oob predictions</li>
+ * </ul>
+ * 
+ */
+@Deprecated
+public abstract class Builder {
+  
+  private static final Logger log = LoggerFactory.getLogger(Builder.class);
+  
+  private final TreeBuilder treeBuilder;
+  private final Path dataPath;
+  private final Path datasetPath;
+  private final Long seed;
+  private final Configuration conf;
+  private String outputDirName = "output";
+  
+  protected Builder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed, Configuration conf) {
+    this.treeBuilder = treeBuilder;
+    this.dataPath = dataPath;
+    this.datasetPath = datasetPath;
+    this.seed = seed;
+    this.conf = new Configuration(conf);
+  }
+  
+  protected Path getDataPath() {
+    return dataPath;
+  }
+  
+  /**
+   * Return the value of "mapred.map.tasks".
+   * 
+   * @param conf
+   *          configuration
+   * @return number of map tasks
+   */
+  public static int getNumMaps(Configuration conf) {
+    return conf.getInt("mapred.map.tasks", -1);
+  }
+
+  /**
+   * Used only for DEBUG purposes. if false, the mappers doesn't output anything, so the builder has nothing
+   * to process
+   * 
+   * @param conf
+   *          configuration
+   * @return true if the builder has to return output. false otherwise
+   */
+  protected static boolean isOutput(Configuration conf) {
+    return conf.getBoolean("debug.mahout.rf.output", true);
+  }
+  
+  /**
+   * Returns the random seed
+   * 
+   * @param conf
+   *          configuration
+   * @return null if no seed is available
+   */
+  public static Long getRandomSeed(Configuration conf) {
+    String seed = conf.get("mahout.rf.random.seed");
+    if (seed == null) {
+      return null;
+    }
+    
+    return Long.valueOf(seed);
+  }
+  
+  /**
+   * Sets the random seed value
+   * 
+   * @param conf
+   *          configuration
+   * @param seed
+   *          random seed
+   */
+  private static void setRandomSeed(Configuration conf, long seed) {
+    conf.setLong("mahout.rf.random.seed", seed);
+  }
+  
+  public static TreeBuilder getTreeBuilder(Configuration conf) {
+    String string = conf.get("mahout.rf.treebuilder");
+    if (string == null) {
+      return null;
+    }
+    
+    return StringUtils.fromString(string);
+  }
+  
+  private static void setTreeBuilder(Configuration conf, TreeBuilder treeBuilder) {
+    conf.set("mahout.rf.treebuilder", StringUtils.toString(treeBuilder));
+  }
+  
+  /**
+   * Get the number of trees for the map-reduce job.
+   * 
+   * @param conf
+   *          configuration
+   * @return number of trees to build
+   */
+  public static int getNbTrees(Configuration conf) {
+    return conf.getInt("mahout.rf.nbtrees", -1);
+  }
+  
+  /**
+   * Set the number of trees to grow for the map-reduce job
+   * 
+   * @param conf
+   *          configuration
+   * @param nbTrees
+   *          number of trees to build
+   * @throws IllegalArgumentException
+   *           if (nbTrees <= 0)
+   */
+  public static void setNbTrees(Configuration conf, int nbTrees) {
+    Preconditions.checkArgument(nbTrees > 0, "nbTrees should be greater than 0");
+
+    conf.setInt("mahout.rf.nbtrees", nbTrees);
+  }
+  
+  /**
+   * Sets the Output directory name, will be creating in the working directory
+   * 
+   * @param name
+   *          output dir. name
+   */
+  public void setOutputDirName(String name) {
+    outputDirName = name;
+  }
+  
+  /**
+   * Output Directory name
+   * 
+   * @param conf
+   *          configuration
+   * @return output dir. path (%WORKING_DIRECTORY%/OUTPUT_DIR_NAME%)
+   * @throws IOException
+   *           if we cannot get the default FileSystem
+   */
+  protected Path getOutputPath(Configuration conf) throws IOException {
+    // the output directory is accessed only by this class, so use the default
+    // file system
+    FileSystem fs = FileSystem.get(conf);
+    return new Path(fs.getWorkingDirectory(), outputDirName);
+  }
+  
+  /**
+   * Helper method. Get a path from the DistributedCache
+   * 
+   * @param conf
+   *          configuration
+   * @param index
+   *          index of the path in the DistributedCache files
+   * @return path from the DistributedCache
+   * @throws IOException
+   *           if no path is found
+   */
+  public static Path getDistributedCacheFile(Configuration conf, int index) throws IOException {
+    Path[] files = HadoopUtil.getCachedFiles(conf);
+    
+    if (files.length <= index) {
+      throw new IOException("path not found in the DistributedCache");
+    }
+    
+    return files[index];
+  }
+  
+  /**
+   * Helper method. Load a Dataset stored in the DistributedCache
+   * 
+   * @param conf
+   *          configuration
+   * @return loaded Dataset
+   * @throws IOException
+   *           if we cannot retrieve the Dataset path from the DistributedCache, or the Dataset could not be
+   *           loaded
+   */
+  public static Dataset loadDataset(Configuration conf) throws IOException {
+    Path datasetPath = getDistributedCacheFile(conf, 0);
+    
+    return Dataset.load(conf, datasetPath);
+  }
+  
+  /**
+   * Used by the inheriting classes to configure the job
+   * 
+   *
+   * @param job
+   *          Hadoop's Job
+   * @throws IOException
+   *           if anything goes wrong while configuring the job
+   */
+  protected abstract void configureJob(Job job) throws IOException;
+  
+  /**
+   * Sequential implementation should override this method to simulate the job execution
+   * 
+   * @param job
+   *          Hadoop's job
+   * @return true is the job succeeded
+   */
+  protected boolean runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException {
+    return job.waitForCompletion(true);
+  }
+  
+  /**
+   * Parse the output files to extract the trees and pass the predictions to the callback
+   * 
+   * @param job
+   *          Hadoop's job
+   * @return Built DecisionForest
+   * @throws IOException
+   *           if anything goes wrong while parsing the output
+   */
+  protected abstract DecisionForest parseOutput(Job job) throws IOException;
+  
+  public DecisionForest build(int nbTrees)
+    throws IOException, ClassNotFoundException, InterruptedException {
+    // int numTrees = getNbTrees(conf);
+    
+    Path outputPath = getOutputPath(conf);
+    FileSystem fs = outputPath.getFileSystem(conf);
+    
+    // check the output
+    if (fs.exists(outputPath)) {
+      throw new IOException("Output path already exists : " + outputPath);
+    }
+    
+    if (seed != null) {
+      setRandomSeed(conf, seed);
+    }
+    setNbTrees(conf, nbTrees);
+    setTreeBuilder(conf, treeBuilder);
+    
+    // put the dataset into the DistributedCache
+    DistributedCache.addCacheFile(datasetPath.toUri(), conf);
+    
+    Job job = new Job(conf, "decision forest builder");
+    
+    log.debug("Configuring the job...");
+    configureJob(job);
+    
+    log.debug("Running the job...");
+    if (!runJob(job)) {
+      log.error("Job failed!");
+      return null;
+    }
+    
+    if (isOutput(conf)) {
+      log.debug("Parsing the output...");
+      DecisionForest forest = parseOutput(job);
+      HadoopUtil.delete(conf, outputPath);
+      return forest;
+    }
+    
+    return null;
+  }
+  
+  /**
+   * sort the splits into order based on size, so that the biggest go first.<br>
+   * This is the same code used by Hadoop's JobClient.
+   * 
+   * @param splits
+   *          input splits
+   */
+  public static void sortSplits(InputSplit[] splits) {
+    Arrays.sort(splits, new Comparator<InputSplit>() {
+      @Override
+      public int compare(InputSplit a, InputSplit b) {
+        try {
+          long left = a.getLength();
+          long right = b.getLength();
+          if (left == right) {
+            return 0;
+          } else if (left < right) {
+            return 1;
+          } else {
+            return -1;
+          }
+        } catch (IOException ie) {
+          throw new IllegalStateException("Problem getting input split size", ie);
+        } catch (InterruptedException ie) {
+          throw new IllegalStateException("Problem getting input split size", ie);
+        }
+      }
+    });
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Classifier.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Classifier.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Classifier.java
new file mode 100644
index 0000000..1a35cfe
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Classifier.java
@@ -0,0 +1,238 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.classifier.df.DFUtils;
+import org.apache.mahout.classifier.df.DecisionForest;
+import org.apache.mahout.classifier.df.data.DataConverter;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * Mapreduce implementation that classifies the Input data using a previousely built decision forest
+ */
+@Deprecated
+public class Classifier {
+
+  private static final Logger log = LoggerFactory.getLogger(Classifier.class);
+
+  private final Path forestPath;
+  private final Path inputPath;
+  private final Path datasetPath;
+  private final Configuration conf;
+  private final Path outputPath; // path that will containt the final output of the classifier
+  private final Path mappersOutputPath; // mappers will output here
+  private double[][] results;
+  
+  public double[][] getResults() {
+    return results;
+  }
+
+  public Classifier(Path forestPath,
+                    Path inputPath,
+                    Path datasetPath,
+                    Path outputPath,
+                    Configuration conf) {
+    this.forestPath = forestPath;
+    this.inputPath = inputPath;
+    this.datasetPath = datasetPath;
+    this.outputPath = outputPath;
+    this.conf = conf;
+
+    mappersOutputPath = new Path(outputPath, "mappers");
+  }
+
+  private void configureJob(Job job) throws IOException {
+
+    job.setJarByClass(Classifier.class);
+
+    FileInputFormat.setInputPaths(job, inputPath);
+    FileOutputFormat.setOutputPath(job, mappersOutputPath);
+
+    job.setOutputKeyClass(DoubleWritable.class);
+    job.setOutputValueClass(Text.class);
+
+    job.setMapperClass(CMapper.class);
+    job.setNumReduceTasks(0); // no reducers
+
+    job.setInputFormatClass(CTextInputFormat.class);
+    job.setOutputFormatClass(SequenceFileOutputFormat.class);
+
+  }
+
+  public void run() throws IOException, ClassNotFoundException, InterruptedException {
+    FileSystem fs = FileSystem.get(conf);
+
+    // check the output
+    if (fs.exists(outputPath)) {
+      throw new IOException("Output path already exists : " + outputPath);
+    }
+
+    log.info("Adding the dataset to the DistributedCache");
+    // put the dataset into the DistributedCache
+    DistributedCache.addCacheFile(datasetPath.toUri(), conf);
+
+    log.info("Adding the decision forest to the DistributedCache");
+    DistributedCache.addCacheFile(forestPath.toUri(), conf);
+
+    Job job = new Job(conf, "decision forest classifier");
+
+    log.info("Configuring the job...");
+    configureJob(job);
+
+    log.info("Running the job...");
+    if (!job.waitForCompletion(true)) {
+      throw new IllegalStateException("Job failed!");
+    }
+
+    parseOutput(job);
+
+    HadoopUtil.delete(conf, mappersOutputPath);
+  }
+
+  /**
+   * Extract the prediction for each mapper and write them in the corresponding output file. 
+   * The name of the output file is based on the name of the corresponding input file.
+   * Will compute the ConfusionMatrix if necessary.
+   */
+  private void parseOutput(JobContext job) throws IOException {
+    Configuration conf = job.getConfiguration();
+    FileSystem fs = mappersOutputPath.getFileSystem(conf);
+
+    Path[] outfiles = DFUtils.listOutputFiles(fs, mappersOutputPath);
+
+    // read all the output
+    List<double[]> resList = new ArrayList<>();
+    for (Path path : outfiles) {
+      FSDataOutputStream ofile = null;
+      try {
+        for (Pair<DoubleWritable,Text> record : new SequenceFileIterable<DoubleWritable,Text>(path, true, conf)) {
+          double key = record.getFirst().get();
+          String value = record.getSecond().toString();
+          if (ofile == null) {
+            // this is the first value, it contains the name of the input file
+            ofile = fs.create(new Path(outputPath, value).suffix(".out"));
+          } else {
+            // The key contains the correct label of the data. The value contains a prediction
+            ofile.writeChars(value); // write the prediction
+            ofile.writeChar('\n');
+
+            resList.add(new double[]{key, Double.valueOf(value)});
+          }
+        }
+      } finally {
+        Closeables.close(ofile, false);
+      }
+    }
+    results = new double[resList.size()][2];
+    resList.toArray(results);
+  }
+
+  /**
+   * TextInputFormat that does not split the input files. This ensures that each input file is processed by one single
+   * mapper.
+   */
+  private static class CTextInputFormat extends TextInputFormat {
+    @Override
+    protected boolean isSplitable(JobContext jobContext, Path path) {
+      return false;
+    }
+  }
+  
+  public static class CMapper extends Mapper<LongWritable, Text, DoubleWritable, Text> {
+
+    /** used to convert input values to data instances */
+    private DataConverter converter;
+    private DecisionForest forest;
+    private final Random rng = RandomUtils.getRandom();
+    private boolean first = true;
+    private final Text lvalue = new Text();
+    private Dataset dataset;
+    private final DoubleWritable lkey = new DoubleWritable();
+
+    @Override
+    protected void setup(Context context) throws IOException, InterruptedException {
+      super.setup(context);    //To change body of overridden methods use File | Settings | File Templates.
+
+      Configuration conf = context.getConfiguration();
+
+      Path[] files = HadoopUtil.getCachedFiles(conf);
+
+      if (files.length < 2) {
+        throw new IOException("not enough paths in the DistributedCache");
+      }
+      dataset = Dataset.load(conf, files[0]);
+      converter = new DataConverter(dataset);
+
+      forest = DecisionForest.load(conf, files[1]);
+      if (forest == null) {
+        throw new InterruptedException("DecisionForest not found!");
+      }
+    }
+
+    @Override
+    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+      if (first) {
+        FileSplit split = (FileSplit) context.getInputSplit();
+        Path path = split.getPath(); // current split path
+        lvalue.set(path.getName());
+        lkey.set(key.get());
+        context.write(lkey, lvalue);
+
+        first = false;
+      }
+
+      String line = value.toString();
+      if (!line.isEmpty()) {
+        Instance instance = converter.convert(line);
+        double prediction = forest.classify(dataset, rng, instance);
+        lkey.set(dataset.getLabel(instance));
+        lvalue.set(Double.toString(prediction));
+        context.write(lkey, lvalue);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredMapper.java
new file mode 100644
index 0000000..4d0f3f1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredMapper.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.classifier.df.builder.TreeBuilder;
+import org.apache.mahout.classifier.df.data.Dataset;
+
+import java.io.IOException;
+
+/**
+ * Base class for Mapred mappers. Loads common parameters from the job
+ */
+@Deprecated
+public class MapredMapper<KEYIN,VALUEIN,KEYOUT,VALUEOUT> extends Mapper<KEYIN,VALUEIN,KEYOUT,VALUEOUT> {
+  
+  private boolean noOutput;
+  
+  private TreeBuilder treeBuilder;
+  
+  private Dataset dataset;
+  
+  /**
+   * 
+   * @return whether the mapper does estimate and output predictions
+   */
+  protected boolean isOutput() {
+    return !noOutput;
+  }
+  
+  protected TreeBuilder getTreeBuilder() {
+    return treeBuilder;
+  }
+  
+  protected Dataset getDataset() {
+    return dataset;
+  }
+  
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    super.setup(context);
+    
+    Configuration conf = context.getConfiguration();
+    
+    configure(!Builder.isOutput(conf), Builder.getTreeBuilder(conf), Builder
+        .loadDataset(conf));
+  }
+  
+  /**
+   * Useful for testing
+   */
+  protected void configure(boolean noOutput, TreeBuilder treeBuilder, Dataset dataset) {
+    Preconditions.checkArgument(treeBuilder != null, "TreeBuilder not found in the Job parameters");
+    this.noOutput = noOutput;
+    this.treeBuilder = treeBuilder;
+    this.dataset = dataset;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredOutput.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredOutput.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredOutput.java
new file mode 100644
index 0000000..56cabb2
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredOutput.java
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.df.DFUtils;
+import org.apache.mahout.classifier.df.node.Node;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+
+/**
+ * Used by various implementation to return the results of a build.<br>
+ * Contains a grown tree and and its oob predictions.
+ */
+@Deprecated
+public class MapredOutput implements Writable, Cloneable {
+
+  private Node tree;
+
+  private int[] predictions;
+
+  public MapredOutput() {
+  }
+
+  public MapredOutput(Node tree, int[] predictions) {
+    this.tree = tree;
+    this.predictions = predictions;
+  }
+
+  public MapredOutput(Node tree) {
+    this(tree, null);
+  }
+
+  public Node getTree() {
+    return tree;
+  }
+
+  int[] getPredictions() {
+    return predictions;
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    boolean readTree = in.readBoolean();
+    if (readTree) {
+      tree = Node.read(in);
+    }
+
+    boolean readPredictions = in.readBoolean();
+    if (readPredictions) {
+      predictions = DFUtils.readIntArray(in);
+    }
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeBoolean(tree != null);
+    if (tree != null) {
+      tree.write(out);
+    }
+
+    out.writeBoolean(predictions != null);
+    if (predictions != null) {
+      DFUtils.writeArray(out, predictions);
+    }
+  }
+
+  @Override
+  public MapredOutput clone() {
+    return new MapredOutput(tree, predictions);
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+    if (!(obj instanceof MapredOutput)) {
+      return false;
+    }
+
+    MapredOutput mo = (MapredOutput) obj;
+
+    return ((tree == null && mo.getTree() == null) || (tree != null && tree.equals(mo.getTree())))
+        && Arrays.equals(predictions, mo.getPredictions());
+  }
+
+  @Override
+  public int hashCode() {
+    int hashCode = tree == null ? 1 : tree.hashCode();
+    for (int prediction : predictions) {
+      hashCode = 31 * hashCode + prediction;
+    }
+    return hashCode;
+  }
+
+  @Override
+  public String toString() {
+    return "{" + tree + " | " + Arrays.toString(predictions) + '}';
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemBuilder.java
new file mode 100644
index 0000000..86d4404
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemBuilder.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce.inmem;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.classifier.df.DFUtils;
+import org.apache.mahout.classifier.df.DecisionForest;
+import org.apache.mahout.classifier.df.builder.TreeBuilder;
+import org.apache.mahout.classifier.df.mapreduce.Builder;
+import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+
+/**
+ * MapReduce implementation where each mapper loads a full copy of the data in-memory. The forest trees are
+ * splitted across all the mappers
+ */
+@Deprecated
+public class InMemBuilder extends Builder {
+  
+  public InMemBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed, Configuration conf) {
+    super(treeBuilder, dataPath, datasetPath, seed, conf);
+  }
+  
+  public InMemBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath) {
+    this(treeBuilder, dataPath, datasetPath, null, new Configuration());
+  }
+  
+  @Override
+  protected void configureJob(Job job) throws IOException {
+    Configuration conf = job.getConfiguration();
+    
+    job.setJarByClass(InMemBuilder.class);
+    
+    FileOutputFormat.setOutputPath(job, getOutputPath(conf));
+    
+    // put the data in the DistributedCache
+    DistributedCache.addCacheFile(getDataPath().toUri(), conf);
+    
+    job.setOutputKeyClass(IntWritable.class);
+    job.setOutputValueClass(MapredOutput.class);
+    
+    job.setMapperClass(InMemMapper.class);
+    job.setNumReduceTasks(0); // no reducers
+    
+    job.setInputFormatClass(InMemInputFormat.class);
+    job.setOutputFormatClass(SequenceFileOutputFormat.class);
+    
+  }
+  
+  @Override
+  protected DecisionForest parseOutput(Job job) throws IOException {
+    Configuration conf = job.getConfiguration();
+    
+    Map<Integer,MapredOutput> output = new HashMap<>();
+    
+    Path outputPath = getOutputPath(conf);
+    FileSystem fs = outputPath.getFileSystem(conf);
+    
+    Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);
+    
+    // import the InMemOutputs
+    for (Path path : outfiles) {
+      for (Pair<IntWritable,MapredOutput> record : new SequenceFileIterable<IntWritable,MapredOutput>(path, conf)) {
+        output.put(record.getFirst().get(), record.getSecond());
+      }
+    }
+    
+    return processOutput(output);
+  }
+  
+  /**
+   * Process the output, extracting the trees
+   */
+  private static DecisionForest processOutput(Map<Integer,MapredOutput> output) {
+    List<Node> trees = new ArrayList<>();
+    
+    for (Map.Entry<Integer,MapredOutput> entry : output.entrySet()) {
+      MapredOutput value = entry.getValue();
+      trees.add(value.getTree());
+    }
+    
+    return new DecisionForest(trees);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemInputFormat.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemInputFormat.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemInputFormat.java
new file mode 100644
index 0000000..c3b2fa3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemInputFormat.java
@@ -0,0 +1,284 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce.inmem;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.Random;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.mahout.classifier.df.mapreduce.Builder;
+import org.apache.mahout.common.RandomUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Custom InputFormat that generates InputSplits given the desired number of trees.<br>
+ * each input split contains a subset of the trees.<br>
+ * The number of splits is equal to the number of requested splits
+ */
+@Deprecated
+public class InMemInputFormat extends InputFormat<IntWritable,NullWritable> {
+
+  private static final Logger log = LoggerFactory.getLogger(InMemInputSplit.class);
+
+  private Random rng;
+
+  private Long seed;
+
+  private boolean isSingleSeed;
+
+  /**
+   * Used for DEBUG purposes only. if true and a seed is available, all the mappers use the same seed, thus
+   * all the mapper should take the same time to build their trees.
+   */
+  private static boolean isSingleSeed(Configuration conf) {
+    return conf.getBoolean("debug.mahout.rf.single.seed", false);
+  }
+
+  @Override
+  public RecordReader<IntWritable,NullWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
+    throws IOException, InterruptedException {
+    Preconditions.checkArgument(split instanceof InMemInputSplit);
+    return new InMemRecordReader((InMemInputSplit) split);
+  }
+
+  @Override
+  public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
+    Configuration conf = context.getConfiguration();
+    int numSplits = conf.getInt("mapred.map.tasks", -1);
+
+    return getSplits(conf, numSplits);
+  }
+
+  public List<InputSplit> getSplits(Configuration conf, int numSplits) {
+    int nbTrees = Builder.getNbTrees(conf);
+    int splitSize = nbTrees / numSplits;
+
+    seed = Builder.getRandomSeed(conf);
+    isSingleSeed = isSingleSeed(conf);
+
+    if (rng != null && seed != null) {
+      log.warn("getSplits() was called more than once and the 'seed' is set, "
+                                + "this can lead to no-repeatable behavior");
+    }
+
+    rng = seed == null || isSingleSeed ? null : RandomUtils.getRandom(seed);
+
+    int id = 0;
+
+    List<InputSplit> splits = new ArrayList<>(numSplits);
+
+    for (int index = 0; index < numSplits - 1; index++) {
+      splits.add(new InMemInputSplit(id, splitSize, nextSeed()));
+      id += splitSize;
+    }
+
+    // take care of the remainder
+    splits.add(new InMemInputSplit(id, nbTrees - id, nextSeed()));
+
+    return splits;
+  }
+
+  /**
+   * @return the seed for the next InputSplit
+   */
+  private Long nextSeed() {
+    if (seed == null) {
+      return null;
+    } else if (isSingleSeed) {
+      return seed;
+    } else {
+      return rng.nextLong();
+    }
+  }
+
+  public static class InMemRecordReader extends RecordReader<IntWritable,NullWritable> {
+
+    private final InMemInputSplit split;
+    private int pos;
+    private IntWritable key;
+    private NullWritable value;
+
+    public InMemRecordReader(InMemInputSplit split) {
+      this.split = split;
+    }
+
+    @Override
+    public float getProgress() throws IOException {
+      return pos == 0 ? 0.0f : (float) (pos - 1) / split.nbTrees;
+    }
+
+    @Override
+    public IntWritable getCurrentKey() throws IOException, InterruptedException {
+      return key;
+    }
+
+    @Override
+    public NullWritable getCurrentValue() throws IOException, InterruptedException {
+      return value;
+    }
+
+    @Override
+    public void initialize(InputSplit arg0, TaskAttemptContext arg1) throws IOException, InterruptedException {
+      key = new IntWritable();
+      value = NullWritable.get();
+    }
+
+    @Override
+    public boolean nextKeyValue() throws IOException, InterruptedException {
+      if (pos < split.nbTrees) {
+        key.set(split.firstId + pos);
+        pos++;
+        return true;
+      } else {
+        return false;
+      }
+    }
+
+    @Override
+    public void close() throws IOException {
+    }
+
+  }
+
+  /**
+   * Custom InputSplit that indicates how many trees are built by each mapper
+   */
+  public static class InMemInputSplit extends InputSplit implements Writable {
+
+    private static final String[] NO_LOCATIONS = new String[0];
+
+    /** Id of the first tree of this split */
+    private int firstId;
+
+    private int nbTrees;
+
+    private Long seed;
+
+    public InMemInputSplit() { }
+
+    public InMemInputSplit(int firstId, int nbTrees, Long seed) {
+      this.firstId = firstId;
+      this.nbTrees = nbTrees;
+      this.seed = seed;
+    }
+
+    /**
+     * @return the Id of the first tree of this split
+     */
+    public int getFirstId() {
+      return firstId;
+    }
+
+    /**
+     * @return the number of trees
+     */
+    public int getNbTrees() {
+      return nbTrees;
+    }
+
+    /**
+     * @return the random seed or null if no seed is available
+     */
+    public Long getSeed() {
+      return seed;
+    }
+
+    @Override
+    public long getLength() throws IOException {
+      return nbTrees;
+    }
+
+    @Override
+    public String[] getLocations() throws IOException {
+      return NO_LOCATIONS;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+      if (this == obj) {
+        return true;
+      }
+      if (!(obj instanceof InMemInputSplit)) {
+        return false;
+      }
+
+      InMemInputSplit split = (InMemInputSplit) obj;
+
+      if (firstId != split.firstId || nbTrees != split.nbTrees) {
+        return false;
+      }
+      if (seed == null) {
+        return split.seed == null;
+      } else {
+        return seed.equals(split.seed);
+      }
+
+    }
+
+    @Override
+    public int hashCode() {
+      return firstId + nbTrees + (seed == null ? 0 : seed.intValue());
+    }
+
+    @Override
+    public String toString() {
+      return String.format(Locale.ENGLISH, "[firstId:%d, nbTrees:%d, seed:%d]", firstId, nbTrees, seed);
+    }
+
+    @Override
+    public void readFields(DataInput in) throws IOException {
+      firstId = in.readInt();
+      nbTrees = in.readInt();
+      boolean isSeed = in.readBoolean();
+      seed = isSeed ? in.readLong() : null;
+    }
+
+    @Override
+    public void write(DataOutput out) throws IOException {
+      out.writeInt(firstId);
+      out.writeInt(nbTrees);
+      out.writeBoolean(seed != null);
+      if (seed != null) {
+        out.writeLong(seed);
+      }
+    }
+
+    public static InMemInputSplit read(DataInput in) throws IOException {
+      InMemInputSplit split = new InMemInputSplit();
+      split.readFields(in);
+      return split;
+    }
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemMapper.java
new file mode 100644
index 0000000..2fc67ba
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemMapper.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce.inmem;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.mahout.classifier.df.Bagging;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.DataLoader;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.mapreduce.Builder;
+import org.apache.mahout.classifier.df.mapreduce.MapredMapper;
+import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
+import org.apache.mahout.classifier.df.mapreduce.inmem.InMemInputFormat.InMemInputSplit;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.common.RandomUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Random;
+
+/**
+ * In-memory mapper that grows the trees using a full copy of the data loaded in-memory. The number of trees
+ * to grow is determined by the current InMemInputSplit.
+ */
+@Deprecated
+public class InMemMapper extends MapredMapper<IntWritable,NullWritable,IntWritable,MapredOutput> {
+  
+  private static final Logger log = LoggerFactory.getLogger(InMemMapper.class);
+  
+  private Bagging bagging;
+  
+  private Random rng;
+
+  /**
+   * Load the training data
+   */
+  private static Data loadData(Configuration conf, Dataset dataset) throws IOException {
+    Path dataPath = Builder.getDistributedCacheFile(conf, 1);
+    FileSystem fs = FileSystem.get(dataPath.toUri(), conf);
+    return DataLoader.loadData(dataset, fs, dataPath);
+  }
+  
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    super.setup(context);
+    
+    Configuration conf = context.getConfiguration();
+    
+    log.info("Loading the data...");
+    Data data = loadData(conf, getDataset());
+    log.info("Data loaded : {} instances", data.size());
+    
+    bagging = new Bagging(getTreeBuilder(), data);
+  }
+  
+  @Override
+  protected void map(IntWritable key,
+                     NullWritable value,
+                     Context context) throws IOException, InterruptedException {
+    map(key, context);
+  }
+  
+  void map(IntWritable key, Context context) throws IOException, InterruptedException {
+    
+    initRandom((InMemInputSplit) context.getInputSplit());
+    
+    log.debug("Building...");
+    Node tree = bagging.build(rng);
+    
+    if (isOutput()) {
+      log.debug("Outputing...");
+      MapredOutput mrOut = new MapredOutput(tree);
+      
+      context.write(key, mrOut);
+    }
+  }
+  
+  void initRandom(InMemInputSplit split) {
+    if (rng == null) { // first execution of this mapper
+      Long seed = split.getSeed();
+      log.debug("Initialising rng with seed : {}", seed);
+      rng = seed == null ? RandomUtils.getRandom() : RandomUtils.getRandom(seed);
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/package-info.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/package-info.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/package-info.java
new file mode 100644
index 0000000..61e65e8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/package-info.java
@@ -0,0 +1,22 @@
+/**
+ * <h2>In-memory mapreduce implementation of Random Decision Forests</h2>
+ *
+ * <p>Each mapper is responsible for growing a number of trees with a whole copy of the dataset loaded in memory,
+ * it uses the reference implementation's code to build each tree and estimate the oob error.</p>
+ *
+ * <p>The dataset is distributed to the slave nodes using the {@link org.apache.hadoop.filecache.DistributedCache}.
+ * A custom {@link org.apache.hadoop.mapreduce.InputFormat}
+ * ({@link org.apache.mahout.classifier.df.mapreduce.inmem.InMemInputFormat}) is configured with the
+ * desired number of trees and generates a number of {@link org.apache.hadoop.mapreduce.InputSplit}s
+ * equal to the configured number of maps.</p>
+ *
+ * <p>There is no need for reducers, each map outputs (the trees it built and, for each tree, the labels the
+ * tree predicted for each out-of-bag instance. This step has to be done in the mapper because only there we
+ * know which instances are o-o-b.</p>
+ *
+ * <p>The Forest builder ({@link org.apache.mahout.classifier.df.mapreduce.inmem.InMemBuilder}) is responsible
+ * for configuring and launching the job.
+ * At the end of the job it parses the output files and builds the corresponding
+ * {@link org.apache.mahout.classifier.df.DecisionForest}.</p>
+ */
+package org.apache.mahout.classifier.df.mapreduce.inmem;

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java
new file mode 100644
index 0000000..9236af3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java
@@ -0,0 +1,158 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce.partial;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.classifier.df.DFUtils;
+import org.apache.mahout.classifier.df.DecisionForest;
+import org.apache.mahout.classifier.df.builder.TreeBuilder;
+import org.apache.mahout.classifier.df.mapreduce.Builder;
+import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Builds a random forest using partial data. Each mapper uses only the data given by its InputSplit
+ */
+@Deprecated
+public class PartialBuilder extends Builder {
+
+  private static final Logger log = LoggerFactory.getLogger(PartialBuilder.class);
+
+  public PartialBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed) {
+    this(treeBuilder, dataPath, datasetPath, seed, new Configuration());
+  }
+  
+  public PartialBuilder(TreeBuilder treeBuilder,
+                        Path dataPath,
+                        Path datasetPath,
+                        Long seed,
+                        Configuration conf) {
+    super(treeBuilder, dataPath, datasetPath, seed, conf);
+  }
+
+  @Override
+  protected void configureJob(Job job) throws IOException {
+    Configuration conf = job.getConfiguration();
+    
+    job.setJarByClass(PartialBuilder.class);
+    
+    FileInputFormat.setInputPaths(job, getDataPath());
+    FileOutputFormat.setOutputPath(job, getOutputPath(conf));
+    
+    job.setOutputKeyClass(TreeID.class);
+    job.setOutputValueClass(MapredOutput.class);
+    
+    job.setMapperClass(Step1Mapper.class);
+    job.setNumReduceTasks(0); // no reducers
+    
+    job.setInputFormatClass(TextInputFormat.class);
+    job.setOutputFormatClass(SequenceFileOutputFormat.class);
+
+    // For this implementation to work, mapred.map.tasks needs to be set to the actual
+    // number of mappers Hadoop will use:
+    TextInputFormat inputFormat = new TextInputFormat();
+    List<?> splits = inputFormat.getSplits(job);
+    if (splits == null || splits.isEmpty()) {
+      log.warn("Unable to compute number of splits?");
+    } else {
+      int numSplits = splits.size();
+      log.info("Setting mapred.map.tasks = {}", numSplits);
+      conf.setInt("mapred.map.tasks", numSplits);
+    }
+  }
+  
+  @Override
+  protected DecisionForest parseOutput(Job job) throws IOException {
+    Configuration conf = job.getConfiguration();
+    
+    int numTrees = Builder.getNbTrees(conf);
+    
+    Path outputPath = getOutputPath(conf);
+    
+    TreeID[] keys = new TreeID[numTrees];
+    Node[] trees = new Node[numTrees];
+        
+    processOutput(job, outputPath, keys, trees);
+    
+    return new DecisionForest(Arrays.asList(trees));
+  }
+  
+  /**
+   * Processes the output from the output path.<br>
+   * 
+   * @param outputPath
+   *          directory that contains the output of the job
+   * @param keys
+   *          can be null
+   * @param trees
+   *          can be null
+   * @throws java.io.IOException
+   */
+  protected static void processOutput(JobContext job,
+                                      Path outputPath,
+                                      TreeID[] keys,
+                                      Node[] trees) throws IOException {
+    Preconditions.checkArgument(keys == null && trees == null || keys != null && trees != null,
+        "if keys is null, trees should also be null");
+    Preconditions.checkArgument(keys == null || keys.length == trees.length, "keys.length != trees.length");
+
+    Configuration conf = job.getConfiguration();
+
+    FileSystem fs = outputPath.getFileSystem(conf);
+
+    Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);
+
+    // read all the outputs
+    int index = 0;
+    for (Path path : outfiles) {
+      for (Pair<TreeID,MapredOutput> record : new SequenceFileIterable<TreeID, MapredOutput>(path, conf)) {
+        TreeID key = record.getFirst();
+        MapredOutput value = record.getSecond();
+        if (keys != null) {
+          keys[index] = key;
+        }
+        if (trees != null) {
+          trees[index] = value.getTree();
+        }
+        index++;
+      }
+    }
+
+    // make sure we got all the keys/values
+    if (keys != null && index != keys.length) {
+      throw new IllegalStateException("Some key/values are missing from the output");
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java
new file mode 100644
index 0000000..9474236
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java
@@ -0,0 +1,168 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce.partial;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.classifier.df.Bagging;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.DataConverter;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.classifier.df.mapreduce.Builder;
+import org.apache.mahout.classifier.df.mapreduce.MapredMapper;
+import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.common.RandomUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * First step of the Partial Data Builder. Builds the trees using the data available in the InputSplit.
+ * Predict the oob classes for each tree in its growing partition (input split).
+ */
+@Deprecated
+public class Step1Mapper extends MapredMapper<LongWritable,Text,TreeID,MapredOutput> {
+  
+  private static final Logger log = LoggerFactory.getLogger(Step1Mapper.class);
+  
+  /** used to convert input values to data instances */
+  private DataConverter converter;
+  
+  private Random rng;
+  
+  /** number of trees to be built by this mapper */
+  private int nbTrees;
+  
+  /** id of the first tree */
+  private int firstTreeId;
+  
+  /** mapper's partition */
+  private int partition;
+  
+  /** will contain all instances if this mapper's split */
+  private final List<Instance> instances = new ArrayList<>();
+  
+  public int getFirstTreeId() {
+    return firstTreeId;
+  }
+  
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    super.setup(context);
+    Configuration conf = context.getConfiguration();
+    
+    configure(Builder.getRandomSeed(conf), conf.getInt("mapred.task.partition", -1),
+      Builder.getNumMaps(conf), Builder.getNbTrees(conf));
+  }
+  
+  /**
+   * Useful when testing
+   * 
+   * @param partition
+   *          current mapper inputSplit partition
+   * @param numMapTasks
+   *          number of running map tasks
+   * @param numTrees
+   *          total number of trees in the forest
+   */
+  protected void configure(Long seed, int partition, int numMapTasks, int numTrees) {
+    converter = new DataConverter(getDataset());
+    
+    // prepare random-numders generator
+    log.debug("seed : {}", seed);
+    if (seed == null) {
+      rng = RandomUtils.getRandom();
+    } else {
+      rng = RandomUtils.getRandom(seed);
+    }
+    
+    // mapper's partition
+    Preconditions.checkArgument(partition >= 0, "Wrong partition ID: " + partition + ". Partition must be >= 0!");
+    this.partition = partition;
+    
+    // compute number of trees to build
+    nbTrees = nbTrees(numMapTasks, numTrees, partition);
+    
+    // compute first tree id
+    firstTreeId = 0;
+    for (int p = 0; p < partition; p++) {
+      firstTreeId += nbTrees(numMapTasks, numTrees, p);
+    }
+    
+    log.debug("partition : {}", partition);
+    log.debug("nbTrees : {}", nbTrees);
+    log.debug("firstTreeId : {}", firstTreeId);
+  }
+  
+  /**
+   * Compute the number of trees for a given partition. The first partitions may be longer
+   * than the rest because of the remainder.
+   * 
+   * @param numMaps
+   *          total number of maps (partitions)
+   * @param numTrees
+   *          total number of trees to build
+   * @param partition
+   *          partition to compute the number of trees for
+   */
+  public static int nbTrees(int numMaps, int numTrees, int partition) {
+    int treesPerMapper = numTrees / numMaps;
+    int remainder = numTrees - numMaps * treesPerMapper;
+    return treesPerMapper + (partition < remainder ? 1 : 0);
+  }
+  
+  @Override
+  protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+    instances.add(converter.convert(value.toString()));
+  }
+  
+  @Override
+  protected void cleanup(Context context) throws IOException, InterruptedException {
+    // prepare the data
+    log.debug("partition: {} numInstances: {}", partition, instances.size());
+    
+    Data data = new Data(getDataset(), instances);
+    Bagging bagging = new Bagging(getTreeBuilder(), data);
+    
+    TreeID key = new TreeID();
+    
+    log.debug("Building {} trees", nbTrees);
+    for (int treeId = 0; treeId < nbTrees; treeId++) {
+      log.debug("Building tree number : {}", treeId);
+      
+      Node tree = bagging.build(rng);
+      
+      key.set(partition, firstTreeId + treeId);
+      
+      if (isOutput()) {
+        MapredOutput emOut = new MapredOutput(tree);
+        context.write(key, emOut);
+      }
+
+      context.progress();
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/TreeID.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/TreeID.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/TreeID.java
new file mode 100644
index 0000000..c296061
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/TreeID.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce.partial;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.io.LongWritable;
+
+/**
+ * Indicates both the tree and the data partition used to grow the tree
+ */
+@Deprecated
+public class TreeID extends LongWritable implements Cloneable {
+  
+  public static final int MAX_TREEID = 100000;
+  
+  public TreeID() { }
+  
+  public TreeID(int partition, int treeId) {
+    Preconditions.checkArgument(partition >= 0, "Wrong partition: " + partition + ". Partition must be >= 0!");
+    Preconditions.checkArgument(treeId >= 0, "Wrong treeId: " + treeId + ". TreeId must be >= 0!");
+    set(partition, treeId);
+  }
+  
+  public void set(int partition, int treeId) {
+    set((long) partition * MAX_TREEID + treeId);
+  }
+  
+  /**
+   * Data partition (InputSplit's index) that was used to grow the tree
+   */
+  public int partition() {
+    return (int) (get() / MAX_TREEID);
+  }
+  
+  public int treeId() {
+    return (int) (get() % MAX_TREEID);
+  }
+  
+  @Override
+  public TreeID clone() {
+    return new TreeID(partition(), treeId());
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/package-info.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/package-info.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/package-info.java
new file mode 100644
index 0000000..e621c91
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/package-info.java
@@ -0,0 +1,16 @@
+/**
+ * <h2>Partial-data mapreduce implementation of Random Decision Forests</h2>
+ *
+ * <p>The builder splits the data, using a FileInputSplit, among the mappers.
+ * Building the forest and estimating the oob error takes two job steps.</p>
+ *
+ * <p>In the first step, each mapper is responsible for growing a number of trees with its partition's,
+ * loading the data instances in its {@code map()} function, then building the trees in the {@code close()} method. It
+ * uses the reference implementation's code to build each tree and estimate the oob error.</p>
+ *
+ * <p>The second step is needed when estimating the oob error. Each mapper loads all the trees that does not
+ * belong to its own partition (were not built using the partition's data) and uses them to classify the
+ * partition's data instances. The data instances are loaded in the {@code map()} method and the classification
+ * is performed in the {@code close()} method.</p>
+ */
+package org.apache.mahout.classifier.df.mapreduce.partial;

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/CategoricalNode.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/CategoricalNode.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/CategoricalNode.java
new file mode 100644
index 0000000..1f91842
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/CategoricalNode.java
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.node;
+
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.mahout.classifier.df.DFUtils;
+import org.apache.mahout.classifier.df.data.Instance;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+@Deprecated
+public class CategoricalNode extends Node {
+
+  private int attr;
+  private double[] values;
+  private Node[] childs;
+  
+  public CategoricalNode() {
+  }
+  
+  public CategoricalNode(int attr, double[] values, Node[] childs) {
+    this.attr = attr;
+    this.values = values;
+    this.childs = childs;
+  }
+  
+  @Override
+  public double classify(Instance instance) {
+    int index = ArrayUtils.indexOf(values, instance.get(attr));
+    if (index == -1) {
+      // value not available, we cannot predict
+      return Double.NaN;
+    }
+    return childs[index].classify(instance);
+  }
+  
+  @Override
+  public long maxDepth() {
+    long max = 0;
+    
+    for (Node child : childs) {
+      long depth = child.maxDepth();
+      if (depth > max) {
+        max = depth;
+      }
+    }
+    
+    return 1 + max;
+  }
+  
+  @Override
+  public long nbNodes() {
+    long nbNodes = 1;
+    
+    for (Node child : childs) {
+      nbNodes += child.nbNodes();
+    }
+    
+    return nbNodes;
+  }
+  
+  @Override
+  protected Type getType() {
+    return Type.CATEGORICAL;
+  }
+  
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+    if (!(obj instanceof CategoricalNode)) {
+      return false;
+    }
+    
+    CategoricalNode node = (CategoricalNode) obj;
+    
+    return attr == node.attr && Arrays.equals(values, node.values) && Arrays.equals(childs, node.childs);
+  }
+  
+  @Override
+  public int hashCode() {
+    int hashCode = attr;
+    for (double value : values) {
+      hashCode = 31 * hashCode + (int) Double.doubleToLongBits(value);
+    }
+    for (Node node : childs) {
+      hashCode = 31 * hashCode + node.hashCode();
+    }
+    return hashCode;
+  }
+  
+  @Override
+  protected String getString() {
+    StringBuilder buffer = new StringBuilder();
+    
+    for (Node child : childs) {
+      buffer.append(child).append(',');
+    }
+    
+    return buffer.toString();
+  }
+  
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    attr = in.readInt();
+    values = DFUtils.readDoubleArray(in);
+    childs = DFUtils.readNodeArray(in);
+  }
+  
+  @Override
+  protected void writeNode(DataOutput out) throws IOException {
+    out.writeInt(attr);
+    DFUtils.writeArray(out, values);
+    DFUtils.writeArray(out, childs);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Leaf.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Leaf.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Leaf.java
new file mode 100644
index 0000000..3360bb5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Leaf.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.node;
+
+import org.apache.mahout.classifier.df.data.Instance;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Represents a Leaf node
+ */
+@Deprecated
+public class Leaf extends Node {
+  private static final double EPSILON = 1.0e-6;
+
+  private double label;
+  
+  Leaf() { }
+  
+  public Leaf(double label) {
+    this.label = label;
+  }
+  
+  @Override
+  public double classify(Instance instance) {
+    return label;
+  }
+  
+  @Override
+  public long maxDepth() {
+    return 1;
+  }
+  
+  @Override
+  public long nbNodes() {
+    return 1;
+  }
+  
+  @Override
+  protected Type getType() {
+    return Type.LEAF;
+  }
+  
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+    if (!(obj instanceof Leaf)) {
+      return false;
+    }
+    
+    Leaf leaf = (Leaf) obj;
+    
+    return Math.abs(label - leaf.label) < EPSILON;
+  }
+  
+  @Override
+  public int hashCode() {
+    long bits = Double.doubleToLongBits(label);
+    return (int)(bits ^ (bits >>> 32));
+  }
+  
+  @Override
+  protected String getString() {
+    return "";
+  }
+  
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    label = in.readDouble();
+  }
+  
+  @Override
+  protected void writeNode(DataOutput out) throws IOException {
+    out.writeDouble(label);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Node.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Node.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Node.java
new file mode 100644
index 0000000..73d516d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Node.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.node;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.df.data.Instance;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Represents an abstract node of a decision tree
+ */
+@Deprecated
+public abstract class Node implements Writable {
+  
+  protected enum Type {
+    LEAF,
+    NUMERICAL,
+    CATEGORICAL
+  }
+  
+  /**
+   * predicts the label for the instance
+   * 
+   * @return -1 if the label cannot be predicted
+   */
+  public abstract double classify(Instance instance);
+  
+  /**
+   * @return the total number of nodes of the tree
+   */
+  public abstract long nbNodes();
+  
+  /**
+   * @return the maximum depth of the tree
+   */
+  public abstract long maxDepth();
+  
+  protected abstract Type getType();
+  
+  public static Node read(DataInput in) throws IOException {
+    Type type = Type.values()[in.readInt()];
+    Node node;
+    
+    switch (type) {
+      case LEAF:
+        node = new Leaf();
+        break;
+      case NUMERICAL:
+        node = new NumericalNode();
+        break;
+      case CATEGORICAL:
+        node = new CategoricalNode();
+        break;
+      default:
+        throw new IllegalStateException("This implementation is not currently supported");
+    }
+    
+    node.readFields(in);
+    
+    return node;
+  }
+  
+  @Override
+  public final String toString() {
+    return getType() + ":" + getString() + ';';
+  }
+  
+  protected abstract String getString();
+  
+  @Override
+  public final void write(DataOutput out) throws IOException {
+    out.writeInt(getType().ordinal());
+    writeNode(out);
+  }
+  
+  protected abstract void writeNode(DataOutput out) throws IOException;
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/NumericalNode.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/NumericalNode.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/NumericalNode.java
new file mode 100644
index 0000000..aa02089
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/NumericalNode.java
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.node;
+
+import org.apache.mahout.classifier.df.data.Instance;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Represents a node that splits using a numerical attribute
+ */
+@Deprecated
+public class NumericalNode extends Node {
+  /** numerical attribute to split for */
+  private int attr;
+  
+  /** split value */
+  private double split;
+  
+  /** child node when attribute's value < split value */
+  private Node loChild;
+  
+  /** child node when attribute's value >= split value */
+  private Node hiChild;
+  
+  public NumericalNode() { } 
+  
+  public NumericalNode(int attr, double split, Node loChild, Node hiChild) {
+    this.attr = attr;
+    this.split = split;
+    this.loChild = loChild;
+    this.hiChild = hiChild;
+  }
+  
+  @Override
+  public double classify(Instance instance) {
+    if (instance.get(attr) < split) {
+      return loChild.classify(instance);
+    } else {
+      return hiChild.classify(instance);
+    }
+  }
+  
+  @Override
+  public long maxDepth() {
+    return 1 + Math.max(loChild.maxDepth(), hiChild.maxDepth());
+  }
+  
+  @Override
+  public long nbNodes() {
+    return 1 + loChild.nbNodes() + hiChild.nbNodes();
+  }
+  
+  @Override
+  protected Type getType() {
+    return Type.NUMERICAL;
+  }
+  
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+    if (!(obj instanceof NumericalNode)) {
+      return false;
+    }
+    
+    NumericalNode node = (NumericalNode) obj;
+    
+    return attr == node.attr && split == node.split && loChild.equals(node.loChild) && hiChild.equals(node.hiChild);
+  }
+  
+  @Override
+  public int hashCode() {
+    return attr + (int) Double.doubleToLongBits(split) + loChild.hashCode() + hiChild.hashCode();
+  }
+  
+  @Override
+  protected String getString() {
+    return loChild.toString() + ',' + hiChild.toString();
+  }
+  
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    attr = in.readInt();
+    split = in.readDouble();
+    loChild = Node.read(in);
+    hiChild = Node.read(in);
+  }
+  
+  @Override
+  protected void writeNode(DataOutput out) throws IOException {
+    out.writeInt(attr);
+    out.writeDouble(split);
+    loChild.write(out);
+    hiChild.write(out);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ref/SequentialBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ref/SequentialBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ref/SequentialBuilder.java
new file mode 100644
index 0000000..7ef907e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ref/SequentialBuilder.java
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.ref;
+
+import org.apache.mahout.classifier.df.Bagging;
+import org.apache.mahout.classifier.df.DecisionForest;
+import org.apache.mahout.classifier.df.builder.TreeBuilder;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.node.Node;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * Builds a Random Decision Forest using a given TreeBuilder to grow the trees
+ */
+@Deprecated
+public class SequentialBuilder {
+  
+  private static final Logger log = LoggerFactory.getLogger(SequentialBuilder.class);
+  
+  private final Random rng;
+  
+  private final Bagging bagging;
+  
+  /**
+   * Constructor
+   * 
+   * @param rng
+   *          random-numbers generator
+   * @param treeBuilder
+   *          tree builder
+   * @param data
+   *          training data
+   */
+  public SequentialBuilder(Random rng, TreeBuilder treeBuilder, Data data) {
+    this.rng = rng;
+    bagging = new Bagging(treeBuilder, data);
+  }
+  
+  public DecisionForest build(int nbTrees) {
+    List<Node> trees = new ArrayList<>();
+    
+    for (int treeId = 0; treeId < nbTrees; treeId++) {
+      trees.add(bagging.build(rng));
+      logProgress(((float) treeId + 1) / nbTrees);
+    }
+    
+    return new DecisionForest(trees);
+  }
+  
+  private static void logProgress(float progress) {
+    int percent = (int) (progress * 100);
+    if (percent % 10 == 0) {
+      log.info("Building {}%", percent);
+    }
+    
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/DefaultIgSplit.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/DefaultIgSplit.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/DefaultIgSplit.java
new file mode 100644
index 0000000..3f1cfdf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/DefaultIgSplit.java
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.split;
+
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.conditions.Condition;
+
+import java.util.Arrays;
+
+/**
+ * Default, not optimized, implementation of IgSplit
+ */
+@Deprecated
+public class DefaultIgSplit extends IgSplit {
+  
+  /** used by entropy() */
+  private int[] counts;
+  
+  @Override
+  public Split computeSplit(Data data, int attr) {
+    if (data.getDataset().isNumerical(attr)) {
+      double[] values = data.values(attr);
+      double bestIg = -1;
+      double bestSplit = 0.0;
+      
+      for (double value : values) {
+        double ig = numericalIg(data, attr, value);
+        if (ig > bestIg) {
+          bestIg = ig;
+          bestSplit = value;
+        }
+      }
+      
+      return new Split(attr, bestIg, bestSplit);
+    } else {
+      double ig = categoricalIg(data, attr);
+      
+      return new Split(attr, ig);
+    }
+  }
+  
+  /**
+   * Computes the Information Gain for a CATEGORICAL attribute
+   */
+  double categoricalIg(Data data, int attr) {
+    double[] values = data.values(attr);
+    double hy = entropy(data); // H(Y)
+    double hyx = 0.0; // H(Y|X)
+    double invDataSize = 1.0 / data.size();
+    
+    for (double value : values) {
+      Data subset = data.subset(Condition.equals(attr, value));
+      hyx += subset.size() * invDataSize * entropy(subset);
+    }
+    
+    return hy - hyx;
+  }
+  
+  /**
+   * Computes the Information Gain for a NUMERICAL attribute given a splitting value
+   */
+  double numericalIg(Data data, int attr, double split) {
+    double hy = entropy(data);
+    double invDataSize = 1.0 / data.size();
+    
+    // LO subset
+    Data subset = data.subset(Condition.lesser(attr, split));
+    hy -= subset.size() * invDataSize * entropy(subset);
+    
+    // HI subset
+    subset = data.subset(Condition.greaterOrEquals(attr, split));
+    hy -= subset.size() * invDataSize * entropy(subset);
+    
+    return hy;
+  }
+  
+  /**
+   * Computes the Entropy
+   */
+  protected double entropy(Data data) {
+    double invDataSize = 1.0 / data.size();
+    
+    if (counts == null) {
+      counts = new int[data.getDataset().nblabels()];
+    }
+    
+    Arrays.fill(counts, 0);
+    data.countLabels(counts);
+    
+    double entropy = 0.0;
+    for (int label = 0; label < data.getDataset().nblabels(); label++) {
+      int count = counts[label];
+      if (count == 0) {
+        continue; // otherwise we get a NaN
+      }
+      double p = count * invDataSize;
+      entropy += -p * Math.log(p) / LOG2;
+    }
+    
+    return entropy;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/IgSplit.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/IgSplit.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/IgSplit.java
new file mode 100644
index 0000000..aff94e1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/IgSplit.java
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.split;
+
+import org.apache.mahout.classifier.df.data.Data;
+
+/**
+ * Computes the best split using the Information Gain measure
+ */
+@Deprecated
+public abstract class IgSplit {
+  
+  static final double LOG2 = Math.log(2.0);
+  
+  /**
+   * Computes the best split for the given attribute
+   */
+  public abstract Split computeSplit(Data data, int attr);
+  
+}


[05/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/UnitVectorizerJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/UnitVectorizerJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/UnitVectorizerJob.java
new file mode 100644
index 0000000..56cb237
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/UnitVectorizerJob.java
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * <p>Given a DistributedRowMatrix, this job normalizes each row to unit
+ * vector length. If the input is a matrix U, and the output is a matrix
+ * W, the job follows:</p>
+ *
+ * <p>{@code v_ij = u_ij / sqrt(sum_j(u_ij * u_ij))}</p>
+ */
+public final class UnitVectorizerJob {
+
+  private UnitVectorizerJob() {
+  }
+
+  public static void runJob(Path input, Path output)
+    throws IOException, InterruptedException, ClassNotFoundException {
+    
+    Configuration conf = new Configuration();
+    Job job = new Job(conf, "UnitVectorizerJob");
+    
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    job.setOutputKeyClass(IntWritable.class);
+    job.setOutputValueClass(VectorWritable.class);
+    job.setOutputFormatClass(SequenceFileOutputFormat.class);
+    job.setMapperClass(UnitVectorizerMapper.class);
+    job.setNumReduceTasks(0);
+    
+    FileInputFormat.addInputPath(job, input);
+    FileOutputFormat.setOutputPath(job, output);
+
+    job.setJarByClass(UnitVectorizerJob.class);
+
+    boolean succeeded = job.waitForCompletion(true);
+    if (!succeeded) {
+      throw new IllegalStateException("Job failed!");
+    }
+  }
+  
+  public static class UnitVectorizerMapper
+    extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> {
+    
+    @Override
+    protected void map(IntWritable row, VectorWritable vector, Context context) 
+      throws IOException, InterruptedException {
+      context.write(row, new VectorWritable(vector.get().normalize(2)));
+    }
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorCache.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorCache.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorCache.java
new file mode 100644
index 0000000..4ec8149
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorCache.java
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.Arrays;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * This class handles reading and writing vectors to the Hadoop
+ * distributed cache. Created as a result of Eigencuts' liberal use
+ * of such functionality, but available to any algorithm requiring it.
+ */
+public final class VectorCache {
+
+  private static final Logger log = LoggerFactory.getLogger(VectorCache.class);
+
+  private VectorCache() {
+  }
+
+  /**
+   * @param key    SequenceFile key
+   * @param vector Vector to save, to be wrapped as VectorWritable
+   */
+  public static void save(Writable key,
+                          Vector vector,
+                          Path output,
+                          Configuration conf,
+                          boolean overwritePath,
+                          boolean deleteOnExit) throws IOException {
+
+    FileSystem fs = FileSystem.get(output.toUri(), conf);
+    output = fs.makeQualified(output);
+    if (overwritePath) {
+      HadoopUtil.delete(conf, output);
+    }
+
+    // set the cache
+    DistributedCache.setCacheFiles(new URI[]{output.toUri()}, conf);
+
+    // set up the writer
+    try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output,
+        IntWritable.class, VectorWritable.class)){
+      writer.append(key, new VectorWritable(vector));
+    }
+
+    if (deleteOnExit) {
+      fs.deleteOnExit(output);
+    }
+  }
+
+  /**
+   * Calls the save() method, setting the cache to overwrite any previous
+   * Path and to delete the path after exiting
+   */
+  public static void save(Writable key, Vector vector, Path output, Configuration conf) throws IOException {
+    save(key, vector, output, conf, true, true);
+  }
+
+  /**
+   * Loads the vector from {@link DistributedCache}. Returns null if no vector exists.
+   */
+  public static Vector load(Configuration conf) throws IOException {
+    Path[] files = HadoopUtil.getCachedFiles(conf);
+
+    if (files.length != 1) {
+      throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ')');
+    }
+
+    if (log.isInfoEnabled()) {
+      log.info("Files are: {}", Arrays.toString(files));
+    }
+    return load(conf, files[0]);
+  }
+
+  /**
+   * Loads a Vector from the specified path. Returns null if no vector exists.
+   */
+  public static Vector load(Configuration conf, Path input) throws IOException {
+    log.info("Loading vector from: {}", input);
+    try (SequenceFileValueIterator<VectorWritable> iterator =
+             new SequenceFileValueIterator<>(input, true, conf)){
+      return iterator.next().get();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorMatrixMultiplicationJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorMatrixMultiplicationJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorMatrixMultiplicationJob.java
new file mode 100644
index 0000000..c42ab70
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorMatrixMultiplicationJob.java
@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.Functions;
+import org.apache.mahout.math.hadoop.DistributedRowMatrix;
+
+/**
+ * <p>This class handles the three-way multiplication of the digonal matrix
+ * and the Markov transition matrix inherent in the Eigencuts algorithm.
+ * The equation takes the form:</p>
+ *
+ * {@code W = D^(1/2) * M * D^(1/2)}
+ *
+ * <p>Since the diagonal matrix D has only n non-zero elements, it is represented
+ * as a dense vector in this job, rather than a full n-by-n matrix. This job
+ * performs the multiplications and returns the new DRM.
+ */
+public final class VectorMatrixMultiplicationJob {
+
+  private VectorMatrixMultiplicationJob() {
+  }
+
+  /**
+   * Invokes the job.
+   * @param markovPath Path to the markov DRM's sequence files
+   */
+  public static DistributedRowMatrix runJob(Path markovPath, Vector diag, Path outputPath)
+    throws IOException, ClassNotFoundException, InterruptedException {
+    
+    return runJob(markovPath, diag, outputPath, new Path(outputPath, "tmp"));
+  }
+
+  public static DistributedRowMatrix runJob(Path markovPath, Vector diag, Path outputPath, Path tmpPath)
+    throws IOException, ClassNotFoundException, InterruptedException {
+
+    // set up the serialization of the diagonal vector
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(markovPath.toUri(), conf);
+    markovPath = fs.makeQualified(markovPath);
+    outputPath = fs.makeQualified(outputPath);
+    Path vectorOutputPath = new Path(outputPath.getParent(), "vector");
+    VectorCache.save(new IntWritable(Keys.DIAGONAL_CACHE_INDEX), diag, vectorOutputPath, conf);
+
+    // set up the job itself
+    Job job = new Job(conf, "VectorMatrixMultiplication");
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    job.setOutputKeyClass(IntWritable.class);
+    job.setOutputValueClass(VectorWritable.class);
+    job.setOutputFormatClass(SequenceFileOutputFormat.class);
+    job.setMapperClass(VectorMatrixMultiplicationMapper.class);
+    job.setNumReduceTasks(0);
+
+    FileInputFormat.addInputPath(job, markovPath);
+    FileOutputFormat.setOutputPath(job, outputPath);
+
+    job.setJarByClass(VectorMatrixMultiplicationJob.class);
+
+    boolean succeeded = job.waitForCompletion(true);
+    if (!succeeded) {
+      throw new IllegalStateException("Job failed!");
+    }
+
+    // build the resulting DRM from the results
+    return new DistributedRowMatrix(outputPath, tmpPath,
+        diag.size(), diag.size());
+  }
+  
+  public static class VectorMatrixMultiplicationMapper
+    extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> {
+    
+    private Vector diagonal;
+    
+    @Override
+    protected void setup(Context context) throws IOException, InterruptedException {
+      // read in the diagonal vector from the distributed cache
+      super.setup(context);
+      Configuration config = context.getConfiguration();
+      diagonal = VectorCache.load(config);
+      if (diagonal == null) {
+        throw new IOException("No vector loaded from cache!");
+      }
+      if (!(diagonal instanceof DenseVector)) {
+        diagonal = new DenseVector(diagonal);
+      }
+    }
+    
+    @Override
+    protected void map(IntWritable key, VectorWritable row, Context ctx) 
+      throws IOException, InterruptedException {
+      
+      for (Vector.Element e : row.get().all()) {
+        double dii = Functions.SQRT.apply(diagonal.get(key.get()));
+        double djj = Functions.SQRT.apply(diagonal.get(e.index()));
+        double mij = e.get();
+        e.set(dii * mij * djj);
+      }
+      ctx.write(key, row);
+    }
+    
+    /**
+     * Performs the setup of the Mapper. Used by unit tests.
+     * @param diag
+     */
+    void setup(Vector diag) {
+      this.diagonal = diag;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VertexWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VertexWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VertexWritable.java
new file mode 100644
index 0000000..0d70cac
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VertexWritable.java
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+
+/**
+ * Represents a vertex within the affinity graph for Eigencuts.
+ */
+public class VertexWritable implements Writable {
+  
+  /** the row */
+  private int i;
+  
+  /** the column */
+  private int j;
+  
+  /** the value at this vertex */
+  private double value;
+  
+  /** an extra type delimeter, can probably be null */
+  private String type;
+  
+  public VertexWritable() {
+  }
+
+  public VertexWritable(int i, int j, double v, String t) {
+    this.i = i;
+    this.j = j;
+    this.value = v;
+    this.type = t;
+  }
+  
+  public int getRow() {
+    return i;
+  }
+  
+  public void setRow(int i) {
+    this.i = i;
+  }
+  
+  public int getCol() {
+    return j;
+  }
+  
+  public void setCol(int j) { 
+    this.j = j;
+  }
+  
+  public double getValue() {
+    return value;
+  }
+  
+  public void setValue(double v) {
+    this.value = v;
+  }
+  
+  public String getType() {
+    return type;
+  }
+  
+  public void setType(String t) {
+    this.type = t;
+  }
+  
+  @Override
+  public void readFields(DataInput arg0) throws IOException {
+    this.i = arg0.readInt();
+    this.j = arg0.readInt();
+    this.value = arg0.readDouble();
+    this.type = arg0.readUTF();
+  }
+
+  @Override
+  public void write(DataOutput arg0) throws IOException {
+    arg0.writeInt(i);
+    arg0.writeInt(j);
+    arg0.writeDouble(value);
+    arg0.writeUTF(type);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/EigenSeedGenerator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/EigenSeedGenerator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/EigenSeedGenerator.java
new file mode 100644
index 0000000..3ce94dc
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/EigenSeedGenerator.java
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral.kmeans;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.kmeans.Kluster;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Given an Input Path containing a {@link org.apache.hadoop.io.SequenceFile}, select k vectors and write them to the
+ * output file as a {@link org.apache.mahout.clustering.kmeans.Kluster} representing the initial centroid to use. The
+ * selection criterion is the rows with max value in that respective column
+ */
+public final class EigenSeedGenerator {
+
+  private static final Logger log = LoggerFactory.getLogger(EigenSeedGenerator.class);
+
+  public static final String K = "k";
+
+  private EigenSeedGenerator() {}
+
+  public static Path buildFromEigens(Configuration conf, Path input, Path output, int k, DistanceMeasure measure)
+      throws IOException {
+    // delete the output directory
+    FileSystem fs = FileSystem.get(output.toUri(), conf);
+    HadoopUtil.delete(conf, output);
+    Path outFile = new Path(output, "part-eigenSeed");
+    boolean newFile = fs.createNewFile(outFile);
+    if (newFile) {
+      Path inputPathPattern;
+
+      if (fs.getFileStatus(input).isDir()) {
+        inputPathPattern = new Path(input, "*");
+      } else {
+        inputPathPattern = input;
+      }
+
+      FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter());
+      Map<Integer,Double> maxEigens = new HashMap<>(k); // store
+                                                                          // max
+                                                                          // value
+                                                                          // of
+                                                                          // each
+                                                                          // column
+      Map<Integer,Text> chosenTexts = new HashMap<>(k);
+      Map<Integer,ClusterWritable> chosenClusters = new HashMap<>(k);
+
+      for (FileStatus fileStatus : inputFiles) {
+        if (!fileStatus.isDir()) {
+          for (Pair<Writable,VectorWritable> record : new SequenceFileIterable<Writable,VectorWritable>(
+              fileStatus.getPath(), true, conf)) {
+            Writable key = record.getFirst();
+            VectorWritable value = record.getSecond();
+
+            for (Vector.Element e : value.get().nonZeroes()) {
+              int index = e.index();
+              double v = Math.abs(e.get());
+
+              if (!maxEigens.containsKey(index) || v > maxEigens.get(index)) {
+                maxEigens.put(index, v);
+                Text newText = new Text(key.toString());
+                chosenTexts.put(index, newText);
+                Kluster newCluster = new Kluster(value.get(), index, measure);
+                newCluster.observe(value.get(), 1);
+                ClusterWritable clusterWritable = new ClusterWritable();
+                clusterWritable.setValue(newCluster);
+                chosenClusters.put(index, clusterWritable);
+              }
+            }
+          }
+        }
+      }
+
+      try (SequenceFile.Writer writer =
+               SequenceFile.createWriter(fs, conf, outFile, Text.class, ClusterWritable.class)){
+        for (Integer key : maxEigens.keySet()) {
+          writer.append(chosenTexts.get(key), chosenClusters.get(key));
+        }
+        log.info("EigenSeedGenerator:: Wrote {} Klusters to {}", chosenTexts.size(), outFile);
+      }
+    }
+
+    return outFile;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
new file mode 100644
index 0000000..427de91
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
@@ -0,0 +1,243 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral.kmeans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.spectral.AffinityMatrixInputJob;
+import org.apache.mahout.clustering.spectral.MatrixDiagonalizeJob;
+import org.apache.mahout.clustering.spectral.UnitVectorizerJob;
+import org.apache.mahout.clustering.spectral.VectorMatrixMultiplicationJob;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.hadoop.DistributedRowMatrix;
+import org.apache.mahout.math.hadoop.stochasticsvd.SSVDSolver;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Performs spectral k-means clustering on the top k eigenvectors of the input affinity matrix.
+ */
+public class SpectralKMeansDriver extends AbstractJob {
+  private static final Logger log = LoggerFactory.getLogger(SpectralKMeansDriver.class);
+
+  public static final int REDUCERS = 10;
+  public static final int BLOCKHEIGHT = 30000;
+  public static final int OVERSAMPLING = 15;
+  public static final int POWERITERS = 0;
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new SpectralKMeansDriver(), args);
+  }
+
+  @Override
+  public int run(String[] arg0) throws Exception {
+
+    Configuration conf = getConf();
+    addInputOption();
+    addOutputOption();
+    addOption("dimensions", "d", "Square dimensions of affinity matrix", true);
+    addOption("clusters", "k", "Number of clusters and top eigenvectors", true);
+    addOption(DefaultOptionCreator.distanceMeasureOption().create());
+    addOption(DefaultOptionCreator.convergenceOption().create());
+    addOption(DefaultOptionCreator.maxIterationsOption().create());
+    addOption(DefaultOptionCreator.overwriteOption().create());
+    addFlag("usessvd", "ssvd", "Uses SSVD as the eigensolver. Default is the Lanczos solver.");
+    addOption("reduceTasks", "t", "Number of reducers for SSVD", String.valueOf(REDUCERS));
+    addOption("outerProdBlockHeight", "oh", "Block height of outer products for SSVD", String.valueOf(BLOCKHEIGHT));
+    addOption("oversampling", "p", "Oversampling parameter for SSVD", String.valueOf(OVERSAMPLING));
+    addOption("powerIter", "q", "Additional power iterations for SSVD", String.valueOf(POWERITERS));
+
+    Map<String, List<String>> parsedArgs = parseArguments(arg0);
+    if (parsedArgs == null) {
+      return 0;
+    }
+
+    Path input = getInputPath();
+    Path output = getOutputPath();
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.delete(conf, getTempPath());
+      HadoopUtil.delete(conf, getOutputPath());
+    }
+    int numDims = Integer.parseInt(getOption("dimensions"));
+    int clusters = Integer.parseInt(getOption("clusters"));
+    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
+    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+
+    Path tempdir = new Path(getOption("tempDir"));
+    int reducers = Integer.parseInt(getOption("reduceTasks"));
+    int blockheight = Integer.parseInt(getOption("outerProdBlockHeight"));
+    int oversampling = Integer.parseInt(getOption("oversampling"));
+    int poweriters = Integer.parseInt(getOption("powerIter"));
+    run(conf, input, output, numDims, clusters, measure, convergenceDelta, maxIterations, tempdir, reducers,
+        blockheight, oversampling, poweriters);
+
+    return 0;
+  }
+
+  public static void run(Configuration conf, Path input, Path output, int numDims, int clusters,
+                         DistanceMeasure measure, double convergenceDelta, int maxIterations, Path tempDir)
+      throws IOException, InterruptedException, ClassNotFoundException {
+    run(conf, input, output, numDims, clusters, measure, convergenceDelta, maxIterations, tempDir, REDUCERS,
+        BLOCKHEIGHT, OVERSAMPLING, POWERITERS);
+  }
+
+  /**
+   * Run the Spectral KMeans clustering on the supplied arguments
+   *
+   * @param conf
+   *          the Configuration to be used
+   * @param input
+   *          the Path to the input tuples directory
+   * @param output
+   *          the Path to the output directory
+   * @param numDims
+   *          the int number of dimensions of the affinity matrix
+   * @param clusters
+   *          the int number of eigenvectors and thus clusters to produce
+   * @param measure
+   *          the DistanceMeasure for the k-Means calculations
+   * @param convergenceDelta
+   *          the double convergence delta for the k-Means calculations
+   * @param maxIterations
+   *          the int maximum number of iterations for the k-Means calculations
+   * @param tempDir
+   *          Temporary directory for intermediate calculations
+   * @param numReducers
+   *          Number of reducers
+   * @param blockHeight
+   * @param oversampling
+   * @param poweriters
+   */
+  public static void run(Configuration conf, Path input, Path output, int numDims, int clusters,
+                         DistanceMeasure measure, double convergenceDelta, int maxIterations, Path tempDir,
+                         int numReducers, int blockHeight, int oversampling, int poweriters)
+      throws IOException, InterruptedException, ClassNotFoundException {
+
+    HadoopUtil.delete(conf, tempDir);
+    Path outputCalc = new Path(tempDir, "calculations");
+    Path outputTmp = new Path(tempDir, "temporary");
+
+    // Take in the raw CSV text file and split it ourselves,
+    // creating our own SequenceFiles for the matrices to read later
+    // (similar to the style of syntheticcontrol.canopy.InputMapper)
+    Path affSeqFiles = new Path(outputCalc, "seqfile");
+    AffinityMatrixInputJob.runJob(input, affSeqFiles, numDims, numDims);
+
+    // Construct the affinity matrix using the newly-created sequence files
+    DistributedRowMatrix A = new DistributedRowMatrix(affSeqFiles, new Path(outputTmp, "afftmp"), numDims, numDims);
+
+    Configuration depConf = new Configuration(conf);
+    A.setConf(depConf);
+
+    // Construct the diagonal matrix D (represented as a vector)
+    Vector D = MatrixDiagonalizeJob.runJob(affSeqFiles, numDims);
+
+    // Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5)
+    DistributedRowMatrix L = VectorMatrixMultiplicationJob.runJob(affSeqFiles, D, new Path(outputCalc, "laplacian"),
+        new Path(outputCalc, outputCalc));
+    L.setConf(depConf);
+
+    Path data;
+
+    // SSVD requires an array of Paths to function. So we pass in an array of length one
+    Path[] LPath = new Path[1];
+    LPath[0] = L.getRowPath();
+
+    Path SSVDout = new Path(outputCalc, "SSVD");
+
+    SSVDSolver solveIt = new SSVDSolver(depConf, LPath, SSVDout, blockHeight, clusters, oversampling, numReducers);
+
+    solveIt.setComputeV(false);
+    solveIt.setComputeU(true);
+    solveIt.setOverwrite(true);
+    solveIt.setQ(poweriters);
+    // solveIt.setBroadcast(false);
+    solveIt.run();
+    data = new Path(solveIt.getUPath());
+
+    // Normalize the rows of Wt to unit length
+    // normalize is important because it reduces the occurrence of two unique clusters combining into one
+    Path unitVectors = new Path(outputCalc, "unitvectors");
+
+    UnitVectorizerJob.runJob(data, unitVectors);
+
+    DistributedRowMatrix Wt = new DistributedRowMatrix(unitVectors, new Path(unitVectors, "tmp"), clusters, numDims);
+    Wt.setConf(depConf);
+    data = Wt.getRowPath();
+
+    // Generate initial clusters using EigenSeedGenerator which picks rows as centroids if that row contains max
+    // eigen value in that column
+    Path initialclusters = EigenSeedGenerator.buildFromEigens(conf, data,
+        new Path(output, Cluster.INITIAL_CLUSTERS_DIR), clusters, measure);
+
+    // Run the KMeansDriver
+    Path answer = new Path(output, "kmeans_out");
+    KMeansDriver.run(conf, data, initialclusters, answer, convergenceDelta, maxIterations, true, 0.0, false);
+
+    // Restore name to id mapping and read through the cluster assignments
+    Path mappingPath = new Path(new Path(conf.get("hadoop.tmp.dir")), "generic_input_mapping");
+    List<String> mapping = new ArrayList<>();
+    FileSystem fs = FileSystem.get(mappingPath.toUri(), conf);
+    if (fs.exists(mappingPath)) {
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, mappingPath, conf);
+      Text mappingValue = new Text();
+      IntWritable mappingIndex = new IntWritable();
+      while (reader.next(mappingIndex, mappingValue)) {
+        String s = mappingValue.toString();
+        mapping.add(s);
+      }
+      HadoopUtil.delete(conf, mappingPath);
+    } else {
+      log.warn("generic input mapping file not found!");
+    }
+
+    Path clusteredPointsPath = new Path(answer, "clusteredPoints");
+    Path inputPath = new Path(clusteredPointsPath, "part-m-00000");
+    int id = 0;
+    for (Pair<IntWritable, WeightedVectorWritable> record :
+        new SequenceFileIterable<IntWritable, WeightedVectorWritable>(inputPath, conf)) {
+      if (!mapping.isEmpty()) {
+        log.info("{}: {}", mapping.get(id++), record.getFirst().get());
+      } else {
+        log.info("{}: {}", id++, record.getFirst().get());
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/BallKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/BallKMeans.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/BallKMeans.java
new file mode 100644
index 0000000..25806fe
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/BallKMeans.java
@@ -0,0 +1,456 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.cluster;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import com.google.common.base.Function;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.clustering.ClusteringUtils;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.WeightedVector;
+import org.apache.mahout.math.neighborhood.UpdatableSearcher;
+import org.apache.mahout.math.random.Multinomial;
+import org.apache.mahout.math.random.WeightedThing;
+
+/**
+ * Implements a ball k-means algorithm for weighted vectors with probabilistic seeding similar to k-means++.
+ * The idea is that k-means++ gives good starting clusters and ball k-means can tune up the final result very nicely
+ * in only a few passes (or even in a single iteration for well-clusterable data).
+ *
+ * A good reference for this class of algorithms is "The Effectiveness of Lloyd-Type Methods for the k-Means Problem"
+ * by Rafail Ostrovsky, Yuval Rabani, Leonard J. Schulman and Chaitanya Swamy.  The code here uses the seeding strategy
+ * as described in section 4.1.1 of that paper and the ball k-means step as described in section 4.2.  We support
+ * multiple iterations in contrast to the algorithm described in the paper.
+ */
+public class BallKMeans implements Iterable<Centroid> {
+  /**
+   * The searcher containing the centroids.
+   */
+  private final UpdatableSearcher centroids;
+
+  /**
+   * The number of clusters to cluster the data into.
+   */
+  private final int numClusters;
+
+  /**
+   * The maximum number of iterations of the algorithm to run waiting for the cluster assignments
+   * to stabilize. If there are no changes in cluster assignment earlier, we can finish early.
+   */
+  private final int maxNumIterations;
+
+  /**
+   * When deciding which points to include in the new centroid calculation,
+   * it's preferable to exclude outliers since it increases the rate of convergence.
+   * So, we calculate the distance from each cluster to its closest neighboring cluster. When
+   * evaluating the points assigned to a cluster, we compare the distance between the centroid to
+   * the point with the distance between the centroid and its closest centroid neighbor
+   * multiplied by this trimFraction. If the distance between the centroid and the point is
+   * greater, we consider it an outlier and we don't use it.
+   */
+  private final double trimFraction;
+
+  /**
+   * Selecting the initial centroids is the most important part of the ball k-means clustering. Poor choices, like two
+   * centroids in the same actual cluster result in a low-quality final result.
+   * k-means++ initialization yields good quality clusters, especially when using BallKMeans after StreamingKMeans as
+   * the points have weights.
+   * Simple, random selection of the points based on their weights is faster but sometimes fails to produce the
+   * desired number of clusters.
+   * This field is true if the initialization should be done with k-means++.
+   */
+  private final boolean kMeansPlusPlusInit;
+
+  /**
+   * When using trimFraction, the weight of each centroid will not be the sum of the weights of
+   * the vectors assigned to that cluster because outliers are not used to compute the updated
+   * centroid.
+   * So, the total weight is probably wrong. This can be fixed by doing another pass over the
+   * data points and adjusting the weights of each centroid. This doesn't update the coordinates
+   * of the centroids, but is useful if the weights matter.
+   */
+  private final boolean correctWeights;
+
+  /**
+   * When running multiple ball k-means passes to get the one with the smallest total cost, can compute the
+   * overall cost, using all the points for clustering, or reserve a fraction of them, testProbability in a test set.
+   * The cost is the sum of the distances between each point and its corresponding centroid.
+   * We then use this set of points to compute the total cost on. We're therefore trying to select the clustering
+   * that best describes the underlying distribution of the clusters.
+   * This field is the probability of assigning a given point to the test set. If this is 0, the cost will be computed
+   * on the entire set of points.
+   */
+  private final double testProbability;
+
+  /**
+   * Whether or not testProbability > 0, i.e., there exists a non-empty 'test' set.
+   */
+  private final boolean splitTrainTest;
+
+  /**
+   * How many k-means runs to have. If there's more than one run, we compute the cost of each clustering as described
+   * above and select the clustering that minimizes the cost.
+   * Multiple runs are a lot more useful when using the random initialization. With kmeans++, 1-2 runs are enough and
+   * more runs are not likely to help quality much.
+   */
+  private final int numRuns;
+
+  /**
+   * Random object to sample values from.
+   */
+  private final Random random;
+
+  public BallKMeans(UpdatableSearcher searcher, int numClusters, int maxNumIterations) {
+    // By default, the trimFraction is 0.9, k-means++ is used, the weights will be corrected at the end,
+    // there will be 0 points in the test set and 1 run.
+    this(searcher, numClusters, maxNumIterations, 0.9, true, true, 0.0, 1);
+  }
+
+  public BallKMeans(UpdatableSearcher searcher, int numClusters, int maxNumIterations,
+                    boolean kMeansPlusPlusInit, int numRuns) {
+    // By default, the trimFraction is 0.9, k-means++ is used, the weights will be corrected at the end,
+    // there will be 10% points of in the test set.
+    this(searcher, numClusters, maxNumIterations, 0.9, kMeansPlusPlusInit, true, 0.1, numRuns);
+  }
+
+  public BallKMeans(UpdatableSearcher searcher, int numClusters, int maxNumIterations,
+                    double trimFraction, boolean kMeansPlusPlusInit, boolean correctWeights,
+                    double testProbability, int numRuns) {
+    Preconditions.checkArgument(searcher.size() == 0, "Searcher must be empty initially to populate with centroids");
+    Preconditions.checkArgument(numClusters > 0, "The requested number of clusters must be positive");
+    Preconditions.checkArgument(maxNumIterations > 0, "The maximum number of iterations must be positive");
+    Preconditions.checkArgument(trimFraction > 0, "The trim fraction must be positive");
+    Preconditions.checkArgument(testProbability >= 0 && testProbability < 1, "The testProbability must be in [0, 1)");
+    Preconditions.checkArgument(numRuns > 0, "There has to be at least one run");
+
+    this.centroids = searcher;
+    this.numClusters = numClusters;
+    this.maxNumIterations = maxNumIterations;
+
+    this.trimFraction = trimFraction;
+    this.kMeansPlusPlusInit = kMeansPlusPlusInit;
+    this.correctWeights = correctWeights;
+
+    this.testProbability = testProbability;
+    this.splitTrainTest = testProbability > 0;
+    this.numRuns = numRuns;
+
+    this.random = RandomUtils.getRandom();
+  }
+
+  public Pair<List<? extends WeightedVector>, List<? extends WeightedVector>> splitTrainTest(
+      List<? extends WeightedVector> datapoints) {
+    // If there will be no points assigned to the test set, return now.
+    if (testProbability == 0) {
+      return new Pair<List<? extends WeightedVector>, List<? extends WeightedVector>>(datapoints,
+          new ArrayList<WeightedVector>());
+    }
+
+    int numTest = (int) (testProbability * datapoints.size());
+    Preconditions.checkArgument(numTest > 0 && numTest < datapoints.size(),
+        "Must have nonzero number of training and test vectors. Asked for %.1f %% of %d vectors for test",
+        testProbability * 100, datapoints.size());
+
+    Collections.shuffle(datapoints);
+    return new Pair<List<? extends WeightedVector>, List<? extends WeightedVector>>(
+        datapoints.subList(numTest, datapoints.size()), datapoints.subList(0, numTest));
+  }
+
+  /**
+   * Clusters the datapoints in the list doing either random seeding of the centroids or k-means++.
+   *
+   * @param datapoints the points to be clustered.
+   * @return an UpdatableSearcher with the resulting clusters.
+   */
+  public UpdatableSearcher cluster(List<? extends WeightedVector> datapoints) {
+    Pair<List<? extends WeightedVector>, List<? extends WeightedVector>> trainTestSplit = splitTrainTest(datapoints);
+    List<Vector> bestCentroids = new ArrayList<>();
+    double cost = Double.POSITIVE_INFINITY;
+    double bestCost = Double.POSITIVE_INFINITY;
+    for (int i = 0; i < numRuns; ++i) {
+      centroids.clear();
+      if (kMeansPlusPlusInit) {
+        // Use k-means++ to set initial centroids.
+        initializeSeedsKMeansPlusPlus(trainTestSplit.getFirst());
+      } else {
+        // Randomly select the initial centroids.
+        initializeSeedsRandomly(trainTestSplit.getFirst());
+      }
+      // Do k-means iterations with trimmed mean computation (aka ball k-means).
+      if (numRuns > 1) {
+        // If the clustering is successful (there are no zero-weight centroids).
+        iterativeAssignment(trainTestSplit.getFirst());
+        // Compute the cost of the clustering and possibly save the centroids.
+        cost = ClusteringUtils.totalClusterCost(
+            splitTrainTest ? datapoints : trainTestSplit.getSecond(), centroids);
+        if (cost < bestCost) {
+          bestCost = cost;
+          bestCentroids.clear();
+          Iterables.addAll(bestCentroids, centroids);
+        }
+      } else {
+        // If there is only going to be one run, the cost doesn't need to be computed, so we just return the clustering.
+        iterativeAssignment(datapoints);
+        return centroids;
+      }
+    }
+    if (bestCost == Double.POSITIVE_INFINITY) {
+      throw new RuntimeException("No valid clustering was found");
+    }
+    if (cost != bestCost) {
+      centroids.clear();
+      centroids.addAll(bestCentroids);
+    }
+    if (correctWeights) {
+      for (WeightedVector testDatapoint : trainTestSplit.getSecond()) {
+        WeightedVector closest = (WeightedVector) centroids.searchFirst(testDatapoint, false).getValue();
+        closest.setWeight(closest.getWeight() + testDatapoint.getWeight());
+      }
+    }
+    return centroids;
+  }
+
+  /**
+   * Selects some of the original points randomly with probability proportional to their weights. This is much
+   * less sophisticated than the kmeans++ approach, however it is faster and coupled with
+   *
+   * The side effect of this method is to fill the centroids structure itself.
+   *
+   * @param datapoints The datapoints to select from.  These datapoints should be WeightedVectors of some kind.
+   */
+  private void initializeSeedsRandomly(List<? extends WeightedVector> datapoints) {
+    int numDatapoints = datapoints.size();
+    double totalWeight = 0;
+    for (WeightedVector datapoint : datapoints) {
+      totalWeight += datapoint.getWeight();
+    }
+    Multinomial<Integer> seedSelector = new Multinomial<>();
+    for (int i = 0; i < numDatapoints; ++i) {
+      seedSelector.add(i, datapoints.get(i).getWeight() / totalWeight);
+    }
+    for (int i = 0; i < numClusters; ++i) {
+      int sample = seedSelector.sample();
+      seedSelector.delete(sample);
+      Centroid centroid = new Centroid(datapoints.get(sample));
+      centroid.setIndex(i);
+      centroids.add(centroid);
+    }
+  }
+
+  /**
+   * Selects some of the original points according to the k-means++ algorithm.  The basic idea is that
+   * points are selected with probability proportional to their distance from any selected point.  In
+   * this version, points have weights which multiply their likelihood of being selected.  This is the
+   * same as if there were as many copies of the same point as indicated by the weight.
+   *
+   * This is pretty expensive, but it vastly improves the quality and convergences of the k-means algorithm.
+   * The basic idea can be made much faster by only processing a random subset of the original points.
+   * In the context of streaming k-means, the total number of possible seeds will be about k log n so this
+   * selection will cost O(k^2 (log n)^2) which isn't much worse than the random sampling idea.  At
+   * n = 10^9, the cost of this initialization will be about 10x worse than a reasonable random sampling
+   * implementation.
+   *
+   * The side effect of this method is to fill the centroids structure itself.
+   *
+   * @param datapoints The datapoints to select from.  These datapoints should be WeightedVectors of some kind.
+   */
+  private void initializeSeedsKMeansPlusPlus(List<? extends WeightedVector> datapoints) {
+    Preconditions.checkArgument(datapoints.size() > 1, "Must have at least two datapoints points to cluster " +
+        "sensibly");
+    Preconditions.checkArgument(datapoints.size() >= numClusters,
+        String.format("Must have more datapoints [%d] than clusters [%d]", datapoints.size(), numClusters));
+    // Compute the centroid of all of the datapoints.  This is then used to compute the squared radius of the datapoints.
+    Centroid center = new Centroid(datapoints.iterator().next());
+    for (WeightedVector row : Iterables.skip(datapoints, 1)) {
+      center.update(row);
+    }
+
+    // Given the centroid, we can compute \Delta_1^2(X), the total squared distance for the datapoints
+    // this accelerates seed selection.
+    double deltaX = 0;
+    DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
+    for (WeightedVector row : datapoints) {
+      deltaX += distanceMeasure.distance(row, center);
+    }
+
+    // Find the first seed c_1 (and conceptually the second, c_2) as might be done in the 2-means clustering so that
+    // the probability of selecting c_1 and c_2 is proportional to || c_1 - c_2 ||^2.  This is done
+    // by first selecting c_1 with probability:
+    //
+    // p(c_1) = sum_{c_1} || c_1 - c_2 ||^2 \over sum_{c_1, c_2} || c_1 - c_2 ||^2
+    //
+    // This can be simplified to:
+    //
+    // p(c_1) = \Delta_1^2(X) + n || c_1 - c ||^2 / (2 n \Delta_1^2(X))
+    //
+    // where c = \sum x / n and \Delta_1^2(X) = sum || x - c ||^2
+    //
+    // All subsequent seeds c_i (including c_2) can then be selected from the remaining points with probability
+    // proportional to Pr(c_i == x_j) = min_{m < i} || c_m - x_j ||^2.
+
+    // Multinomial distribution of vector indices for the selection seeds. These correspond to
+    // the indices of the vectors in the original datapoints list.
+    Multinomial<Integer> seedSelector = new Multinomial<>();
+    for (int i = 0; i < datapoints.size(); ++i) {
+      double selectionProbability =
+          deltaX + datapoints.size() * distanceMeasure.distance(datapoints.get(i), center);
+      seedSelector.add(i, selectionProbability);
+    }
+
+    int selected = random.nextInt(datapoints.size());
+    Centroid c_1 = new Centroid(datapoints.get(selected).clone());
+    c_1.setIndex(0);
+    // Construct a set of weighted things which can be used for random selection.  Initial weights are
+    // set to the squared distance from c_1
+    for (int i = 0; i < datapoints.size(); ++i) {
+      WeightedVector row = datapoints.get(i);
+      double w = distanceMeasure.distance(c_1, row) * 2 * Math.log(1 + row.getWeight());
+      seedSelector.set(i, w);
+    }
+
+    // From here, seeds are selected with probability proportional to:
+    //
+    // r_i = min_{c_j} || x_i - c_j ||^2
+    //
+    // when we only have c_1, we have already set these distances and as we select each new
+    // seed, we update the minimum distances.
+    centroids.add(c_1);
+    int clusterIndex = 1;
+    while (centroids.size() < numClusters) {
+      // Select according to weights.
+      int seedIndex = seedSelector.sample();
+      Centroid nextSeed = new Centroid(datapoints.get(seedIndex));
+      nextSeed.setIndex(clusterIndex++);
+      centroids.add(nextSeed);
+      // Don't select this one again.
+      seedSelector.delete(seedIndex);
+      // Re-weight everything according to the minimum distance to a seed.
+      for (int currSeedIndex : seedSelector) {
+        WeightedVector curr = datapoints.get(currSeedIndex);
+        double newWeight = nextSeed.getWeight() * distanceMeasure.distance(nextSeed, curr);
+        if (newWeight < seedSelector.getWeight(currSeedIndex)) {
+          seedSelector.set(currSeedIndex, newWeight);
+        }
+      }
+    }
+  }
+
+  /**
+   * Examines the datapoints and updates cluster centers to be the centroid of the nearest datapoints points.  To
+   * compute a new center for cluster c_i, we average all points that are closer than d_i * trimFraction
+   * where d_i is
+   *
+   * d_i = min_j \sqrt ||c_j - c_i||^2
+   *
+   * By ignoring distant points, the centroids converge more quickly to a good approximation of the
+   * optimal k-means solution (given good starting points).
+   *
+   * @param datapoints the points to cluster.
+   */
+  private void iterativeAssignment(List<? extends WeightedVector> datapoints) {
+    DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
+    // closestClusterDistances.get(i) is the distance from the i'th cluster to its closest
+    // neighboring cluster.
+    List<Double> closestClusterDistances = new ArrayList<>(numClusters);
+    // clusterAssignments[i] == j means that the i'th point is assigned to the j'th cluster. When
+    // these don't change, we are done.
+    // Each point is assigned to the invalid "-1" cluster initially.
+    List<Integer> clusterAssignments = new ArrayList<>(Collections.nCopies(datapoints.size(), -1));
+
+    boolean changed = true;
+    for (int i = 0; changed && i < maxNumIterations; i++) {
+      changed = false;
+      // We compute what the distance between each cluster and its closest neighbor is to set a
+      // proportional distance threshold for points that should be involved in calculating the
+      // centroid.
+      closestClusterDistances.clear();
+      for (Vector center : centroids) {
+        // If a centroid has no points assigned to it, the clustering failed.
+        Vector closestOtherCluster = centroids.searchFirst(center, true).getValue();
+        closestClusterDistances.add(distanceMeasure.distance(center, closestOtherCluster));
+      }
+
+      // Copies the current cluster centroids to newClusters and sets their weights to 0. This is
+      // so we calculate the new centroids as we go through the datapoints.
+      List<Centroid> newCentroids = new ArrayList<>();
+      for (Vector centroid : centroids) {
+        // need a deep copy because we will mutate these values
+        Centroid newCentroid = (Centroid)centroid.clone();
+        newCentroid.setWeight(0);
+        newCentroids.add(newCentroid);
+      }
+
+      // Pass over the datapoints computing new centroids.
+      for (int j = 0; j < datapoints.size(); ++j) {
+        WeightedVector datapoint = datapoints.get(j);
+        // Get the closest cluster this point belongs to.
+        WeightedThing<Vector> closestPair = centroids.searchFirst(datapoint, false);
+        int closestIndex = ((WeightedVector) closestPair.getValue()).getIndex();
+        double closestDistance = closestPair.getWeight();
+        // Update its cluster assignment if necessary.
+        if (closestIndex != clusterAssignments.get(j)) {
+          changed = true;
+          clusterAssignments.set(j, closestIndex);
+        }
+        // Only update if the datapoints point is near enough. What this means is that the weight
+        // of outliers is NOT taken into account and the final weights of the centroids will
+        // reflect this (it will be less or equal to the initial sum of the weights).
+        if (closestDistance < trimFraction * closestClusterDistances.get(closestIndex)) {
+          newCentroids.get(closestIndex).update(datapoint);
+        }
+      }
+      // Add the new centers back into searcher.
+      centroids.clear();
+      centroids.addAll(newCentroids);
+    }
+
+    if (correctWeights) {
+      for (Vector v : centroids) {
+        ((Centroid)v).setWeight(0);
+      }
+      for (WeightedVector datapoint : datapoints) {
+        Centroid closestCentroid = (Centroid) centroids.searchFirst(datapoint, false).getValue();
+        closestCentroid.setWeight(closestCentroid.getWeight() + datapoint.getWeight());
+      }
+    }
+  }
+
+  @Override
+  public Iterator<Centroid> iterator() {
+    return Iterators.transform(centroids.iterator(), new Function<Vector, Centroid>() {
+      @Override
+      public Centroid apply(Vector input) {
+        Preconditions.checkArgument(input instanceof Centroid, "Non-centroid in centroids " +
+            "searcher");
+        //noinspection ConstantConditions
+        return (Centroid)input;
+      }
+    });
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeans.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeans.java
new file mode 100644
index 0000000..604bc9d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeans.java
@@ -0,0 +1,368 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.cluster;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.MatrixSlice;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.jet.math.Constants;
+import org.apache.mahout.math.neighborhood.UpdatableSearcher;
+import org.apache.mahout.math.random.WeightedThing;
+
+/**
+ * Implements a streaming k-means algorithm for weighted vectors.
+ * The goal clustering points one at a time, especially useful for MapReduce mappers that get inputs one at a time.
+ *
+ * A rough description of the algorithm:
+ * Suppose there are l clusters at one point and a new point p is added.
+ * The new point can either be added to one of the existing l clusters or become a new cluster. To decide:
+ * - let c be the closest cluster to point p;
+ * - let d be the distance between c and p;
+ * - if d > distanceCutoff, create a new cluster from p (p is too far away from the clusters to be part of them;
+ * distanceCutoff represents the largest distance from a point its assigned cluster's centroid);
+ * - else (d <= distanceCutoff), create a new cluster with probability d / distanceCutoff (the probability of creating
+ * a new cluster increases as d increases).
+ * There will be either l points or l + 1 points after processing a new point.
+ *
+ * As the number of clusters increases, it will go over the numClusters limit (numClusters represents a recommendation
+ * for the number of clusters that there should be at the end). To decrease the number of clusters the existing clusters
+ * are treated as data points and are re-clustered (collapsed). This tends to make the number of clusters go down.
+ * If the number of clusters is still too high, distanceCutoff is increased.
+ *
+ * For more details, see:
+ * - "Streaming  k-means approximation" by N. Ailon, R. Jaiswal, C. Monteleoni
+ * http://books.nips.cc/papers/files/nips22/NIPS2009_1085.pdf
+ * - "Fast and Accurate k-means for Large Datasets" by M. Shindler, A. Wong, A. Meyerson,
+ * http://books.nips.cc/papers/files/nips24/NIPS2011_1271.pdf
+ */
+public class StreamingKMeans implements Iterable<Centroid> {
+  /**
+   * The searcher containing the centroids that resulted from the clustering of points until now. When adding a new
+   * point we either assign it to one of the existing clusters in this searcher or create a new centroid for it.
+   */
+  private final UpdatableSearcher centroids;
+
+  /**
+   * The estimated number of clusters to cluster the data in. If the actual number of clusters increases beyond this
+   * limit, the clusters will be "collapsed" (re-clustered, by treating them as data points). This doesn't happen
+   * recursively and a collapse might not necessarily make the number of actual clusters drop to less than this limit.
+   *
+   * If the goal is clustering a large data set into k clusters, numClusters SHOULD NOT BE SET to k. StreamingKMeans is
+   * useful to reduce the size of the data set by the mappers so that it can fit into memory in one reducer that runs
+   * BallKMeans.
+   *
+   * It is NOT MEANT to cluster the data into k clusters in one pass because it can't guarantee that there will in fact
+   * be k clusters in total. This is because of the dynamic nature of numClusters over the course of the runtime.
+   * To get an exact number of clusters, another clustering algorithm needs to be applied to the results.
+   */
+  private int numClusters;
+
+  /**
+   * The number of data points seen so far. This is important for re-estimating numClusters when deciding to collapse
+   * the existing clusters.
+   */
+  private int numProcessedDatapoints = 0;
+
+  /**
+   * This is the current value of the distance cutoff.  Points which are much closer than this to a centroid will stick
+   * to it almost certainly. Points further than this to any centroid will form a new cluster.
+   *
+   * This increases (is multiplied by beta) when a cluster collapse did not make the number of clusters drop to below
+   * numClusters (it effectively increases the tolerance for cluster compactness discouraging the creation of new
+   * clusters). Since a collapse only happens when centroids.size() > clusterOvershoot * numClusters, the cutoff
+   * increases when the collapse didn't at least remove the slack in the number of clusters.
+   */
+  private double distanceCutoff;
+
+  /**
+   * Parameter that controls the growth of the distanceCutoff. After n increases of the
+   * distanceCutoff starting at d_0, the final value is d_0 * beta^n (distance cutoffs increase following a geometric
+   * progression with ratio beta).
+   */
+  private final double beta;
+
+  /**
+   * Multiplying clusterLogFactor with numProcessedDatapoints gets an estimate of the suggested
+   * number of clusters. This mirrors the recommended number of clusters for n points where there should be k actual
+   * clusters, k * log n. In the case of our estimate we use clusterLogFactor * log(numProcessedDataPoints).
+   *
+   * It is important to note that numClusters is NOT k. It is an estimate of k * log n.
+   */
+  private final double clusterLogFactor;
+
+  /**
+   * Centroids are collapsed when the number of clusters becomes greater than clusterOvershoot * numClusters. This
+   * effectively means having a slack in numClusters so that the actual number of centroids, centroids.size() tracks
+   * numClusters approximately. The idea is that the actual number of clusters should be at least numClusters but not
+   * much more (so that we don't end up having 1 cluster / point).
+   */
+  private final double clusterOvershoot;
+
+  /**
+   * Random object to sample values from.
+   */
+  private final Random random = RandomUtils.getRandom();
+
+  /**
+   * Calls StreamingKMeans(searcher, numClusters, 1.3, 10, 2).
+   * @see StreamingKMeans#StreamingKMeans(org.apache.mahout.math.neighborhood.UpdatableSearcher, int,
+   * double, double, double, double)
+   */
+  public StreamingKMeans(UpdatableSearcher searcher, int numClusters) {
+    this(searcher, numClusters, 1.0 / numClusters, 1.3, 20, 2);
+  }
+
+  /**
+   * Calls StreamingKMeans(searcher, numClusters, distanceCutoff, 1.3, 10, 2).
+   * @see StreamingKMeans#StreamingKMeans(org.apache.mahout.math.neighborhood.UpdatableSearcher, int,
+   * double, double, double, double)
+   */
+  public StreamingKMeans(UpdatableSearcher searcher, int numClusters, double distanceCutoff) {
+    this(searcher, numClusters, distanceCutoff, 1.3, 20, 2);
+  }
+
+  /**
+   * Creates a new StreamingKMeans class given a searcher and the number of clusters to generate.
+   *
+   * @param searcher A Searcher that is used for performing nearest neighbor search. It MUST BE
+   *                 EMPTY initially because it will be used to keep track of the cluster
+   *                 centroids.
+   * @param numClusters An estimated number of clusters to generate for the data points.
+   *                    This can adjusted, but the actual number will depend on the data. The
+   * @param distanceCutoff  The initial distance cutoff representing the value of the
+   *                      distance between a point and its closest centroid after which
+   *                      the new point will definitely be assigned to a new cluster.
+   * @param beta Ratio of geometric progression to use when increasing distanceCutoff. After n increases, distanceCutoff
+   *             becomes distanceCutoff * beta^n. A smaller value increases the distanceCutoff less aggressively.
+   * @param clusterLogFactor Value multiplied with the number of points counted so far estimating the number of clusters
+   *                         to aim for. If the final number of clusters is known and this clustering is only for a
+   *                         sketch of the data, this can be the final number of clusters, k.
+   * @param clusterOvershoot Multiplicative slack factor for slowing down the collapse of the clusters.
+   */
+  public StreamingKMeans(UpdatableSearcher searcher, int numClusters,
+                         double distanceCutoff, double beta, double clusterLogFactor, double clusterOvershoot) {
+    this.centroids = searcher;
+    this.numClusters = numClusters;
+    this.distanceCutoff = distanceCutoff;
+    this.beta = beta;
+    this.clusterLogFactor = clusterLogFactor;
+    this.clusterOvershoot = clusterOvershoot;
+  }
+
+  /**
+   * @return an Iterator to the Centroids contained in this clusterer.
+   */
+  @Override
+  public Iterator<Centroid> iterator() {
+    return Iterators.transform(centroids.iterator(), new Function<Vector, Centroid>() {
+      @Override
+      public Centroid apply(Vector input) {
+        return (Centroid)input;
+      }
+    });
+  }
+
+  /**
+   * Cluster the rows of a matrix, treating them as Centroids with weight 1.
+   * @param data matrix whose rows are to be clustered.
+   * @return the UpdatableSearcher containing the resulting centroids.
+   */
+  public UpdatableSearcher cluster(Matrix data) {
+    return cluster(Iterables.transform(data, new Function<MatrixSlice, Centroid>() {
+      @Override
+      public Centroid apply(MatrixSlice input) {
+        // The key in a Centroid is actually the MatrixSlice's index.
+        return Centroid.create(input.index(), input.vector());
+      }
+    }));
+  }
+
+  /**
+   * Cluster the data points in an Iterable<Centroid>.
+   * @param datapoints Iterable whose elements are to be clustered.
+   * @return the UpdatableSearcher containing the resulting centroids.
+   */
+  public UpdatableSearcher cluster(Iterable<Centroid> datapoints) {
+    return clusterInternal(datapoints, false);
+  }
+
+  /**
+   * Cluster one data point.
+   * @param datapoint to be clustered.
+   * @return the UpdatableSearcher containing the resulting centroids.
+   */
+  public UpdatableSearcher cluster(final Centroid datapoint) {
+    return cluster(new Iterable<Centroid>() {
+      @Override
+      public Iterator<Centroid> iterator() {
+        return new Iterator<Centroid>() {
+          private boolean accessed = false;
+
+          @Override
+          public boolean hasNext() {
+            return !accessed;
+          }
+
+          @Override
+          public Centroid next() {
+            accessed = true;
+            return datapoint;
+          }
+
+          @Override
+          public void remove() {
+            throw new UnsupportedOperationException();
+          }
+        };
+      }
+    });
+  }
+
+  /**
+   * @return the number of clusters computed from the points until now.
+   */
+  public int getNumClusters() {
+    return centroids.size();
+  }
+
+  /**
+   * Internal clustering method that gets called from the other wrappers.
+   * @param datapoints Iterable of data points to be clustered.
+   * @param collapseClusters whether this is an "inner" clustering and the datapoints are the previously computed
+   *                         centroids. Some logic is different to ensure counters are consistent but it behaves
+   *                         nearly the same.
+   * @return the UpdatableSearcher containing the resulting centroids.
+   */
+  private UpdatableSearcher clusterInternal(Iterable<Centroid> datapoints, boolean collapseClusters) {
+    Iterator<Centroid> datapointsIterator = datapoints.iterator();
+    if (!datapointsIterator.hasNext()) {
+      return centroids;
+    }
+
+    int oldNumProcessedDataPoints = numProcessedDatapoints;
+    // We clear the centroids we have in case of cluster collapse, the old clusters are the
+    // datapoints but we need to re-cluster them.
+    if (collapseClusters) {
+      centroids.clear();
+      numProcessedDatapoints = 0;
+    }
+
+    if (centroids.size() == 0) {
+      // Assign the first datapoint to the first cluster.
+      // Adding a vector to a searcher would normally just reference the copy,
+      // but we could potentially mutate it and so we need to make a clone.
+      centroids.add(datapointsIterator.next().clone());
+      ++numProcessedDatapoints;
+    }
+
+    // To cluster, we scan the data and either add each point to the nearest group or create a new group.
+    // when we get too many groups, we need to increase the threshold and rescan our current groups
+    while (datapointsIterator.hasNext()) {
+      Centroid row = datapointsIterator.next();
+      // Get the closest vector and its weight as a WeightedThing<Vector>.
+      // The weight of the WeightedThing is the distance to the query and the value is a
+      // reference to one of the vectors we added to the searcher previously.
+      WeightedThing<Vector> closestPair = centroids.searchFirst(row, false);
+
+      // We get a uniformly distributed random number between 0 and 1 and compare it with the
+      // distance to the closest cluster divided by the distanceCutoff.
+      // This is so that if the closest cluster is further than distanceCutoff,
+      // closestPair.getWeight() / distanceCutoff > 1 which will trigger the creation of a new
+      // cluster anyway.
+      // However, if the ratio is less than 1, we want to create a new cluster with probability
+      // proportional to the distance to the closest cluster.
+      double sample = random.nextDouble();
+      if (sample < row.getWeight() * closestPair.getWeight() / distanceCutoff) {
+        // Add new centroid, note that the vector is copied because we may mutate it later.
+        centroids.add(row.clone());
+      } else {
+        // Merge the new point with the existing centroid. This will update the centroid's actual
+        // position.
+        // We know that all the points we inserted in the centroids searcher are (or extend)
+        // WeightedVector, so the cast will always succeed.
+        Centroid centroid = (Centroid) closestPair.getValue();
+
+        // We will update the centroid by removing it from the searcher and reinserting it to
+        // ensure consistency.
+        if (!centroids.remove(centroid, Constants.EPSILON)) {
+          throw new RuntimeException("Unable to remove centroid");
+        }
+        centroid.update(row);
+        centroids.add(centroid);
+
+      }
+      ++numProcessedDatapoints;
+
+      if (!collapseClusters && centroids.size() > clusterOvershoot * numClusters) {
+        numClusters = (int) Math.max(numClusters, clusterLogFactor * Math.log(numProcessedDatapoints));
+
+        List<Centroid> shuffled = new ArrayList<>();
+        for (Vector vector : centroids) {
+          shuffled.add((Centroid) vector);
+        }
+        Collections.shuffle(shuffled);
+        // Re-cluster using the shuffled centroids as data points. The centroids member variable
+        // is modified directly.
+        clusterInternal(shuffled, true);
+
+        if (centroids.size() > numClusters) {
+          distanceCutoff *= beta;
+        }
+      }
+    }
+
+    if (collapseClusters) {
+      numProcessedDatapoints = oldNumProcessedDataPoints;
+    }
+    return centroids;
+  }
+
+  public void reindexCentroids() {
+    int numCentroids = 0;
+    for (Centroid centroid : this) {
+      centroid.setIndex(numCentroids++);
+    }
+  }
+
+  /**
+   * @return the distanceCutoff (an upper bound for the maximum distance within a cluster).
+   */
+  public double getDistanceCutoff() {
+    return distanceCutoff;
+  }
+
+  public void setDistanceCutoff(double distanceCutoff) {
+    this.distanceCutoff = distanceCutoff;
+  }
+
+  public DistanceMeasure getDistanceMeasure() {
+    return centroids.getDistanceMeasure();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/CentroidWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/CentroidWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/CentroidWritable.java
new file mode 100644
index 0000000..a41940b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/CentroidWritable.java
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.mapreduce;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+public class CentroidWritable implements Writable {
+  private Centroid centroid = null;
+
+  public CentroidWritable() {}
+
+  public CentroidWritable(Centroid centroid) {
+    this.centroid = centroid;
+  }
+
+  public Centroid getCentroid() {
+    return centroid;
+  }
+
+  @Override
+  public void write(DataOutput dataOutput) throws IOException {
+    dataOutput.writeInt(centroid.getIndex());
+    dataOutput.writeDouble(centroid.getWeight());
+    VectorWritable.writeVector(dataOutput, centroid.getVector());
+  }
+
+  @Override
+  public void readFields(DataInput dataInput) throws IOException {
+    if (centroid == null) {
+      centroid = read(dataInput);
+      return;
+    }
+    centroid.setIndex(dataInput.readInt());
+    centroid.setWeight(dataInput.readDouble());
+    centroid.assign(VectorWritable.readVector(dataInput));
+  }
+
+  public static Centroid read(DataInput dataInput) throws IOException {
+    int index = dataInput.readInt();
+    double weight = dataInput.readDouble();
+    Vector v = VectorWritable.readVector(dataInput);
+    return new Centroid(index, v, weight);
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) {
+      return true;
+    }
+    if (!(o instanceof CentroidWritable)) {
+      return false;
+    }
+    CentroidWritable writable = (CentroidWritable) o;
+    return centroid.equals(writable.centroid);
+  }
+
+  @Override
+  public int hashCode() {
+    return centroid.hashCode();
+  }
+
+  @Override
+  public String toString() {
+    return centroid.toString();
+  }
+}


[21/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java
new file mode 100644
index 0000000..3463ff5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.model.UpdatableIDMigrator;
+
+/**
+ * Implementation which stores the reverse long-to-String mapping in memory.
+ */
+public final class MemoryIDMigrator extends AbstractIDMigrator implements UpdatableIDMigrator {
+  
+  private final FastByIDMap<String> longToString;
+  
+  public MemoryIDMigrator() {
+    this.longToString = new FastByIDMap<>(100);
+  }
+  
+  @Override
+  public void storeMapping(long longID, String stringID) {
+    synchronized (longToString) {
+      longToString.put(longID, stringID);
+    }
+  }
+  
+  @Override
+  public String toStringID(long longID) {
+    synchronized (longToString) {
+      return longToString.get(longID);
+    }
+  }
+
+  @Override
+  public void initialize(Iterable<String> stringIDs) {
+    for (String stringID : stringIDs) {
+      storeMapping(toLongID(stringID), stringID);
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java
new file mode 100644
index 0000000..b134598
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import javax.sql.DataSource;
+
+/**
+ * <p>
+ * An implementation for MySQL. The following statement would create a table suitable for use with this class:
+ * </p>
+ *
+ * <p>
+ *
+ * <pre>
+ * CREATE TABLE taste_id_migration (
+ *   long_id BIGINT NOT NULL PRIMARY KEY,
+ *   string_id VARCHAR(255) NOT NULL UNIQUE
+ * )
+ * </pre>
+ *
+ * </p>
+ *
+ * <p>
+ * Separately, note that in a MySQL database, the following function calls will convert a string value into a
+ * numeric value in the same way that the standard implementations in this package do. This may be useful in
+ * writing SQL statements for use with
+ * {@code AbstractJDBCDataModel} subclasses which convert string
+ * column values to appropriate numeric values -- though this should be viewed as a temporary arrangement
+ * since it will impact performance:
+ * </p>
+ *
+ * <p>
+ * {@code cast(conv(substring(md5([column name]), 1, 16),16,10) as signed)}
+ * </p>
+ */
+public final class MySQLJDBCIDMigrator extends AbstractJDBCIDMigrator {
+  
+  public MySQLJDBCIDMigrator(DataSource dataSource) {
+    this(dataSource, DEFAULT_MAPPING_TABLE,
+        DEFAULT_LONG_ID_COLUMN, DEFAULT_STRING_ID_COLUMN);
+  }
+  
+  public MySQLJDBCIDMigrator(DataSource dataSource,
+                             String mappingTable,
+                             String longIDColumn,
+                             String stringIDColumn) {
+    super(dataSource,
+          "SELECT " + stringIDColumn + " FROM " + mappingTable + " WHERE " + longIDColumn + "=?",
+          "INSERT IGNORE INTO " + mappingTable + " (" + longIDColumn + ',' + stringIDColumn + ") VALUES (?,?)");
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousConcurrentUserDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousConcurrentUserDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousConcurrentUserDataModel.java
new file mode 100644
index 0000000..c97a545
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousConcurrentUserDataModel.java
@@ -0,0 +1,352 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import com.google.common.base.Preconditions;
+import java.util.List;
+import java.util.Map;
+import java.util.Queue;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
+
+import com.google.common.collect.Lists;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>
+ * This is a special thread-safe version of {@link PlusAnonymousUserDataModel}
+ * which allow multiple concurrent anonymous requests.
+ * </p>
+ *
+ * <p>
+ * To use it, you have to estimate the number of concurrent anonymous users of your application.
+ * The pool of users with the given size will be created. For each anonymous recommendations request,
+ * a user has to be taken from the pool and returned back immediately afterwards.
+ * </p>
+ *
+ * <p>
+ * If no more users are available in the pool, anonymous recommendations cannot be produced.
+ * </p>
+ *
+ * </p>
+ *
+ * Setup:
+ * <pre>
+ * int concurrentUsers = 100;
+ * DataModel realModel = ..
+ * PlusAnonymousConcurrentUserDataModel plusModel =
+ *   new PlusAnonymousConcurrentUserDataModel(realModel, concurrentUsers);
+ * Recommender recommender = ...;
+ * </pre>
+ *
+ * Real-time recommendation:
+ * <pre>
+ * PlusAnonymousConcurrentUserDataModel plusModel =
+ *   (PlusAnonymousConcurrentUserDataModel) recommender.getDataModel();
+ *
+ * // Take the next available anonymous user from the pool
+ * Long anonymousUserID = plusModel.takeAvailableUser();
+ *
+ * PreferenceArray tempPrefs = ..
+ * tempPrefs.setUserID(0, anonymousUserID);
+ * tempPrefs.setItemID(0, itemID);
+ * plusModel.setTempPrefs(tempPrefs, anonymousUserID);
+ *
+ * // Produce recommendations
+ * recommender.recommend(anonymousUserID, howMany);
+ *
+ * // It is very IMPORTANT to release user back to the pool
+ * plusModel.releaseUser(anonymousUserID);
+ * </pre>
+ *
+ * </p>
+ */
+public final class PlusAnonymousConcurrentUserDataModel extends PlusAnonymousUserDataModel {
+
+  /** Preferences for all anonymous users */
+  private final Map<Long,PreferenceArray> tempPrefs;
+  /** Item IDs set for all anonymous users */
+  private final Map<Long,FastIDSet> prefItemIDs;
+  /** Pool of the users (FIFO) */
+  private Queue<Long> usersPool;
+
+  private static final Logger log = LoggerFactory.getLogger(PlusAnonymousUserDataModel.class);
+
+  /**
+   * @param delegate Real model where anonymous users will be added to
+   * @param maxConcurrentUsers Maximum allowed number of concurrent anonymous users
+   */
+  public PlusAnonymousConcurrentUserDataModel(DataModel delegate, int maxConcurrentUsers) {
+    super(delegate);
+
+    tempPrefs = new ConcurrentHashMap<>();
+    prefItemIDs = new ConcurrentHashMap<>();
+
+    initializeUsersPools(maxConcurrentUsers);
+  }
+
+  /**
+   * Initialize the pool of concurrent anonymous users.
+   *
+   * @param usersPoolSize Maximum allowed number of concurrent anonymous user. Depends on the consumer system.
+   */
+  private void initializeUsersPools(int usersPoolSize) {
+    usersPool = new ConcurrentLinkedQueue<>();
+    for (int i = 0; i < usersPoolSize; i++) {
+      usersPool.add(TEMP_USER_ID + i);
+    }
+  }
+
+  /**
+   * Take the next available concurrent anonymous users from the pool.
+   *
+   * @return User ID or null if no more users are available
+   */
+  public Long takeAvailableUser() {
+    Long takenUserID = usersPool.poll();
+    if (takenUserID != null) {
+      // Initialize the preferences array to indicate that the user is taken.
+      tempPrefs.put(takenUserID, new GenericUserPreferenceArray(0));
+      return takenUserID;
+    }
+    return null;
+  }
+
+  /**
+   * Release previously taken anonymous user and return it to the pool.
+   *
+   * @param userID ID of a previously taken anonymous user
+   * @return true if the user was previously taken, false otherwise
+   */
+  public boolean releaseUser(Long userID) {
+    if (tempPrefs.containsKey(userID)) {
+      this.clearTempPrefs(userID);
+      // Return previously taken user to the pool
+      usersPool.offer(userID);
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * Checks whether a given user is a valid previously acquired anonymous user.
+   */
+  private boolean isAnonymousUser(long userID) {
+    return tempPrefs.containsKey(userID);
+  }
+
+  /**
+   * Sets temporary preferences for a given anonymous user.
+   */
+  public void setTempPrefs(PreferenceArray prefs, long anonymousUserID) {
+    Preconditions.checkArgument(prefs != null && prefs.length() > 0, "prefs is null or empty");
+
+    this.tempPrefs.put(anonymousUserID, prefs);
+    FastIDSet userPrefItemIDs = new FastIDSet();
+
+    for (int i = 0; i < prefs.length(); i++) {
+      userPrefItemIDs.add(prefs.getItemID(i));
+    }
+
+    this.prefItemIDs.put(anonymousUserID, userPrefItemIDs);
+  }
+
+  /**
+   * Clears temporary preferences for a given anonymous user.
+   */
+  public void clearTempPrefs(long anonymousUserID) {
+    this.tempPrefs.remove(anonymousUserID);
+    this.prefItemIDs.remove(anonymousUserID);
+  }
+
+  @Override
+  public LongPrimitiveIterator getUserIDs() throws TasteException {
+    // Anonymous users have short lifetime and should not be included into the neighbohoods of the real users.
+    // Thus exclude them from the universe.
+    return getDelegate().getUserIDs();
+  }
+
+  @Override
+  public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
+    if (isAnonymousUser(userID)) {
+      return tempPrefs.get(userID);
+    }
+    return getDelegate().getPreferencesFromUser(userID);
+  }
+
+  @Override
+  public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+    if (isAnonymousUser(userID)) {
+      return prefItemIDs.get(userID);
+    }
+    return getDelegate().getItemIDsFromUser(userID);
+  }
+
+  @Override
+  public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
+    if (tempPrefs.isEmpty()) {
+      return getDelegate().getPreferencesForItem(itemID);
+    }
+
+    PreferenceArray delegatePrefs = null;
+
+    try {
+      delegatePrefs = getDelegate().getPreferencesForItem(itemID);
+    } catch (NoSuchItemException nsie) {
+      // OK. Probably an item that only the anonymous user has
+      if (log.isDebugEnabled()) {
+        log.debug("Item {} unknown", itemID);
+      }
+    }
+
+    List<Preference> anonymousPreferences = Lists.newArrayList();
+
+    for (Map.Entry<Long, PreferenceArray> prefsMap : tempPrefs.entrySet()) {
+      PreferenceArray singleUserTempPrefs = prefsMap.getValue();
+      for (int i = 0; i < singleUserTempPrefs.length(); i++) {
+        if (singleUserTempPrefs.getItemID(i) == itemID) {
+          anonymousPreferences.add(singleUserTempPrefs.get(i));
+        }
+      }
+    }
+
+    int delegateLength = delegatePrefs == null ? 0 : delegatePrefs.length();
+    int anonymousPrefsLength = anonymousPreferences.size();
+    int prefsCounter = 0;
+
+    // Merge the delegate and anonymous preferences into a single array
+    PreferenceArray newPreferenceArray = new GenericItemPreferenceArray(delegateLength + anonymousPrefsLength);
+
+    for (int i = 0; i < delegateLength; i++) {
+      newPreferenceArray.set(prefsCounter++, delegatePrefs.get(i));
+    }
+
+    for (Preference anonymousPreference : anonymousPreferences) {
+      newPreferenceArray.set(prefsCounter++, anonymousPreference);
+    }
+
+    if (newPreferenceArray.length() == 0) {
+      // No, didn't find it among the anonymous user prefs
+      throw new NoSuchItemException(itemID);
+    }
+
+    return newPreferenceArray;
+  }
+
+  @Override
+  public Float getPreferenceValue(long userID, long itemID) throws TasteException {
+    if (isAnonymousUser(userID)) {
+      PreferenceArray singleUserTempPrefs = tempPrefs.get(userID);
+      for (int i = 0; i < singleUserTempPrefs.length(); i++) {
+        if (singleUserTempPrefs.getItemID(i) == itemID) {
+          return singleUserTempPrefs.getValue(i);
+        }
+      }
+      return null;
+    }
+    return getDelegate().getPreferenceValue(userID, itemID);
+  }
+
+  @Override
+  public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+    if (isAnonymousUser(userID)) {
+      // Timestamps are not saved for anonymous preferences
+      return null;
+    }
+    return getDelegate().getPreferenceTime(userID, itemID);
+  }
+
+  @Override
+  public int getNumUsers() throws TasteException {
+    // Anonymous users have short lifetime and should not be included into the neighbohoods of the real users.
+    // Thus exclude them from the universe.
+    return getDelegate().getNumUsers();
+  }
+
+  @Override
+  public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
+    if (tempPrefs.isEmpty()) {
+      return getDelegate().getNumUsersWithPreferenceFor(itemID);
+    }
+
+    int countAnonymousUsersWithPreferenceFor = 0;
+
+    for (Map.Entry<Long, PreferenceArray> singleUserTempPrefs : tempPrefs.entrySet()) {
+      for (int i = 0; i < singleUserTempPrefs.getValue().length(); i++) {
+        if (singleUserTempPrefs.getValue().getItemID(i) == itemID) {
+          countAnonymousUsersWithPreferenceFor++;
+          break;
+        }
+      }
+    }
+    return getDelegate().getNumUsersWithPreferenceFor(itemID) + countAnonymousUsersWithPreferenceFor;
+  }
+
+  @Override
+  public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
+    if (tempPrefs.isEmpty()) {
+      return getDelegate().getNumUsersWithPreferenceFor(itemID1, itemID2);
+    }
+
+    int countAnonymousUsersWithPreferenceFor = 0;
+
+    for (Map.Entry<Long, PreferenceArray> singleUserTempPrefs : tempPrefs.entrySet()) {
+      boolean found1 = false;
+      boolean found2 = false;
+      for (int i = 0; i < singleUserTempPrefs.getValue().length() && !(found1 && found2); i++) {
+        long itemID = singleUserTempPrefs.getValue().getItemID(i);
+        if (itemID == itemID1) {
+          found1 = true;
+        }
+        if (itemID == itemID2) {
+          found2 = true;
+        }
+      }
+
+      if (found1 && found2) {
+        countAnonymousUsersWithPreferenceFor++;
+      }
+    }
+
+    return getDelegate().getNumUsersWithPreferenceFor(itemID1, itemID2) + countAnonymousUsersWithPreferenceFor;
+  }
+
+  @Override
+  public void setPreference(long userID, long itemID, float value) throws TasteException {
+    if (isAnonymousUser(userID)) {
+      throw new UnsupportedOperationException();
+    }
+    getDelegate().setPreference(userID, itemID, value);
+  }
+
+  @Override
+  public void removePreference(long userID, long itemID) throws TasteException {
+    if (isAnonymousUser(userID)) {
+      throw new UnsupportedOperationException();
+    }
+    getDelegate().removePreference(userID, itemID);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserDataModel.java
new file mode 100644
index 0000000..546349b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserDataModel.java
@@ -0,0 +1,320 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+import com.google.common.base.Preconditions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>
+ * This {@link DataModel} decorator class is useful in a situation where you wish to recommend to a user that
+ * doesn't really exist yet in your actual {@link DataModel}. For example maybe you wish to recommend DVDs to
+ * a user who has browsed a few titles on your DVD store site, but, the user is not yet registered.
+ * </p>
+ *
+ * <p>
+ * This enables you to temporarily add a temporary user to an existing {@link DataModel} in a way that
+ * recommenders can then produce recommendations anyway. To do so, wrap your real implementation in this
+ * class:
+ * </p>
+ *
+ * <p>
+ *
+ * <pre>
+ * DataModel realModel = ...;
+ * DataModel plusModel = new PlusAnonymousUserDataModel(realModel);
+ * ...
+ * ItemSimilarity similarity = new LogLikelihoodSimilarity(realModel); // not plusModel
+ * </pre>
+ *
+ * </p>
+ *
+ * <p>
+ * But, you may continue to use {@code realModel} as input to other components. To recommend, first construct and
+ * set the temporary user information on the model and then simply call the recommender. The
+ * {@code synchronized} block exists to remind you that this is of course not thread-safe. Only one set
+ * of temp data can be inserted into the model and used at one time.
+ * </p>
+ *
+ * <p>
+ *
+ * <pre>
+ * Recommender recommender = ...;
+ * ...
+ * synchronized(...) {
+ *   PreferenceArray tempPrefs = ...;
+ *   plusModel.setTempPrefs(tempPrefs);
+ *   recommender.recommend(PlusAnonymousUserDataModel.TEMP_USER_ID, 10);
+ *   plusModel.setTempPrefs(null);
+ * }
+ * </pre>
+ *
+ * </p>
+ */
+public class PlusAnonymousUserDataModel implements DataModel {
+
+  public static final long TEMP_USER_ID = Long.MIN_VALUE;
+  
+  private final DataModel delegate;
+  private PreferenceArray tempPrefs;
+  private final FastIDSet prefItemIDs;
+
+  private static final Logger log = LoggerFactory.getLogger(PlusAnonymousUserDataModel.class);
+
+  public PlusAnonymousUserDataModel(DataModel delegate) {
+    this.delegate = delegate;
+    this.prefItemIDs = new FastIDSet();
+  }
+
+  protected DataModel getDelegate() {
+    return delegate;
+  }
+  
+  public void setTempPrefs(PreferenceArray prefs) {
+    Preconditions.checkArgument(prefs != null && prefs.length() > 0, "prefs is null or empty");
+    this.tempPrefs = prefs;
+    this.prefItemIDs.clear();
+    for (int i = 0; i < prefs.length(); i++) {
+      this.prefItemIDs.add(prefs.getItemID(i));
+    }
+  }
+
+  public void clearTempPrefs() {
+    tempPrefs = null;
+    prefItemIDs.clear();
+  }
+  
+  @Override
+  public LongPrimitiveIterator getUserIDs() throws TasteException {
+    if (tempPrefs == null) {
+      return delegate.getUserIDs();
+    }
+    return new PlusAnonymousUserLongPrimitiveIterator(delegate.getUserIDs(), TEMP_USER_ID);
+  }
+  
+  @Override
+  public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
+    if (userID == TEMP_USER_ID) {
+      if (tempPrefs == null) {
+        throw new NoSuchUserException(TEMP_USER_ID);
+      }
+      return tempPrefs;
+    }
+    return delegate.getPreferencesFromUser(userID);
+  }
+  
+  @Override
+  public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+    if (userID == TEMP_USER_ID) {
+      if (tempPrefs == null) {
+        throw new NoSuchUserException(TEMP_USER_ID);
+      }
+      return prefItemIDs;
+    }
+    return delegate.getItemIDsFromUser(userID);
+  }
+  
+  @Override
+  public LongPrimitiveIterator getItemIDs() throws TasteException {
+    return delegate.getItemIDs();
+    // Yeah ignoring items that only the plus-one user knows about... can't really happen
+  }
+  
+  @Override
+  public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
+    if (tempPrefs == null) {
+      return delegate.getPreferencesForItem(itemID);
+    }
+    PreferenceArray delegatePrefs = null;
+    try {
+      delegatePrefs = delegate.getPreferencesForItem(itemID);
+    } catch (NoSuchItemException nsie) {
+      // OK. Probably an item that only the anonymous user has
+      if (log.isDebugEnabled()) {
+        log.debug("Item {} unknown", itemID);
+      }
+    }
+    for (int i = 0; i < tempPrefs.length(); i++) {
+      if (tempPrefs.getItemID(i) == itemID) {
+        return cloneAndMergeInto(delegatePrefs, itemID, tempPrefs.getUserID(i), tempPrefs.getValue(i));
+      }
+    }
+    if (delegatePrefs == null) {
+      // No, didn't find it among the anonymous user prefs
+      throw new NoSuchItemException(itemID);
+    }
+    return delegatePrefs;
+  }
+
+  private static PreferenceArray cloneAndMergeInto(PreferenceArray delegatePrefs,
+                                                   long itemID,
+                                                   long newUserID,
+                                                   float value) {
+
+    int length = delegatePrefs == null ? 0 : delegatePrefs.length();
+    int newLength = length + 1;
+    PreferenceArray newPreferenceArray = new GenericItemPreferenceArray(newLength);
+
+    // Set item ID once
+    newPreferenceArray.setItemID(0, itemID);
+
+    int positionToInsert = 0;
+    while (positionToInsert < length && newUserID > delegatePrefs.getUserID(positionToInsert)) {
+      positionToInsert++;
+    }
+
+    for (int i = 0; i < positionToInsert; i++) {
+      newPreferenceArray.setUserID(i, delegatePrefs.getUserID(i));
+      newPreferenceArray.setValue(i, delegatePrefs.getValue(i));
+    }
+    newPreferenceArray.setUserID(positionToInsert, newUserID);
+    newPreferenceArray.setValue(positionToInsert, value);
+    for (int i = positionToInsert + 1; i < newLength; i++) {
+      newPreferenceArray.setUserID(i, delegatePrefs.getUserID(i - 1));
+      newPreferenceArray.setValue(i, delegatePrefs.getValue(i - 1));
+    }
+
+    return newPreferenceArray;
+  }
+  
+  @Override
+  public Float getPreferenceValue(long userID, long itemID) throws TasteException {
+    if (userID == TEMP_USER_ID) {
+      if (tempPrefs == null) {
+        throw new NoSuchUserException(TEMP_USER_ID);
+      }
+      for (int i = 0; i < tempPrefs.length(); i++) {
+        if (tempPrefs.getItemID(i) == itemID) {
+          return tempPrefs.getValue(i);
+        }
+      }
+      return null;
+    }
+    return delegate.getPreferenceValue(userID, itemID);
+  }
+
+  @Override
+  public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+    if (userID == TEMP_USER_ID) {
+      if (tempPrefs == null) {
+        throw new NoSuchUserException(TEMP_USER_ID);
+      }
+      return null;
+    }
+    return delegate.getPreferenceTime(userID, itemID);
+  }
+  
+  @Override
+  public int getNumItems() throws TasteException {
+    return delegate.getNumItems();
+  }
+  
+  @Override
+  public int getNumUsers() throws TasteException {
+    return delegate.getNumUsers() + (tempPrefs == null ? 0 : 1);
+  }
+  
+  @Override
+  public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
+    if (tempPrefs == null) {
+      return delegate.getNumUsersWithPreferenceFor(itemID);
+    }
+    boolean found = false;
+    for (int i = 0; i < tempPrefs.length(); i++) {
+      if (tempPrefs.getItemID(i) == itemID) {
+        found = true;
+        break;
+      }
+    }
+    return delegate.getNumUsersWithPreferenceFor(itemID) + (found ? 1 : 0);
+  }
+
+  @Override
+  public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
+    if (tempPrefs == null) {
+      return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2);
+    }
+    boolean found1 = false;
+    boolean found2 = false;
+    for (int i = 0; i < tempPrefs.length() && !(found1 && found2); i++) {
+      long itemID = tempPrefs.getItemID(i);
+      if (itemID == itemID1) {
+        found1 = true;
+      }
+      if (itemID == itemID2) {
+        found2 = true;
+      }
+    }
+    return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2) + (found1 && found2 ? 1 : 0);
+  }
+  
+  @Override
+  public void setPreference(long userID, long itemID, float value) throws TasteException {
+    if (userID == TEMP_USER_ID) {
+      if (tempPrefs == null) {
+        throw new NoSuchUserException(TEMP_USER_ID);
+      }
+      throw new UnsupportedOperationException();
+    }
+    delegate.setPreference(userID, itemID, value);
+  }
+  
+  @Override
+  public void removePreference(long userID, long itemID) throws TasteException {
+    if (userID == TEMP_USER_ID) {
+      if (tempPrefs == null) {
+        throw new NoSuchUserException(TEMP_USER_ID);
+      }
+      throw new UnsupportedOperationException();
+    }
+    delegate.removePreference(userID, itemID);
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    delegate.refresh(alreadyRefreshed);
+  }
+
+  @Override
+  public boolean hasPreferenceValues() {
+    return delegate.hasPreferenceValues();
+  }
+
+  @Override
+  public float getMaxPreference() {
+    return delegate.getMaxPreference();
+  }
+
+  @Override
+  public float getMinPreference() {
+    return delegate.getMinPreference();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserLongPrimitiveIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserLongPrimitiveIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserLongPrimitiveIterator.java
new file mode 100644
index 0000000..ea4df85
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserLongPrimitiveIterator.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import org.apache.mahout.cf.taste.impl.common.AbstractLongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+
+final class PlusAnonymousUserLongPrimitiveIterator extends AbstractLongPrimitiveIterator {
+  
+  private final LongPrimitiveIterator delegate;
+  private final long extraDatum;
+  private boolean datumConsumed;
+  
+  PlusAnonymousUserLongPrimitiveIterator(LongPrimitiveIterator delegate, long extraDatum) {
+    this.delegate = delegate;
+    this.extraDatum = extraDatum;
+    datumConsumed = false;
+  }
+  
+  @Override
+  public long nextLong() {
+    if (datumConsumed) {
+      return delegate.nextLong();
+    } else {
+      if (delegate.hasNext()) {
+        long delegateNext = delegate.peek();
+        if (extraDatum <= delegateNext) {
+          datumConsumed = true;
+          return extraDatum;
+        } else {
+          return delegate.next();
+        }
+      } else {
+        datumConsumed = true;
+        return extraDatum;
+      }
+    }
+  }
+  
+  @Override
+  public long peek() {
+    if (datumConsumed) {
+      return delegate.peek();
+    } else {
+      if (delegate.hasNext()) {
+        long delegateNext = delegate.peek();
+        if (extraDatum <= delegateNext) {
+          return extraDatum;
+        } else {
+          return delegateNext;
+        }
+      } else {
+        return extraDatum;
+      }
+    }
+  }
+  
+  @Override
+  public boolean hasNext() {
+    return !datumConsumed || delegate.hasNext();
+  }
+  
+  @Override
+  public void remove() {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void skip(int n) {
+    for (int i = 0; i < n; i++) {
+      nextLong();
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
new file mode 100644
index 0000000..0399618
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
@@ -0,0 +1,758 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.file;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.locks.ReentrantLock;
+
+import com.google.common.base.Preconditions;
+import com.google.common.base.Splitter;
+import com.google.common.io.Closeables;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.model.AbstractDataModel;
+import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel;
+import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
+import org.apache.mahout.cf.taste.impl.model.GenericPreference;
+import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.iterator.FileLineIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>
+ * A {@link DataModel} backed by a delimited file. This class expects a file where each line
+ * contains a user ID, followed by item ID, followed by optional preference value, followed by
+ * optional timestamp. Commas or tabs delimit fields:
+ * </p>
+ *
+ * <p>{@code userID,itemID[,preference[,timestamp]]}</p>
+ *
+ * <p>
+ * Preference value is optional to accommodate applications that have no notion of a
+ * preference value (that is, the user simply expresses a
+ * preference for an item, but no degree of preference).
+ * </p>
+ *
+ * <p>
+ * The preference value is assumed to be parseable as a {@code double}. The user IDs and item IDs are
+ * read parsed as {@code long}s. The timestamp, if present, is assumed to be parseable as a
+ * {@code long}, though this can be overridden via {@link #readTimestampFromString(String)}.
+ * The preference value may be empty, to indicate "no preference value", but cannot be empty. That is,
+ * this is legal:
+ * </p>
+ *
+ * <p>{@code 123,456,,129050099059}</p>
+ *
+ * <p>But this isn't:</p>
+ *
+ * <p>{@code 123,456,129050099059}</p>
+ *
+ * <p>
+ * It is also acceptable for the lines to contain additional fields. Fields beyond the third will be ignored.
+ * An empty line, or one that begins with '#' will be ignored as a comment.
+ * </p>
+ *
+ * <p>
+ * This class will reload data from the data file when {@link #refresh(Collection)} is called, unless the file
+ * has been reloaded very recently already.
+ * </p>
+ *
+ * <p>
+ * This class will also look for update "delta" files in the same directory, with file names that start the
+ * same way (up to the first period). These files have the same format, and provide updated data that
+ * supersedes what is in the main data file. This is a mechanism that allows an application to push updates to
+ * {@link FileDataModel} without re-copying the entire data file.
+ * </p>
+ *
+ * <p>
+ * One small format difference exists. Update files must also be able to express deletes.
+ * This is done by ending with a blank preference value, as in "123,456,".
+ * </p>
+ *
+ * <p>
+ * Note that it's all-or-nothing -- all of the items in the file must express no preference, or the all must.
+ * These cannot be mixed. Put another way there will always be the same number of delimiters on every line of
+ * the file!
+ * </p>
+ *
+ * <p>
+ * This class is not intended for use with very large amounts of data (over, say, tens of millions of rows).
+ * For that, a JDBC-backed {@link DataModel} and a database are more appropriate.
+ * </p>
+ *
+ * <p>
+ * It is possible and likely useful to subclass this class and customize its behavior to accommodate
+ * application-specific needs and input formats. See {@link #processLine(String, FastByIDMap, FastByIDMap, boolean)} and
+ * {@link #processLineWithoutID(String, FastByIDMap, FastByIDMap)}
+ */
+public class FileDataModel extends AbstractDataModel {
+
+  private static final Logger log = LoggerFactory.getLogger(FileDataModel.class);
+
+  public static final long DEFAULT_MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute?
+  private static final char COMMENT_CHAR = '#';
+  private static final char[] DELIMIETERS = {',', '\t'};
+
+  private final File dataFile;
+  private long lastModified;
+  private long lastUpdateFileModified;
+  private final transient Splitter delimiterPattern;
+  private final boolean hasPrefValues;
+  private DataModel delegate;
+  private final ReentrantLock reloadLock;
+  private final boolean transpose;
+  private final long minReloadIntervalMS;
+
+  /**
+   * @param dataFile
+   *          file containing preferences data. If file is compressed (and name ends in .gz or .zip
+   *          accordingly) it will be decompressed as it is read)
+   * @throws FileNotFoundException
+   *           if dataFile does not exist
+   * @throws IOException
+   *           if file can't be read
+   */
+  public FileDataModel(File dataFile) throws IOException {
+    this(dataFile, false, DEFAULT_MIN_RELOAD_INTERVAL_MS);
+  }
+  
+  /**
+   * @param delimiterRegex If your data file don't use '\t' or ',' as delimiter, you can specify 
+   * a custom regex pattern.
+   */
+  public FileDataModel(File dataFile, String delimiterRegex) throws IOException {
+    this(dataFile, false, DEFAULT_MIN_RELOAD_INTERVAL_MS, delimiterRegex);
+  }
+  
+  /**
+   * @param transpose
+   *          transposes user IDs and item IDs -- convenient for 'flipping' the data model this way
+   * @param minReloadIntervalMS
+   *  the minimum interval in milliseconds after which a full reload of the original datafile is done
+   *  when refresh() is called
+   * @see #FileDataModel(File)
+   */
+  public FileDataModel(File dataFile, boolean transpose, long minReloadIntervalMS) throws IOException {
+    this(dataFile, transpose, minReloadIntervalMS, null);
+  }
+  
+  /**
+   * @param delimiterRegex If your data file don't use '\t' or ',' as delimiters, you can specify 
+   * user own using regex pattern.
+   * @throws IOException
+   */
+  public FileDataModel(File dataFile, boolean transpose, long minReloadIntervalMS, String delimiterRegex)
+    throws IOException {
+
+    this.dataFile = Preconditions.checkNotNull(dataFile.getAbsoluteFile());
+    if (!dataFile.exists() || dataFile.isDirectory()) {
+      throw new FileNotFoundException(dataFile.toString());
+    }
+    Preconditions.checkArgument(dataFile.length() > 0L, "dataFile is empty");
+    Preconditions.checkArgument(minReloadIntervalMS >= 0L, "minReloadIntervalMs must be non-negative");
+
+    log.info("Creating FileDataModel for file {}", dataFile);
+
+    this.lastModified = dataFile.lastModified();
+    this.lastUpdateFileModified = readLastUpdateFileModified();
+
+    FileLineIterator iterator = new FileLineIterator(dataFile, false);
+    String firstLine = iterator.peek();
+    while (firstLine.isEmpty() || firstLine.charAt(0) == COMMENT_CHAR) {
+      iterator.next();
+      firstLine = iterator.peek();
+    }
+    Closeables.close(iterator, true);
+
+    char delimiter;
+    if (delimiterRegex == null) {
+      delimiter = determineDelimiter(firstLine);
+      delimiterPattern = Splitter.on(delimiter);
+    } else {
+      delimiter = '\0';
+      delimiterPattern = Splitter.onPattern(delimiterRegex);
+      if (!delimiterPattern.split(firstLine).iterator().hasNext()) {
+        throw new IllegalArgumentException("Did not find a delimiter(pattern) in first line");
+      }
+    }
+    List<String> firstLineSplit = new ArrayList<>();
+    for (String token : delimiterPattern.split(firstLine)) {
+      firstLineSplit.add(token);
+    }
+    // If preference value exists and isn't empty then the file is specifying pref values
+    hasPrefValues = firstLineSplit.size() >= 3 && !firstLineSplit.get(2).isEmpty();
+
+    this.reloadLock = new ReentrantLock();
+    this.transpose = transpose;
+    this.minReloadIntervalMS = minReloadIntervalMS;
+
+    reload();
+  }
+
+  public File getDataFile() {
+    return dataFile;
+  }
+
+  protected void reload() {
+    if (reloadLock.tryLock()) {
+      try {
+        delegate = buildModel();
+      } catch (IOException ioe) {
+        log.warn("Exception while reloading", ioe);
+      } finally {
+        reloadLock.unlock();
+      }
+    }
+  }
+
+  protected DataModel buildModel() throws IOException {
+
+    long newLastModified = dataFile.lastModified();
+    long newLastUpdateFileModified = readLastUpdateFileModified();
+
+    boolean loadFreshData = delegate == null || newLastModified > lastModified + minReloadIntervalMS;
+
+    long oldLastUpdateFileModifieid = lastUpdateFileModified;
+    lastModified = newLastModified;
+    lastUpdateFileModified = newLastUpdateFileModified;
+
+    FastByIDMap<FastByIDMap<Long>> timestamps = new FastByIDMap<>();
+
+    if (hasPrefValues) {
+
+      if (loadFreshData) {
+
+        FastByIDMap<Collection<Preference>> data = new FastByIDMap<>();
+        FileLineIterator iterator = new FileLineIterator(dataFile, false);
+        processFile(iterator, data, timestamps, false);
+
+        for (File updateFile : findUpdateFilesAfter(newLastModified)) {
+          processFile(new FileLineIterator(updateFile, false), data, timestamps, false);
+        }
+
+        return new GenericDataModel(GenericDataModel.toDataMap(data, true), timestamps);
+
+      } else {
+
+        FastByIDMap<PreferenceArray> rawData = ((GenericDataModel) delegate).getRawUserData();
+
+        for (File updateFile : findUpdateFilesAfter(Math.max(oldLastUpdateFileModifieid, newLastModified))) {
+          processFile(new FileLineIterator(updateFile, false), rawData, timestamps, true);
+        }
+
+        return new GenericDataModel(rawData, timestamps);
+
+      }
+
+    } else {
+
+      if (loadFreshData) {
+
+        FastByIDMap<FastIDSet> data = new FastByIDMap<>();
+        FileLineIterator iterator = new FileLineIterator(dataFile, false);
+        processFileWithoutID(iterator, data, timestamps);
+
+        for (File updateFile : findUpdateFilesAfter(newLastModified)) {
+          processFileWithoutID(new FileLineIterator(updateFile, false), data, timestamps);
+        }
+
+        return new GenericBooleanPrefDataModel(data, timestamps);
+
+      } else {
+
+        FastByIDMap<FastIDSet> rawData = ((GenericBooleanPrefDataModel) delegate).getRawUserData();
+
+        for (File updateFile : findUpdateFilesAfter(Math.max(oldLastUpdateFileModifieid, newLastModified))) {
+          processFileWithoutID(new FileLineIterator(updateFile, false), rawData, timestamps);
+        }
+
+        return new GenericBooleanPrefDataModel(rawData, timestamps);
+
+      }
+
+    }
+  }
+
+  /**
+   * Finds update delta files in the same directory as the data file. This finds any file whose name starts
+   * the same way as the data file (up to first period) but isn't the data file itself. For example, if the
+   * data file is /foo/data.txt.gz, you might place update files at /foo/data.1.txt.gz, /foo/data.2.txt.gz,
+   * etc.
+   */
+  private Iterable<File> findUpdateFilesAfter(long minimumLastModified) {
+    String dataFileName = dataFile.getName();
+    int period = dataFileName.indexOf('.');
+    String startName = period < 0 ? dataFileName : dataFileName.substring(0, period);
+    File parentDir = dataFile.getParentFile();
+    Map<Long, File> modTimeToUpdateFile = new TreeMap<>();
+    FileFilter onlyFiles = new FileFilter() {
+      @Override
+      public boolean accept(File file) {
+        return !file.isDirectory();
+      }
+    };
+    for (File updateFile : parentDir.listFiles(onlyFiles)) {
+      String updateFileName = updateFile.getName();
+      if (updateFileName.startsWith(startName)
+          && !updateFileName.equals(dataFileName)
+          && updateFile.lastModified() >= minimumLastModified) {
+        modTimeToUpdateFile.put(updateFile.lastModified(), updateFile);
+      }
+    }
+    return modTimeToUpdateFile.values();
+  }
+
+  private long readLastUpdateFileModified() {
+    long mostRecentModification = Long.MIN_VALUE;
+    for (File updateFile : findUpdateFilesAfter(0L)) {
+      mostRecentModification = Math.max(mostRecentModification, updateFile.lastModified());
+    }
+    return mostRecentModification;
+  }
+
+  public static char determineDelimiter(String line) {
+    for (char possibleDelimieter : DELIMIETERS) {
+      if (line.indexOf(possibleDelimieter) >= 0) {
+        return possibleDelimieter;
+      }
+    }
+    throw new IllegalArgumentException("Did not find a delimiter in first line");
+  }
+
+  protected void processFile(FileLineIterator dataOrUpdateFileIterator,
+                             FastByIDMap<?> data,
+                             FastByIDMap<FastByIDMap<Long>> timestamps,
+                             boolean fromPriorData) {
+    log.info("Reading file info...");
+    int count = 0;
+    while (dataOrUpdateFileIterator.hasNext()) {
+      String line = dataOrUpdateFileIterator.next();
+      if (!line.isEmpty()) {
+        processLine(line, data, timestamps, fromPriorData);
+        if (++count % 1000000 == 0) {
+          log.info("Processed {} lines", count);
+        }
+      }
+    }
+    log.info("Read lines: {}", count);
+  }
+
+  /**
+   * <p>
+   * Reads one line from the input file and adds the data to a {@link FastByIDMap} data structure which maps user IDs
+   * to preferences. This assumes that each line of the input file corresponds to one preference. After
+   * reading a line and determining which user and item the preference pertains to, the method should look to
+   * see if the data contains a mapping for the user ID already, and if not, add an empty data structure of preferences
+   * as appropriate to the data.
+   * </p>
+   *
+   * <p>
+   * Note that if the line is empty or begins with '#' it will be ignored as a comment.
+   * </p>
+   *
+   * @param line
+   *          line from input data file
+   * @param data
+   *          all data read so far, as a mapping from user IDs to preferences
+   * @param fromPriorData an implementation detail -- if true, data will map IDs to
+   *  {@link PreferenceArray} since the framework is attempting to read and update raw
+   *  data that is already in memory. Otherwise it maps to {@link Collection}s of
+   *  {@link Preference}s, since it's reading fresh data. Subclasses must be prepared
+   *  to handle this wrinkle.
+   */
+  protected void processLine(String line,
+                             FastByIDMap<?> data, 
+                             FastByIDMap<FastByIDMap<Long>> timestamps,
+                             boolean fromPriorData) {
+
+    // Ignore empty lines and comments
+    if (line.isEmpty() || line.charAt(0) == COMMENT_CHAR) {
+      return;
+    }
+
+    Iterator<String> tokens = delimiterPattern.split(line).iterator();
+    String userIDString = tokens.next();
+    String itemIDString = tokens.next();
+    String preferenceValueString = tokens.next();
+    boolean hasTimestamp = tokens.hasNext();
+    String timestampString = hasTimestamp ? tokens.next() : null;
+
+    long userID = readUserIDFromString(userIDString);
+    long itemID = readItemIDFromString(itemIDString);
+
+    if (transpose) {
+      long tmp = userID;
+      userID = itemID;
+      itemID = tmp;
+    }
+
+    // This is kind of gross but need to handle two types of storage
+    Object maybePrefs = data.get(userID);
+    if (fromPriorData) {
+      // Data are PreferenceArray
+
+      PreferenceArray prefs = (PreferenceArray) maybePrefs;
+      if (!hasTimestamp && preferenceValueString.isEmpty()) {
+        // Then line is of form "userID,itemID,", meaning remove
+        if (prefs != null) {
+          boolean exists = false;
+          int length = prefs.length();
+          for (int i = 0; i < length; i++) {
+            if (prefs.getItemID(i) == itemID) {
+              exists = true;
+              break;
+            }
+          }
+          if (exists) {
+            if (length == 1) {
+              data.remove(userID);
+            } else {
+              PreferenceArray newPrefs = new GenericUserPreferenceArray(length - 1);
+              for (int i = 0, j = 0; i < length; i++, j++) {
+                if (prefs.getItemID(i) == itemID) {
+                  j--;
+                } else {
+                  newPrefs.set(j, prefs.get(i));
+                }
+              }
+              ((FastByIDMap<PreferenceArray>) data).put(userID, newPrefs);
+            }
+          }
+        }
+
+        removeTimestamp(userID, itemID, timestamps);
+
+      } else {
+
+        float preferenceValue = Float.parseFloat(preferenceValueString);
+
+        boolean exists = false;
+        if (prefs != null) {
+          for (int i = 0; i < prefs.length(); i++) {
+            if (prefs.getItemID(i) == itemID) {
+              exists = true;
+              prefs.setValue(i, preferenceValue);
+              break;
+            }
+          }
+        }
+
+        if (!exists) {
+          if (prefs == null) {
+            prefs = new GenericUserPreferenceArray(1);
+          } else {
+            PreferenceArray newPrefs = new GenericUserPreferenceArray(prefs.length() + 1);
+            for (int i = 0, j = 1; i < prefs.length(); i++, j++) {
+              newPrefs.set(j, prefs.get(i));
+            }
+            prefs = newPrefs;
+          }
+          prefs.setUserID(0, userID);
+          prefs.setItemID(0, itemID);
+          prefs.setValue(0, preferenceValue);
+          ((FastByIDMap<PreferenceArray>) data).put(userID, prefs);          
+        }
+      }
+
+      addTimestamp(userID, itemID, timestampString, timestamps);
+
+    } else {
+      // Data are Collection<Preference>
+
+      Collection<Preference> prefs = (Collection<Preference>) maybePrefs;
+
+      if (!hasTimestamp && preferenceValueString.isEmpty()) {
+        // Then line is of form "userID,itemID,", meaning remove
+        if (prefs != null) {
+          // remove pref
+          Iterator<Preference> prefsIterator = prefs.iterator();
+          while (prefsIterator.hasNext()) {
+            Preference pref = prefsIterator.next();
+            if (pref.getItemID() == itemID) {
+              prefsIterator.remove();
+              break;
+            }
+          }
+        }
+
+        removeTimestamp(userID, itemID, timestamps);
+        
+      } else {
+
+        float preferenceValue = Float.parseFloat(preferenceValueString);
+
+        boolean exists = false;
+        if (prefs != null) {
+          for (Preference pref : prefs) {
+            if (pref.getItemID() == itemID) {
+              exists = true;
+              pref.setValue(preferenceValue);
+              break;
+            }
+          }
+        }
+
+        if (!exists) {
+          if (prefs == null) {
+            prefs = new ArrayList<>(2);
+            ((FastByIDMap<Collection<Preference>>) data).put(userID, prefs);
+          }
+          prefs.add(new GenericPreference(userID, itemID, preferenceValue));
+        }
+
+        addTimestamp(userID, itemID, timestampString, timestamps);
+
+      }
+
+    }
+  }
+
+  protected void processFileWithoutID(FileLineIterator dataOrUpdateFileIterator,
+                                      FastByIDMap<FastIDSet> data,
+                                      FastByIDMap<FastByIDMap<Long>> timestamps) {
+    log.info("Reading file info...");
+    int count = 0;
+    while (dataOrUpdateFileIterator.hasNext()) {
+      String line = dataOrUpdateFileIterator.next();
+      if (!line.isEmpty()) {
+        processLineWithoutID(line, data, timestamps);
+        if (++count % 100000 == 0) {
+          log.info("Processed {} lines", count);
+        }
+      }
+    }
+    log.info("Read lines: {}", count);
+  }
+
+  protected void processLineWithoutID(String line,
+                                      FastByIDMap<FastIDSet> data,
+                                      FastByIDMap<FastByIDMap<Long>> timestamps) {
+
+    if (line.isEmpty() || line.charAt(0) == COMMENT_CHAR) {
+      return;
+    }
+
+    Iterator<String> tokens = delimiterPattern.split(line).iterator();
+    String userIDString = tokens.next();
+    String itemIDString = tokens.next();
+    boolean hasPreference = tokens.hasNext();
+    String preferenceValueString = hasPreference ? tokens.next() : "";
+    boolean hasTimestamp = tokens.hasNext();
+    String timestampString = hasTimestamp ? tokens.next() : null;
+
+    long userID = readUserIDFromString(userIDString);
+    long itemID = readItemIDFromString(itemIDString);
+
+    if (transpose) {
+      long tmp = userID;
+      userID = itemID;
+      itemID = tmp;
+    }
+
+    if (hasPreference && !hasTimestamp && preferenceValueString.isEmpty()) {
+      // Then line is of form "userID,itemID,", meaning remove
+
+      FastIDSet itemIDs = data.get(userID);
+      if (itemIDs != null) {
+        itemIDs.remove(itemID);
+      }
+
+      removeTimestamp(userID, itemID, timestamps);
+
+    } else {
+
+      FastIDSet itemIDs = data.get(userID);
+      if (itemIDs == null) {
+        itemIDs = new FastIDSet(2);
+        data.put(userID, itemIDs);
+      }
+      itemIDs.add(itemID);
+
+      addTimestamp(userID, itemID, timestampString, timestamps);
+
+    }
+  }
+
+  private void addTimestamp(long userID,
+                            long itemID,
+                            String timestampString,
+                            FastByIDMap<FastByIDMap<Long>> timestamps) {
+    if (timestampString != null) {
+      FastByIDMap<Long> itemTimestamps = timestamps.get(userID);
+      if (itemTimestamps == null) {
+        itemTimestamps = new FastByIDMap<>();
+        timestamps.put(userID, itemTimestamps);
+      }
+      long timestamp = readTimestampFromString(timestampString);
+      itemTimestamps.put(itemID, timestamp);
+    }
+  }
+
+  private static void removeTimestamp(long userID,
+                                      long itemID,
+                                      FastByIDMap<FastByIDMap<Long>> timestamps) {
+    FastByIDMap<Long> itemTimestamps = timestamps.get(userID);
+    if (itemTimestamps != null) {
+      itemTimestamps.remove(itemID);
+    }
+  }
+
+  /**
+   * Subclasses may wish to override this if ID values in the file are not numeric. This provides a hook by
+   * which subclasses can inject an {@link org.apache.mahout.cf.taste.model.IDMigrator} to perform
+   * translation.
+   */
+  protected long readUserIDFromString(String value) {
+    return Long.parseLong(value);
+  }
+
+  /**
+   * Subclasses may wish to override this if ID values in the file are not numeric. This provides a hook by
+   * which subclasses can inject an {@link org.apache.mahout.cf.taste.model.IDMigrator} to perform
+   * translation.
+   */
+  protected long readItemIDFromString(String value) {
+    return Long.parseLong(value);
+  }
+
+  /**
+   * Subclasses may wish to override this to change how time values in the input file are parsed.
+   * By default they are expected to be numeric, expressing a time as milliseconds since the epoch.
+   */
+  protected long readTimestampFromString(String value) {
+    return Long.parseLong(value);
+  }
+
+  @Override
+  public LongPrimitiveIterator getUserIDs() throws TasteException {
+    return delegate.getUserIDs();
+  }
+
+  @Override
+  public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
+    return delegate.getPreferencesFromUser(userID);
+  }
+
+  @Override
+  public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+    return delegate.getItemIDsFromUser(userID);
+  }
+
+  @Override
+  public LongPrimitiveIterator getItemIDs() throws TasteException {
+    return delegate.getItemIDs();
+  }
+
+  @Override
+  public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
+    return delegate.getPreferencesForItem(itemID);
+  }
+
+  @Override
+  public Float getPreferenceValue(long userID, long itemID) throws TasteException {
+    return delegate.getPreferenceValue(userID, itemID);
+  }
+
+  @Override
+  public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+    return delegate.getPreferenceTime(userID, itemID);
+  }
+
+  @Override
+  public int getNumItems() throws TasteException {
+    return delegate.getNumItems();
+  }
+
+  @Override
+  public int getNumUsers() throws TasteException {
+    return delegate.getNumUsers();
+  }
+
+  @Override
+  public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
+    return delegate.getNumUsersWithPreferenceFor(itemID);
+  }
+
+  @Override
+  public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
+    return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2);
+  }
+
+  /**
+   * Note that this method only updates the in-memory preference data that this {@link FileDataModel}
+   * maintains; it does not modify any data on disk. Therefore any updates from this method are only
+   * temporary, and lost when data is reloaded from a file. This method should also be considered relatively
+   * slow.
+   */
+  @Override
+  public void setPreference(long userID, long itemID, float value) throws TasteException {
+    delegate.setPreference(userID, itemID, value);
+  }
+
+  /** See the warning at {@link #setPreference(long, long, float)}. */
+  @Override
+  public void removePreference(long userID, long itemID) throws TasteException {
+    delegate.removePreference(userID, itemID);
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    if (dataFile.lastModified() > lastModified + minReloadIntervalMS
+        || readLastUpdateFileModified() > lastUpdateFileModified + minReloadIntervalMS) {
+      log.debug("File has changed; reloading...");
+      reload();
+    }
+  }
+
+  @Override
+  public boolean hasPreferenceValues() {
+    return delegate.hasPreferenceValues();
+  }
+
+  @Override
+  public float getMaxPreference() {
+    return delegate.getMaxPreference();
+  }
+
+  @Override
+  public float getMinPreference() {
+    return delegate.getMinPreference();
+  }
+
+  @Override
+  public String toString() {
+    return "FileDataModel[dataFile:" + dataFile + ']';
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileIDMigrator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileIDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileIDMigrator.java
new file mode 100644
index 0000000..1bcb4ef
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileIDMigrator.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.file;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.concurrent.locks.ReentrantLock;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.model.AbstractIDMigrator;
+import org.apache.mahout.common.iterator.FileLineIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * An {@link org.apache.mahout.cf.taste.model.IDMigrator} backed by a file.
+ * This class typically expects a file where each line
+ * contains a single stringID to be stored in this migrator.
+ * </p>
+ *
+ * <p>
+ * This class will reload data from the data file when {@link #refresh(Collection)} is called, unless the file
+ * has been reloaded very recently already.
+ * </p>
+ */
+public class FileIDMigrator extends AbstractIDMigrator {
+
+  public static final long DEFAULT_MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute?
+
+  private final File dataFile;
+  private FastByIDMap<String> longToString;
+  private final ReentrantLock reloadLock;
+
+  private long lastModified;
+  private final long minReloadIntervalMS;
+
+  private static final Logger log = LoggerFactory.getLogger(FileIDMigrator.class);
+
+  public FileIDMigrator(File dataFile) throws FileNotFoundException {
+    this(dataFile, DEFAULT_MIN_RELOAD_INTERVAL_MS);
+  }
+
+  public FileIDMigrator(File dataFile, long minReloadIntervalMS) throws FileNotFoundException {
+    longToString = new FastByIDMap<>(100);
+    this.dataFile = Preconditions.checkNotNull(dataFile);
+    if (!dataFile.exists() || dataFile.isDirectory()) {
+      throw new FileNotFoundException(dataFile.toString());
+    }
+
+    log.info("Creating FileReadonlyIDMigrator for file {}", dataFile);
+
+    this.reloadLock = new ReentrantLock();
+    this.lastModified = dataFile.lastModified();
+    this.minReloadIntervalMS = minReloadIntervalMS;
+
+    reload();
+  }
+
+  @Override
+  public String toStringID(long longID) {
+    return longToString.get(longID);
+  }
+
+  private void reload() {
+    if (reloadLock.tryLock()) {
+      try {
+        longToString = buildMapping();
+      } catch (IOException ioe) {
+        throw new IllegalStateException(ioe);
+      } finally {
+        reloadLock.unlock();
+      }
+    }
+  }
+
+  private FastByIDMap<String> buildMapping() throws IOException {
+    FastByIDMap<String> mapping = new FastByIDMap<>();
+    for (String line : new FileLineIterable(dataFile)) {
+      mapping.put(toLongID(line), line);
+    }
+    lastModified = dataFile.lastModified();
+    return mapping;
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    if (dataFile.lastModified() > lastModified + minReloadIntervalMS) {
+      log.debug("File has changed; reloading...");
+      reload();
+    }
+  }
+
+  @Override
+  public String toString() {
+    return "FileIDMigrator[dataFile:" + dataFile + ']';
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/AbstractUserNeighborhood.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/AbstractUserNeighborhood.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/AbstractUserNeighborhood.java
new file mode 100644
index 0000000..8d33f60
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/AbstractUserNeighborhood.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.neighborhood;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * Contains methods and resources useful to all classes in this package.
+ * </p>
+ */
+abstract class AbstractUserNeighborhood implements UserNeighborhood {
+  
+  private final UserSimilarity userSimilarity;
+  private final DataModel dataModel;
+  private final double samplingRate;
+  private final RefreshHelper refreshHelper;
+  
+  AbstractUserNeighborhood(UserSimilarity userSimilarity, DataModel dataModel, double samplingRate) {
+    Preconditions.checkArgument(userSimilarity != null, "userSimilarity is null");
+    Preconditions.checkArgument(dataModel != null, "dataModel is null");
+    Preconditions.checkArgument(samplingRate > 0.0 && samplingRate <= 1.0, "samplingRate must be in (0,1]");
+    this.userSimilarity = userSimilarity;
+    this.dataModel = dataModel;
+    this.samplingRate = samplingRate;
+    this.refreshHelper = new RefreshHelper(null);
+    this.refreshHelper.addDependency(this.dataModel);
+    this.refreshHelper.addDependency(this.userSimilarity);
+  }
+  
+  final UserSimilarity getUserSimilarity() {
+    return userSimilarity;
+  }
+  
+  final DataModel getDataModel() {
+    return dataModel;
+  }
+  
+  final double getSamplingRate() {
+    return samplingRate;
+  }
+  
+  @Override
+  public final void refresh(Collection<Refreshable> alreadyRefreshed) {
+    refreshHelper.refresh(alreadyRefreshed);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/CachingUserNeighborhood.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/CachingUserNeighborhood.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/CachingUserNeighborhood.java
new file mode 100644
index 0000000..998e476
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/CachingUserNeighborhood.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.neighborhood;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.Retriever;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
+
+import com.google.common.base.Preconditions;
+
+/** A caching wrapper around an underlying {@link UserNeighborhood} implementation. */
+public final class CachingUserNeighborhood implements UserNeighborhood {
+  
+  private final UserNeighborhood neighborhood;
+  private final Cache<Long,long[]> neighborhoodCache;
+  
+  public CachingUserNeighborhood(UserNeighborhood neighborhood, DataModel dataModel) throws TasteException {
+    Preconditions.checkArgument(neighborhood != null, "neighborhood is null");
+    this.neighborhood = neighborhood;
+    int maxCacheSize = dataModel.getNumUsers(); // just a dumb heuristic for sizing
+    this.neighborhoodCache = new Cache<>(new NeighborhoodRetriever(neighborhood), maxCacheSize);
+  }
+  
+  @Override
+  public long[] getUserNeighborhood(long userID) throws TasteException {
+    return neighborhoodCache.get(userID);
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    neighborhoodCache.clear();
+    Collection<Refreshable> refreshed = RefreshHelper.buildRefreshed(alreadyRefreshed);
+    RefreshHelper.maybeRefresh(refreshed, neighborhood);
+  }
+  
+  private static final class NeighborhoodRetriever implements Retriever<Long,long[]> {
+    private final UserNeighborhood neighborhood;
+    
+    private NeighborhoodRetriever(UserNeighborhood neighborhood) {
+      this.neighborhood = neighborhood;
+    }
+    
+    @Override
+    public long[] get(Long key) throws TasteException {
+      return neighborhood.getUserNeighborhood(key);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/NearestNUserNeighborhood.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/NearestNUserNeighborhood.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/NearestNUserNeighborhood.java
new file mode 100644
index 0000000..7f3a98a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/NearestNUserNeighborhood.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.neighborhood;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.recommender.TopItems;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * Computes a neighborhood consisting of the nearest n users to a given user. "Nearest" is defined by the
+ * given {@link UserSimilarity}.
+ * </p>
+ */
+public final class NearestNUserNeighborhood extends AbstractUserNeighborhood {
+  
+  private final int n;
+  private final double minSimilarity;
+  
+  /**
+   * @param n neighborhood size; capped at the number of users in the data model
+   * @throws IllegalArgumentException
+   *           if {@code n < 1}, or userSimilarity or dataModel are {@code null}
+   */
+  public NearestNUserNeighborhood(int n, UserSimilarity userSimilarity, DataModel dataModel) throws TasteException {
+    this(n, Double.NEGATIVE_INFINITY, userSimilarity, dataModel, 1.0);
+  }
+  
+  /**
+   * @param n neighborhood size; capped at the number of users in the data model
+   * @param minSimilarity minimal similarity required for neighbors
+   * @throws IllegalArgumentException
+   *           if {@code n < 1}, or userSimilarity or dataModel are {@code null}
+   */
+  public NearestNUserNeighborhood(int n,
+                                  double minSimilarity,
+                                  UserSimilarity userSimilarity,
+                                  DataModel dataModel) throws TasteException {
+    this(n, minSimilarity, userSimilarity, dataModel, 1.0);
+  }
+  
+  /**
+   * @param n neighborhood size; capped at the number of users in the data model
+   * @param minSimilarity minimal similarity required for neighbors
+   * @param samplingRate percentage of users to consider when building neighborhood -- decrease to trade quality for
+   *   performance
+   * @throws IllegalArgumentException
+   *           if {@code n < 1} or samplingRate is NaN or not in (0,1], or userSimilarity or dataModel are
+   *           {@code null}
+   */
+  public NearestNUserNeighborhood(int n,
+                                  double minSimilarity,
+                                  UserSimilarity userSimilarity,
+                                  DataModel dataModel,
+                                  double samplingRate) throws TasteException {
+    super(userSimilarity, dataModel, samplingRate);
+    Preconditions.checkArgument(n >= 1, "n must be at least 1");
+    int numUsers = dataModel.getNumUsers();
+    this.n = n > numUsers ? numUsers : n;
+    this.minSimilarity = minSimilarity;
+  }
+  
+  @Override
+  public long[] getUserNeighborhood(long userID) throws TasteException {
+    
+    DataModel dataModel = getDataModel();
+    UserSimilarity userSimilarityImpl = getUserSimilarity();
+    
+    TopItems.Estimator<Long> estimator = new Estimator(userSimilarityImpl, userID, minSimilarity);
+    
+    LongPrimitiveIterator userIDs = SamplingLongPrimitiveIterator.maybeWrapIterator(dataModel.getUserIDs(),
+      getSamplingRate());
+    
+    return TopItems.getTopUsers(n, userIDs, null, estimator);
+  }
+  
+  @Override
+  public String toString() {
+    return "NearestNUserNeighborhood";
+  }
+  
+  private static final class Estimator implements TopItems.Estimator<Long> {
+    private final UserSimilarity userSimilarityImpl;
+    private final long theUserID;
+    private final double minSim;
+    
+    private Estimator(UserSimilarity userSimilarityImpl, long theUserID, double minSim) {
+      this.userSimilarityImpl = userSimilarityImpl;
+      this.theUserID = theUserID;
+      this.minSim = minSim;
+    }
+    
+    @Override
+    public double estimate(Long userID) throws TasteException {
+      if (userID == theUserID) {
+        return Double.NaN;
+      }
+      double sim = userSimilarityImpl.userSimilarity(theUserID, userID);
+      return sim >= minSim ? sim : Double.NaN;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/ThresholdUserNeighborhood.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/ThresholdUserNeighborhood.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/ThresholdUserNeighborhood.java
new file mode 100644
index 0000000..d5246e4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/ThresholdUserNeighborhood.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.neighborhood;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * Computes a neigbhorhood consisting of all users whose similarity to the given user meets or exceeds a
+ * certain threshold. Similarity is defined by the given {@link UserSimilarity}.
+ * </p>
+ */
+public final class ThresholdUserNeighborhood extends AbstractUserNeighborhood {
+  
+  private final double threshold;
+  
+  /**
+   * @param threshold
+   *          similarity threshold
+   * @param userSimilarity
+   *          similarity metric
+   * @param dataModel
+   *          data model
+   * @throws IllegalArgumentException
+   *           if threshold is {@link Double#NaN}, or if samplingRate is not positive and less than or equal
+   *           to 1.0, or if userSimilarity or dataModel are {@code null}
+   */
+  public ThresholdUserNeighborhood(double threshold, UserSimilarity userSimilarity, DataModel dataModel) {
+    this(threshold, userSimilarity, dataModel, 1.0);
+  }
+  
+  /**
+   * @param threshold
+   *          similarity threshold
+   * @param userSimilarity
+   *          similarity metric
+   * @param dataModel
+   *          data model
+   * @param samplingRate
+   *          percentage of users to consider when building neighborhood -- decrease to trade quality for
+   *          performance
+   * @throws IllegalArgumentException
+   *           if threshold or samplingRate is {@link Double#NaN}, or if samplingRate is not positive and less
+   *           than or equal to 1.0, or if userSimilarity or dataModel are {@code null}
+   */
+  public ThresholdUserNeighborhood(double threshold,
+                                   UserSimilarity userSimilarity,
+                                   DataModel dataModel,
+                                   double samplingRate) {
+    super(userSimilarity, dataModel, samplingRate);
+    Preconditions.checkArgument(!Double.isNaN(threshold), "threshold must not be NaN");
+    this.threshold = threshold;
+  }
+  
+  @Override
+  public long[] getUserNeighborhood(long userID) throws TasteException {
+    
+    DataModel dataModel = getDataModel();
+    FastIDSet neighborhood = new FastIDSet();
+    LongPrimitiveIterator usersIterable = SamplingLongPrimitiveIterator.maybeWrapIterator(dataModel
+        .getUserIDs(), getSamplingRate());
+    UserSimilarity userSimilarityImpl = getUserSimilarity();
+    
+    while (usersIterable.hasNext()) {
+      long otherUserID = usersIterable.next();
+      if (userID != otherUserID) {
+        double theSimilarity = userSimilarityImpl.userSimilarity(userID, otherUserID);
+        if (!Double.isNaN(theSimilarity) && theSimilarity >= threshold) {
+          neighborhood.add(otherUserID);
+        }
+      }
+    }
+    
+    return neighborhood.toArray();
+  }
+  
+  @Override
+  public String toString() {
+    return "ThresholdUserNeighborhood";
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractCandidateItemsStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractCandidateItemsStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractCandidateItemsStrategy.java
new file mode 100644
index 0000000..d24ea6a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractCandidateItemsStrategy.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy;
+import org.apache.mahout.cf.taste.recommender.MostSimilarItemsCandidateItemsStrategy;
+
+import java.util.Collection;
+
+/**
+ * Abstract base implementation for retrieving candidate items to recommend
+ */
+public abstract class AbstractCandidateItemsStrategy implements CandidateItemsStrategy,
+    MostSimilarItemsCandidateItemsStrategy {
+
+  protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel) throws TasteException{
+      return doGetCandidateItems(preferredItemIDs, dataModel, false);
+  }
+  
+  @Override
+  public FastIDSet getCandidateItems(long userID, PreferenceArray preferencesFromUser, DataModel dataModel,
+      boolean includeKnownItems) throws TasteException {
+    return doGetCandidateItems(preferencesFromUser.getIDs(), dataModel, includeKnownItems);
+  }
+  
+  @Override
+  public FastIDSet getCandidateItems(long[] itemIDs, DataModel dataModel)
+    throws TasteException {
+    return doGetCandidateItems(itemIDs, dataModel, false);
+  }
+     
+  protected abstract FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel,
+      boolean includeKnownItems) throws TasteException;
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {}
+}


[22/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/OrderBasedRecommenderEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/OrderBasedRecommenderEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/OrderBasedRecommenderEvaluator.java
new file mode 100644
index 0000000..e267a39
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/OrderBasedRecommenderEvaluator.java
@@ -0,0 +1,431 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Evaluate recommender by comparing order of all raw prefs with order in 
+ * recommender's output for that user. Can also compare data models.
+ */
+public final class OrderBasedRecommenderEvaluator {
+
+  private static final Logger log = LoggerFactory.getLogger(OrderBasedRecommenderEvaluator.class);
+
+  private OrderBasedRecommenderEvaluator() {
+  }
+
+  public static void evaluate(Recommender recommender1,
+                              Recommender recommender2,
+                              int samples,
+                              RunningAverage tracker,
+                              String tag) throws TasteException {
+    printHeader();
+    LongPrimitiveIterator users = recommender1.getDataModel().getUserIDs();
+
+    while (users.hasNext()) {
+      long userID = users.nextLong();
+      List<RecommendedItem> recs1 = recommender1.recommend(userID, samples);
+      List<RecommendedItem> recs2 = recommender2.recommend(userID, samples);
+      FastIDSet commonSet = new FastIDSet();
+      long maxItemID = setBits(commonSet, recs1, samples);
+      FastIDSet otherSet = new FastIDSet();
+      maxItemID = Math.max(maxItemID, setBits(otherSet, recs2, samples));
+      int max = mask(commonSet, otherSet, maxItemID);
+      max = Math.min(max, samples);
+      if (max < 2) {
+        continue;
+      }
+      Long[] items1 = getCommonItems(commonSet, recs1, max);
+      Long[] items2 = getCommonItems(commonSet, recs2, max);
+      double variance = scoreCommonSubset(tag, userID, samples, max, items1, items2);
+      tracker.addDatum(variance);
+    }
+  }
+
+  public static void evaluate(Recommender recommender,
+                              DataModel model,
+                              int samples,
+                              RunningAverage tracker,
+                              String tag) throws TasteException {
+    printHeader();
+    LongPrimitiveIterator users = recommender.getDataModel().getUserIDs();
+    while (users.hasNext()) {
+      long userID = users.nextLong();
+      List<RecommendedItem> recs1 = recommender.recommend(userID, model.getNumItems());
+      PreferenceArray prefs2 = model.getPreferencesFromUser(userID);
+      prefs2.sortByValueReversed();
+      FastIDSet commonSet = new FastIDSet();
+      long maxItemID = setBits(commonSet, recs1, samples);
+      FastIDSet otherSet = new FastIDSet();
+      maxItemID = Math.max(maxItemID, setBits(otherSet, prefs2, samples));
+      int max = mask(commonSet, otherSet, maxItemID);
+      max = Math.min(max, samples);
+      if (max < 2) {
+        continue;
+      }
+      Long[] items1 = getCommonItems(commonSet, recs1, max);
+      Long[] items2 = getCommonItems(commonSet, prefs2, max);
+      double variance = scoreCommonSubset(tag, userID, samples, max, items1, items2);
+      tracker.addDatum(variance);
+    }
+  }
+
+  public static void evaluate(DataModel model1,
+                              DataModel model2,
+                              int samples,
+                              RunningAverage tracker,
+                              String tag) throws TasteException {
+    printHeader();
+    LongPrimitiveIterator users = model1.getUserIDs();
+    while (users.hasNext()) {
+      long userID = users.nextLong();
+      PreferenceArray prefs1 = model1.getPreferencesFromUser(userID);
+      PreferenceArray prefs2 = model2.getPreferencesFromUser(userID);
+      prefs1.sortByValueReversed();
+      prefs2.sortByValueReversed();
+      FastIDSet commonSet = new FastIDSet();
+      long maxItemID = setBits(commonSet, prefs1, samples);
+      FastIDSet otherSet = new FastIDSet();
+      maxItemID = Math.max(maxItemID, setBits(otherSet, prefs2, samples));
+      int max = mask(commonSet, otherSet, maxItemID);
+      max = Math.min(max, samples);
+      if (max < 2) {
+        continue;
+      }
+      Long[] items1 = getCommonItems(commonSet, prefs1, max);
+      Long[] items2 = getCommonItems(commonSet, prefs2, max);
+      double variance = scoreCommonSubset(tag, userID, samples, max, items1, items2);
+      tracker.addDatum(variance);
+    }
+  }
+
+  /**
+   * This exists because FastIDSet has 'retainAll' as MASK, but there is 
+   * no count of the number of items in the set. size() is supposed to do 
+   * this but does not work.
+   */
+  private static int mask(FastIDSet commonSet, FastIDSet otherSet, long maxItemID) {
+    int count = 0;
+    for (int i = 0; i <= maxItemID; i++) {
+      if (commonSet.contains(i)) {
+        if (otherSet.contains(i)) {
+          count++;
+        } else {
+          commonSet.remove(i);
+        }
+      }
+    }
+    return count;
+  }
+
+  private static Long[] getCommonItems(FastIDSet commonSet, Iterable<RecommendedItem> recs, int max) {
+    Long[] commonItems = new Long[max];
+    int index = 0;
+    for (RecommendedItem rec : recs) {
+      Long item = rec.getItemID();
+      if (commonSet.contains(item)) {
+        commonItems[index++] = item;
+      }
+      if (index == max) {
+        break;
+      }
+    }
+    return commonItems;
+  }
+
+  private static Long[] getCommonItems(FastIDSet commonSet, PreferenceArray prefs1, int max) {
+    Long[] commonItems = new Long[max];
+    int index = 0;
+    for (int i = 0; i < prefs1.length(); i++) {
+      Long item = prefs1.getItemID(i);
+      if (commonSet.contains(item)) {
+        commonItems[index++] = item;
+      }
+      if (index == max) {
+        break;
+      }
+    }
+    return commonItems;
+  }
+
+  private static long setBits(FastIDSet modelSet, List<RecommendedItem> items, int max) {
+    long maxItem = -1;
+    for (int i = 0; i < items.size() && i < max; i++) {
+      long itemID = items.get(i).getItemID();
+      modelSet.add(itemID);
+      if (itemID > maxItem) {
+        maxItem = itemID;
+      }
+    }
+    return maxItem;
+  }
+
+  private static long setBits(FastIDSet modelSet, PreferenceArray prefs, int max) {
+    long maxItem = -1;
+    for (int i = 0; i < prefs.length() && i < max; i++) {
+      long itemID = prefs.getItemID(i);
+      modelSet.add(itemID);
+      if (itemID > maxItem) {
+        maxItem = itemID;
+      }
+    }
+    return maxItem;
+  }
+
+  private static void printHeader() {
+    log.info("tag,user,samples,common,hamming,bubble,rank,normal,score");
+  }
+
+  /**
+   * Common Subset Scoring
+   *
+   * These measurements are given the set of results that are common to both
+   * recommendation lists. They only get ordered lists.
+   *
+   * These measures all return raw numbers do not correlate among the tests.
+   * The numbers are not corrected against the total number of samples or the
+   * number of common items.
+   * The one contract is that all measures are 0 for an exact match and an
+   * increasing positive number as differences increase.
+   */
+  private static double scoreCommonSubset(String tag,
+                                          long userID,
+                                          int samples,
+                                          int subset,
+                                          Long[] itemsL,
+                                          Long[] itemsR) {
+    int[] vectorZ = new int[subset];
+    int[] vectorZabs = new int[subset];
+
+    long bubble = sort(itemsL, itemsR);
+    int hamming = slidingWindowHamming(itemsR, itemsL);
+    if (hamming > samples) {
+      throw new IllegalStateException();
+    }
+    getVectorZ(itemsR, itemsL, vectorZ, vectorZabs);
+    double normalW = normalWilcoxon(vectorZ, vectorZabs);
+    double meanRank = getMeanRank(vectorZabs);
+    // case statement for requested value
+    double variance = Math.sqrt(meanRank);
+    log.info("{},{},{},{},{},{},{},{},{}",
+             tag, userID, samples, subset, hamming, bubble, meanRank, normalW, variance);
+    return variance;
+  }
+
+  // simple sliding-window hamming distance: a[i or plus/minus 1] == b[i]
+  private static int slidingWindowHamming(Long[] itemsR, Long[] itemsL) {
+    int count = 0;
+    int samples = itemsR.length;
+
+    if (itemsR[0].equals(itemsL[0]) || itemsR[0].equals(itemsL[1])) {
+      count++;
+    }
+    for (int i = 1; i < samples - 1; i++) {
+      long itemID = itemsL[i];
+      if (itemsR[i] == itemID || itemsR[i - 1] == itemID || itemsR[i + 1] == itemID) {
+        count++;
+      }
+    }
+    if (itemsR[samples - 1].equals(itemsL[samples - 1]) || itemsR[samples - 1].equals(itemsL[samples - 2])) {
+      count++;
+    }
+    return count;
+  }
+
+  /**
+   * Normal-distribution probability value for matched sets of values.
+   * Based upon:
+   * http://comp9.psych.cornell.edu/Darlington/normscor.htm
+   * 
+   * The Standard Wilcoxon is not used because it requires a lookup table.
+   */
+  static double normalWilcoxon(int[] vectorZ, int[] vectorZabs) {
+    int nitems = vectorZ.length;
+
+    double[] ranks = new double[nitems];
+    double[] ranksAbs = new double[nitems];
+    wilcoxonRanks(vectorZ, vectorZabs, ranks, ranksAbs);
+    return Math.min(getMeanWplus(ranks), getMeanWminus(ranks));
+  }
+
+  /**
+   * vector Z is a list of distances between the correct value and the recommended value
+   * Z[i] = position i of correct itemID - position of correct itemID in recommendation list
+   * can be positive or negative
+   * the smaller the better - means recommendations are closer
+   * both are the same length, and both sample from the same set
+   * 
+   * destructive to items arrays - allows N log N instead of N^2 order
+   */
+  private static void getVectorZ(Long[] itemsR, Long[] itemsL, int[] vectorZ, int[] vectorZabs) {
+    int nitems = itemsR.length;
+    int bottom = 0;
+    int top = nitems - 1;
+    for (int i = 0; i < nitems; i++) {
+      long itemID = itemsR[i];
+      for (int j = bottom; j <= top; j++) {
+        if (itemsL[j] == null) {
+          continue;
+        }
+        long test = itemsL[j];
+        if (itemID == test) {
+          vectorZ[i] = i - j;
+          vectorZabs[i] = Math.abs(i - j);
+          if (j == bottom) {
+            bottom++;
+          } else if (j == top) {
+            top--;
+          } else {
+            itemsL[j] = null;
+          }
+          break;
+        }
+      }
+    }
+  }
+
+  /**
+   * Ranks are the position of the value from low to high, divided by the # of values.
+   * I had to walk through it a few times.
+   */
+  private static void wilcoxonRanks(int[] vectorZ, int[] vectorZabs, double[] ranks, double[] ranksAbs) {
+    int nitems = vectorZ.length;
+    int[] sorted = vectorZabs.clone();
+    Arrays.sort(sorted);
+    int zeros = 0;
+    for (; zeros < nitems; zeros++) {
+      if (sorted[zeros] > 0) {
+        break;
+      }
+    }
+    for (int i = 0; i < nitems; i++) {
+      double rank = 0.0;
+      int count = 0;
+      int score = vectorZabs[i];
+      for (int j = 0; j < nitems; j++) {
+        if (score == sorted[j]) {
+          rank += j + 1 - zeros;
+          count++;
+        } else if (score < sorted[j]) {
+          break;
+        }
+      }
+      if (vectorZ[i] != 0) {
+        ranks[i] = (rank / count) * (vectorZ[i] < 0 ? -1 : 1);  // better be at least 1
+        ranksAbs[i] = Math.abs(ranks[i]);
+      }
+    }
+  }
+
+  private static double getMeanRank(int[] ranks) {
+    int nitems = ranks.length;
+    double sum = 0.0;
+    for (int rank : ranks) {
+      sum += rank;
+    }
+    return sum / nitems;
+  }
+
+  private static double getMeanWplus(double[] ranks) {
+    int nitems = ranks.length;
+    double sum = 0.0;
+    for (double rank : ranks) {
+      if (rank > 0) {
+        sum += rank;
+      }
+    }
+    return sum / nitems;
+  }
+
+  private static double getMeanWminus(double[] ranks) {
+    int nitems = ranks.length;
+    double sum = 0.0;
+    for (double rank : ranks) {
+      if (rank < 0) {
+        sum -= rank;
+      }
+    }
+    return sum / nitems;
+  }
+
+  /**
+   * Do bubble sort and return number of swaps needed to match preference lists.
+   * Sort itemsR using itemsL as the reference order.
+   */
+  static long sort(Long[] itemsL, Long[] itemsR) {
+    int length = itemsL.length;
+    if (length < 2) {
+      return 0;
+    }
+    if (length == 2) {
+      return itemsL[0].longValue() == itemsR[0].longValue() ? 0 : 1;
+    }
+    // 1) avoid changing originals; 2) primitive type is more efficient
+    long[] reference = new long[length];
+    long[] sortable = new long[length];
+    for (int i = 0; i < length; i++) {
+      reference[i] = itemsL[i];
+      sortable[i] = itemsR[i];
+    }
+    int sorted = 0;
+    long swaps = 0;
+    while (sorted < length - 1) {
+      // opportunistically trim back the top
+      while (length > 0 && reference[length - 1] == sortable[length - 1]) {
+        length--;
+      }
+      if (length == 0) {
+        break;
+      }
+      if (reference[sorted] == sortable[sorted]) {
+        sorted++;
+      } else {
+        for (int j = sorted; j < length - 1; j++) {
+          // do not swap anything already in place
+          int jump = 1;
+          if (reference[j] == sortable[j]) {
+            while (j + jump < length && reference[j + jump] == sortable[j + jump]) {
+              jump++;
+            }
+          }
+          if (j + jump < length && !(reference[j] == sortable[j] && reference[j + jump] == sortable[j + jump])) {
+            long tmp = sortable[j];
+            sortable[j] = sortable[j + 1];
+            sortable[j + 1] = tmp;
+            swaps++;
+          }
+        }
+      }
+    }
+    return swaps;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/RMSRecommenderEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/RMSRecommenderEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/RMSRecommenderEvaluator.java
new file mode 100644
index 0000000..97eda10
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/RMSRecommenderEvaluator.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.Preference;
+
+/**
+ * <p>
+ * A {@link org.apache.mahout.cf.taste.eval.RecommenderEvaluator} which computes the "root mean squared"
+ * difference between predicted and actual ratings for users. This is the square root of the average of this
+ * difference, squared.
+ * </p>
+ */
+public final class RMSRecommenderEvaluator extends AbstractDifferenceRecommenderEvaluator {
+  
+  private RunningAverage average;
+  
+  @Override
+  protected void reset() {
+    average = new FullRunningAverage();
+  }
+  
+  @Override
+  protected void processOneEstimate(float estimatedPreference, Preference realPref) {
+    double diff = realPref.getValue() - estimatedPreference;
+    average.addDatum(diff * diff);
+  }
+  
+  @Override
+  protected double computeFinalEvaluation() {
+    return Math.sqrt(average.getAverage());
+  }
+  
+  @Override
+  public String toString() {
+    return "RMSRecommenderEvaluator";
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/StatsCallable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/StatsCallable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/StatsCallable.java
new file mode 100644
index 0000000..036d0b4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/StatsCallable.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicInteger;
+
+final class StatsCallable implements Callable<Void> {
+  
+  private static final Logger log = LoggerFactory.getLogger(StatsCallable.class);
+  
+  private final Callable<Void> delegate;
+  private final boolean logStats;
+  private final RunningAverageAndStdDev timing;
+  private final AtomicInteger noEstimateCounter;
+  
+  StatsCallable(Callable<Void> delegate,
+                boolean logStats,
+                RunningAverageAndStdDev timing,
+                AtomicInteger noEstimateCounter) {
+    this.delegate = delegate;
+    this.logStats = logStats;
+    this.timing = timing;
+    this.noEstimateCounter = noEstimateCounter;
+  }
+  
+  @Override
+  public Void call() throws Exception {
+    long start = System.currentTimeMillis();
+    delegate.call();
+    long end = System.currentTimeMillis();
+    timing.addDatum(end - start);
+    if (logStats) {
+      Runtime runtime = Runtime.getRuntime();
+      int average = (int) timing.getAverage();
+      log.info("Average time per recommendation: {}ms", average);
+      long totalMemory = runtime.totalMemory();
+      long memory = totalMemory - runtime.freeMemory();
+      log.info("Approximate memory used: {}MB / {}MB", memory / 1000000L, totalMemory / 1000000L);
+      log.info("Unable to recommend in {} cases", noEstimateCounter.get());
+    }
+    return null;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractDataModel.java
new file mode 100644
index 0000000..a1a2a1f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractDataModel.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import org.apache.mahout.cf.taste.model.DataModel;
+
+/**
+ * Contains some features common to all implementations.
+ */
+public abstract class AbstractDataModel implements DataModel {
+
+  private float maxPreference;
+  private float minPreference;
+
+  protected AbstractDataModel() {
+    maxPreference = Float.NaN;
+    minPreference = Float.NaN;
+  }
+
+  @Override
+  public float getMaxPreference() {
+    return maxPreference;
+  }
+
+  protected void setMaxPreference(float maxPreference) {
+    this.maxPreference = maxPreference;
+  }
+
+  @Override
+  public float getMinPreference() {
+    return minPreference;
+  }
+
+  protected void setMinPreference(float minPreference) {
+    this.minPreference = minPreference;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java
new file mode 100644
index 0000000..6efa6fa
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.Collection;
+
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.model.IDMigrator;
+
+public abstract class AbstractIDMigrator implements IDMigrator {
+
+  private final MessageDigest md5Digest;
+  
+  protected AbstractIDMigrator() {
+    try {
+      md5Digest = MessageDigest.getInstance("MD5");
+    } catch (NoSuchAlgorithmException nsae) {
+      // Can't happen
+      throw new IllegalStateException(nsae);
+    }
+  }
+  
+  /**
+   * @return most significant 8 bytes of the MD5 hash of the string, as a long
+   */
+  protected final long hash(String value) {
+    byte[] md5hash;
+    synchronized (md5Digest) {
+      md5hash = md5Digest.digest(value.getBytes(Charsets.UTF_8));
+      md5Digest.reset();
+    }
+    long hash = 0L;
+    for (int i = 0; i < 8; i++) {
+      hash = hash << 8 | md5hash[i] & 0x00000000000000FFL;
+    }
+    return hash;
+  }
+  
+  @Override
+  public long toLongID(String stringID) {
+    return hash(stringID);
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java
new file mode 100644
index 0000000..cd3a434
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import javax.sql.DataSource;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.UpdatableIDMigrator;
+import org.apache.mahout.common.IOUtils;
+
+/**
+ * Implementation which stores the reverse long-to-String mapping in a database. Subclasses can override and
+ * configure the class to operate with particular databases by supplying appropriate SQL statements to the
+ * constructor.
+ */
+public abstract class AbstractJDBCIDMigrator extends AbstractIDMigrator implements UpdatableIDMigrator {
+  
+  public static final String DEFAULT_MAPPING_TABLE = "taste_id_mapping";
+  public static final String DEFAULT_LONG_ID_COLUMN = "long_id";
+  public static final String DEFAULT_STRING_ID_COLUMN = "string_id";
+  
+  private final DataSource dataSource;
+  private final String getStringIDSQL;
+  private final String storeMappingSQL;
+  
+  /**
+   * @param getStringIDSQL
+   *          SQL statement which selects one column, the String ID, from a mapping table. The statement
+   *          should take one long parameter.
+   * @param storeMappingSQL
+   *          SQL statement which saves a mapping from long to String. It should take two parameters, a long
+   *          and a String.
+   */
+  protected AbstractJDBCIDMigrator(DataSource dataSource, String getStringIDSQL, String storeMappingSQL) {
+    this.dataSource = dataSource;
+    this.getStringIDSQL = getStringIDSQL;
+    this.storeMappingSQL = storeMappingSQL;
+  }
+  
+  @Override
+  public final void storeMapping(long longID, String stringID) throws TasteException {
+    Connection conn = null;
+    PreparedStatement stmt = null;
+    try {
+      conn = dataSource.getConnection();
+      stmt = conn.prepareStatement(storeMappingSQL);
+      stmt.setLong(1, longID);
+      stmt.setString(2, stringID);
+      stmt.executeUpdate();
+    } catch (SQLException sqle) {
+      throw new TasteException(sqle);
+    } finally {
+      IOUtils.quietClose(null, stmt, conn);
+    }
+  }
+  
+  @Override
+  public final String toStringID(long longID) throws TasteException {
+    Connection conn = null;
+    PreparedStatement stmt = null;
+    ResultSet rs = null;
+    try {
+      conn = dataSource.getConnection();
+      stmt = conn.prepareStatement(getStringIDSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+      stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
+      stmt.setFetchSize(1);
+      stmt.setLong(1, longID);
+      rs = stmt.executeQuery();
+      if (rs.next()) {
+        return rs.getString(1);
+      } else {
+        return null;
+      }
+    } catch (SQLException sqle) {
+      throw new TasteException(sqle);
+    } finally {
+      IOUtils.quietClose(rs, stmt, conn);
+    }
+  }
+
+  @Override
+  public void initialize(Iterable<String> stringIDs) throws TasteException {
+    for (String stringID : stringIDs) {
+      storeMapping(toLongID(stringID), stringID);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanItemPreferenceArray.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanItemPreferenceArray.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanItemPreferenceArray.java
new file mode 100644
index 0000000..6db5807
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanItemPreferenceArray.java
@@ -0,0 +1,234 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.iterator.CountingIterator;
+
+/**
+ * <p>
+ * Like {@link BooleanUserPreferenceArray} but stores preferences for one item (all item IDs the same) rather
+ * than one user.
+ * </p>
+ * 
+ * @see BooleanPreference
+ * @see BooleanUserPreferenceArray
+ * @see GenericItemPreferenceArray
+ */
+public final class BooleanItemPreferenceArray implements PreferenceArray {
+  
+  private final long[] ids;
+  private long id;
+  
+  public BooleanItemPreferenceArray(int size) {
+    this.ids = new long[size];
+    this.id = Long.MIN_VALUE; // as a sort of 'unspecified' value
+  }
+  
+  public BooleanItemPreferenceArray(List<? extends Preference> prefs, boolean forOneUser) {
+    this(prefs.size());
+    int size = prefs.size();
+    for (int i = 0; i < size; i++) {
+      Preference pref = prefs.get(i);
+      ids[i] = forOneUser ? pref.getItemID() : pref.getUserID();
+    }
+    if (size > 0) {
+      id = forOneUser ? prefs.get(0).getUserID() : prefs.get(0).getItemID();
+    }
+  }
+  
+  /**
+   * This is a private copy constructor for clone().
+   */
+  private BooleanItemPreferenceArray(long[] ids, long id) {
+    this.ids = ids;
+    this.id = id;
+  }
+  
+  @Override
+  public int length() {
+    return ids.length;
+  }
+  
+  @Override
+  public Preference get(int i) {
+    return new PreferenceView(i);
+  }
+  
+  @Override
+  public void set(int i, Preference pref) {
+    id = pref.getItemID();
+    ids[i] = pref.getUserID();
+  }
+  
+  @Override
+  public long getUserID(int i) {
+    return ids[i];
+  }
+  
+  @Override
+  public void setUserID(int i, long userID) {
+    ids[i] = userID;
+  }
+  
+  @Override
+  public long getItemID(int i) {
+    return id;
+  }
+  
+  /**
+   * {@inheritDoc}
+   * 
+   * Note that this method will actually set the item ID for <em>all</em> preferences.
+   */
+  @Override
+  public void setItemID(int i, long itemID) {
+    id = itemID;
+  }
+
+  /**
+   * @return all user IDs
+   */
+  @Override
+  public long[] getIDs() {
+    return ids;
+  }
+  
+  @Override
+  public float getValue(int i) {
+    return 1.0f;
+  }
+  
+  @Override
+  public void setValue(int i, float value) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void sortByUser() {
+    Arrays.sort(ids);
+  }
+  
+  @Override
+  public void sortByItem() { }
+  
+  @Override
+  public void sortByValue() { }
+  
+  @Override
+  public void sortByValueReversed() { }
+  
+  @Override
+  public boolean hasPrefWithUserID(long userID) {
+    for (long id : ids) {
+      if (userID == id) {
+        return true;
+      }
+    }
+    return false;
+  }
+  
+  @Override
+  public boolean hasPrefWithItemID(long itemID) {
+    return id == itemID;
+  }
+  
+  @Override
+  public BooleanItemPreferenceArray clone() {
+    return new BooleanItemPreferenceArray(ids.clone(), id);
+  }
+
+  @Override
+  public int hashCode() {
+    return (int) (id >> 32) ^ (int) id ^ Arrays.hashCode(ids);
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (!(other instanceof BooleanItemPreferenceArray)) {
+      return false;
+    }
+    BooleanItemPreferenceArray otherArray = (BooleanItemPreferenceArray) other;
+    return id == otherArray.id && Arrays.equals(ids, otherArray.ids);
+  }
+  
+  @Override
+  public Iterator<Preference> iterator() {
+    return Iterators.transform(new CountingIterator(length()),
+        new Function<Integer, Preference>() {
+        @Override
+        public Preference apply(Integer from) {
+          return new PreferenceView(from);
+        }
+      });
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder result = new StringBuilder(10 * ids.length);
+    result.append("BooleanItemPreferenceArray[itemID:");
+    result.append(id);
+    result.append(",{");
+    for (int i = 0; i < ids.length; i++) {
+      if (i > 0) {
+        result.append(',');
+      }
+      result.append(ids[i]);
+    }
+    result.append("}]");
+    return result.toString();
+  }
+  
+  private final class PreferenceView implements Preference {
+    
+    private final int i;
+    
+    private PreferenceView(int i) {
+      this.i = i;
+    }
+    
+    @Override
+    public long getUserID() {
+      return BooleanItemPreferenceArray.this.getUserID(i);
+    }
+    
+    @Override
+    public long getItemID() {
+      return BooleanItemPreferenceArray.this.getItemID(i);
+    }
+    
+    @Override
+    public float getValue() {
+      return 1.0f;
+    }
+    
+    @Override
+    public void setValue(float value) {
+      throw new UnsupportedOperationException();
+    }
+    
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanPreference.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanPreference.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanPreference.java
new file mode 100644
index 0000000..2093af8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanPreference.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.io.Serializable;
+
+import org.apache.mahout.cf.taste.model.Preference;
+
+/**
+ * Encapsulates a simple boolean "preference" for an item whose value does not matter (is fixed at 1.0). This
+ * is appropriate in situations where users conceptually have only a general "yes" preference for items,
+ * rather than a spectrum of preference values.
+ */
+public final class BooleanPreference implements Preference, Serializable {
+  
+  private final long userID;
+  private final long itemID;
+  
+  public BooleanPreference(long userID, long itemID) {
+    this.userID = userID;
+    this.itemID = itemID;
+  }
+  
+  @Override
+  public long getUserID() {
+    return userID;
+  }
+  
+  @Override
+  public long getItemID() {
+    return itemID;
+  }
+  
+  @Override
+  public float getValue() {
+    return 1.0f;
+  }
+  
+  @Override
+  public void setValue(float value) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public String toString() {
+    return "BooleanPreference[userID: " + userID + ", itemID:" + itemID + ']';
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanUserPreferenceArray.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanUserPreferenceArray.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanUserPreferenceArray.java
new file mode 100644
index 0000000..629e0cf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanUserPreferenceArray.java
@@ -0,0 +1,234 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.iterator.CountingIterator;
+
+/**
+ * <p>
+ * Like {@link GenericUserPreferenceArray} but stores, conceptually, {@link BooleanPreference} objects which
+ * have no associated preference value.
+ * </p>
+ * 
+ * @see BooleanPreference
+ * @see BooleanItemPreferenceArray
+ * @see GenericUserPreferenceArray
+ */
+public final class BooleanUserPreferenceArray implements PreferenceArray {
+  
+  private final long[] ids;
+  private long id;
+  
+  public BooleanUserPreferenceArray(int size) {
+    this.ids = new long[size];
+    this.id = Long.MIN_VALUE; // as a sort of 'unspecified' value
+  }
+  
+  public BooleanUserPreferenceArray(List<? extends Preference> prefs) {
+    this(prefs.size());
+    int size = prefs.size();
+    for (int i = 0; i < size; i++) {
+      Preference pref = prefs.get(i);
+      ids[i] = pref.getItemID();
+    }
+    if (size > 0) {
+      id = prefs.get(0).getUserID();
+    }
+  }
+  
+  /**
+   * This is a private copy constructor for clone().
+   */
+  private BooleanUserPreferenceArray(long[] ids, long id) {
+    this.ids = ids;
+    this.id = id;
+  }
+  
+  @Override
+  public int length() {
+    return ids.length;
+  }
+  
+  @Override
+  public Preference get(int i) {
+    return new PreferenceView(i);
+  }
+  
+  @Override
+  public void set(int i, Preference pref) {
+    id = pref.getUserID();
+    ids[i] = pref.getItemID();
+  }
+  
+  @Override
+  public long getUserID(int i) {
+    return id;
+  }
+  
+  /**
+   * {@inheritDoc}
+   * 
+   * Note that this method will actually set the user ID for <em>all</em> preferences.
+   */
+  @Override
+  public void setUserID(int i, long userID) {
+    id = userID;
+  }
+  
+  @Override
+  public long getItemID(int i) {
+    return ids[i];
+  }
+  
+  @Override
+  public void setItemID(int i, long itemID) {
+    ids[i] = itemID;
+  }
+
+  /**
+   * @return all item IDs
+   */
+  @Override
+  public long[] getIDs() {
+    return ids;
+  }
+  
+  @Override
+  public float getValue(int i) {
+    return 1.0f;
+  }
+  
+  @Override
+  public void setValue(int i, float value) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void sortByUser() { }
+  
+  @Override
+  public void sortByItem() {
+    Arrays.sort(ids);
+  }
+  
+  @Override
+  public void sortByValue() { }
+  
+  @Override
+  public void sortByValueReversed() { }
+  
+  @Override
+  public boolean hasPrefWithUserID(long userID) {
+    return id == userID;
+  }
+  
+  @Override
+  public boolean hasPrefWithItemID(long itemID) {
+    for (long id : ids) {
+      if (itemID == id) {
+        return true;
+      }
+    }
+    return false;
+  }
+  
+  @Override
+  public BooleanUserPreferenceArray clone() {
+    return new BooleanUserPreferenceArray(ids.clone(), id);
+  }
+
+  @Override
+  public int hashCode() {
+    return (int) (id >> 32) ^ (int) id ^ Arrays.hashCode(ids);
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (!(other instanceof BooleanUserPreferenceArray)) {
+      return false;
+    }
+    BooleanUserPreferenceArray otherArray = (BooleanUserPreferenceArray) other;
+    return id == otherArray.id && Arrays.equals(ids, otherArray.ids);
+  }
+  
+  @Override
+  public Iterator<Preference> iterator() {
+    return Iterators.transform(new CountingIterator(length()),
+        new Function<Integer, Preference>() {
+        @Override
+        public Preference apply(Integer from) {
+          return new PreferenceView(from);
+        }
+      });
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder result = new StringBuilder(10 * ids.length);
+    result.append("BooleanUserPreferenceArray[userID:");
+    result.append(id);
+    result.append(",{");
+    for (int i = 0; i < ids.length; i++) {
+      if (i > 0) {
+        result.append(',');
+      }
+      result.append(ids[i]);
+    }
+    result.append("}]");
+    return result.toString();
+  }
+
+  private final class PreferenceView implements Preference {
+    
+    private final int i;
+    
+    private PreferenceView(int i) {
+      this.i = i;
+    }
+    
+    @Override
+    public long getUserID() {
+      return BooleanUserPreferenceArray.this.getUserID(i);
+    }
+    
+    @Override
+    public long getItemID() {
+      return BooleanUserPreferenceArray.this.getItemID(i);
+    }
+    
+    @Override
+    public float getValue() {
+      return 1.0f;
+    }
+    
+    @Override
+    public void setValue(float value) {
+      throw new UnsupportedOperationException();
+    }
+    
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java
new file mode 100644
index 0000000..2c1ff4d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java
@@ -0,0 +1,320 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Map;
+
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveArrayIterator;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A simple {@link DataModel} which uses given user data as its data source. This implementation
+ * is mostly useful for small experiments and is not recommended for contexts where performance is important.
+ * </p>
+ */
+public final class GenericBooleanPrefDataModel extends AbstractDataModel {
+  
+  private final long[] userIDs;
+  private final FastByIDMap<FastIDSet> preferenceFromUsers;
+  private final long[] itemIDs;
+  private final FastByIDMap<FastIDSet> preferenceForItems;
+  private final FastByIDMap<FastByIDMap<Long>> timestamps;
+  
+  /**
+   * <p>
+   * Creates a new {@link GenericDataModel} from the given users (and their preferences). This
+   * {@link DataModel} retains all this information in memory and is effectively immutable.
+   * </p>
+   * 
+   * @param userData users to include
+   */
+  public GenericBooleanPrefDataModel(FastByIDMap<FastIDSet> userData) {
+    this(userData, null);
+  }
+
+  /**
+   * <p>
+   * Creates a new {@link GenericDataModel} from the given users (and their preferences). This
+   * {@link DataModel} retains all this information in memory and is effectively immutable.
+   * </p>
+   *
+   * @param userData users to include
+   * @param timestamps optionally, provided timestamps of preferences as milliseconds since the epoch.
+   *  User IDs are mapped to maps of item IDs to Long timestamps.
+   */
+  public GenericBooleanPrefDataModel(FastByIDMap<FastIDSet> userData, FastByIDMap<FastByIDMap<Long>> timestamps) {
+    Preconditions.checkArgument(userData != null, "userData is null");
+
+    this.preferenceFromUsers = userData;
+    this.preferenceForItems = new FastByIDMap<>();
+    FastIDSet itemIDSet = new FastIDSet();
+    for (Map.Entry<Long, FastIDSet> entry : preferenceFromUsers.entrySet()) {
+      long userID = entry.getKey();
+      FastIDSet itemIDs = entry.getValue();
+      itemIDSet.addAll(itemIDs);
+      LongPrimitiveIterator it = itemIDs.iterator();
+      while (it.hasNext()) {
+        long itemID = it.nextLong();
+        FastIDSet userIDs = preferenceForItems.get(itemID);
+        if (userIDs == null) {
+          userIDs = new FastIDSet(2);
+          preferenceForItems.put(itemID, userIDs);
+        }
+        userIDs.add(userID);
+      }
+    }
+
+    this.itemIDs = itemIDSet.toArray();
+    itemIDSet = null; // Might help GC -- this is big
+    Arrays.sort(itemIDs);
+
+    this.userIDs = new long[userData.size()];
+    int i = 0;
+    LongPrimitiveIterator it = userData.keySetIterator();
+    while (it.hasNext()) {
+      userIDs[i++] = it.next();
+    }
+    Arrays.sort(userIDs);
+
+    this.timestamps = timestamps;
+  }
+  
+  /**
+   * <p>
+   * Creates a new {@link GenericDataModel} containing an immutable copy of the data from another given
+   * {@link DataModel}.
+   * </p>
+   * 
+   * @param dataModel
+   *          {@link DataModel} to copy
+   * @throws TasteException
+   *           if an error occurs while retrieving the other {@link DataModel}'s users
+   * @deprecated without direct replacement.
+   *  Consider {@link #toDataMap(DataModel)} with {@link #GenericBooleanPrefDataModel(FastByIDMap)}
+   */
+  @Deprecated
+  public GenericBooleanPrefDataModel(DataModel dataModel) throws TasteException {
+    this(toDataMap(dataModel));
+  }
+
+  /**
+   * Exports the simple user IDs and associated item IDs in the data model.
+   *
+   * @return a {@link FastByIDMap} mapping user IDs to {@link FastIDSet}s representing
+   *  that user's associated items
+   */
+  public static FastByIDMap<FastIDSet> toDataMap(DataModel dataModel) throws TasteException {
+    FastByIDMap<FastIDSet> data = new FastByIDMap<>(dataModel.getNumUsers());
+    LongPrimitiveIterator it = dataModel.getUserIDs();
+    while (it.hasNext()) {
+      long userID = it.nextLong();
+      data.put(userID, dataModel.getItemIDsFromUser(userID));
+    }
+    return data;
+  }
+
+  public static FastByIDMap<FastIDSet> toDataMap(FastByIDMap<PreferenceArray> data) {
+    for (Map.Entry<Long,Object> entry : ((FastByIDMap<Object>) (FastByIDMap<?>) data).entrySet()) {
+      PreferenceArray prefArray = (PreferenceArray) entry.getValue();
+      int size = prefArray.length();
+      FastIDSet itemIDs = new FastIDSet(size);
+      for (int i = 0; i < size; i++) {
+        itemIDs.add(prefArray.getItemID(i));
+      }
+      entry.setValue(itemIDs);
+    }
+    return (FastByIDMap<FastIDSet>) (FastByIDMap<?>) data;
+  }
+  
+  /**
+   * This is used mostly internally to the framework, and shouldn't be relied upon otherwise.
+   */
+  public FastByIDMap<FastIDSet> getRawUserData() {
+    return this.preferenceFromUsers;
+  }
+
+  /**
+   * This is used mostly internally to the framework, and shouldn't be relied upon otherwise.
+   */
+  public FastByIDMap<FastIDSet> getRawItemData() {
+    return this.preferenceForItems;
+  }
+  
+  @Override
+  public LongPrimitiveArrayIterator getUserIDs() {
+    return new LongPrimitiveArrayIterator(userIDs);
+  }
+  
+  /**
+   * @throws NoSuchUserException
+   *           if there is no such user
+   */
+  @Override
+  public PreferenceArray getPreferencesFromUser(long userID) throws NoSuchUserException {
+    FastIDSet itemIDs = preferenceFromUsers.get(userID);
+    if (itemIDs == null) {
+      throw new NoSuchUserException(userID);
+    }
+    PreferenceArray prefArray = new BooleanUserPreferenceArray(itemIDs.size());
+    int i = 0;
+    LongPrimitiveIterator it = itemIDs.iterator();
+    while (it.hasNext()) {
+      prefArray.setUserID(i, userID);
+      prefArray.setItemID(i, it.nextLong());
+      i++;
+    }
+    return prefArray;
+  }
+  
+  @Override
+  public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+    FastIDSet itemIDs = preferenceFromUsers.get(userID);
+    if (itemIDs == null) {
+      throw new NoSuchUserException(userID);
+    }
+    return itemIDs;
+  }
+  
+  @Override
+  public LongPrimitiveArrayIterator getItemIDs() {
+    return new LongPrimitiveArrayIterator(itemIDs);
+  }
+  
+  @Override
+  public PreferenceArray getPreferencesForItem(long itemID) throws NoSuchItemException {
+    FastIDSet userIDs = preferenceForItems.get(itemID);
+    if (userIDs == null) {
+      throw new NoSuchItemException(itemID);
+    }
+    PreferenceArray prefArray = new BooleanItemPreferenceArray(userIDs.size());
+    int i = 0;
+    LongPrimitiveIterator it = userIDs.iterator();
+    while (it.hasNext()) {
+      prefArray.setUserID(i, it.nextLong());
+      prefArray.setItemID(i, itemID);
+      i++;
+    }
+    return prefArray;
+  }
+  
+  @Override
+  public Float getPreferenceValue(long userID, long itemID) throws NoSuchUserException {
+    FastIDSet itemIDs = preferenceFromUsers.get(userID);
+    if (itemIDs == null) {
+      throw new NoSuchUserException(userID);
+    }
+    if (itemIDs.contains(itemID)) {
+      return 1.0f;
+    }
+    return null;
+  }
+
+  @Override
+  public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+    if (timestamps == null) {
+      return null;
+    }
+    FastByIDMap<Long> itemTimestamps = timestamps.get(userID);
+    if (itemTimestamps == null) {
+      throw new NoSuchUserException(userID);
+    }
+    return itemTimestamps.get(itemID);
+  }
+  
+  @Override
+  public int getNumItems() {
+    return itemIDs.length;
+  }
+  
+  @Override
+  public int getNumUsers() {
+    return userIDs.length;
+  }
+  
+  @Override
+  public int getNumUsersWithPreferenceFor(long itemID) {
+    FastIDSet userIDs1 = preferenceForItems.get(itemID);
+    return userIDs1 == null ? 0 : userIDs1.size();
+  }
+
+  @Override
+  public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) {
+    FastIDSet userIDs1 = preferenceForItems.get(itemID1);
+    if (userIDs1 == null) {
+      return 0;
+    }
+    FastIDSet userIDs2 = preferenceForItems.get(itemID2);
+    if (userIDs2 == null) {
+      return 0;
+    }
+    return userIDs1.size() < userIDs2.size()
+        ? userIDs2.intersectionSize(userIDs1)
+        : userIDs1.intersectionSize(userIDs2);
+  }
+  
+  @Override
+  public void removePreference(long userID, long itemID) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void setPreference(long userID, long itemID, float value) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+  // Does nothing
+  }
+
+  @Override
+  public boolean hasPreferenceValues() {
+    return false;
+  }
+  
+  @Override
+  public String toString() {
+    StringBuilder result = new StringBuilder(200);
+    result.append("GenericBooleanPrefDataModel[users:");
+    for (int i = 0; i < Math.min(3, userIDs.length); i++) {
+      if (i > 0) {
+        result.append(',');
+      }
+      result.append(userIDs[i]);
+    }
+    if (userIDs.length > 3) {
+      result.append("...");
+    }
+    result.append(']');
+    return result.toString();
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java
new file mode 100644
index 0000000..f58d349
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java
@@ -0,0 +1,361 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Lists;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveArrayIterator;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A simple {@link DataModel} which uses a given {@link List} of users as its data source. This implementation
+ * is mostly useful for small experiments and is not recommended for contexts where performance is important.
+ * </p>
+ */
+public final class GenericDataModel extends AbstractDataModel {
+  
+  private static final Logger log = LoggerFactory.getLogger(GenericDataModel.class);
+  
+  private final long[] userIDs;
+  private final FastByIDMap<PreferenceArray> preferenceFromUsers;
+  private final long[] itemIDs;
+  private final FastByIDMap<PreferenceArray> preferenceForItems;
+  private final FastByIDMap<FastByIDMap<Long>> timestamps;
+  
+  /**
+   * <p>
+   * Creates a new {@link GenericDataModel} from the given users (and their preferences). This
+   * {@link DataModel} retains all this information in memory and is effectively immutable.
+   * </p>
+   * 
+   * @param userData users to include; (see also {@link #toDataMap(FastByIDMap, boolean)})
+   */
+  public GenericDataModel(FastByIDMap<PreferenceArray> userData) {
+    this(userData, null);
+  }
+
+  /**
+   * <p>
+   * Creates a new {@link GenericDataModel} from the given users (and their preferences). This
+   * {@link DataModel} retains all this information in memory and is effectively immutable.
+   * </p>
+   *
+   * @param userData users to include; (see also {@link #toDataMap(FastByIDMap, boolean)})
+   * @param timestamps optionally, provided timestamps of preferences as milliseconds since the epoch.
+   *  User IDs are mapped to maps of item IDs to Long timestamps.
+   */
+  public GenericDataModel(FastByIDMap<PreferenceArray> userData, FastByIDMap<FastByIDMap<Long>> timestamps) {
+    Preconditions.checkArgument(userData != null, "userData is null");
+
+    this.preferenceFromUsers = userData;
+    FastByIDMap<Collection<Preference>> prefsForItems = new FastByIDMap<>();
+    FastIDSet itemIDSet = new FastIDSet();
+    int currentCount = 0;
+    float maxPrefValue = Float.NEGATIVE_INFINITY;
+    float minPrefValue = Float.POSITIVE_INFINITY;
+    for (Map.Entry<Long, PreferenceArray> entry : preferenceFromUsers.entrySet()) {
+      PreferenceArray prefs = entry.getValue();
+      prefs.sortByItem();
+      for (Preference preference : prefs) {
+        long itemID = preference.getItemID();
+        itemIDSet.add(itemID);
+        Collection<Preference> prefsForItem = prefsForItems.get(itemID);
+        if (prefsForItem == null) {
+          prefsForItem = Lists.newArrayListWithCapacity(2);
+          prefsForItems.put(itemID, prefsForItem);
+        }
+        prefsForItem.add(preference);
+        float value = preference.getValue();
+        if (value > maxPrefValue) {
+          maxPrefValue = value;
+        }
+        if (value < minPrefValue) {
+          minPrefValue = value;
+        }
+      }
+      if (++currentCount % 10000 == 0) {
+        log.info("Processed {} users", currentCount);
+      }
+    }
+    log.info("Processed {} users", currentCount);
+
+    setMinPreference(minPrefValue);
+    setMaxPreference(maxPrefValue);
+
+    this.itemIDs = itemIDSet.toArray();
+    itemIDSet = null; // Might help GC -- this is big
+    Arrays.sort(itemIDs);
+
+    this.preferenceForItems = toDataMap(prefsForItems, false);
+
+    for (Map.Entry<Long, PreferenceArray> entry : preferenceForItems.entrySet()) {
+      entry.getValue().sortByUser();
+    }
+
+    this.userIDs = new long[userData.size()];
+    int i = 0;
+    LongPrimitiveIterator it = userData.keySetIterator();
+    while (it.hasNext()) {
+      userIDs[i++] = it.next();
+    }
+    Arrays.sort(userIDs);
+
+    this.timestamps = timestamps;
+  }
+
+  /**
+   * <p>
+   * Creates a new {@link GenericDataModel} containing an immutable copy of the data from another given
+   * {@link DataModel}.
+   * </p>
+   *
+   * @param dataModel {@link DataModel} to copy
+   * @throws TasteException if an error occurs while retrieving the other {@link DataModel}'s users
+   * @deprecated without direct replacement.
+   *  Consider {@link #toDataMap(DataModel)} with {@link #GenericDataModel(FastByIDMap)}
+   */
+  @Deprecated
+  public GenericDataModel(DataModel dataModel) throws TasteException {
+    this(toDataMap(dataModel));
+  }
+  
+  /**
+   * Swaps, in-place, {@link List}s for arrays in {@link Map} values .
+   * 
+   * @return input value
+   */
+  public static FastByIDMap<PreferenceArray> toDataMap(FastByIDMap<Collection<Preference>> data,
+                                                       boolean byUser) {
+    for (Map.Entry<Long,Object> entry : ((FastByIDMap<Object>) (FastByIDMap<?>) data).entrySet()) {
+      List<Preference> prefList = (List<Preference>) entry.getValue();
+      entry.setValue(byUser ? new GenericUserPreferenceArray(prefList) : new GenericItemPreferenceArray(
+          prefList));
+    }
+    return (FastByIDMap<PreferenceArray>) (FastByIDMap<?>) data;
+  }
+
+  /**
+   * Exports the simple user IDs and preferences in the data model.
+   *
+   * @return a {@link FastByIDMap} mapping user IDs to {@link PreferenceArray}s representing
+   *  that user's preferences
+   */
+  public static FastByIDMap<PreferenceArray> toDataMap(DataModel dataModel) throws TasteException {
+    FastByIDMap<PreferenceArray> data = new FastByIDMap<>(dataModel.getNumUsers());
+    LongPrimitiveIterator it = dataModel.getUserIDs();
+    while (it.hasNext()) {
+      long userID = it.nextLong();
+      data.put(userID, dataModel.getPreferencesFromUser(userID));
+    }
+    return data;
+  }
+  
+  /**
+   * This is used mostly internally to the framework, and shouldn't be relied upon otherwise.
+   */
+  public FastByIDMap<PreferenceArray> getRawUserData() {
+    return this.preferenceFromUsers;
+  }
+
+  /**
+   * This is used mostly internally to the framework, and shouldn't be relied upon otherwise.
+   */
+  public FastByIDMap<PreferenceArray> getRawItemData() {
+    return this.preferenceForItems;
+  }
+
+  @Override
+  public LongPrimitiveArrayIterator getUserIDs() {
+    return new LongPrimitiveArrayIterator(userIDs);
+  }
+  
+  /**
+   * @throws NoSuchUserException
+   *           if there is no such user
+   */
+  @Override
+  public PreferenceArray getPreferencesFromUser(long userID) throws NoSuchUserException {
+    PreferenceArray prefs = preferenceFromUsers.get(userID);
+    if (prefs == null) {
+      throw new NoSuchUserException(userID);
+    }
+    return prefs;
+  }
+  
+  @Override
+  public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+    PreferenceArray prefs = getPreferencesFromUser(userID);
+    int size = prefs.length();
+    FastIDSet result = new FastIDSet(size);
+    for (int i = 0; i < size; i++) {
+      result.add(prefs.getItemID(i));
+    }
+    return result;
+  }
+  
+  @Override
+  public LongPrimitiveArrayIterator getItemIDs() {
+    return new LongPrimitiveArrayIterator(itemIDs);
+  }
+  
+  @Override
+  public PreferenceArray getPreferencesForItem(long itemID) throws NoSuchItemException {
+    PreferenceArray prefs = preferenceForItems.get(itemID);
+    if (prefs == null) {
+      throw new NoSuchItemException(itemID);
+    }
+    return prefs;
+  }
+  
+  @Override
+  public Float getPreferenceValue(long userID, long itemID) throws TasteException {
+    PreferenceArray prefs = getPreferencesFromUser(userID);
+    int size = prefs.length();
+    for (int i = 0; i < size; i++) {
+      if (prefs.getItemID(i) == itemID) {
+        return prefs.getValue(i);
+      }
+    }
+    return null;
+  }
+
+  @Override
+  public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+    if (timestamps == null) {
+      return null;
+    }
+    FastByIDMap<Long> itemTimestamps = timestamps.get(userID);
+    if (itemTimestamps == null) {
+      throw new NoSuchUserException(userID);
+    }
+    return itemTimestamps.get(itemID);
+  }
+
+  @Override
+  public int getNumItems() {
+    return itemIDs.length;
+  }
+  
+  @Override
+  public int getNumUsers() {
+    return userIDs.length;
+  }
+
+  @Override
+  public int getNumUsersWithPreferenceFor(long itemID) {
+    PreferenceArray prefs1 = preferenceForItems.get(itemID);
+    return prefs1 == null ? 0 : prefs1.length();
+  }
+  
+  @Override
+  public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) {
+    PreferenceArray prefs1 = preferenceForItems.get(itemID1);
+    if (prefs1 == null) {
+      return 0;
+    }
+    PreferenceArray prefs2 = preferenceForItems.get(itemID2);
+    if (prefs2 == null) {
+      return 0;
+    }
+
+    int size1 = prefs1.length();
+    int size2 = prefs2.length();
+    int count = 0;
+    int i = 0;
+    int j = 0;
+    long userID1 = prefs1.getUserID(0);
+    long userID2 = prefs2.getUserID(0);
+    while (true) {
+      if (userID1 < userID2) {
+        if (++i == size1) {
+          break;
+        }
+        userID1 = prefs1.getUserID(i);
+      } else if (userID1 > userID2) {
+        if (++j == size2) {
+          break;
+        }
+        userID2 = prefs2.getUserID(j);
+      } else {
+        count++;
+        if (++i == size1 || ++j == size2) {
+          break;
+        }
+        userID1 = prefs1.getUserID(i);
+        userID2 = prefs2.getUserID(j);
+      }
+    }
+    return count;
+  }
+
+  @Override
+  public void removePreference(long userID, long itemID) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void setPreference(long userID, long itemID, float value) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+  // Does nothing
+  }
+
+  @Override
+  public boolean hasPreferenceValues() {
+    return true;
+  }
+  
+  @Override
+  public String toString() {
+    StringBuilder result = new StringBuilder(200);
+    result.append("GenericDataModel[users:");
+    for (int i = 0; i < Math.min(3, userIDs.length); i++) {
+      if (i > 0) {
+        result.append(',');
+      }
+      result.append(userIDs[i]);
+    }
+    if (userIDs.length > 3) {
+      result.append("...");
+    }
+    result.append(']');
+    return result.toString();
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericItemPreferenceArray.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericItemPreferenceArray.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericItemPreferenceArray.java
new file mode 100644
index 0000000..fde9314
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericItemPreferenceArray.java
@@ -0,0 +1,301 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.iterator.CountingIterator;
+
+/**
+ * <p>
+ * Like {@link GenericUserPreferenceArray} but stores preferences for one item (all item IDs the same) rather
+ * than one user.
+ * </p>
+ * 
+ * @see BooleanItemPreferenceArray
+ * @see GenericUserPreferenceArray
+ * @see GenericPreference
+ */
+public final class GenericItemPreferenceArray implements PreferenceArray {
+
+  private static final int USER = 0;
+  private static final int VALUE = 2;
+  private static final int VALUE_REVERSED = 3;
+
+  private final long[] ids;
+  private long id;
+  private final float[] values;
+
+  public GenericItemPreferenceArray(int size) {
+    this.ids = new long[size];
+    values = new float[size];
+    this.id = Long.MIN_VALUE; // as a sort of 'unspecified' value
+  }
+
+  public GenericItemPreferenceArray(List<? extends Preference> prefs) {
+    this(prefs.size());
+    int size = prefs.size();
+    long itemID = Long.MIN_VALUE;
+    for (int i = 0; i < size; i++) {
+      Preference pref = prefs.get(i);
+      ids[i] = pref.getUserID();
+      if (i == 0) {
+        itemID = pref.getItemID();
+      } else {
+        if (itemID != pref.getItemID()) {
+          throw new IllegalArgumentException("Not all item IDs are the same");
+        }
+      }
+      values[i] = pref.getValue();
+    }
+    id = itemID;
+  }
+
+  /**
+   * This is a private copy constructor for clone().
+   */
+  private GenericItemPreferenceArray(long[] ids, long id, float[] values) {
+    this.ids = ids;
+    this.id = id;
+    this.values = values;
+  }
+
+  @Override
+  public int length() {
+    return ids.length;
+  }
+
+  @Override
+  public Preference get(int i) {
+    return new PreferenceView(i);
+  }
+
+  @Override
+  public void set(int i, Preference pref) {
+    id = pref.getItemID();
+    ids[i] = pref.getUserID();
+    values[i] = pref.getValue();
+  }
+
+  @Override
+  public long getUserID(int i) {
+    return ids[i];
+  }
+
+  @Override
+  public void setUserID(int i, long userID) {
+    ids[i] = userID;
+  }
+
+  @Override
+  public long getItemID(int i) {
+    return id;
+  }
+
+  /**
+   * {@inheritDoc}
+   * 
+   * Note that this method will actually set the item ID for <em>all</em> preferences.
+   */
+  @Override
+  public void setItemID(int i, long itemID) {
+    id = itemID;
+  }
+
+  /**
+   * @return all user IDs
+   */
+  @Override
+  public long[] getIDs() {
+    return ids;
+  }
+
+  @Override
+  public float getValue(int i) {
+    return values[i];
+  }
+
+  @Override
+  public void setValue(int i, float value) {
+    values[i] = value;
+  }
+
+  @Override
+  public void sortByUser() {
+    lateralSort(USER);
+  }
+
+  @Override
+  public void sortByItem() { }
+
+  @Override
+  public void sortByValue() {
+    lateralSort(VALUE);
+  }
+
+  @Override
+  public void sortByValueReversed() {
+    lateralSort(VALUE_REVERSED);
+  }
+
+  @Override
+  public boolean hasPrefWithUserID(long userID) {
+    for (long id : ids) {
+      if (userID == id) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  @Override
+  public boolean hasPrefWithItemID(long itemID) {
+    return id == itemID;
+  }
+
+  private void lateralSort(int type) {
+    //Comb sort: http://en.wikipedia.org/wiki/Comb_sort
+    int length = length();
+    int gap = length;
+    boolean swapped = false;
+    while (gap > 1 || swapped) {
+      if (gap > 1) {
+        gap /= 1.247330950103979; // = 1 / (1 - 1/e^phi)
+      }
+      swapped = false;
+      int max = length - gap;
+      for (int i = 0; i < max; i++) {
+        int other = i + gap;
+        if (isLess(other, i, type)) {
+          swap(i, other);
+          swapped = true;
+        }
+      }
+    }
+  }
+
+  private boolean isLess(int i, int j, int type) {
+    switch (type) {
+      case USER:
+        return ids[i] < ids[j];
+      case VALUE:
+        return values[i] < values[j];
+      case VALUE_REVERSED:
+        return values[i] > values[j];
+      default:
+        throw new IllegalStateException();
+    }
+  }
+
+  private void swap(int i, int j) {
+    long temp1 = ids[i];
+    float temp2 = values[i];
+    ids[i] = ids[j];
+    values[i] = values[j];
+    ids[j] = temp1;
+    values[j] = temp2;
+  }
+
+  @Override
+  public GenericItemPreferenceArray clone() {
+    return new GenericItemPreferenceArray(ids.clone(), id, values.clone());
+  }
+
+  @Override
+  public int hashCode() {
+    return (int) (id >> 32) ^ (int) id ^ Arrays.hashCode(ids) ^ Arrays.hashCode(values);
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (!(other instanceof GenericItemPreferenceArray)) {
+      return false;
+    }
+    GenericItemPreferenceArray otherArray = (GenericItemPreferenceArray) other;
+    return id == otherArray.id && Arrays.equals(ids, otherArray.ids) && Arrays.equals(values, otherArray.values);
+  }
+
+  @Override
+  public Iterator<Preference> iterator() {
+    return Iterators.transform(new CountingIterator(length()),
+      new Function<Integer, Preference>() {
+        @Override
+        public Preference apply(Integer from) {
+          return new PreferenceView(from);
+        }
+      });
+  }
+
+  @Override
+  public String toString() {
+    if (ids == null || ids.length == 0) {
+      return "GenericItemPreferenceArray[{}]";
+    }
+    StringBuilder result = new StringBuilder(20 * ids.length);
+    result.append("GenericItemPreferenceArray[itemID:");
+    result.append(id);
+    result.append(",{");
+    for (int i = 0; i < ids.length; i++) {
+      if (i > 0) {
+        result.append(',');
+      }
+      result.append(ids[i]);
+      result.append('=');
+      result.append(values[i]);
+    }
+    result.append("}]");
+    return result.toString();
+  }
+
+  private final class PreferenceView implements Preference {
+
+    private final int i;
+
+    private PreferenceView(int i) {
+      this.i = i;
+    }
+
+    @Override
+    public long getUserID() {
+      return GenericItemPreferenceArray.this.getUserID(i);
+    }
+
+    @Override
+    public long getItemID() {
+      return GenericItemPreferenceArray.this.getItemID(i);
+    }
+
+    @Override
+    public float getValue() {
+      return values[i];
+    }
+
+    @Override
+    public void setValue(float value) {
+      values[i] = value;
+    }
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericPreference.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericPreference.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericPreference.java
new file mode 100644
index 0000000..e6c7f43
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericPreference.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.io.Serializable;
+
+import org.apache.mahout.cf.taste.model.Preference;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A simple {@link Preference} encapsulating an item and preference value.
+ * </p>
+ */
+public class GenericPreference implements Preference, Serializable {
+  
+  private final long userID;
+  private final long itemID;
+  private float value;
+  
+  public GenericPreference(long userID, long itemID, float value) {
+    Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
+    this.userID = userID;
+    this.itemID = itemID;
+    this.value = value;
+  }
+  
+  @Override
+  public long getUserID() {
+    return userID;
+  }
+  
+  @Override
+  public long getItemID() {
+    return itemID;
+  }
+  
+  @Override
+  public float getValue() {
+    return value;
+  }
+  
+  @Override
+  public void setValue(float value) {
+    Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
+    this.value = value;
+  }
+  
+  @Override
+  public String toString() {
+    return "GenericPreference[userID: " + userID + ", itemID:" + itemID + ", value:" + value + ']';
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericUserPreferenceArray.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericUserPreferenceArray.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericUserPreferenceArray.java
new file mode 100644
index 0000000..647feeb
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericUserPreferenceArray.java
@@ -0,0 +1,307 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.iterator.CountingIterator;
+
+/**
+ * <p>
+ * Like {@link GenericItemPreferenceArray} but stores preferences for one user (all user IDs the same) rather
+ * than one item.
+ * </p>
+ *
+ * <p>
+ * This implementation maintains two parallel arrays, of item IDs and values. The idea is to save allocating
+ * {@link Preference} objects themselves. This saves the overhead of {@link Preference} objects but also
+ * duplicating the user ID value.
+ * </p>
+ * 
+ * @see BooleanUserPreferenceArray
+ * @see GenericItemPreferenceArray
+ * @see GenericPreference
+ */
+public final class GenericUserPreferenceArray implements PreferenceArray {
+
+  private static final int ITEM = 1;
+  private static final int VALUE = 2;
+  private static final int VALUE_REVERSED = 3;
+
+  private final long[] ids;
+  private long id;
+  private final float[] values;
+
+  public GenericUserPreferenceArray(int size) {
+    this.ids = new long[size];
+    values = new float[size];
+    this.id = Long.MIN_VALUE; // as a sort of 'unspecified' value
+  }
+
+  public GenericUserPreferenceArray(List<? extends Preference> prefs) {
+    this(prefs.size());
+    int size = prefs.size();
+    long userID = Long.MIN_VALUE;
+    for (int i = 0; i < size; i++) {
+      Preference pref = prefs.get(i);
+      if (i == 0) {
+        userID = pref.getUserID();
+      } else {
+        if (userID != pref.getUserID()) {
+          throw new IllegalArgumentException("Not all user IDs are the same");
+        }
+      }
+      ids[i] = pref.getItemID();
+      values[i] = pref.getValue();
+    }
+    id = userID;
+  }
+
+  /**
+   * This is a private copy constructor for clone().
+   */
+  private GenericUserPreferenceArray(long[] ids, long id, float[] values) {
+    this.ids = ids;
+    this.id = id;
+    this.values = values;
+  }
+
+  @Override
+  public int length() {
+    return ids.length;
+  }
+
+  @Override
+  public Preference get(int i) {
+    return new PreferenceView(i);
+  }
+
+  @Override
+  public void set(int i, Preference pref) {
+    id = pref.getUserID();
+    ids[i] = pref.getItemID();
+    values[i] = pref.getValue();
+  }
+
+  @Override
+  public long getUserID(int i) {
+    return id;
+  }
+
+  /**
+   * {@inheritDoc}
+   * 
+   * Note that this method will actually set the user ID for <em>all</em> preferences.
+   */
+  @Override
+  public void setUserID(int i, long userID) {
+    id = userID;
+  }
+
+  @Override
+  public long getItemID(int i) {
+    return ids[i];
+  }
+
+  @Override
+  public void setItemID(int i, long itemID) {
+    ids[i] = itemID;
+  }
+
+  /**
+   * @return all item IDs
+   */
+  @Override
+  public long[] getIDs() {
+    return ids;
+  }
+
+  @Override
+  public float getValue(int i) {
+    return values[i];
+  }
+
+  @Override
+  public void setValue(int i, float value) {
+    values[i] = value;
+  }
+
+  @Override
+  public void sortByUser() { }
+
+  @Override
+  public void sortByItem() {
+    lateralSort(ITEM);
+  }
+
+  @Override
+  public void sortByValue() {
+    lateralSort(VALUE);
+  }
+
+  @Override
+  public void sortByValueReversed() {
+    lateralSort(VALUE_REVERSED);
+  }
+
+  @Override
+  public boolean hasPrefWithUserID(long userID) {
+    return id == userID;
+  }
+
+  @Override
+  public boolean hasPrefWithItemID(long itemID) {
+    for (long id : ids) {
+      if (itemID == id) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  private void lateralSort(int type) {
+    //Comb sort: http://en.wikipedia.org/wiki/Comb_sort
+    int length = length();
+    int gap = length;
+    boolean swapped = false;
+    while (gap > 1 || swapped) {
+      if (gap > 1) {
+        gap /= 1.247330950103979; // = 1 / (1 - 1/e^phi)
+      }
+      swapped = false;
+      int max = length - gap;
+      for (int i = 0; i < max; i++) {
+        int other = i + gap;
+        if (isLess(other, i, type)) {
+          swap(i, other);
+          swapped = true;
+        }
+      }
+    }
+  }
+
+  private boolean isLess(int i, int j, int type) {
+    switch (type) {
+      case ITEM:
+        return ids[i] < ids[j];
+      case VALUE:
+        return values[i] < values[j];
+      case VALUE_REVERSED:
+        return values[i] > values[j];
+      default:
+        throw new IllegalStateException();
+    }
+  }
+
+  private void swap(int i, int j) {
+    long temp1 = ids[i];
+    float temp2 = values[i];
+    ids[i] = ids[j];
+    values[i] = values[j];
+    ids[j] = temp1;
+    values[j] = temp2;
+  }
+
+  @Override
+  public GenericUserPreferenceArray clone() {
+    return new GenericUserPreferenceArray(ids.clone(), id, values.clone());
+  }
+
+  @Override
+  public int hashCode() {
+    return (int) (id >> 32) ^ (int) id ^ Arrays.hashCode(ids) ^ Arrays.hashCode(values);
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (!(other instanceof GenericUserPreferenceArray)) {
+      return false;
+    }
+    GenericUserPreferenceArray otherArray = (GenericUserPreferenceArray) other;
+    return id == otherArray.id && Arrays.equals(ids, otherArray.ids) && Arrays.equals(values, otherArray.values);
+  }
+
+  @Override
+  public Iterator<Preference> iterator() {
+    return Iterators.transform(new CountingIterator(length()),
+      new Function<Integer, Preference>() {
+        @Override
+        public Preference apply(Integer from) {
+          return new PreferenceView(from);
+        }
+      });
+  }
+
+  @Override
+  public String toString() {
+    if (ids == null || ids.length == 0) {
+      return "GenericUserPreferenceArray[{}]";
+    }
+    StringBuilder result = new StringBuilder(20 * ids.length);
+    result.append("GenericUserPreferenceArray[userID:");
+    result.append(id);
+    result.append(",{");
+    for (int i = 0; i < ids.length; i++) {
+      if (i > 0) {
+        result.append(',');
+      }
+      result.append(ids[i]);
+      result.append('=');
+      result.append(values[i]);
+    }
+    result.append("}]");
+    return result.toString();
+  }
+
+  private final class PreferenceView implements Preference {
+
+    private final int i;
+
+    private PreferenceView(int i) {
+      this.i = i;
+    }
+
+    @Override
+    public long getUserID() {
+      return GenericUserPreferenceArray.this.getUserID(i);
+    }
+
+    @Override
+    public long getItemID() {
+      return GenericUserPreferenceArray.this.getItemID(i);
+    }
+
+    @Override
+    public float getValue() {
+      return values[i];
+    }
+
+    @Override
+    public void setValue(float value) {
+      values[i] = value;
+    }
+
+  }
+
+}


[17/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java
new file mode 100644
index 0000000..8ea1660
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.common.Weighting;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * An implementation of the Pearson correlation. For users X and Y, the following values are calculated:
+ * </p>
+ *
+ * <ul>
+ * <li>sumX2: sum of the square of all X's preference values</li>
+ * <li>sumY2: sum of the square of all Y's preference values</li>
+ * <li>sumXY: sum of the product of X and Y's preference value for all items for which both X and Y express a
+ * preference</li>
+ * </ul>
+ *
+ * <p>
+ * The correlation is then:
+ *
+ * <p>
+ * {@code sumXY / sqrt(sumX2 * sumY2)}
+ * </p>
+ *
+ * <p>
+ * Note that this correlation "centers" its data, shifts the user's preference values so that each of their
+ * means is 0. This is necessary to achieve expected behavior on all data sets.
+ * </p>
+ *
+ * <p>
+ * This correlation implementation is equivalent to the cosine similarity since the data it receives
+ * is assumed to be centered -- mean is 0. The correlation may be interpreted as the cosine of the angle
+ * between the two vectors defined by the users' preference values.
+ * </p>
+ *
+ * <p>
+ * For cosine similarity on uncentered data, see {@link UncenteredCosineSimilarity}.
+ * </p> 
+ */
+public final class PearsonCorrelationSimilarity extends AbstractSimilarity {
+
+  /**
+   * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+   */
+  public PearsonCorrelationSimilarity(DataModel dataModel) throws TasteException {
+    this(dataModel, Weighting.UNWEIGHTED);
+  }
+
+  /**
+   * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+   */
+  public PearsonCorrelationSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
+    super(dataModel, weighting, true);
+    Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values");
+  }
+  
+  @Override
+  double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
+    if (n == 0) {
+      return Double.NaN;
+    }
+    // Note that sum of X and sum of Y don't appear here since they are assumed to be 0;
+    // the data is assumed to be centered.
+    double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2);
+    if (denominator == 0.0) {
+      // One or both parties has -all- the same ratings;
+      // can't really say much similarity under this measure
+      return Double.NaN;
+    }
+    return sumXY / denominator;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java
new file mode 100644
index 0000000..1116368
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * Like {@link PearsonCorrelationSimilarity}, but compares relative ranking of preference values instead of
+ * preference values themselves. That is, each user's preferences are sorted and then assign a rank as their
+ * preference value, with 1 being assigned to the least preferred item.
+ * </p>
+ */
+public final class SpearmanCorrelationSimilarity implements UserSimilarity {
+  
+  private final DataModel dataModel;
+  
+  public SpearmanCorrelationSimilarity(DataModel dataModel) {
+    this.dataModel = Preconditions.checkNotNull(dataModel);
+  }
+  
+  @Override
+  public double userSimilarity(long userID1, long userID2) throws TasteException {
+    PreferenceArray xPrefs = dataModel.getPreferencesFromUser(userID1);
+    PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2);
+    int xLength = xPrefs.length();
+    int yLength = yPrefs.length();
+    
+    if (xLength <= 1 || yLength <= 1) {
+      return Double.NaN;
+    }
+    
+    // Copy prefs since we need to modify pref values to ranks
+    xPrefs = xPrefs.clone();
+    yPrefs = yPrefs.clone();
+    
+    // First sort by values from low to high
+    xPrefs.sortByValue();
+    yPrefs.sortByValue();
+    
+    // Assign ranks from low to high
+    float nextRank = 1.0f;
+    for (int i = 0; i < xLength; i++) {
+      // ... but only for items that are common to both pref arrays
+      if (yPrefs.hasPrefWithItemID(xPrefs.getItemID(i))) {
+        xPrefs.setValue(i, nextRank);
+        nextRank += 1.0f;
+      }
+      // Other values are bogus but don't matter
+    }
+    nextRank = 1.0f;
+    for (int i = 0; i < yLength; i++) {
+      if (xPrefs.hasPrefWithItemID(yPrefs.getItemID(i))) {
+        yPrefs.setValue(i, nextRank);
+        nextRank += 1.0f;
+      }
+    }
+    
+    xPrefs.sortByItem();
+    yPrefs.sortByItem();
+    
+    long xIndex = xPrefs.getItemID(0);
+    long yIndex = yPrefs.getItemID(0);
+    int xPrefIndex = 0;
+    int yPrefIndex = 0;
+    
+    double sumXYRankDiff2 = 0.0;
+    int count = 0;
+    
+    while (true) {
+      int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0;
+      if (compare == 0) {
+        double diff = xPrefs.getValue(xPrefIndex) - yPrefs.getValue(yPrefIndex);
+        sumXYRankDiff2 += diff * diff;
+        count++;
+      }
+      if (compare <= 0) {
+        if (++xPrefIndex >= xLength) {
+          break;
+        }
+        xIndex = xPrefs.getItemID(xPrefIndex);
+      }
+      if (compare >= 0) {
+        if (++yPrefIndex >= yLength) {
+          break;
+        }
+        yIndex = yPrefs.getItemID(yPrefIndex);
+      }
+    }
+    
+    if (count <= 1) {
+      return Double.NaN;
+    }
+    
+    // When ranks are unique, this formula actually gives the Pearson correlation
+    return 1.0 - 6.0 * sumXYRankDiff2 / (count * (count * count - 1));
+  }
+  
+  @Override
+  public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed);
+    RefreshHelper.maybeRefresh(alreadyRefreshed, dataModel);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java
new file mode 100644
index 0000000..0c3a0a4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+/**
+ * <p>
+ * An implementation of a "similarity" based on the <a
+ * href="http://en.wikipedia.org/wiki/Jaccard_index#Tanimoto_coefficient_.28extended_Jaccard_coefficient.29">
+ * Tanimoto coefficient</a>, or extended <a href="http://en.wikipedia.org/wiki/Jaccard_index">Jaccard
+ * coefficient</a>.
+ * </p>
+ * 
+ * <p>
+ * This is intended for "binary" data sets where a user either expresses a generic "yes" preference for an
+ * item or has no preference. The actual preference values do not matter here, only their presence or absence.
+ * </p>
+ * 
+ * <p>
+ * The value returned is in [0,1].
+ * </p>
+ */
+public final class TanimotoCoefficientSimilarity extends AbstractItemSimilarity implements UserSimilarity {
+
+  public TanimotoCoefficientSimilarity(DataModel dataModel) {
+    super(dataModel);
+  }
+  
+  /**
+   * @throws UnsupportedOperationException
+   */
+  @Override
+  public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public double userSimilarity(long userID1, long userID2) throws TasteException {
+
+    DataModel dataModel = getDataModel();
+    FastIDSet xPrefs = dataModel.getItemIDsFromUser(userID1);
+    FastIDSet yPrefs = dataModel.getItemIDsFromUser(userID2);
+
+    int xPrefsSize = xPrefs.size();
+    int yPrefsSize = yPrefs.size();
+    if (xPrefsSize == 0 && yPrefsSize == 0) {
+      return Double.NaN;
+    }
+    if (xPrefsSize == 0 || yPrefsSize == 0) {
+      return 0.0;
+    }
+    
+    int intersectionSize =
+        xPrefsSize < yPrefsSize ? yPrefs.intersectionSize(xPrefs) : xPrefs.intersectionSize(yPrefs);
+    if (intersectionSize == 0) {
+      return Double.NaN;
+    }
+    
+    int unionSize = xPrefsSize + yPrefsSize - intersectionSize;
+    
+    return (double) intersectionSize / (double) unionSize;
+  }
+  
+  @Override
+  public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+    int preferring1 = getDataModel().getNumUsersWithPreferenceFor(itemID1);
+    return doItemSimilarity(itemID1, itemID2, preferring1);
+  }
+
+  @Override
+  public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+    int preferring1 = getDataModel().getNumUsersWithPreferenceFor(itemID1);
+    int length = itemID2s.length;
+    double[] result = new double[length];
+    for (int i = 0; i < length; i++) {
+      result[i] = doItemSimilarity(itemID1, itemID2s[i], preferring1);
+    }
+    return result;
+  }
+
+  private double doItemSimilarity(long itemID1, long itemID2, int preferring1) throws TasteException {
+    DataModel dataModel = getDataModel();
+    int preferring1and2 = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2);
+    if (preferring1and2 == 0) {
+      return Double.NaN;
+    }
+    int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2);
+    return (double) preferring1and2 / (double) (preferring1 + preferring2 - preferring1and2);
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed);
+    RefreshHelper.maybeRefresh(alreadyRefreshed, getDataModel());
+  }
+  
+  @Override
+  public String toString() {
+    return "TanimotoCoefficientSimilarity[dataModel:" + getDataModel() + ']';
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java
new file mode 100644
index 0000000..6260606
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.common.Weighting;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * An implementation of the cosine similarity. The result is the cosine of the angle formed between
+ * the two preference vectors.
+ * </p>
+ *
+ * <p>
+ * Note that this similarity does not "center" its data, shifts the user's preference values so that each of their
+ * means is 0. For this behavior, use {@link PearsonCorrelationSimilarity}, which actually is mathematically
+ * equivalent for centered data.
+ * </p>
+ */
+public final class UncenteredCosineSimilarity extends AbstractSimilarity {
+
+  /**
+   * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+   */
+  public UncenteredCosineSimilarity(DataModel dataModel) throws TasteException {
+    this(dataModel, Weighting.UNWEIGHTED);
+  }
+
+  /**
+   * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+   */
+  public UncenteredCosineSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
+    super(dataModel, weighting, false);
+    Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values");
+  }
+
+  @Override
+  double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
+    if (n == 0) {
+      return Double.NaN;
+    }
+    double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2);
+    if (denominator == 0.0) {
+      // One or both parties has -all- the same ratings;
+      // can't really say much similarity under this measure
+      return Double.NaN;
+    }
+    return sumXY / denominator;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java
new file mode 100644
index 0000000..1ae45c2
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.file;
+
+import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * {@link Iterable} to be able to read a file linewise into a {@link GenericItemSimilarity}
+ */
+final class FileItemItemSimilarityIterable implements Iterable<GenericItemSimilarity.ItemItemSimilarity> {
+
+  private final File similaritiesFile;
+
+  FileItemItemSimilarityIterable(File similaritiesFile) {
+    this.similaritiesFile = similaritiesFile;
+  }
+
+  @Override
+  public Iterator<GenericItemSimilarity.ItemItemSimilarity> iterator() {
+    try {
+      return new FileItemItemSimilarityIterator(similaritiesFile);
+    } catch (IOException ioe) {
+      throw new IllegalStateException("Can't read " + similaritiesFile, ioe);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java
new file mode 100644
index 0000000..c071159
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.file;
+
+import com.google.common.base.Function;
+import com.google.common.collect.ForwardingIterator;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity;
+import org.apache.mahout.common.iterator.FileLineIterator;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.regex.Pattern;
+
+/**
+ * a simple iterator using a {@link FileLineIterator} internally, parsing each
+ * line into an {@link GenericItemSimilarity.ItemItemSimilarity}.
+ */
+final class FileItemItemSimilarityIterator extends ForwardingIterator<GenericItemSimilarity.ItemItemSimilarity> {
+
+  private static final Pattern SEPARATOR = Pattern.compile("[,\t]");
+
+  private final Iterator<GenericItemSimilarity.ItemItemSimilarity> delegate;
+
+  FileItemItemSimilarityIterator(File similaritiesFile) throws IOException {
+    delegate = Iterators.transform(
+        new FileLineIterator(similaritiesFile),
+        new Function<String, GenericItemSimilarity.ItemItemSimilarity>() {
+          @Override
+          public GenericItemSimilarity.ItemItemSimilarity apply(String from) {
+            String[] tokens = SEPARATOR.split(from);
+            return new GenericItemSimilarity.ItemItemSimilarity(Long.parseLong(tokens[0]),
+                                                                Long.parseLong(tokens[1]),
+                                                                Double.parseDouble(tokens[2]));
+          }
+        });
+  }
+
+  @Override
+  protected Iterator<GenericItemSimilarity.ItemItemSimilarity> delegate() {
+    return delegate;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java
new file mode 100644
index 0000000..712b96a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.file;
+
+import java.io.File;
+import java.util.Collection;
+import java.util.concurrent.locks.ReentrantLock;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * An {@link ItemSimilarity} backed by a comma-delimited file. This class typically expects a file where each line
+ * contains an item ID, followed by another item ID, followed by a similarity value, separated by commas. You may also
+ * use tabs.
+ * </p>
+ *
+ * <p>
+ * The similarity value is assumed to be parseable as a {@code double} having a value between -1 and 1. The
+ * item IDs are parsed as {@code long}s. Similarities are symmetric so for a pair of items you do not have to
+ * include 2 lines in the file.
+ * </p>
+ *
+ * <p>
+ * This class will reload data from the data file when {@link #refresh(Collection)} is called, unless the file
+ * has been reloaded very recently already.
+ * </p>
+ *
+ * <p>
+ * This class is not intended for use with very large amounts of data. For that, a JDBC-backed {@link ItemSimilarity}
+ * and a database are more appropriate.
+ * </p>
+ */
+public class FileItemSimilarity implements ItemSimilarity {
+
+  public static final long DEFAULT_MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute?
+
+  private ItemSimilarity delegate;
+  private final ReentrantLock reloadLock;
+  private final File dataFile;
+  private long lastModified;
+  private final long minReloadIntervalMS;
+
+  private static final Logger log = LoggerFactory.getLogger(FileItemSimilarity.class);
+
+  /**
+   * @param dataFile
+   *          file containing the similarity data
+   */
+  public FileItemSimilarity(File dataFile) {
+    this(dataFile, DEFAULT_MIN_RELOAD_INTERVAL_MS);
+  }
+
+  /**
+   * @param minReloadIntervalMS
+   *          the minimum interval in milliseconds after which a full reload of the original datafile is done
+   *          when refresh() is called
+   * @see #FileItemSimilarity(File)
+   */
+  public FileItemSimilarity(File dataFile, long minReloadIntervalMS) {
+    Preconditions.checkArgument(dataFile != null, "dataFile is null");
+    Preconditions.checkArgument(dataFile.exists() && !dataFile.isDirectory(),
+      "dataFile is missing or a directory: %s", dataFile);
+
+    log.info("Creating FileItemSimilarity for file {}", dataFile);
+
+    this.dataFile = dataFile.getAbsoluteFile();
+    this.lastModified = dataFile.lastModified();
+    this.minReloadIntervalMS = minReloadIntervalMS;
+    this.reloadLock = new ReentrantLock();
+
+    reload();
+  }
+
+  @Override
+  public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+    return delegate.itemSimilarities(itemID1, itemID2s);
+  }
+
+  @Override
+  public long[] allSimilarItemIDs(long itemID) throws TasteException {
+    return delegate.allSimilarItemIDs(itemID);
+  }
+
+  @Override
+  public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+    return delegate.itemSimilarity(itemID1, itemID2);
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    if (dataFile.lastModified() > lastModified + minReloadIntervalMS) {
+      log.debug("File has changed; reloading...");
+      reload();
+    }
+  }
+
+  protected void reload() {
+    if (reloadLock.tryLock()) {
+      try {
+        long newLastModified = dataFile.lastModified();
+        delegate = new GenericItemSimilarity(new FileItemItemSimilarityIterable(dataFile));
+        lastModified = newLastModified;
+      } finally {
+        reloadLock.unlock();
+      }
+    }
+  }
+
+  @Override
+  public String toString() {
+    return "FileItemSimilarity[dataFile:" + dataFile + ']';
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/FileSimilarItemsWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/FileSimilarItemsWriter.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/FileSimilarItemsWriter.java
new file mode 100644
index 0000000..631ec9b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/FileSimilarItemsWriter.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.precompute;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+
+import com.google.common.io.Closeables;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.cf.taste.similarity.precompute.SimilarItem;
+import org.apache.mahout.cf.taste.similarity.precompute.SimilarItems;
+import org.apache.mahout.cf.taste.similarity.precompute.SimilarItemsWriter;
+
+/**
+ * Persist the precomputed item similarities to a file that can later be used
+ * by a {@link org.apache.mahout.cf.taste.impl.similarity.file.FileItemSimilarity}
+ */
+public class FileSimilarItemsWriter implements SimilarItemsWriter {
+
+  private final File file;
+  private BufferedWriter writer;
+
+  public FileSimilarItemsWriter(File file) {
+    this.file = file;
+  }
+
+  @Override
+  public void open() throws IOException {
+    writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), Charsets.UTF_8));
+  }
+
+  @Override
+  public void add(SimilarItems similarItems) throws IOException {
+    String itemID = String.valueOf(similarItems.getItemID());
+    for (SimilarItem similarItem : similarItems.getSimilarItems()) {
+      writer.write(itemID);
+      writer.write(',');
+      writer.write(String.valueOf(similarItem.getItemID()));
+      writer.write(',');
+      writer.write(String.valueOf(similarItem.getSimilarity()));
+      writer.newLine();
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    Closeables.close(writer, false);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/MultithreadedBatchItemSimilarities.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/MultithreadedBatchItemSimilarities.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/MultithreadedBatchItemSimilarities.java
new file mode 100644
index 0000000..b7b52cf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/MultithreadedBatchItemSimilarities.java
@@ -0,0 +1,230 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.precompute;
+
+import com.google.common.io.Closeables;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.ItemBasedRecommender;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.similarity.precompute.BatchItemSimilarities;
+import org.apache.mahout.cf.taste.similarity.precompute.SimilarItems;
+import org.apache.mahout.cf.taste.similarity.precompute.SimilarItemsWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * Precompute item similarities in parallel on a single machine. The recommender given to this class must use a
+ * DataModel that holds the interactions in memory (such as
+ * {@link org.apache.mahout.cf.taste.impl.model.GenericDataModel} or
+ * {@link org.apache.mahout.cf.taste.impl.model.file.FileDataModel}) as fast random access to the data is required
+ */
+public class MultithreadedBatchItemSimilarities extends BatchItemSimilarities {
+
+  private int batchSize;
+
+  private static final int DEFAULT_BATCH_SIZE = 100;
+
+  private static final Logger log = LoggerFactory.getLogger(MultithreadedBatchItemSimilarities.class);
+
+  /**
+   * @param recommender recommender to use
+   * @param similarItemsPerItem number of similar items to compute per item
+   */
+  public MultithreadedBatchItemSimilarities(ItemBasedRecommender recommender, int similarItemsPerItem) {
+    this(recommender, similarItemsPerItem, DEFAULT_BATCH_SIZE);
+  }
+
+  /**
+   * @param recommender recommender to use
+   * @param similarItemsPerItem number of similar items to compute per item
+   * @param batchSize size of item batches sent to worker threads
+   */
+  public MultithreadedBatchItemSimilarities(ItemBasedRecommender recommender, int similarItemsPerItem, int batchSize) {
+    super(recommender, similarItemsPerItem);
+    this.batchSize = batchSize;
+  }
+
+  @Override
+  public int computeItemSimilarities(int degreeOfParallelism, int maxDurationInHours, SimilarItemsWriter writer)
+    throws IOException {
+
+    ExecutorService executorService = Executors.newFixedThreadPool(degreeOfParallelism + 1);
+
+    Output output = null;
+    try {
+      writer.open();
+
+      DataModel dataModel = getRecommender().getDataModel();
+
+      BlockingQueue<long[]> itemsIDsInBatches = queueItemIDsInBatches(dataModel, batchSize, degreeOfParallelism);
+      BlockingQueue<List<SimilarItems>> results = new LinkedBlockingQueue<>();
+
+      AtomicInteger numActiveWorkers = new AtomicInteger(degreeOfParallelism);
+      for (int n = 0; n < degreeOfParallelism; n++) {
+        executorService.execute(new SimilarItemsWorker(n, itemsIDsInBatches, results, numActiveWorkers));
+      }
+
+      output = new Output(results, writer, numActiveWorkers);
+      executorService.execute(output);
+
+    } catch (Exception e) {
+      throw new IOException(e);
+    } finally {
+      executorService.shutdown();
+      try {
+        boolean succeeded = executorService.awaitTermination(maxDurationInHours, TimeUnit.HOURS);
+        if (!succeeded) {
+          throw new RuntimeException("Unable to complete the computation in " + maxDurationInHours + " hours!");
+        }
+      } catch (InterruptedException e) {
+        throw new RuntimeException(e);
+      }
+      Closeables.close(writer, false);
+    }
+
+    return output.getNumSimilaritiesProcessed();
+  }
+
+  private static BlockingQueue<long[]> queueItemIDsInBatches(DataModel dataModel, int batchSize,
+                                                             int degreeOfParallelism)
+      throws TasteException {
+
+    LongPrimitiveIterator itemIDs = dataModel.getItemIDs();
+    int numItems = dataModel.getNumItems();
+
+    BlockingQueue<long[]> itemIDBatches = new LinkedBlockingQueue<>((numItems / batchSize) + 1);
+
+    long[] batch = new long[batchSize];
+    int pos = 0;
+    while (itemIDs.hasNext()) {
+      batch[pos] = itemIDs.nextLong();
+      pos++;
+      if (pos == batchSize) {
+        itemIDBatches.add(batch.clone());
+        pos = 0;
+      }
+    }
+
+    if (pos > 0) {
+      long[] lastBatch = new long[pos];
+      System.arraycopy(batch, 0, lastBatch, 0, pos);
+      itemIDBatches.add(lastBatch);
+    }
+
+    if (itemIDBatches.size() < degreeOfParallelism) {
+      throw new IllegalStateException("Degree of parallelism [" + degreeOfParallelism + "] " +
+          " is larger than number of batches [" + itemIDBatches.size() +"].");
+    }
+
+    log.info("Queued {} items in {} batches", numItems, itemIDBatches.size());
+
+    return itemIDBatches;
+  }
+
+
+  private static class Output implements Runnable {
+
+    private final BlockingQueue<List<SimilarItems>> results;
+    private final SimilarItemsWriter writer;
+    private final AtomicInteger numActiveWorkers;
+    private int numSimilaritiesProcessed = 0;
+
+    Output(BlockingQueue<List<SimilarItems>> results, SimilarItemsWriter writer, AtomicInteger numActiveWorkers) {
+      this.results = results;
+      this.writer = writer;
+      this.numActiveWorkers = numActiveWorkers;
+    }
+
+    private int getNumSimilaritiesProcessed() {
+      return numSimilaritiesProcessed;
+    }
+
+    @Override
+    public void run() {
+      while (numActiveWorkers.get() != 0 || !results.isEmpty()) {
+        try {
+          List<SimilarItems> similarItemsOfABatch = results.poll(10, TimeUnit.MILLISECONDS);
+          if (similarItemsOfABatch != null) {
+            for (SimilarItems similarItems : similarItemsOfABatch) {
+              writer.add(similarItems);
+              numSimilaritiesProcessed += similarItems.numSimilarItems();
+            }
+          }
+        } catch (Exception e) {
+          throw new RuntimeException(e);
+        }
+      }
+    }
+  }
+
+  private class SimilarItemsWorker implements Runnable {
+
+    private final int number;
+    private final BlockingQueue<long[]> itemIDBatches;
+    private final BlockingQueue<List<SimilarItems>> results;
+    private final AtomicInteger numActiveWorkers;
+
+    SimilarItemsWorker(int number, BlockingQueue<long[]> itemIDBatches, BlockingQueue<List<SimilarItems>> results,
+        AtomicInteger numActiveWorkers) {
+      this.number = number;
+      this.itemIDBatches = itemIDBatches;
+      this.results = results;
+      this.numActiveWorkers = numActiveWorkers;
+    }
+
+    @Override
+    public void run() {
+
+      int numBatchesProcessed = 0;
+      while (!itemIDBatches.isEmpty()) {
+        try {
+          long[] itemIDBatch = itemIDBatches.take();
+
+          List<SimilarItems> similarItemsOfBatch = new ArrayList<>(itemIDBatch.length);
+          for (long itemID : itemIDBatch) {
+            List<RecommendedItem> similarItems = getRecommender().mostSimilarItems(itemID, getSimilarItemsPerItem());
+            similarItemsOfBatch.add(new SimilarItems(itemID, similarItems));
+          }
+
+          results.offer(similarItemsOfBatch);
+
+          if (++numBatchesProcessed % 5 == 0) {
+            log.info("worker {} processed {} batches", number, numBatchesProcessed);
+          }
+
+        } catch (Exception e) {
+          throw new RuntimeException(e);
+        }
+      }
+      log.info("worker {} processed {} batches. done.", number, numBatchesProcessed);
+      numActiveWorkers.decrementAndGet();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/DataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/DataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/DataModel.java
new file mode 100644
index 0000000..022d02d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/DataModel.java
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.model;
+
+import java.io.Serializable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+
+/**
+ * <p>
+ * Implementations represent a repository of information about users and their associated {@link Preference}s
+ * for items.
+ * </p>
+ */
+public interface DataModel extends Refreshable, Serializable {
+  
+  /**
+   * @return all user IDs in the model, in order
+   * @throws TasteException
+   *           if an error occurs while accessing the data
+   */
+  LongPrimitiveIterator getUserIDs() throws TasteException;
+  
+  /**
+   * @param userID
+   *          ID of user to get prefs for
+   * @return user's preferences, ordered by item ID
+   * @throws org.apache.mahout.cf.taste.common.NoSuchUserException
+   *           if the user does not exist
+   * @throws TasteException
+   *           if an error occurs while accessing the data
+   */
+  PreferenceArray getPreferencesFromUser(long userID) throws TasteException;
+  
+  /**
+   * @param userID
+   *          ID of user to get prefs for
+   * @return IDs of items user expresses a preference for
+   * @throws org.apache.mahout.cf.taste.common.NoSuchUserException
+   *           if the user does not exist
+   * @throws TasteException
+   *           if an error occurs while accessing the data
+   */
+  FastIDSet getItemIDsFromUser(long userID) throws TasteException;
+  
+  /**
+   * @return a {@link LongPrimitiveIterator} of all item IDs in the model, in order
+   * @throws TasteException
+   *           if an error occurs while accessing the data
+   */
+  LongPrimitiveIterator getItemIDs() throws TasteException;
+  
+  /**
+   * @param itemID
+   *          item ID
+   * @return all existing {@link Preference}s expressed for that item, ordered by user ID, as an array
+   * @throws org.apache.mahout.cf.taste.common.NoSuchItemException
+   *           if the item does not exist
+   * @throws TasteException
+   *           if an error occurs while accessing the data
+   */
+  PreferenceArray getPreferencesForItem(long itemID) throws TasteException;
+  
+  /**
+   * Retrieves the preference value for a single user and item.
+   * 
+   * @param userID
+   *          user ID to get pref value from
+   * @param itemID
+   *          item ID to get pref value for
+   * @return preference value from the given user for the given item or null if none exists
+   * @throws org.apache.mahout.cf.taste.common.NoSuchUserException
+   *           if the user does not exist
+   * @throws TasteException
+   *           if an error occurs while accessing the data
+   */
+  Float getPreferenceValue(long userID, long itemID) throws TasteException;
+
+  /**
+   * Retrieves the time at which a preference value from a user and item was set, if known.
+   * Time is expressed in the usual way, as a number of milliseconds since the epoch.
+   *
+   * @param userID user ID for preference in question
+   * @param itemID item ID for preference in question
+   * @return time at which preference was set or null if no preference exists or its time is not known
+   * @throws org.apache.mahout.cf.taste.common.NoSuchUserException if the user does not exist
+   * @throws TasteException if an error occurs while accessing the data
+   */
+  Long getPreferenceTime(long userID, long itemID) throws TasteException;
+  
+  /**
+   * @return total number of items known to the model. This is generally the union of all items preferred by
+   *         at least one user but could include more.
+   * @throws TasteException
+   *           if an error occurs while accessing the data
+   */
+  int getNumItems() throws TasteException;
+  
+  /**
+   * @return total number of users known to the model.
+   * @throws TasteException
+   *           if an error occurs while accessing the data
+   */
+  int getNumUsers() throws TasteException;
+  
+  /**
+   * @param itemID item ID to check for
+   * @return the number of users who have expressed a preference for the item
+   * @throws TasteException if an error occurs while accessing the data
+   */
+  int getNumUsersWithPreferenceFor(long itemID) throws TasteException;
+
+  /**
+   * @param itemID1 first item ID to check for
+   * @param itemID2 second item ID to check for
+   * @return the number of users who have expressed a preference for the items
+   * @throws TasteException if an error occurs while accessing the data
+   */
+  int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException;
+  
+  /**
+   * <p>
+   * Sets a particular preference (item plus rating) for a user.
+   * </p>
+   * 
+   * @param userID
+   *          user to set preference for
+   * @param itemID
+   *          item to set preference for
+   * @param value
+   *          preference value
+   * @throws org.apache.mahout.cf.taste.common.NoSuchItemException
+   *           if the item does not exist
+   * @throws org.apache.mahout.cf.taste.common.NoSuchUserException
+   *           if the user does not exist
+   * @throws TasteException
+   *           if an error occurs while accessing the data
+   */
+  void setPreference(long userID, long itemID, float value) throws TasteException;
+  
+  /**
+   * <p>
+   * Removes a particular preference for a user.
+   * </p>
+   * 
+   * @param userID
+   *          user from which to remove preference
+   * @param itemID
+   *          item to remove preference for
+   * @throws org.apache.mahout.cf.taste.common.NoSuchItemException
+   *           if the item does not exist
+   * @throws org.apache.mahout.cf.taste.common.NoSuchUserException
+   *           if the user does not exist
+   * @throws TasteException
+   *           if an error occurs while accessing the data
+   */
+  void removePreference(long userID, long itemID) throws TasteException;
+
+  /**
+   * @return true if this implementation actually stores and returns distinct preference values;
+   *  that is, if it is not a 'boolean' DataModel
+   */
+  boolean hasPreferenceValues();
+
+  /**
+   * @return the maximum preference value that is possible in the current problem domain being evaluated. For
+   * example, if the domain is movie ratings on a scale of 1 to 5, this should be 5. While a
+   * {@link org.apache.mahout.cf.taste.recommender.Recommender} may estimate a preference value above 5.0, it
+   * isn't "fair" to consider that the system is actually suggesting an impossible rating of, say, 5.4 stars.
+   * In practice the application would cap this estimate to 5.0. Since evaluators evaluate
+   * the difference between estimated and actual value, this at least prevents this effect from unfairly
+   * penalizing a {@link org.apache.mahout.cf.taste.recommender.Recommender}
+   */
+  float getMaxPreference();
+
+  /**
+   * @see #getMaxPreference()
+   */
+  float getMinPreference();
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java
new file mode 100644
index 0000000..cc477fe
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.model;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * <p>
+ * Mahout 0.2 changed the framework to operate only in terms of numeric (long) ID values for users and items.
+ * This is, obviously, not compatible with applications that used other key types -- most commonly
+ * {@link String}. Implementation of this class provide support for mapping String to longs and vice versa in
+ * order to provide a smoother migration path to applications that must still use strings as IDs.
+ * </p>
+ * 
+ * <p>
+ * The mapping from strings to 64-bit numeric values is fixed here, to provide a standard implementation that
+ * is 'portable' or reproducible outside the framework easily. See {@link #toLongID(String)}.
+ * </p>
+ * 
+ * <p>
+ * Because this mapping is deterministically computable, it does not need to be stored. Indeed, subclasses'
+ * job is to store the reverse mapping. There are an infinite number of strings but only a fixed number of
+ * longs, so, it is possible for two strings to map to the same value. Subclasses do not treat this as an
+ * error but rather retain only the most recent mapping, overwriting a previous mapping. The probability of
+ * collision in a 64-bit space is quite small, but not zero. However, in the context of a collaborative
+ * filtering problem, the consequence of a collision is small, at worst -- perhaps one user receives another
+ * recommendations.
+ * </p>
+ * 
+ * @since 0.2
+ */
+public interface IDMigrator extends Refreshable {
+  
+  /**
+   * @return the top 8 bytes of the MD5 hash of the bytes of the given {@link String}'s UTF-8 encoding as a
+   *         long.
+   */
+  long toLongID(String stringID);
+  
+  /**
+   * @return the string ID most recently associated with the given long ID, or null if doesn't exist
+   * @throws TasteException
+   *           if an error occurs while retrieving the mapping
+   */
+  String toStringID(long longID) throws TasteException;
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/JDBCDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/JDBCDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/JDBCDataModel.java
new file mode 100644
index 0000000..e91ed48
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/JDBCDataModel.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.model;
+
+import javax.sql.DataSource;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+
+public interface JDBCDataModel extends DataModel {
+  
+  /**
+   * @return {@link DataSource} underlying this model
+   */
+  DataSource getDataSource();
+  
+  /**
+   * Hmm, should this exist elsewhere? seems like most relevant for a DB implementation, which is not in
+   * memory, which might want to export to memory.
+   * 
+   * @return all user preference data
+   */
+  FastByIDMap<PreferenceArray> exportWithPrefs() throws TasteException;
+  
+  FastByIDMap<FastIDSet> exportWithIDsOnly() throws TasteException;
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/Preference.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/Preference.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/Preference.java
new file mode 100644
index 0000000..fe0150a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/Preference.java
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.model;
+
+/**
+ * <p>
+ * A {@link Preference} encapsulates an item and a preference value, which indicates the strength of the
+ * preference for it. {@link Preference}s are associated to users.
+ * </p>
+ */
+public interface Preference {
+  
+  /** @return ID of user who prefers the item */
+  long getUserID();
+  
+  /** @return item ID that is preferred */
+  long getItemID();
+  
+  /**
+   * @return strength of the preference for that item. Zero should indicate "no preference either way";
+   *         positive values indicate preference and negative values indicate dislike
+   */
+  float getValue();
+  
+  /**
+   * Sets the strength of the preference for this item
+   * 
+   * @param value
+   *          new preference
+   */
+  void setValue(float value);
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/PreferenceArray.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/PreferenceArray.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/PreferenceArray.java
new file mode 100644
index 0000000..3886bc6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/PreferenceArray.java
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.model;
+
+import java.io.Serializable;
+
+/**
+ * An alternate representation of an array of {@link Preference}. Implementations, in theory, can produce a
+ * more memory-efficient representation.
+ */
+public interface PreferenceArray extends Cloneable, Serializable, Iterable<Preference> {
+  
+  /**
+   * @return size of length of the "array"
+   */
+  int length();
+  
+  /**
+   * @param i
+   *          index
+   * @return a materialized {@link Preference} representation of the preference at i
+   */
+  Preference get(int i);
+  
+  /**
+   * Sets preference at i from information in the given {@link Preference}
+   * 
+   * @param i
+   * @param pref
+   */
+  void set(int i, Preference pref);
+  
+  /**
+   * @param i
+   *          index
+   * @return user ID from preference at i
+   */
+  long getUserID(int i);
+  
+  /**
+   * Sets user ID for preference at i.
+   * 
+   * @param i
+   *          index
+   * @param userID
+   *          new user ID
+   */
+  void setUserID(int i, long userID);
+  
+  /**
+   * @param i
+   *          index
+   * @return item ID from preference at i
+   */
+  long getItemID(int i);
+  
+  /**
+   * Sets item ID for preference at i.
+   * 
+   * @param i
+   *          index
+   * @param itemID
+   *          new item ID
+   */
+  void setItemID(int i, long itemID);
+
+  /**
+   * @return all user or item IDs
+   */
+  long[] getIDs();
+  
+  /**
+   * @param i
+   *          index
+   * @return preference value from preference at i
+   */
+  float getValue(int i);
+  
+  /**
+   * Sets preference value for preference at i.
+   * 
+   * @param i
+   *          index
+   * @param value
+   *          new preference value
+   */
+  void setValue(int i, float value);
+  
+  /**
+   * @return independent copy of this object
+   */
+  PreferenceArray clone();
+  
+  /**
+   * Sorts underlying array by user ID, ascending.
+   */
+  void sortByUser();
+  
+  /**
+   * Sorts underlying array by item ID, ascending.
+   */
+  void sortByItem();
+  
+  /**
+   * Sorts underlying array by preference value, ascending.
+   */
+  void sortByValue();
+  
+  /**
+   * Sorts underlying array by preference value, descending.
+   */
+  void sortByValueReversed();
+  
+  /**
+   * @param userID
+   *          user ID
+   * @return true if array contains a preference with given user ID
+   */
+  boolean hasPrefWithUserID(long userID);
+  
+  /**
+   * @param itemID
+   *          item ID
+   * @return true if array contains a preference with given item ID
+   */
+  boolean hasPrefWithItemID(long itemID);
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/UpdatableIDMigrator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/UpdatableIDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/UpdatableIDMigrator.java
new file mode 100644
index 0000000..ff29a34
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/UpdatableIDMigrator.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.model;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+
+public interface UpdatableIDMigrator extends IDMigrator {
+  
+  /**
+   * Stores the reverse long-to-String mapping in some kind of backing store. Note that this must be called
+   * directly (or indirectly through {@link #initialize(Iterable)}) for every String that might be encountered
+   * in the application, or else the mapping will not be known.
+   *
+   * @param longID
+   *          long ID
+   * @param stringID
+   *          string ID that maps to/from that long ID
+   * @throws TasteException
+   *           if an error occurs while saving the mapping
+   */
+  void storeMapping(long longID, String stringID) throws TasteException;
+
+  /**
+   * Make the mapping aware of the given string IDs. This must be called initially before the implementation
+   * is used, or else it will not be aware of reverse long-to-String mappings.
+   *
+   * @throws TasteException
+   *           if an error occurs while storing the mappings
+   */
+  void initialize(Iterable<String> stringIDs) throws TasteException;
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/neighborhood/UserNeighborhood.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/neighborhood/UserNeighborhood.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/neighborhood/UserNeighborhood.java
new file mode 100644
index 0000000..2a143e1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/neighborhood/UserNeighborhood.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.neighborhood;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * <p>
+ * Implementations of this interface compute a "neighborhood" of users like a given user. This neighborhood
+ * can be used to compute recommendations then.
+ * </p>
+ */
+public interface UserNeighborhood extends Refreshable {
+  
+  /**
+   * @param userID
+   *          ID of user for which a neighborhood will be computed
+   * @return IDs of users in the neighborhood
+   * @throws TasteException
+   *           if an error occurs while accessing data
+   */
+  long[] getUserNeighborhood(long userID) throws TasteException;
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/CandidateItemsStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/CandidateItemsStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/CandidateItemsStrategy.java
new file mode 100644
index 0000000..ada1949
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/CandidateItemsStrategy.java
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.recommender;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+/**
+ * Used to retrieve all items that could possibly be recommended to the user
+ */
+public interface CandidateItemsStrategy extends Refreshable {
+
+  /**
+   * @return IDs of all items that could be recommended to the user
+   */
+  FastIDSet getCandidateItems(long userID, PreferenceArray preferencesFromUser, DataModel dataModel,
+     boolean includeKnownItems) throws TasteException;
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/IDRescorer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/IDRescorer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/IDRescorer.java
new file mode 100644
index 0000000..d9a9cf7
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/IDRescorer.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.recommender;
+
+/**
+ * <p>
+ * A {@link Rescorer} which operates on {@code long} primitive IDs, rather than arbitrary {@link Object}s.
+ * This is provided since most uses of this interface in the framework take IDs (as {@code long}) as an
+ * argument, and so this can be used to avoid unnecessary boxing/unboxing.
+ * </p>
+ */
+public interface IDRescorer {
+  
+  /**
+   * @param id
+   *          ID of thing (user, item, etc.) to rescore
+   * @param originalScore
+   *          original score
+   * @return modified score, or {@link Double#NaN} to indicate that this should be excluded entirely
+   */
+  double rescore(long id, double originalScore);
+  
+  /**
+   * Returns {@code true} to exclude the given thing.
+   *
+   * @param id
+   *          ID of thing (user, item, etc.) to rescore
+   * @return {@code true} to exclude, {@code false} otherwise
+   */
+  boolean isFiltered(long id);
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/ItemBasedRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/ItemBasedRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/ItemBasedRecommender.java
new file mode 100644
index 0000000..570f851
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/ItemBasedRecommender.java
@@ -0,0 +1,145 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.recommender;
+
+import java.util.List;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.common.LongPair;
+
+/**
+ * <p>
+ * Interface implemented by "item-based" recommenders.
+ * </p>
+ */
+public interface ItemBasedRecommender extends Recommender {
+  
+  /**
+   * @param itemID
+   *          ID of item for which to find most similar other items
+   * @param howMany
+   *          desired number of most similar items to find
+   * @return items most similar to the given item, ordered from most similar to least
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+   */
+  List<RecommendedItem> mostSimilarItems(long itemID, int howMany) throws TasteException;
+  
+  /**
+   * @param itemID
+   *          ID of item for which to find most similar other items
+   * @param howMany
+   *          desired number of most similar items to find
+   * @param rescorer
+   *          {@link Rescorer} which can adjust item-item similarity estimates used to determine most similar
+   *          items
+   * @return itemss most similar to the given item, ordered from most similar to least
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+   */
+  List<RecommendedItem> mostSimilarItems(long itemID, int howMany, Rescorer<LongPair> rescorer) throws TasteException;
+  
+  /**
+   * @param itemIDs
+   *          IDs of item for which to find most similar other items
+   * @param howMany
+   *          desired number of most similar items to find estimates used to determine most similar items
+   * @return items most similar to the given items, ordered from most similar to least
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+   */
+  List<RecommendedItem> mostSimilarItems(long[] itemIDs, int howMany) throws TasteException;
+  
+  /**
+   * @param itemIDs
+   *          IDs of item for which to find most similar other items
+   * @param howMany
+   *          desired number of most similar items to find
+   * @param rescorer
+   *          {@link Rescorer} which can adjust item-item similarity estimates used to determine most similar
+   *          items
+   * @return items most similar to the given items, ordered from most similar to least
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+   */
+  List<RecommendedItem> mostSimilarItems(long[] itemIDs,
+                                         int howMany,
+                                         Rescorer<LongPair> rescorer) throws TasteException;
+
+  /**
+   * @param itemIDs
+   *          IDs of item for which to find most similar other items
+   * @param howMany
+   *          desired number of most similar items to find
+   * @param excludeItemIfNotSimilarToAll
+   *          exclude an item if it is not similar to each of the input items
+   * @return items most similar to the given items, ordered from most similar to least
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+   */
+  List<RecommendedItem> mostSimilarItems(long[] itemIDs,
+                                         int howMany,
+                                         boolean excludeItemIfNotSimilarToAll) throws TasteException;
+
+  /**
+   * @param itemIDs
+   *          IDs of item for which to find most similar other items
+   * @param howMany
+   *          desired number of most similar items to find
+   * @param rescorer
+   *          {@link Rescorer} which can adjust item-item similarity estimates used to determine most similar
+   *          items
+   * @param excludeItemIfNotSimilarToAll
+   *          exclude an item if it is not similar to each of the input items
+   * @return items most similar to the given items, ordered from most similar to least
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+   */
+  List<RecommendedItem> mostSimilarItems(long[] itemIDs,
+                                         int howMany,
+                                         Rescorer<LongPair> rescorer,
+                                         boolean excludeItemIfNotSimilarToAll) throws TasteException;
+
+  /**
+   * <p>
+   * Lists the items that were most influential in recommending a given item to a given user. Exactly how this
+   * is determined is left to the implementation, but, generally this will return items that the user prefers
+   * and that are similar to the given item.
+   * </p>
+   * 
+   * <p>
+   * This returns a {@link List} of {@link RecommendedItem} which is a little misleading since it's returning
+   * recommend<strong>ing</strong> items, but, I thought it more natural to just reuse this class since it
+   * encapsulates an item and value. The value here does not necessarily have a consistent interpretation or
+   * expected range; it will be higher the more influential the item was in the recommendation.
+   * </p>
+   * 
+   * @param userID
+   *          ID of user who was recommended the item
+   * @param itemID
+   *          ID of item that was recommended
+   * @param howMany
+   *          maximum number of items to return
+   * @return {@link List} of {@link RecommendedItem}, ordered from most influential in recommended the given
+   *         item to least
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+   */
+  List<RecommendedItem> recommendedBecause(long userID, long itemID, int howMany) throws TasteException;
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/MostSimilarItemsCandidateItemsStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/MostSimilarItemsCandidateItemsStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/MostSimilarItemsCandidateItemsStrategy.java
new file mode 100644
index 0000000..282ceff
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/MostSimilarItemsCandidateItemsStrategy.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.recommender;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+/**
+ * Used to retrieve all items that could possibly be similar
+ */
+public interface MostSimilarItemsCandidateItemsStrategy extends Refreshable {
+
+  FastIDSet getCandidateItems(long[] itemIDs, DataModel dataModel) throws TasteException;
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/RecommendedItem.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/RecommendedItem.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/RecommendedItem.java
new file mode 100644
index 0000000..1fcece8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/RecommendedItem.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.recommender;
+
+/**
+ * <p>
+ * Implementations encapsulate items that are recommended, and include the item recommended and a value
+ * expressing the strength of the preference.
+ * </p>
+ */
+public interface RecommendedItem {
+  
+  /** @return the recommended item ID */
+  long getItemID();
+  
+  /**
+   * <p>
+   * A value expressing the strength of the preference for the recommended item. The range of the values
+   * depends on the implementation. Implementations must use larger values to express stronger preference.
+   * </p>
+   * 
+   * @return strength of the preference
+   */
+  float getValue();
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Recommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Recommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Recommender.java
new file mode 100644
index 0000000..4135aff
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Recommender.java
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.recommender;
+
+import java.util.List;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+/**
+ * <p>
+ * Implementations of this interface can recommend items for a user. Implementations will likely take
+ * advantage of several classes in other packages here to compute this.
+ * </p>
+ */
+public interface Recommender extends Refreshable {
+  
+  /**
+   * @param userID
+   *          user for which recommendations are to be computed
+   * @param howMany
+   *          desired number of recommendations
+   * @return {@link List} of recommended {@link RecommendedItem}s, ordered from most strongly recommend to
+   *         least
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link DataModel}
+   */
+  List<RecommendedItem> recommend(long userID, int howMany) throws TasteException;
+
+  /**
+   * @param userID
+   *          user for which recommendations are to be computed
+   * @param howMany
+   *          desired number of recommendations
+   * @return {@link List} of recommended {@link RecommendedItem}s, ordered from most strongly recommend to
+   *         least
+   * @param includeKnownItems
+   *          whether to include items already known by the user in recommendations
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link DataModel}
+   */
+  List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException;
+
+  /**
+   * @param userID
+   *          user for which recommendations are to be computed
+   * @param howMany
+   *          desired number of recommendations
+   * @param rescorer
+   *          rescoring function to apply before final list of recommendations is determined
+   * @return {@link List} of recommended {@link RecommendedItem}s, ordered from most strongly recommend to
+   *         least
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link DataModel}
+   */
+  List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException;
+  
+  /**
+   * @param userID
+   *          user for which recommendations are to be computed
+   * @param howMany
+   *          desired number of recommendations
+   * @param rescorer
+   *          rescoring function to apply before final list of recommendations is determined
+   * @param includeKnownItems
+   *          whether to include items already known by the user in recommendations
+   * @return {@link List} of recommended {@link RecommendedItem}s, ordered from most strongly recommend to
+   *         least
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link DataModel}
+   */
+  
+  List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+      throws TasteException;
+  
+  /**
+   * @param userID
+   *          user ID whose preference is to be estimated
+   * @param itemID
+   *          item ID to estimate preference for
+   * @return an estimated preference if the user has not expressed a preference for the item, or else the
+   *         user's actual preference for the item. If a preference cannot be estimated, returns
+   *         {@link Double#NaN}
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link DataModel}
+   */
+  float estimatePreference(long userID, long itemID) throws TasteException;
+  
+  /**
+   * @param userID
+   *          user to set preference for
+   * @param itemID
+   *          item to set preference for
+   * @param value
+   *          preference value
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link DataModel}
+   */
+  void setPreference(long userID, long itemID, float value) throws TasteException;
+  
+  /**
+   * @param userID
+   *          user from which to remove preference
+   * @param itemID
+   *          item for which to remove preference
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link DataModel}
+   */
+  void removePreference(long userID, long itemID) throws TasteException;
+
+  /**
+   * @return underlying {@link DataModel} used by this {@link Recommender} implementation
+   */
+  DataModel getDataModel();
+
+}


[23/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java
new file mode 100644
index 0000000..0f94c22
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+public final class InvertedRunningAverage implements RunningAverage {
+  
+  private final RunningAverage delegate;
+  
+  public InvertedRunningAverage(RunningAverage delegate) {
+    this.delegate = delegate;
+  }
+  
+  @Override
+  public void addDatum(double datum) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void removeDatum(double datum) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void changeDatum(double delta) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public int getCount() {
+    return delegate.getCount();
+  }
+  
+  @Override
+  public double getAverage() {
+    return -delegate.getAverage();
+  }
+
+  @Override
+  public RunningAverage inverse() {
+    return delegate;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java
new file mode 100644
index 0000000..147012d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+public final class InvertedRunningAverageAndStdDev implements RunningAverageAndStdDev {
+  
+  private final RunningAverageAndStdDev delegate;
+  
+  public InvertedRunningAverageAndStdDev(RunningAverageAndStdDev delegate) {
+    this.delegate = delegate;
+  }
+  
+  @Override
+  public void addDatum(double datum) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void removeDatum(double datum) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void changeDatum(double delta) {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public int getCount() {
+    return delegate.getCount();
+  }
+  
+  @Override
+  public double getAverage() {
+    return -delegate.getAverage();
+  }
+  
+  @Override
+  public double getStandardDeviation() {
+    return delegate.getStandardDeviation();
+  }
+
+  @Override
+  public RunningAverageAndStdDev inverse() {
+    return delegate;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveArrayIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveArrayIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveArrayIterator.java
new file mode 100644
index 0000000..5127df0
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveArrayIterator.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.util.NoSuchElementException;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * While long[] is an Iterable, it is not an Iterable&lt;Long&gt;. This adapter class addresses that.
+ */
+public final class LongPrimitiveArrayIterator implements LongPrimitiveIterator {
+  
+  private final long[] array;
+  private int position;
+  private final int max;
+  
+  /**
+   * <p>
+   * Creates an {@link LongPrimitiveArrayIterator} over an entire array.
+   * </p>
+   * 
+   * @param array
+   *          array to iterate over
+   */
+  public LongPrimitiveArrayIterator(long[] array) {
+    this.array = Preconditions.checkNotNull(array); // yeah, not going to copy the array here, for performance
+    this.position = 0;
+    this.max = array.length;
+  }
+  
+  @Override
+  public boolean hasNext() {
+    return position < max;
+  }
+  
+  @Override
+  public Long next() {
+    return nextLong();
+  }
+  
+  @Override
+  public long nextLong() {
+    if (position >= array.length) {
+      throw new NoSuchElementException();
+    }
+    return array[position++];
+  }
+  
+  @Override
+  public long peek() {
+    if (position >= array.length) {
+      throw new NoSuchElementException();
+    }
+    return array[position];
+  }
+  
+  /**
+   * @throws UnsupportedOperationException
+   */
+  @Override
+  public void remove() {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void skip(int n) {
+    if (n > 0) {
+      position += n;
+    }
+  }
+  
+  @Override
+  public String toString() {
+    return "LongPrimitiveArrayIterator";
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveIterator.java
new file mode 100644
index 0000000..0840749
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveIterator.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+/**
+ * Adds notion of iterating over {@code long} primitives in the style of an {@link java.util.Iterator} -- as
+ * opposed to iterating over {@link Long}. Implementations of this interface however also implement
+ * {@link java.util.Iterator} and {@link Iterable} over {@link Long} for convenience.
+ */
+public interface LongPrimitiveIterator extends SkippingIterator<Long> {
+  
+  /**
+   * @return next {@code long} in iteration
+   * @throws java.util.NoSuchElementException
+   *           if no more elements exist in the iteration
+   */
+  long nextLong();
+  
+  /**
+   * @return next {@code long} in iteration without advancing iteration
+   */
+  long peek();
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RefreshHelper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RefreshHelper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RefreshHelper.java
new file mode 100644
index 0000000..3e03108
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RefreshHelper.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.locks.ReentrantLock;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A helper class for implementing {@link Refreshable}. This object is typically included in an implementation
+ * {@link Refreshable} to implement {@link Refreshable#refresh(Collection)}. It execute the class's own
+ * supplied update logic, after updating all the object's dependencies. This also ensures that dependencies
+ * are not updated multiple times.
+ */
+public final class RefreshHelper implements Refreshable {
+  
+  private static final Logger log = LoggerFactory.getLogger(RefreshHelper.class);
+  
+  private final List<Refreshable> dependencies;
+  private final ReentrantLock refreshLock;
+  private final Callable<?> refreshRunnable;
+  
+  /**
+   * @param refreshRunnable
+   *          encapsulates the containing object's own refresh logic
+   */
+  public RefreshHelper(Callable<?> refreshRunnable) {
+    this.dependencies = new ArrayList<>(3);
+    this.refreshLock = new ReentrantLock();
+    this.refreshRunnable = refreshRunnable;
+  }
+  
+  /** Add a dependency to be refreshed first when the encapsulating object does. */
+  public void addDependency(Refreshable refreshable) {
+    if (refreshable != null) {
+      dependencies.add(refreshable);
+    }
+  }
+  
+  public void removeDependency(Refreshable refreshable) {
+    if (refreshable != null) {
+      dependencies.remove(refreshable);
+    }
+  }
+  
+  /**
+   * Typically this is called in {@link Refreshable#refresh(java.util.Collection)} and is the entire body of
+   * that method.
+   */
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    if (refreshLock.tryLock()) {
+      try {
+        alreadyRefreshed = buildRefreshed(alreadyRefreshed);
+        for (Refreshable dependency : dependencies) {
+          maybeRefresh(alreadyRefreshed, dependency);
+        }
+        if (refreshRunnable != null) {
+          try {
+            refreshRunnable.call();
+          } catch (Exception e) {
+            log.warn("Unexpected exception while refreshing", e);
+          }
+        }
+      } finally {
+        refreshLock.unlock();
+      }
+    }
+  }
+  
+  /**
+   * Creates a new and empty {@link Collection} if the method parameter is {@code null}.
+   *
+   * @param currentAlreadyRefreshed
+   *          {@link Refreshable}s to refresh later on
+   * @return an empty {@link Collection} if the method param was {@code null} or the unmodified method
+   *         param.
+   */
+  public static Collection<Refreshable> buildRefreshed(Collection<Refreshable> currentAlreadyRefreshed) {
+    return currentAlreadyRefreshed == null ? new HashSet<Refreshable>(3) : currentAlreadyRefreshed;
+  }
+  
+  /**
+   * Adds the specified {@link Refreshable} to the given collection of {@link Refreshable}s if it is not
+   * already there and immediately refreshes it.
+   * 
+   * @param alreadyRefreshed
+   *          the collection of {@link Refreshable}s
+   * @param refreshable
+   *          the {@link Refreshable} to potentially add and refresh
+   */
+  public static void maybeRefresh(Collection<Refreshable> alreadyRefreshed, Refreshable refreshable) {
+    if (!alreadyRefreshed.contains(refreshable)) {
+      alreadyRefreshed.add(refreshable);
+      log.info("Added refreshable: {}", refreshable);
+      refreshable.refresh(alreadyRefreshed);
+      log.info("Refreshed: {}", alreadyRefreshed);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Retriever.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Retriever.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Retriever.java
new file mode 100644
index 0000000..40da9de
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Retriever.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * <p>
+ * Implementations can retrieve a value for a given key.
+ * </p>
+ */
+public interface Retriever<K,V> {
+  
+  /**
+   * @param key key for which a value should be retrieved
+   * @return value for key
+   * @throws TasteException if an error occurs while retrieving the value
+   */
+  V get(K key) throws TasteException;
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverage.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverage.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverage.java
new file mode 100644
index 0000000..bf8e39c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverage.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+/**
+ * <p>
+ * Interface for classes that can keep track of a running average of a series of numbers. One can add to or
+ * remove from the series, as well as update a datum in the series. The class does not actually keep track of
+ * the series of values, just its running average, so it doesn't even matter if you remove/change a value that
+ * wasn't added.
+ * </p>
+ */
+public interface RunningAverage {
+  
+  /**
+   * @param datum
+   *          new item to add to the running average
+   * @throws IllegalArgumentException
+   *           if datum is {@link Double#NaN}
+   */
+  void addDatum(double datum);
+  
+  /**
+   * @param datum
+   *          item to remove to the running average
+   * @throws IllegalArgumentException
+   *           if datum is {@link Double#NaN}
+   * @throws IllegalStateException
+   *           if count is 0
+   */
+  void removeDatum(double datum);
+  
+  /**
+   * @param delta
+   *          amount by which to change a datum in the running average
+   * @throws IllegalArgumentException
+   *           if delta is {@link Double#NaN}
+   * @throws IllegalStateException
+   *           if count is 0
+   */
+  void changeDatum(double delta);
+  
+  int getCount();
+  
+  double getAverage();
+
+  /**
+   * @return a (possibly immutable) object whose average is the negative of this object's
+   */
+  RunningAverage inverse();
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverageAndStdDev.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverageAndStdDev.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverageAndStdDev.java
new file mode 100644
index 0000000..4ac6108
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverageAndStdDev.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+/**
+ * <p>
+ * Extends {@link RunningAverage} by adding standard deviation too.
+ * </p>
+ */
+public interface RunningAverageAndStdDev extends RunningAverage {
+  
+  /** @return standard deviation of data */
+  double getStandardDeviation();
+
+  /**
+   * @return a (possibly immutable) object whose average is the negative of this object's
+   */
+  @Override
+  RunningAverageAndStdDev inverse();
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SamplingLongPrimitiveIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SamplingLongPrimitiveIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SamplingLongPrimitiveIterator.java
new file mode 100644
index 0000000..6da709d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SamplingLongPrimitiveIterator.java
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.util.NoSuchElementException;
+
+import com.google.common.base.Preconditions;
+import org.apache.commons.math3.distribution.PascalDistribution;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.RandomWrapper;
+
+/**
+ * Wraps a {@link LongPrimitiveIterator} and returns only some subset of the elements that it would,
+ * as determined by a sampling rate parameter.
+ */
+public final class SamplingLongPrimitiveIterator extends AbstractLongPrimitiveIterator {
+  
+  private final PascalDistribution geometricDistribution;
+  private final LongPrimitiveIterator delegate;
+  private long next;
+  private boolean hasNext;
+  
+  public SamplingLongPrimitiveIterator(LongPrimitiveIterator delegate, double samplingRate) {
+    this(RandomUtils.getRandom(), delegate, samplingRate);
+  }
+
+  public SamplingLongPrimitiveIterator(RandomWrapper random, LongPrimitiveIterator delegate, double samplingRate) {
+    Preconditions.checkNotNull(delegate);
+    Preconditions.checkArgument(samplingRate > 0.0 && samplingRate <= 1.0, "Must be: 0.0 < samplingRate <= 1.0");
+    // Geometric distribution is special case of negative binomial (aka Pascal) with r=1:
+    geometricDistribution = new PascalDistribution(random.getRandomGenerator(), 1, samplingRate);
+    this.delegate = delegate;
+    this.hasNext = true;
+    doNext();
+  }
+  
+  @Override
+  public boolean hasNext() {
+    return hasNext;
+  }
+  
+  @Override
+  public long nextLong() {
+    if (hasNext) {
+      long result = next;
+      doNext();
+      return result;
+    }
+    throw new NoSuchElementException();
+  }
+  
+  @Override
+  public long peek() {
+    if (hasNext) {
+      return next;
+    }
+    throw new NoSuchElementException();
+  }
+  
+  private void doNext() {
+    int toSkip = geometricDistribution.sample();
+    delegate.skip(toSkip);
+    if (delegate.hasNext()) {
+      next = delegate.next();
+    } else {
+      hasNext = false;
+    }
+  }
+  
+  /**
+   * @throws UnsupportedOperationException
+   */
+  @Override
+  public void remove() {
+    throw new UnsupportedOperationException();
+  }
+  
+  @Override
+  public void skip(int n) {
+    int toSkip = 0;
+    for (int i = 0; i < n; i++) {
+      toSkip += geometricDistribution.sample();
+    }
+    delegate.skip(toSkip);
+    if (delegate.hasNext()) {
+      next = delegate.next();
+    } else {
+      hasNext = false;
+    }
+  }
+  
+  public static LongPrimitiveIterator maybeWrapIterator(LongPrimitiveIterator delegate, double samplingRate) {
+    return samplingRate >= 1.0 ? delegate : new SamplingLongPrimitiveIterator(delegate, samplingRate);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SkippingIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SkippingIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SkippingIterator.java
new file mode 100644
index 0000000..e88f98a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SkippingIterator.java
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.util.Iterator;
+
+/**
+ * Adds ability to skip ahead in an iterator, perhaps more efficiently than by calling {@link #next()}
+ * repeatedly.
+ */
+public interface SkippingIterator<V> extends Iterator<V> {
+  
+  /**
+   * Skip the next n elements supplied by this {@link Iterator}. If there are less than n elements remaining,
+   * this skips all remaining elements in the {@link Iterator}. This method has the same effect as calling
+   * {@link #next()} n times, except that it will never throw {@link java.util.NoSuchElementException}.
+   */
+  void skip(int n);
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverage.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverage.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverage.java
new file mode 100644
index 0000000..76e5239
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverage.java
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.io.Serializable;
+
+import com.google.common.base.Preconditions;
+
+public class WeightedRunningAverage implements RunningAverage, Serializable {
+
+  private double totalWeight;
+  private double average;
+
+  public WeightedRunningAverage() {
+    totalWeight = 0.0;
+    average = Double.NaN;
+  }
+
+  @Override
+  public synchronized void addDatum(double datum) {
+    addDatum(datum, 1.0);
+  }
+
+  public synchronized void addDatum(double datum, double weight) {
+    double oldTotalWeight = totalWeight;
+    totalWeight += weight;
+    if (oldTotalWeight <= 0.0) {
+      average = datum;
+    } else {
+      average = average * oldTotalWeight / totalWeight + datum * weight / totalWeight;
+    }
+  }
+
+  @Override
+  public synchronized void removeDatum(double datum) {
+    removeDatum(datum, 1.0);
+  }
+
+  public synchronized void removeDatum(double datum, double weight) {
+    double oldTotalWeight = totalWeight;
+    totalWeight -= weight;
+    if (totalWeight <= 0.0) {
+      average = Double.NaN;
+      totalWeight = 0.0;
+    } else {
+      average = average * oldTotalWeight / totalWeight - datum * weight / totalWeight;
+    }
+  }
+
+  @Override
+  public synchronized void changeDatum(double delta) {
+    changeDatum(delta, 1.0);
+  }
+
+  public synchronized void changeDatum(double delta, double weight) {
+    Preconditions.checkArgument(weight <= totalWeight, "weight must be <= totalWeight");
+    average += delta * weight / totalWeight;
+  }
+
+  public synchronized double getTotalWeight() {
+    return totalWeight;
+  }
+
+  /** @return {@link #getTotalWeight()} */
+  @Override
+  public synchronized int getCount() {
+    return (int) totalWeight;
+  }
+
+  @Override
+  public synchronized double getAverage() {
+    return average;
+  }
+
+  @Override
+  public RunningAverage inverse() {
+    return new InvertedRunningAverage(this);
+  }
+
+  @Override
+  public synchronized String toString() {
+    return String.valueOf(average);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverageAndStdDev.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverageAndStdDev.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverageAndStdDev.java
new file mode 100644
index 0000000..bed5812
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverageAndStdDev.java
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+/**
+ * This subclass also provides for a weighted estimate of the sample standard deviation.
+ * See <a href="http://en.wikipedia.org/wiki/Mean_square_weighted_deviation">estimate formulae here</a>.
+ */
+public final class WeightedRunningAverageAndStdDev extends WeightedRunningAverage implements RunningAverageAndStdDev {
+
+  private double totalSquaredWeight;
+  private double totalWeightedData;
+  private double totalWeightedSquaredData;
+
+  public WeightedRunningAverageAndStdDev() {
+    totalSquaredWeight = 0.0;
+    totalWeightedData = 0.0;
+    totalWeightedSquaredData = 0.0;
+  }
+  
+  @Override
+  public synchronized void addDatum(double datum, double weight) {
+    super.addDatum(datum, weight);
+    totalSquaredWeight += weight * weight;
+    double weightedData = datum * weight;
+    totalWeightedData += weightedData;
+    totalWeightedSquaredData += weightedData * datum;
+  }
+  
+  @Override
+  public synchronized void removeDatum(double datum, double weight) {
+    super.removeDatum(datum, weight);
+    totalSquaredWeight -= weight * weight;
+    if (totalSquaredWeight <= 0.0) {
+      totalSquaredWeight = 0.0;
+    }
+    double weightedData = datum * weight;
+    totalWeightedData -= weightedData;
+    if (totalWeightedData <= 0.0) {
+      totalWeightedData = 0.0;
+    }
+    totalWeightedSquaredData -= weightedData * datum;
+    if (totalWeightedSquaredData <= 0.0) {
+      totalWeightedSquaredData = 0.0;
+    }
+  }
+
+  /**
+   * @throws UnsupportedOperationException
+   */
+  @Override
+  public synchronized void changeDatum(double delta, double weight) {
+    throw new UnsupportedOperationException();
+  }
+  
+
+  @Override
+  public synchronized double getStandardDeviation() {
+    double totalWeight = getTotalWeight();
+    return Math.sqrt((totalWeightedSquaredData * totalWeight - totalWeightedData * totalWeightedData)
+                         / (totalWeight * totalWeight - totalSquaredWeight));
+  }
+
+  @Override
+  public RunningAverageAndStdDev inverse() {
+    return new InvertedRunningAverageAndStdDev(this);
+  }
+  
+  @Override
+  public synchronized String toString() {
+    return String.valueOf(String.valueOf(getAverage()) + ',' + getStandardDeviation());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/AbstractJDBCComponent.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/AbstractJDBCComponent.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/AbstractJDBCComponent.java
new file mode 100644
index 0000000..d1e93ab
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/AbstractJDBCComponent.java
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common.jdbc;
+
+import javax.naming.Context;
+import javax.naming.InitialContext;
+import javax.naming.NamingException;
+import javax.sql.DataSource;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * A helper class with common elements for several JDBC-related components.
+ */
+public abstract class AbstractJDBCComponent {
+  
+  private static final Logger log = LoggerFactory.getLogger(AbstractJDBCComponent.class);
+  
+  private static final int DEFAULT_FETCH_SIZE = 1000; // A max, "big" number of rows to buffer at once
+  protected static final String DEFAULT_DATASOURCE_NAME = "jdbc/taste";
+  
+  protected static void checkNotNullAndLog(String argName, Object value) {
+    Preconditions.checkArgument(value != null && !value.toString().isEmpty(),
+      argName + " is null or empty");
+    log.debug("{}: {}", argName, value);
+  }
+  
+  protected static void checkNotNullAndLog(String argName, Object[] values) {
+    Preconditions.checkArgument(values != null && values.length != 0, argName + " is null or zero-length");
+    for (Object value : values) {
+      checkNotNullAndLog(argName, value);
+    }
+  }
+  
+  /**
+   * <p>
+   * Looks up a {@link DataSource} by name from JNDI. "java:comp/env/" is prepended to the argument before
+   * looking up the name in JNDI.
+   * </p>
+   * 
+   * @param dataSourceName
+   *          JNDI name where a {@link DataSource} is bound (e.g. "jdbc/taste")
+   * @return {@link DataSource} under that JNDI name
+   * @throws TasteException
+   *           if a JNDI error occurs
+   */
+  public static DataSource lookupDataSource(String dataSourceName) throws TasteException {
+    Context context = null;
+    try {
+      context = new InitialContext();
+      return (DataSource) context.lookup("java:comp/env/" + dataSourceName);
+    } catch (NamingException ne) {
+      throw new TasteException(ne);
+    } finally {
+      if (context != null) {
+        try {
+          context.close();
+        } catch (NamingException ne) {
+          log.warn("Error while closing Context; continuing...", ne);
+        }
+      }
+    }
+  }
+  
+  protected int getFetchSize() {
+    return DEFAULT_FETCH_SIZE;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/EachRowIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/EachRowIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/EachRowIterator.java
new file mode 100644
index 0000000..3f024bc
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/EachRowIterator.java
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common.jdbc;
+
+import javax.sql.DataSource;
+import java.io.Closeable;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import com.google.common.collect.AbstractIterator;
+import org.apache.mahout.common.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Provides an {@link java.util.Iterator} over the result of an SQL query, as an iteration over the {@link ResultSet}.
+ * While the same object will be returned from the iteration each time, it will be returned once for each row
+ * of the result.
+ */
+final class EachRowIterator extends AbstractIterator<ResultSet> implements Closeable {
+
+  private static final Logger log = LoggerFactory.getLogger(EachRowIterator.class);
+
+  private final Connection connection;
+  private final PreparedStatement statement;
+  private final ResultSet resultSet;
+
+  EachRowIterator(DataSource dataSource, String sqlQuery) throws SQLException {
+    try {
+      connection = dataSource.getConnection();
+      statement = connection.prepareStatement(sqlQuery, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+      statement.setFetchDirection(ResultSet.FETCH_FORWARD);
+      //statement.setFetchSize(getFetchSize());
+      log.debug("Executing SQL query: {}", sqlQuery);
+      resultSet = statement.executeQuery();
+    } catch (SQLException sqle) {
+      close();
+      throw sqle;
+    }
+  }
+
+  @Override
+  protected ResultSet computeNext() {
+    try {
+      if (resultSet.next()) {
+        return resultSet;
+      } else {
+        close();
+        return null;
+      }
+    } catch (SQLException sqle) {
+      close();
+      throw new IllegalStateException(sqle);
+    }
+  }
+
+  public void skip(int n) throws SQLException {
+    try {
+      resultSet.relative(n);
+    } catch (SQLException sqle) {
+      // Can't use relative on MySQL Connector/J; try advancing manually
+      int i = 0;
+      while (i < n && resultSet.next()) {
+        i++;
+      }
+    }
+  }
+
+  @Override
+  public void close() {
+    IOUtils.quietClose(resultSet, statement, connection);
+    endOfData();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/ResultSetIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/ResultSetIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/ResultSetIterator.java
new file mode 100644
index 0000000..273ebd5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/ResultSetIterator.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common.jdbc;
+
+import javax.sql.DataSource;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.Iterator;
+
+import com.google.common.base.Function;
+import com.google.common.collect.ForwardingIterator;
+import com.google.common.collect.Iterators;
+
+public abstract class ResultSetIterator<T> extends ForwardingIterator<T> {
+
+  private final Iterator<T> delegate;
+  private final EachRowIterator rowDelegate;
+
+  protected ResultSetIterator(DataSource dataSource, String sqlQuery) throws SQLException {
+    this.rowDelegate = new EachRowIterator(dataSource, sqlQuery);
+    delegate = Iterators.transform(rowDelegate,
+      new Function<ResultSet, T>() {
+        @Override
+        public T apply(ResultSet from) {
+          try {
+            return parseElement(from);
+          } catch (SQLException sqle) {
+            throw new IllegalStateException(sqle);
+          }
+        }
+      });
+  }
+
+  @Override
+  protected Iterator<T> delegate() {
+    return delegate;
+  }
+
+  protected abstract T parseElement(ResultSet resultSet) throws SQLException;
+
+  public void skip(int n) {
+    if (n >= 1) {
+      try {
+        rowDelegate.skip(n);
+      } catch (SQLException sqle) {
+        throw new IllegalStateException(sqle);
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AbstractDifferenceRecommenderEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AbstractDifferenceRecommenderEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AbstractDifferenceRecommenderEvaluator.java
new file mode 100644
index 0000000..f926f18
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AbstractDifferenceRecommenderEvaluator.java
@@ -0,0 +1,276 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.DataModelBuilder;
+import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
+import org.apache.mahout.cf.taste.eval.RecommenderEvaluator;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
+import org.apache.mahout.cf.taste.impl.model.GenericPreference;
+import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.common.RandomUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Abstract superclass of a couple implementations, providing shared functionality.
+ */
+public abstract class AbstractDifferenceRecommenderEvaluator implements RecommenderEvaluator {
+  
+  private static final Logger log = LoggerFactory.getLogger(AbstractDifferenceRecommenderEvaluator.class);
+  
+  private final Random random;
+  private float maxPreference;
+  private float minPreference;
+  
+  protected AbstractDifferenceRecommenderEvaluator() {
+    random = RandomUtils.getRandom();
+    maxPreference = Float.NaN;
+    minPreference = Float.NaN;
+  }
+  
+  @Override
+  public final float getMaxPreference() {
+    return maxPreference;
+  }
+  
+  @Override
+  public final void setMaxPreference(float maxPreference) {
+    this.maxPreference = maxPreference;
+  }
+  
+  @Override
+  public final float getMinPreference() {
+    return minPreference;
+  }
+  
+  @Override
+  public final void setMinPreference(float minPreference) {
+    this.minPreference = minPreference;
+  }
+  
+  @Override
+  public double evaluate(RecommenderBuilder recommenderBuilder,
+                         DataModelBuilder dataModelBuilder,
+                         DataModel dataModel,
+                         double trainingPercentage,
+                         double evaluationPercentage) throws TasteException {
+    Preconditions.checkNotNull(recommenderBuilder);
+    Preconditions.checkNotNull(dataModel);
+    Preconditions.checkArgument(trainingPercentage >= 0.0 && trainingPercentage <= 1.0,
+      "Invalid trainingPercentage: " + trainingPercentage + ". Must be: 0.0 <= trainingPercentage <= 1.0");
+    Preconditions.checkArgument(evaluationPercentage >= 0.0 && evaluationPercentage <= 1.0,
+      "Invalid evaluationPercentage: " + evaluationPercentage + ". Must be: 0.0 <= evaluationPercentage <= 1.0");
+
+    log.info("Beginning evaluation using {} of {}", trainingPercentage, dataModel);
+    
+    int numUsers = dataModel.getNumUsers();
+    FastByIDMap<PreferenceArray> trainingPrefs = new FastByIDMap<>(
+        1 + (int) (evaluationPercentage * numUsers));
+    FastByIDMap<PreferenceArray> testPrefs = new FastByIDMap<>(
+        1 + (int) (evaluationPercentage * numUsers));
+    
+    LongPrimitiveIterator it = dataModel.getUserIDs();
+    while (it.hasNext()) {
+      long userID = it.nextLong();
+      if (random.nextDouble() < evaluationPercentage) {
+        splitOneUsersPrefs(trainingPercentage, trainingPrefs, testPrefs, userID, dataModel);
+      }
+    }
+    
+    DataModel trainingModel = dataModelBuilder == null ? new GenericDataModel(trainingPrefs)
+        : dataModelBuilder.buildDataModel(trainingPrefs);
+    
+    Recommender recommender = recommenderBuilder.buildRecommender(trainingModel);
+    
+    double result = getEvaluation(testPrefs, recommender);
+    log.info("Evaluation result: {}", result);
+    return result;
+  }
+  
+  private void splitOneUsersPrefs(double trainingPercentage,
+                                  FastByIDMap<PreferenceArray> trainingPrefs,
+                                  FastByIDMap<PreferenceArray> testPrefs,
+                                  long userID,
+                                  DataModel dataModel) throws TasteException {
+    List<Preference> oneUserTrainingPrefs = null;
+    List<Preference> oneUserTestPrefs = null;
+    PreferenceArray prefs = dataModel.getPreferencesFromUser(userID);
+    int size = prefs.length();
+    for (int i = 0; i < size; i++) {
+      Preference newPref = new GenericPreference(userID, prefs.getItemID(i), prefs.getValue(i));
+      if (random.nextDouble() < trainingPercentage) {
+        if (oneUserTrainingPrefs == null) {
+          oneUserTrainingPrefs = new ArrayList<>(3);
+        }
+        oneUserTrainingPrefs.add(newPref);
+      } else {
+        if (oneUserTestPrefs == null) {
+          oneUserTestPrefs = new ArrayList<>(3);
+        }
+        oneUserTestPrefs.add(newPref);
+      }
+    }
+    if (oneUserTrainingPrefs != null) {
+      trainingPrefs.put(userID, new GenericUserPreferenceArray(oneUserTrainingPrefs));
+      if (oneUserTestPrefs != null) {
+        testPrefs.put(userID, new GenericUserPreferenceArray(oneUserTestPrefs));
+      }
+    }
+  }
+
+  private float capEstimatedPreference(float estimate) {
+    if (estimate > maxPreference) {
+      return maxPreference;
+    }
+    if (estimate < minPreference) {
+      return minPreference;
+    }
+    return estimate;
+  }
+
+  private double getEvaluation(FastByIDMap<PreferenceArray> testPrefs, Recommender recommender)
+    throws TasteException {
+    reset();
+    Collection<Callable<Void>> estimateCallables = new ArrayList<>();
+    AtomicInteger noEstimateCounter = new AtomicInteger();
+    for (Map.Entry<Long,PreferenceArray> entry : testPrefs.entrySet()) {
+      estimateCallables.add(
+          new PreferenceEstimateCallable(recommender, entry.getKey(), entry.getValue(), noEstimateCounter));
+    }
+    log.info("Beginning evaluation of {} users", estimateCallables.size());
+    RunningAverageAndStdDev timing = new FullRunningAverageAndStdDev();
+    execute(estimateCallables, noEstimateCounter, timing);
+    return computeFinalEvaluation();
+  }
+  
+  protected static void execute(Collection<Callable<Void>> callables,
+                                AtomicInteger noEstimateCounter,
+                                RunningAverageAndStdDev timing) throws TasteException {
+
+    Collection<Callable<Void>> wrappedCallables = wrapWithStatsCallables(callables, noEstimateCounter, timing);
+    int numProcessors = Runtime.getRuntime().availableProcessors();
+    ExecutorService executor = Executors.newFixedThreadPool(numProcessors);
+    log.info("Starting timing of {} tasks in {} threads", wrappedCallables.size(), numProcessors);
+    try {
+      List<Future<Void>> futures = executor.invokeAll(wrappedCallables);
+      // Go look for exceptions here, really
+      for (Future<Void> future : futures) {
+        future.get();
+      }
+
+    } catch (InterruptedException ie) {
+      throw new TasteException(ie);
+    } catch (ExecutionException ee) {
+      throw new TasteException(ee.getCause());
+    }
+    
+    executor.shutdown();
+    try {
+      executor.awaitTermination(10, TimeUnit.SECONDS);
+    } catch (InterruptedException e) {
+      throw new TasteException(e.getCause());
+    }
+  }
+  
+  private static Collection<Callable<Void>> wrapWithStatsCallables(Iterable<Callable<Void>> callables,
+                                                                   AtomicInteger noEstimateCounter,
+                                                                   RunningAverageAndStdDev timing) {
+    Collection<Callable<Void>> wrapped = new ArrayList<>();
+    int count = 0;
+    for (Callable<Void> callable : callables) {
+      boolean logStats = count++ % 1000 == 0; // log every 1000 or so iterations
+      wrapped.add(new StatsCallable(callable, logStats, timing, noEstimateCounter));
+    }
+    return wrapped;
+  }
+  
+  protected abstract void reset();
+  
+  protected abstract void processOneEstimate(float estimatedPreference, Preference realPref);
+  
+  protected abstract double computeFinalEvaluation();
+
+  public final class PreferenceEstimateCallable implements Callable<Void> {
+
+    private final Recommender recommender;
+    private final long testUserID;
+    private final PreferenceArray prefs;
+    private final AtomicInteger noEstimateCounter;
+
+    public PreferenceEstimateCallable(Recommender recommender,
+                                      long testUserID,
+                                      PreferenceArray prefs,
+                                      AtomicInteger noEstimateCounter) {
+      this.recommender = recommender;
+      this.testUserID = testUserID;
+      this.prefs = prefs;
+      this.noEstimateCounter = noEstimateCounter;
+    }
+
+    @Override
+    public Void call() throws TasteException {
+      for (Preference realPref : prefs) {
+        float estimatedPreference = Float.NaN;
+        try {
+          estimatedPreference = recommender.estimatePreference(testUserID, realPref.getItemID());
+        } catch (NoSuchUserException nsue) {
+          // It's possible that an item exists in the test data but not training data in which case
+          // NSEE will be thrown. Just ignore it and move on.
+          log.info("User exists in test data but not training data: {}", testUserID);
+        } catch (NoSuchItemException nsie) {
+          log.info("Item exists in test data but not training data: {}", realPref.getItemID());
+        }
+        if (Float.isNaN(estimatedPreference)) {
+          noEstimateCounter.incrementAndGet();
+        } else {
+          estimatedPreference = capEstimatedPreference(estimatedPreference);
+          processOneEstimate(estimatedPreference, realPref);
+        }
+      }
+      return null;
+    }
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AverageAbsoluteDifferenceRecommenderEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AverageAbsoluteDifferenceRecommenderEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AverageAbsoluteDifferenceRecommenderEvaluator.java
new file mode 100644
index 0000000..4dad040
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AverageAbsoluteDifferenceRecommenderEvaluator.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.Preference;
+
+/**
+ * <p>
+ * A {@link org.apache.mahout.cf.taste.eval.RecommenderEvaluator} which computes the average absolute
+ * difference between predicted and actual ratings for users.
+ * </p>
+ * 
+ * <p>
+ * This algorithm is also called "mean average error".
+ * </p>
+ */
+public final class AverageAbsoluteDifferenceRecommenderEvaluator extends
+    AbstractDifferenceRecommenderEvaluator {
+  
+  private RunningAverage average;
+  
+  @Override
+  protected void reset() {
+    average = new FullRunningAverage();
+  }
+  
+  @Override
+  protected void processOneEstimate(float estimatedPreference, Preference realPref) {
+    average.addDatum(Math.abs(realPref.getValue() - estimatedPreference));
+  }
+  
+  @Override
+  protected double computeFinalEvaluation() {
+    return average.getAverage();
+  }
+  
+  @Override
+  public String toString() {
+    return "AverageAbsoluteDifferenceRecommenderEvaluator";
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRecommenderIRStatsEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRecommenderIRStatsEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRecommenderIRStatsEvaluator.java
new file mode 100644
index 0000000..0e121d1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRecommenderIRStatsEvaluator.java
@@ -0,0 +1,237 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import java.util.List;
+import java.util.Random;
+
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.DataModelBuilder;
+import org.apache.mahout.cf.taste.eval.IRStatistics;
+import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
+import org.apache.mahout.cf.taste.eval.RecommenderIRStatsEvaluator;
+import org.apache.mahout.cf.taste.eval.RelevantItemsDataSplitter;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.common.RandomUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * For each user, these implementation determine the top {@code n} preferences, then evaluate the IR
+ * statistics based on a {@link DataModel} that does not have these values. This number {@code n} is the
+ * "at" value, as in "precision at 5". For example, this would mean precision evaluated by removing the top 5
+ * preferences for a user and then finding the percentage of those 5 items included in the top 5
+ * recommendations for that user.
+ * </p>
+ */
+public final class GenericRecommenderIRStatsEvaluator implements RecommenderIRStatsEvaluator {
+
+  private static final Logger log = LoggerFactory.getLogger(GenericRecommenderIRStatsEvaluator.class);
+
+  private static final double LOG2 = Math.log(2.0);
+
+  /**
+   * Pass as "relevanceThreshold" argument to
+   * {@link #evaluate(RecommenderBuilder, DataModelBuilder, DataModel, IDRescorer, int, double, double)} to
+   * have it attempt to compute a reasonable threshold. Note that this will impact performance.
+   */
+  public static final double CHOOSE_THRESHOLD = Double.NaN;
+
+  private final Random random;
+  private final RelevantItemsDataSplitter dataSplitter;
+
+  public GenericRecommenderIRStatsEvaluator() {
+    this(new GenericRelevantItemsDataSplitter());
+  }
+
+  public GenericRecommenderIRStatsEvaluator(RelevantItemsDataSplitter dataSplitter) {
+    Preconditions.checkNotNull(dataSplitter);
+    random = RandomUtils.getRandom();
+    this.dataSplitter = dataSplitter;
+  }
+
+  @Override
+  public IRStatistics evaluate(RecommenderBuilder recommenderBuilder,
+                               DataModelBuilder dataModelBuilder,
+                               DataModel dataModel,
+                               IDRescorer rescorer,
+                               int at,
+                               double relevanceThreshold,
+                               double evaluationPercentage) throws TasteException {
+
+    Preconditions.checkArgument(recommenderBuilder != null, "recommenderBuilder is null");
+    Preconditions.checkArgument(dataModel != null, "dataModel is null");
+    Preconditions.checkArgument(at >= 1, "at must be at least 1");
+    Preconditions.checkArgument(evaluationPercentage > 0.0 && evaluationPercentage <= 1.0,
+        "Invalid evaluationPercentage: " + evaluationPercentage + ". Must be: 0.0 < evaluationPercentage <= 1.0");
+
+    int numItems = dataModel.getNumItems();
+    RunningAverage precision = new FullRunningAverage();
+    RunningAverage recall = new FullRunningAverage();
+    RunningAverage fallOut = new FullRunningAverage();
+    RunningAverage nDCG = new FullRunningAverage();
+    int numUsersRecommendedFor = 0;
+    int numUsersWithRecommendations = 0;
+
+    LongPrimitiveIterator it = dataModel.getUserIDs();
+    while (it.hasNext()) {
+
+      long userID = it.nextLong();
+
+      if (random.nextDouble() >= evaluationPercentage) {
+        // Skipped
+        continue;
+      }
+
+      long start = System.currentTimeMillis();
+
+      PreferenceArray prefs = dataModel.getPreferencesFromUser(userID);
+
+      // List some most-preferred items that would count as (most) "relevant" results
+      double theRelevanceThreshold = Double.isNaN(relevanceThreshold) ? computeThreshold(prefs) : relevanceThreshold;
+      FastIDSet relevantItemIDs = dataSplitter.getRelevantItemsIDs(userID, at, theRelevanceThreshold, dataModel);
+
+      int numRelevantItems = relevantItemIDs.size();
+      if (numRelevantItems <= 0) {
+        continue;
+      }
+
+      FastByIDMap<PreferenceArray> trainingUsers = new FastByIDMap<>(dataModel.getNumUsers());
+      LongPrimitiveIterator it2 = dataModel.getUserIDs();
+      while (it2.hasNext()) {
+        dataSplitter.processOtherUser(userID, relevantItemIDs, trainingUsers, it2.nextLong(), dataModel);
+      }
+
+      DataModel trainingModel = dataModelBuilder == null ? new GenericDataModel(trainingUsers)
+          : dataModelBuilder.buildDataModel(trainingUsers);
+      try {
+        trainingModel.getPreferencesFromUser(userID);
+      } catch (NoSuchUserException nsee) {
+        continue; // Oops we excluded all prefs for the user -- just move on
+      }
+
+      int size = numRelevantItems + trainingModel.getItemIDsFromUser(userID).size();
+      if (size < 2 * at) {
+        // Really not enough prefs to meaningfully evaluate this user
+        continue;
+      }
+
+      Recommender recommender = recommenderBuilder.buildRecommender(trainingModel);
+
+      int intersectionSize = 0;
+      List<RecommendedItem> recommendedItems = recommender.recommend(userID, at, rescorer);
+      for (RecommendedItem recommendedItem : recommendedItems) {
+        if (relevantItemIDs.contains(recommendedItem.getItemID())) {
+          intersectionSize++;
+        }
+      }
+
+      int numRecommendedItems = recommendedItems.size();
+
+      // Precision
+      if (numRecommendedItems > 0) {
+        precision.addDatum((double) intersectionSize / (double) numRecommendedItems);
+      }
+
+      // Recall
+      recall.addDatum((double) intersectionSize / (double) numRelevantItems);
+
+      // Fall-out
+      if (numRelevantItems < size) {
+        fallOut.addDatum((double) (numRecommendedItems - intersectionSize)
+                         / (double) (numItems - numRelevantItems));
+      }
+
+      // nDCG
+      // In computing, assume relevant IDs have relevance 1 and others 0
+      double cumulativeGain = 0.0;
+      double idealizedGain = 0.0;
+      for (int i = 0; i < numRecommendedItems; i++) {
+        RecommendedItem item = recommendedItems.get(i);
+        double discount = 1.0 / log2(i + 2.0); // Classical formulation says log(i+1), but i is 0-based here
+        if (relevantItemIDs.contains(item.getItemID())) {
+          cumulativeGain += discount;
+        }
+        // otherwise we're multiplying discount by relevance 0 so it doesn't do anything
+
+        // Ideally results would be ordered with all relevant ones first, so this theoretical
+        // ideal list starts with number of relevant items equal to the total number of relevant items
+        if (i < numRelevantItems) {
+          idealizedGain += discount;
+        }
+      }
+      if (idealizedGain > 0.0) {
+        nDCG.addDatum(cumulativeGain / idealizedGain);
+      }
+
+      // Reach
+      numUsersRecommendedFor++;
+      if (numRecommendedItems > 0) {
+        numUsersWithRecommendations++;
+      }
+
+      long end = System.currentTimeMillis();
+
+      log.info("Evaluated with user {} in {}ms", userID, end - start);
+      log.info("Precision/recall/fall-out/nDCG/reach: {} / {} / {} / {} / {}",
+               precision.getAverage(), recall.getAverage(), fallOut.getAverage(), nDCG.getAverage(),
+               (double) numUsersWithRecommendations / (double) numUsersRecommendedFor);
+    }
+
+    return new IRStatisticsImpl(
+        precision.getAverage(),
+        recall.getAverage(),
+        fallOut.getAverage(),
+        nDCG.getAverage(),
+        (double) numUsersWithRecommendations / (double) numUsersRecommendedFor);
+  }
+
+  private static double computeThreshold(PreferenceArray prefs) {
+    if (prefs.length() < 2) {
+      // Not enough data points -- return a threshold that allows everything
+      return Double.NEGATIVE_INFINITY;
+    }
+    RunningAverageAndStdDev stdDev = new FullRunningAverageAndStdDev();
+    int size = prefs.length();
+    for (int i = 0; i < size; i++) {
+      stdDev.addDatum(prefs.getValue(i));
+    }
+    return stdDev.getAverage() + stdDev.getStandardDeviation();
+  }
+
+  private static double log2(double value) {
+    return Math.log(value) / LOG2;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRelevantItemsDataSplitter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRelevantItemsDataSplitter.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRelevantItemsDataSplitter.java
new file mode 100644
index 0000000..f4e4522
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRelevantItemsDataSplitter.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.RelevantItemsDataSplitter;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * Picks relevant items to be those with the strongest preference, and
+ * includes the other users' preferences in full.
+ */
+public final class GenericRelevantItemsDataSplitter implements RelevantItemsDataSplitter {
+
+  @Override
+  public FastIDSet getRelevantItemsIDs(long userID,
+                                       int at,
+                                       double relevanceThreshold,
+                                       DataModel dataModel) throws TasteException {
+    PreferenceArray prefs = dataModel.getPreferencesFromUser(userID);
+    FastIDSet relevantItemIDs = new FastIDSet(at);
+    prefs.sortByValueReversed();
+    for (int i = 0; i < prefs.length() && relevantItemIDs.size() < at; i++) {
+      if (prefs.getValue(i) >= relevanceThreshold) {
+        relevantItemIDs.add(prefs.getItemID(i));
+      }
+    }
+    return relevantItemIDs;
+  }
+
+  @Override
+  public void processOtherUser(long userID,
+                               FastIDSet relevantItemIDs,
+                               FastByIDMap<PreferenceArray> trainingUsers,
+                               long otherUserID,
+                               DataModel dataModel) throws TasteException {
+    PreferenceArray prefs2Array = dataModel.getPreferencesFromUser(otherUserID);
+    // If we're dealing with the very user that we're evaluating for precision/recall,
+    if (userID == otherUserID) {
+      // then must remove all the test IDs, the "relevant" item IDs
+      List<Preference> prefs2 = new ArrayList<>(prefs2Array.length());
+      for (Preference pref : prefs2Array) {
+        prefs2.add(pref);
+      }
+      for (Iterator<Preference> iterator = prefs2.iterator(); iterator.hasNext();) {
+        Preference pref = iterator.next();
+        if (relevantItemIDs.contains(pref.getItemID())) {
+          iterator.remove();
+        }
+      }
+      if (!prefs2.isEmpty()) {
+        trainingUsers.put(otherUserID, new GenericUserPreferenceArray(prefs2));
+      }
+    } else {
+      // otherwise just add all those other user's prefs
+      trainingUsers.put(otherUserID, prefs2Array);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/IRStatisticsImpl.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/IRStatisticsImpl.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/IRStatisticsImpl.java
new file mode 100644
index 0000000..2838b08
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/IRStatisticsImpl.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import java.io.Serializable;
+
+import org.apache.mahout.cf.taste.eval.IRStatistics;
+
+import com.google.common.base.Preconditions;
+
+public final class IRStatisticsImpl implements IRStatistics, Serializable {
+
+  private final double precision;
+  private final double recall;
+  private final double fallOut;
+  private final double ndcg;
+  private final double reach;
+
+  IRStatisticsImpl(double precision, double recall, double fallOut, double ndcg, double reach) {
+    Preconditions.checkArgument(Double.isNaN(precision) || (precision >= 0.0 && precision <= 1.0),
+        "Illegal precision: " + precision + ". Must be: 0.0 <= precision <= 1.0 or NaN");
+    Preconditions.checkArgument(Double.isNaN(recall) || (recall >= 0.0 && recall <= 1.0), 
+        "Illegal recall: " + recall + ". Must be: 0.0 <= recall <= 1.0 or NaN");
+    Preconditions.checkArgument(Double.isNaN(fallOut) || (fallOut >= 0.0 && fallOut <= 1.0),
+        "Illegal fallOut: " + fallOut + ". Must be: 0.0 <= fallOut <= 1.0 or NaN");
+    Preconditions.checkArgument(Double.isNaN(ndcg) || (ndcg >= 0.0 && ndcg <= 1.0), 
+        "Illegal nDCG: " + ndcg + ". Must be: 0.0 <= nDCG <= 1.0 or NaN");
+    Preconditions.checkArgument(Double.isNaN(reach) || (reach >= 0.0 && reach <= 1.0), 
+        "Illegal reach: " + reach + ". Must be: 0.0 <= reach <= 1.0 or NaN");
+    this.precision = precision;
+    this.recall = recall;
+    this.fallOut = fallOut;
+    this.ndcg = ndcg;
+    this.reach = reach;
+  }
+
+  @Override
+  public double getPrecision() {
+    return precision;
+  }
+
+  @Override
+  public double getRecall() {
+    return recall;
+  }
+
+  @Override
+  public double getFallOut() {
+    return fallOut;
+  }
+
+  @Override
+  public double getF1Measure() {
+    return getFNMeasure(1.0);
+  }
+
+  @Override
+  public double getFNMeasure(double b) {
+    double b2 = b * b;
+    double sum = b2 * precision + recall;
+    return sum == 0.0 ? Double.NaN : (1.0 + b2) * precision * recall / sum;
+  }
+
+  @Override
+  public double getNormalizedDiscountedCumulativeGain() {
+    return ndcg;
+  }
+
+  @Override
+  public double getReach() {
+    return reach;
+  }
+
+  @Override
+  public String toString() {
+    return "IRStatisticsImpl[precision:" + precision + ",recall:" + recall + ",fallOut:"
+        + fallOut + ",nDCG:" + ndcg + ",reach:" + reach + ']';
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadCallable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadCallable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadCallable.java
new file mode 100644
index 0000000..213f7f9
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadCallable.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import org.apache.mahout.cf.taste.recommender.Recommender;
+
+import java.util.concurrent.Callable;
+
+final class LoadCallable implements Callable<Void> {
+
+  private final Recommender recommender;
+  private final long userID;
+
+  LoadCallable(Recommender recommender, long userID) {
+    this.recommender = recommender;
+    this.userID = userID;
+  }
+
+  @Override
+  public Void call() throws Exception {
+    recommender.recommend(userID, 10);
+    return null;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadEvaluator.java
new file mode 100644
index 0000000..2d27a37
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadEvaluator.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+
+/**
+ * Simple helper class for running load on a Recommender.
+ */
+public final class LoadEvaluator {
+  
+  private LoadEvaluator() { }
+
+  public static LoadStatistics runLoad(Recommender recommender) throws TasteException {
+    return runLoad(recommender, 10);
+  }
+  
+  public static LoadStatistics runLoad(Recommender recommender, int howMany) throws TasteException {
+    DataModel dataModel = recommender.getDataModel();
+    int numUsers = dataModel.getNumUsers();
+    double sampleRate = 1000.0 / numUsers;
+    LongPrimitiveIterator userSampler =
+        SamplingLongPrimitiveIterator.maybeWrapIterator(dataModel.getUserIDs(), sampleRate);
+    recommender.recommend(userSampler.next(), howMany); // Warm up
+    Collection<Callable<Void>> callables = new ArrayList<>();
+    while (userSampler.hasNext()) {
+      callables.add(new LoadCallable(recommender, userSampler.next()));
+    }
+    AtomicInteger noEstimateCounter = new AtomicInteger();
+    RunningAverageAndStdDev timing = new FullRunningAverageAndStdDev();
+    AbstractDifferenceRecommenderEvaluator.execute(callables, noEstimateCounter, timing);
+    return new LoadStatistics(timing);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadStatistics.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadStatistics.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadStatistics.java
new file mode 100644
index 0000000..f89160c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadStatistics.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+
+public final class LoadStatistics {
+  
+  private final RunningAverage timing;
+
+  LoadStatistics(RunningAverage timing) {
+    this.timing = timing;
+  }
+
+  public RunningAverage getTiming() {
+    return timing;
+  }
+  
+}


[02/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
new file mode 100644
index 0000000..0e7ee96
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
@@ -0,0 +1,417 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.commandline;
+
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.apache.mahout.clustering.kernel.TriangularKernelProfile;
+
+
+public final class DefaultOptionCreator {
+  
+  public static final String CLUSTERING_OPTION = "clustering";
+  
+  public static final String CLUSTERS_IN_OPTION = "clusters";
+  
+  public static final String CONVERGENCE_DELTA_OPTION = "convergenceDelta";
+  
+  public static final String DISTANCE_MEASURE_OPTION = "distanceMeasure";
+  
+  public static final String EMIT_MOST_LIKELY_OPTION = "emitMostLikely";
+  
+  public static final String INPUT_OPTION = "input";
+  
+  public static final String MAX_ITERATIONS_OPTION = "maxIter";
+  
+  public static final String MAX_REDUCERS_OPTION = "maxRed";
+  
+  public static final String METHOD_OPTION = "method";
+  
+  public static final String NUM_CLUSTERS_OPTION = "numClusters";
+  
+  public static final String OUTPUT_OPTION = "output";
+  
+  public static final String OVERWRITE_OPTION = "overwrite";
+  
+  public static final String T1_OPTION = "t1";
+  
+  public static final String T2_OPTION = "t2";
+  
+  public static final String T3_OPTION = "t3";
+  
+  public static final String T4_OPTION = "t4";
+  
+  public static final String OUTLIER_THRESHOLD = "outlierThreshold";
+  
+  public static final String CLUSTER_FILTER_OPTION = "clusterFilter";
+  
+  public static final String THRESHOLD_OPTION = "threshold";
+  
+  public static final String SEQUENTIAL_METHOD = "sequential";
+  
+  public static final String MAPREDUCE_METHOD = "mapreduce";
+  
+  public static final String KERNEL_PROFILE_OPTION = "kernelProfile";
+
+  public static final String ANALYZER_NAME_OPTION = "analyzerName";
+ 
+  public static final String RANDOM_SEED = "randomSeed";
+  
+  private DefaultOptionCreator() {}
+  
+  /**
+   * Returns a default command line option for help. Used by all clustering jobs
+   * and many others
+   * */
+  public static Option helpOption() {
+    return new DefaultOptionBuilder().withLongName("help")
+        .withDescription("Print out help").withShortName("h").create();
+  }
+  
+  /**
+   * Returns a default command line option for input directory specification.
+   * Used by all clustering jobs plus others
+   */
+  public static DefaultOptionBuilder inputOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(INPUT_OPTION)
+        .withRequired(false)
+        .withShortName("i")
+        .withArgument(
+            new ArgumentBuilder().withName(INPUT_OPTION).withMinimum(1)
+                .withMaximum(1).create())
+        .withDescription("Path to job input directory.");
+  }
+  
+  /**
+   * Returns a default command line option for clusters input directory
+   * specification. Used by FuzzyKmeans, Kmeans
+   */
+  public static DefaultOptionBuilder clustersInOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(CLUSTERS_IN_OPTION)
+        .withRequired(true)
+        .withArgument(
+            new ArgumentBuilder().withName(CLUSTERS_IN_OPTION).withMinimum(1)
+                .withMaximum(1).create())
+        .withDescription(
+            "The path to the initial clusters directory. Must be a SequenceFile of some type of Cluster")
+        .withShortName("c");
+  }
+  
+  /**
+   * Returns a default command line option for output directory specification.
+   * Used by all clustering jobs plus others
+   */
+  public static DefaultOptionBuilder outputOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(OUTPUT_OPTION)
+        .withRequired(false)
+        .withShortName("o")
+        .withArgument(
+            new ArgumentBuilder().withName(OUTPUT_OPTION).withMinimum(1)
+                .withMaximum(1).create())
+        .withDescription("The directory pathname for output.");
+  }
+  
+  /**
+   * Returns a default command line option for output directory overwriting.
+   * Used by all clustering jobs
+   */
+  public static DefaultOptionBuilder overwriteOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(OVERWRITE_OPTION)
+        .withRequired(false)
+        .withDescription(
+            "If present, overwrite the output directory before running job")
+        .withShortName("ow");
+  }
+  
+  /**
+   * Returns a default command line option for specification of distance measure
+   * class to use. Used by Canopy, FuzzyKmeans, Kmeans, MeanShift
+   */
+  public static DefaultOptionBuilder distanceMeasureOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(DISTANCE_MEASURE_OPTION)
+        .withRequired(false)
+        .withShortName("dm")
+        .withArgument(
+            new ArgumentBuilder().withName(DISTANCE_MEASURE_OPTION)
+                .withDefault(SquaredEuclideanDistanceMeasure.class.getName())
+                .withMinimum(1).withMaximum(1).create())
+        .withDescription(
+            "The classname of the DistanceMeasure. Default is SquaredEuclidean");
+  }
+  
+  /**
+   * Returns a default command line option for specification of sequential or
+   * parallel operation. Used by Canopy, FuzzyKmeans, Kmeans, MeanShift,
+   * Dirichlet
+   */
+  public static DefaultOptionBuilder methodOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(METHOD_OPTION)
+        .withRequired(false)
+        .withShortName("xm")
+        .withArgument(
+            new ArgumentBuilder().withName(METHOD_OPTION)
+                .withDefault(MAPREDUCE_METHOD).withMinimum(1).withMaximum(1)
+                .create())
+        .withDescription(
+            "The execution method to use: sequential or mapreduce. Default is mapreduce");
+  }
+  
+  /**
+   * Returns a default command line option for specification of T1. Used by
+   * Canopy, MeanShift
+   */
+  public static DefaultOptionBuilder t1Option() {
+    return new DefaultOptionBuilder()
+        .withLongName(T1_OPTION)
+        .withRequired(true)
+        .withArgument(
+            new ArgumentBuilder().withName(T1_OPTION).withMinimum(1)
+                .withMaximum(1).create()).withDescription("T1 threshold value")
+        .withShortName(T1_OPTION);
+  }
+  
+  /**
+   * Returns a default command line option for specification of T2. Used by
+   * Canopy, MeanShift
+   */
+  public static DefaultOptionBuilder t2Option() {
+    return new DefaultOptionBuilder()
+        .withLongName(T2_OPTION)
+        .withRequired(true)
+        .withArgument(
+            new ArgumentBuilder().withName(T2_OPTION).withMinimum(1)
+                .withMaximum(1).create()).withDescription("T2 threshold value")
+        .withShortName(T2_OPTION);
+  }
+  
+  /**
+   * Returns a default command line option for specification of T3 (Reducer T1).
+   * Used by Canopy
+   */
+  public static DefaultOptionBuilder t3Option() {
+    return new DefaultOptionBuilder()
+        .withLongName(T3_OPTION)
+        .withRequired(false)
+        .withArgument(
+            new ArgumentBuilder().withName(T3_OPTION).withMinimum(1)
+                .withMaximum(1).create())
+        .withDescription("T3 (Reducer T1) threshold value")
+        .withShortName(T3_OPTION);
+  }
+  
+  /**
+   * Returns a default command line option for specification of T4 (Reducer T2).
+   * Used by Canopy
+   */
+  public static DefaultOptionBuilder t4Option() {
+    return new DefaultOptionBuilder()
+        .withLongName(T4_OPTION)
+        .withRequired(false)
+        .withArgument(
+            new ArgumentBuilder().withName(T4_OPTION).withMinimum(1)
+                .withMaximum(1).create())
+        .withDescription("T4 (Reducer T2) threshold value")
+        .withShortName(T4_OPTION);
+  }
+  
+  /**
+ * @return a DefaultOptionBuilder for the clusterFilter option
+ */
+  public static DefaultOptionBuilder clusterFilterOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(CLUSTER_FILTER_OPTION)
+        .withShortName("cf")
+        .withRequired(false)
+        .withArgument(
+            new ArgumentBuilder().withName(CLUSTER_FILTER_OPTION).withMinimum(1)
+                .withMaximum(1).create())
+        .withDescription("Cluster filter suppresses small canopies from mapper")
+        .withShortName(CLUSTER_FILTER_OPTION);
+  }
+  
+  /**
+   * Returns a default command line option for specification of max number of
+   * iterations. Used by Dirichlet, FuzzyKmeans, Kmeans, LDA
+   */
+  public static DefaultOptionBuilder maxIterationsOption() {
+    // default value used by LDA which overrides withRequired(false)
+    return new DefaultOptionBuilder()
+        .withLongName(MAX_ITERATIONS_OPTION)
+        .withRequired(true)
+        .withShortName("x")
+        .withArgument(
+            new ArgumentBuilder().withName(MAX_ITERATIONS_OPTION)
+                .withDefault("-1").withMinimum(1).withMaximum(1).create())
+        .withDescription("The maximum number of iterations.");
+  }
+  
+  /**
+   * Returns a default command line option for specification of numbers of
+   * clusters to create. Used by Dirichlet, FuzzyKmeans, Kmeans
+   */
+  public static DefaultOptionBuilder numClustersOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(NUM_CLUSTERS_OPTION)
+        .withRequired(false)
+        .withArgument(
+            new ArgumentBuilder().withName("k").withMinimum(1).withMaximum(1)
+                .create()).withDescription("The number of clusters to create")
+        .withShortName("k");
+  }
+
+  public static DefaultOptionBuilder useSetRandomSeedOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(RANDOM_SEED)
+        .withRequired(false)
+        .withArgument(new ArgumentBuilder().withName(RANDOM_SEED).create())
+        .withDescription("Seed to initaize Random Number Generator with")
+        .withShortName("rs");
+  }
+  
+  /**
+   * Returns a default command line option for convergence delta specification.
+   * Used by FuzzyKmeans, Kmeans, MeanShift
+   */
+  public static DefaultOptionBuilder convergenceOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(CONVERGENCE_DELTA_OPTION)
+        .withRequired(false)
+        .withShortName("cd")
+        .withArgument(
+            new ArgumentBuilder().withName(CONVERGENCE_DELTA_OPTION)
+                .withDefault("0.5").withMinimum(1).withMaximum(1).create())
+        .withDescription("The convergence delta value. Default is 0.5");
+  }
+  
+  /**
+   * Returns a default command line option for specifying the max number of
+   * reducers. Used by Dirichlet, FuzzyKmeans, Kmeans and LDA
+   * 
+   * @deprecated
+   */
+  @Deprecated
+  public static DefaultOptionBuilder numReducersOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(MAX_REDUCERS_OPTION)
+        .withRequired(false)
+        .withShortName("r")
+        .withArgument(
+            new ArgumentBuilder().withName(MAX_REDUCERS_OPTION)
+                .withDefault("2").withMinimum(1).withMaximum(1).create())
+        .withDescription("The number of reduce tasks. Defaults to 2");
+  }
+  
+  /**
+   * Returns a default command line option for clustering specification. Used by
+   * all clustering except LDA
+   */
+  public static DefaultOptionBuilder clusteringOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(CLUSTERING_OPTION)
+        .withRequired(false)
+        .withDescription(
+            "If present, run clustering after the iterations have taken place")
+        .withShortName("cl");
+  }
+
+  /**
+   * Returns a default command line option for specifying a Lucene analyzer class
+   * @return {@link DefaultOptionBuilder}
+   */
+  public static DefaultOptionBuilder analyzerOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(ANALYZER_NAME_OPTION)
+        .withRequired(false)
+        .withDescription("If present, the name of a Lucene analyzer class to use")
+        .withArgument(new ArgumentBuilder().withName(ANALYZER_NAME_OPTION).withDefault(StandardAnalyzer.class.getName())
+            .withMinimum(1).withMaximum(1).create())
+       .withShortName("an");
+  }
+
+  
+  /**
+   * Returns a default command line option for specifying the emitMostLikely
+   * flag. Used by Dirichlet and FuzzyKmeans
+   */
+  public static DefaultOptionBuilder emitMostLikelyOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(EMIT_MOST_LIKELY_OPTION)
+        .withRequired(false)
+        .withShortName("e")
+        .withArgument(
+            new ArgumentBuilder().withName(EMIT_MOST_LIKELY_OPTION)
+                .withDefault("true").withMinimum(1).withMaximum(1).create())
+        .withDescription(
+            "True if clustering should emit the most likely point only, "
+                + "false for threshold clustering. Default is true");
+  }
+  
+  /**
+   * Returns a default command line option for specifying the clustering
+   * threshold value. Used by Dirichlet and FuzzyKmeans
+   */
+  public static DefaultOptionBuilder thresholdOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(THRESHOLD_OPTION)
+        .withRequired(false)
+        .withShortName("t")
+        .withArgument(
+            new ArgumentBuilder().withName(THRESHOLD_OPTION).withDefault("0")
+                .withMinimum(1).withMaximum(1).create())
+        .withDescription(
+            "The pdf threshold used for cluster determination. Default is 0");
+  }
+  
+  public static DefaultOptionBuilder kernelProfileOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(KERNEL_PROFILE_OPTION)
+        .withRequired(false)
+        .withShortName("kp")
+        .withArgument(
+            new ArgumentBuilder()
+                .withName(KERNEL_PROFILE_OPTION)
+                .withDefault(TriangularKernelProfile.class.getName())
+                .withMinimum(1).withMaximum(1).create())
+        .withDescription(
+            "The classname of the IKernelProfile. Default is TriangularKernelProfile");
+  }
+  
+  /**
+   * Returns a default command line option for specification of OUTLIER THRESHOLD value. Used for
+   * Cluster Classification.
+   */
+  public static DefaultOptionBuilder outlierThresholdOption() {
+    return new DefaultOptionBuilder()
+        .withLongName(OUTLIER_THRESHOLD)
+        .withRequired(false)
+        .withArgument(
+            new ArgumentBuilder().withName(OUTLIER_THRESHOLD).withMinimum(1)
+                .withMaximum(1).create()).withDescription("Outlier threshold value")
+        .withShortName(OUTLIER_THRESHOLD);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ChebyshevDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ChebyshevDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ChebyshevDistanceMeasure.java
new file mode 100644
index 0000000..61aa9a5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ChebyshevDistanceMeasure.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import java.util.Collection;
+import java.util.Collections;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.math.CardinalityException;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+
+/**
+ * This class implements a "Chebyshev distance" metric by finding the maximum difference
+ * between each coordinate. Also 'chessboard distance' due to the moves a king can make.
+ */
+public class ChebyshevDistanceMeasure implements DistanceMeasure {
+  
+  @Override
+  public void configure(Configuration job) {
+    // nothing to do
+  }
+  
+  @Override
+  public Collection<Parameter<?>> getParameters() {
+    return Collections.emptyList();
+  }
+  
+  @Override
+  public void createParameters(String prefix, Configuration jobConf) {
+    // nothing to do
+  }
+  
+  @Override
+  public double distance(Vector v1, Vector v2) {
+    if (v1.size() != v2.size()) {
+      throw new CardinalityException(v1.size(), v2.size());
+    }
+    return v1.aggregate(v2, Functions.MAX_ABS, Functions.MINUS);
+  }
+  
+  @Override
+  public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+    return distance(centroid, v); // TODO
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/CosineDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/CosineDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/CosineDistanceMeasure.java
new file mode 100644
index 0000000..37265eb
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/CosineDistanceMeasure.java
@@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import java.util.Collection;
+import java.util.Collections;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.math.CardinalityException;
+import org.apache.mahout.math.Vector;
+
+/**
+ * This class implements a cosine distance metric by dividing the dot product of two vectors by the product of their
+ * lengths.  That gives the cosine of the angle between the two vectors.  To convert this to a usable distance,
+ * 1-cos(angle) is what is actually returned.
+ */
+public class CosineDistanceMeasure implements DistanceMeasure {
+  
+  @Override
+  public void configure(Configuration job) {
+    // nothing to do
+  }
+  
+  @Override
+  public Collection<Parameter<?>> getParameters() {
+    return Collections.emptyList();
+  }
+  
+  @Override
+  public void createParameters(String prefix, Configuration jobConf) {
+    // nothing to do
+  }
+  
+  public static double distance(double[] p1, double[] p2) {
+    double dotProduct = 0.0;
+    double lengthSquaredp1 = 0.0;
+    double lengthSquaredp2 = 0.0;
+    for (int i = 0; i < p1.length; i++) {
+      lengthSquaredp1 += p1[i] * p1[i];
+      lengthSquaredp2 += p2[i] * p2[i];
+      dotProduct += p1[i] * p2[i];
+    }
+    double denominator = Math.sqrt(lengthSquaredp1) * Math.sqrt(lengthSquaredp2);
+    
+    // correct for floating-point rounding errors
+    if (denominator < dotProduct) {
+      denominator = dotProduct;
+    }
+    
+    // correct for zero-vector corner case
+    if (denominator == 0 && dotProduct == 0) {
+      return 0;
+    }
+    
+    return 1.0 - dotProduct / denominator;
+  }
+  
+  @Override
+  public double distance(Vector v1, Vector v2) {
+    if (v1.size() != v2.size()) {
+      throw new CardinalityException(v1.size(), v2.size());
+    }
+    double lengthSquaredv1 = v1.getLengthSquared();
+    double lengthSquaredv2 = v2.getLengthSquared();
+    
+    double dotProduct = v2.dot(v1);
+    double denominator = Math.sqrt(lengthSquaredv1) * Math.sqrt(lengthSquaredv2);
+    
+    // correct for floating-point rounding errors
+    if (denominator < dotProduct) {
+      denominator = dotProduct;
+    }
+    
+    // correct for zero-vector corner case
+    if (denominator == 0 && dotProduct == 0) {
+      return 0;
+    }
+    
+    return 1.0 - dotProduct / denominator;
+  }
+  
+  @Override
+  public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+    
+    double lengthSquaredv = v.getLengthSquared();
+    
+    double dotProduct = v.dot(centroid);
+    double denominator = Math.sqrt(centroidLengthSquare) * Math.sqrt(lengthSquaredv);
+    
+    // correct for floating-point rounding errors
+    if (denominator < dotProduct) {
+      denominator = dotProduct;
+    }
+    
+    // correct for zero-vector corner case
+    if (denominator == 0 && dotProduct == 0) {
+      return 0;
+    }
+    
+    return 1.0 - dotProduct / denominator;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/DistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/DistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/DistanceMeasure.java
new file mode 100644
index 0000000..696e79c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/DistanceMeasure.java
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import org.apache.mahout.common.parameters.Parametered;
+import org.apache.mahout.math.Vector;
+
+/** This interface is used for objects which can determine a distance metric between two points */
+public interface DistanceMeasure extends Parametered {
+  
+  /**
+   * Returns the distance metric applied to the arguments
+   * 
+   * @param v1
+   *          a Vector defining a multidimensional point in some feature space
+   * @param v2
+   *          a Vector defining a multidimensional point in some feature space
+   * @return a scalar doubles of the distance
+   */
+  double distance(Vector v1, Vector v2);
+  
+  /**
+   * Optimized version of distance metric for sparse vectors. This distance computation requires operations
+   * proportional to the number of non-zero elements in the vector instead of the cardinality of the vector.
+   * 
+   * @param centroidLengthSquare
+   *          Square of the length of centroid
+   * @param centroid
+   *          Centroid vector
+   */
+  double distance(double centroidLengthSquare, Vector centroid, Vector v);
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/EuclideanDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/EuclideanDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/EuclideanDistanceMeasure.java
new file mode 100644
index 0000000..665678d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/EuclideanDistanceMeasure.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import org.apache.mahout.math.Vector;
+
+/**
+ * This class implements a Euclidean distance metric by summing the square root of the squared differences
+ * between each coordinate.
+ * <p/>
+ * If you don't care about the true distance and only need the values for comparison, then the base class,
+ * {@link SquaredEuclideanDistanceMeasure}, will be faster since it doesn't do the actual square root of the
+ * squared differences.
+ */
+public class EuclideanDistanceMeasure extends SquaredEuclideanDistanceMeasure {
+  
+  @Override
+  public double distance(Vector v1, Vector v2) {
+    return Math.sqrt(super.distance(v1, v2));
+  }
+  
+  @Override
+  public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+    return Math.sqrt(super.distance(centroidLengthSquare, centroid, v));
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MahalanobisDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MahalanobisDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MahalanobisDistanceMeasure.java
new file mode 100644
index 0000000..17ee714
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MahalanobisDistanceMeasure.java
@@ -0,0 +1,197 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import java.io.DataInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.parameters.ClassParameter;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.common.parameters.PathParameter;
+import org.apache.mahout.math.Algebra;
+import org.apache.mahout.math.CardinalityException;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.MatrixWritable;
+import org.apache.mahout.math.SingularValueDecomposition;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+//See http://en.wikipedia.org/wiki/Mahalanobis_distance for details
+public class MahalanobisDistanceMeasure implements DistanceMeasure {
+
+  private Matrix inverseCovarianceMatrix;
+  private Vector meanVector;
+
+  private ClassParameter vectorClass;
+  private ClassParameter matrixClass;
+  private List<Parameter<?>> parameters;
+  private Parameter<Path> inverseCovarianceFile;
+  private Parameter<Path> meanVectorFile;
+
+  /*public MahalanobisDistanceMeasure(Vector meanVector,Matrix inputMatrix, boolean inversionNeeded)
+  {
+    this.meanVector=meanVector;
+    if (inversionNeeded)
+      setCovarianceMatrix(inputMatrix);
+    else
+      setInverseCovarianceMatrix(inputMatrix);  
+  }*/
+
+  @Override
+  public void configure(Configuration jobConf) {
+    if (parameters == null) {
+      ParameteredGeneralizations.configureParameters(this, jobConf);
+    }
+    try {
+      if (inverseCovarianceFile.get() != null) {
+        FileSystem fs = FileSystem.get(inverseCovarianceFile.get().toUri(), jobConf);
+        MatrixWritable inverseCovarianceMatrix = 
+            ClassUtils.instantiateAs((Class<? extends MatrixWritable>) matrixClass.get(), MatrixWritable.class);
+        if (!fs.exists(inverseCovarianceFile.get())) {
+          throw new FileNotFoundException(inverseCovarianceFile.get().toString());
+        }
+        try (DataInputStream in = fs.open(inverseCovarianceFile.get())){
+          inverseCovarianceMatrix.readFields(in);
+        }
+        this.inverseCovarianceMatrix = inverseCovarianceMatrix.get();
+        Preconditions.checkArgument(this.inverseCovarianceMatrix != null, "inverseCovarianceMatrix not initialized");
+      }
+
+      if (meanVectorFile.get() != null) {
+        FileSystem fs = FileSystem.get(meanVectorFile.get().toUri(), jobConf);
+        VectorWritable meanVector = 
+            ClassUtils.instantiateAs((Class<? extends VectorWritable>) vectorClass.get(), VectorWritable.class);
+        if (!fs.exists(meanVectorFile.get())) {
+          throw new FileNotFoundException(meanVectorFile.get().toString());
+        }
+        try (DataInputStream in = fs.open(meanVectorFile.get())){
+          meanVector.readFields(in);
+        }
+        this.meanVector = meanVector.get();
+        Preconditions.checkArgument(this.meanVector != null, "meanVector not initialized");
+      }
+
+    } catch (IOException e) {
+      throw new IllegalStateException(e);
+    }
+  }
+
+  @Override
+  public Collection<Parameter<?>> getParameters() {
+    return parameters;
+  }
+
+  @Override
+  public void createParameters(String prefix, Configuration jobConf) {
+    parameters = new ArrayList<>();
+    inverseCovarianceFile = new PathParameter(prefix, "inverseCovarianceFile", jobConf, null,
+            "Path on DFS to a file containing the inverse covariance matrix.");
+    parameters.add(inverseCovarianceFile);
+
+    matrixClass = new ClassParameter(prefix, "maxtrixClass", jobConf, DenseMatrix.class,
+            "Class<Matix> file specified in parameter inverseCovarianceFile has been serialized with.");
+    parameters.add(matrixClass);
+
+    meanVectorFile = new PathParameter(prefix, "meanVectorFile", jobConf, null,
+            "Path on DFS to a file containing the mean Vector.");
+    parameters.add(meanVectorFile);
+
+    vectorClass = new ClassParameter(prefix, "vectorClass", jobConf, DenseVector.class,
+            "Class file specified in parameter meanVectorFile has been serialized with.");
+    parameters.add(vectorClass);
+  }
+
+  /**
+   * @param v The vector to compute the distance to
+   * @return Mahalanobis distance of a multivariate vector
+   */
+  public double distance(Vector v) {
+    return Math.sqrt(v.minus(meanVector).dot(Algebra.mult(inverseCovarianceMatrix, v.minus(meanVector))));
+  }
+
+  @Override
+  public double distance(Vector v1, Vector v2) {
+    if (v1.size() != v2.size()) {
+      throw new CardinalityException(v1.size(), v2.size());
+    }
+    return Math.sqrt(v1.minus(v2).dot(Algebra.mult(inverseCovarianceMatrix, v1.minus(v2))));
+  }
+
+  @Override
+  public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+    return distance(centroid, v); // TODO
+  }
+
+  public void setInverseCovarianceMatrix(Matrix inverseCovarianceMatrix) {
+    Preconditions.checkArgument(inverseCovarianceMatrix != null, "inverseCovarianceMatrix not initialized");
+    this.inverseCovarianceMatrix = inverseCovarianceMatrix;
+  }
+
+
+  /**
+   * Computes the inverse covariance from the input covariance matrix given in input.
+   *
+   * @param m A covariance matrix.
+   * @throws IllegalArgumentException if <tt>eigen values equal to 0 found</tt>.
+   */
+  public void setCovarianceMatrix(Matrix m) {
+    if (m.numRows() != m.numCols()) {
+      throw new CardinalityException(m.numRows(), m.numCols());
+    }
+    // See http://www.mlahanas.de/Math/svd.htm for details,
+    // which specifically details the case of covariance matrix inversion
+    // Complexity: O(min(nm2,mn2))
+    SingularValueDecomposition svd = new SingularValueDecomposition(m);
+    Matrix sInv = svd.getS();
+    // Inverse Diagonal Elems
+    for (int i = 0; i < sInv.numRows(); i++) {
+      double diagElem = sInv.get(i, i);
+      if (diagElem > 0.0) {
+        sInv.set(i, i, 1 / diagElem);
+      } else {
+        throw new IllegalStateException("Eigen Value equals to 0 found.");
+      }
+    }
+    inverseCovarianceMatrix = svd.getU().times(sInv.times(svd.getU().transpose()));
+    Preconditions.checkArgument(inverseCovarianceMatrix != null, "inverseCovarianceMatrix not initialized");
+  }
+
+  public Matrix getInverseCovarianceMatrix() {
+    return inverseCovarianceMatrix;
+  }
+
+  public void setMeanVector(Vector meanVector) {
+    Preconditions.checkArgument(meanVector != null, "meanVector not initialized");
+    this.meanVector = meanVector;
+  }
+
+  public Vector getMeanVector() {
+    return meanVector;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ManhattanDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ManhattanDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ManhattanDistanceMeasure.java
new file mode 100644
index 0000000..5c32fcf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ManhattanDistanceMeasure.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import java.util.Collection;
+import java.util.Collections;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.math.CardinalityException;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+
+/**
+ * This class implements a "manhattan distance" metric by summing the absolute values of the difference
+ * between each coordinate
+ */
+public class ManhattanDistanceMeasure implements DistanceMeasure {
+
+  public static double distance(double[] p1, double[] p2) {
+    double result = 0.0;
+    for (int i = 0; i < p1.length; i++) {
+      result += Math.abs(p2[i] - p1[i]);
+    }
+    return result;
+  }
+
+  @Override
+  public void configure(Configuration job) {
+  // nothing to do
+  }
+
+  @Override
+  public Collection<Parameter<?>> getParameters() {
+    return Collections.emptyList();
+  }
+
+  @Override
+  public void createParameters(String prefix, Configuration jobConf) {
+  // nothing to do
+  }
+
+  @Override
+  public double distance(Vector v1, Vector v2) {
+    if (v1.size() != v2.size()) {
+      throw new CardinalityException(v1.size(), v2.size());
+    }
+    return v1.aggregate(v2, Functions.PLUS, Functions.MINUS_ABS);
+  }
+
+  @Override
+  public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+    return distance(centroid, v); // TODO
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MinkowskiDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MinkowskiDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MinkowskiDistanceMeasure.java
new file mode 100644
index 0000000..c3a48cb
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MinkowskiDistanceMeasure.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.common.parameters.DoubleParameter;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+
+/** 
+ * Implement Minkowski distance, a real-valued generalization of the 
+ * integral L(n) distances: Manhattan = L1, Euclidean = L2. 
+ * For high numbers of dimensions, very high exponents give more useful distances. 
+ * 
+ * Note: Math.pow is clever about integer-valued doubles.
+ **/
+public class MinkowskiDistanceMeasure implements DistanceMeasure {
+
+  private static final double EXPONENT = 3.0;
+
+  private List<Parameter<?>> parameters;
+  private double exponent = EXPONENT;
+  
+  public MinkowskiDistanceMeasure() {
+  }
+  
+  public MinkowskiDistanceMeasure(double exponent) {
+    this.exponent = exponent;
+  }
+
+  @Override
+  public void createParameters(String prefix, Configuration conf) {
+    parameters = new ArrayList<>();
+    Parameter<?> param =
+        new DoubleParameter(prefix, "exponent", conf, EXPONENT, "Exponent for Fractional Lagrange distance");
+    parameters.add(param);
+  }
+
+  @Override
+  public Collection<Parameter<?>> getParameters() {
+    return parameters;
+  }
+
+  @Override
+  public void configure(Configuration jobConf) {
+    if (parameters == null) {
+      ParameteredGeneralizations.configureParameters(this, jobConf);
+    }
+  }
+
+  public double getExponent() {
+    return exponent;
+  }
+
+  public void setExponent(double exponent) {
+    this.exponent = exponent;
+  }
+
+  /**
+   *  Math.pow is clever about integer-valued doubles
+   */
+  @Override
+  public double distance(Vector v1, Vector v2) {
+    return Math.pow(v1.aggregate(v2, Functions.PLUS, Functions.minusAbsPow(exponent)), 1.0 / exponent);
+  }
+
+  // TODO: how?
+  @Override
+  public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+    return distance(centroid, v); // TODO - can this use centroidLengthSquare somehow?
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/SquaredEuclideanDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/SquaredEuclideanDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/SquaredEuclideanDistanceMeasure.java
new file mode 100644
index 0000000..66da121
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/SquaredEuclideanDistanceMeasure.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import java.util.Collection;
+import java.util.Collections;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.math.Vector;
+
+/**
+ * Like {@link EuclideanDistanceMeasure} but it does not take the square root.
+ * <p/>
+ * Thus, it is not actually the Euclidean Distance, but it is saves on computation when you only need the
+ * distance for comparison and don't care about the actual value as a distance.
+ */
+public class SquaredEuclideanDistanceMeasure implements DistanceMeasure {
+  
+  @Override
+  public void configure(Configuration job) {
+  // nothing to do
+  }
+  
+  @Override
+  public Collection<Parameter<?>> getParameters() {
+    return Collections.emptyList();
+  }
+  
+  @Override
+  public void createParameters(String prefix, Configuration jobConf) {
+  // nothing to do
+  }
+  
+  @Override
+  public double distance(Vector v1, Vector v2) {
+    return v2.getDistanceSquared(v1);
+  }
+  
+  @Override
+  public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+    return centroidLengthSquare - 2 * v.dot(centroid) + v.getLengthSquared();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/TanimotoDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/TanimotoDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/TanimotoDistanceMeasure.java
new file mode 100644
index 0000000..cfeb119
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/TanimotoDistanceMeasure.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+
+/**
+ * Tanimoto coefficient implementation.
+ * 
+ * http://en.wikipedia.org/wiki/Jaccard_index
+ */
+public class TanimotoDistanceMeasure extends WeightedDistanceMeasure {
+  
+  /**
+   * Calculates the distance between two vectors.
+   * 
+   * The coefficient (a measure of similarity) is: T(a, b) = a.b / (|a|^2 + |b|^2 - a.b)
+   * 
+   * The distance d(a,b) = 1 - T(a,b)
+   * 
+   * @return 0 for perfect match, > 0 for greater distance
+   */
+  @Override
+  public double distance(Vector a, Vector b) {
+    double ab;
+    double denominator;
+    if (getWeights() != null) {
+      ab = a.times(b).aggregate(getWeights(), Functions.PLUS, Functions.MULT);
+      denominator = a.aggregate(getWeights(), Functions.PLUS, Functions.MULT_SQUARE_LEFT)
+          + b.aggregate(getWeights(), Functions.PLUS, Functions.MULT_SQUARE_LEFT)
+          - ab;
+    } else {
+      ab = b.dot(a); // b is SequentialAccess
+      denominator = a.getLengthSquared() + b.getLengthSquared() - ab;
+    }
+    
+    if (denominator < ab) { // correct for fp round-off: distance >= 0
+      denominator = ab;
+    }
+    if (denominator > 0) {
+      // denominator == 0 only when dot(a,a) == dot(b,b) == dot(a,b) == 0
+      return 1.0 - ab / denominator;
+    } else {
+      return 0.0;
+    }
+  }
+
+  @Override
+  public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+    return distance(centroid, v); // TODO
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedDistanceMeasure.java
new file mode 100644
index 0000000..1acbe86
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedDistanceMeasure.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import java.io.DataInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.parameters.ClassParameter;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.common.parameters.PathParameter;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+/** Abstract implementation of DistanceMeasure with support for weights. */
+public abstract class WeightedDistanceMeasure implements DistanceMeasure {
+  
+  private List<Parameter<?>> parameters;
+  private Parameter<Path> weightsFile;
+  private ClassParameter vectorClass;
+  private Vector weights;
+  
+  @Override
+  public void createParameters(String prefix, Configuration jobConf) {
+    parameters = new ArrayList<>();
+    weightsFile = new PathParameter(prefix, "weightsFile", jobConf, null,
+        "Path on DFS to a file containing the weights.");
+    parameters.add(weightsFile);
+    vectorClass = new ClassParameter(prefix, "vectorClass", jobConf, DenseVector.class,
+        "Class<Vector> file specified in parameter weightsFile has been serialized with.");
+    parameters.add(vectorClass);
+  }
+  
+  @Override
+  public Collection<Parameter<?>> getParameters() {
+    return parameters;
+  }
+  
+  @Override
+  public void configure(Configuration jobConf) {
+    if (parameters == null) {
+      ParameteredGeneralizations.configureParameters(this, jobConf);
+    }
+    try {
+      if (weightsFile.get() != null) {
+        FileSystem fs = FileSystem.get(weightsFile.get().toUri(), jobConf);
+        VectorWritable weights =
+            ClassUtils.instantiateAs((Class<? extends VectorWritable>) vectorClass.get(), VectorWritable.class);
+        if (!fs.exists(weightsFile.get())) {
+          throw new FileNotFoundException(weightsFile.get().toString());
+        }
+        try (DataInputStream in = fs.open(weightsFile.get())){
+          weights.readFields(in);
+        }
+        this.weights = weights.get();
+      }
+    } catch (IOException e) {
+      throw new IllegalStateException(e);
+    }
+  }
+  
+  public Vector getWeights() {
+    return weights;
+  }
+  
+  public void setWeights(Vector weights) {
+    this.weights = weights;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedEuclideanDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedEuclideanDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedEuclideanDistanceMeasure.java
new file mode 100644
index 0000000..4c78d9f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedEuclideanDistanceMeasure.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+
+/**
+ * This class implements a Euclidean distance metric by summing the square root of the squared differences
+ * between each coordinate, optionally adding weights.
+ */
+public class WeightedEuclideanDistanceMeasure extends WeightedDistanceMeasure {
+  
+  @Override
+  public double distance(Vector p1, Vector p2) {
+    double result = 0;
+    Vector res = p2.minus(p1);
+    Vector theWeights = getWeights();
+    if (theWeights == null) {
+      for (Element elt : res.nonZeroes()) {
+        result += elt.get() * elt.get();
+      }
+    } else {
+      for (Element elt : res.nonZeroes()) {
+        result += elt.get() * elt.get() * theWeights.get(elt.index());
+      }
+    }
+    return Math.sqrt(result);
+  }
+  
+  @Override
+  public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+    return distance(centroid, v); // TODO
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedManhattanDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedManhattanDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedManhattanDistanceMeasure.java
new file mode 100644
index 0000000..2c280e2
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedManhattanDistanceMeasure.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+
+/**
+ * This class implements a "Manhattan distance" metric by summing the absolute values of the difference
+ * between each coordinate, optionally with weights.
+ */
+public class WeightedManhattanDistanceMeasure extends WeightedDistanceMeasure {
+  
+  @Override
+  public double distance(Vector p1, Vector p2) {
+    double result = 0;
+    
+    Vector res = p2.minus(p1);
+    if (getWeights() == null) {
+      for (Element elt : res.nonZeroes()) {
+        result += Math.abs(elt.get());
+      }
+      
+    } else {
+      for (Element elt : res.nonZeroes()) {
+        result += Math.abs(elt.get() * getWeights().get(elt.index()));
+      }
+    }
+    
+    return result;
+  }
+  
+  @Override
+  public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+    return distance(centroid, v); // TODO
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CopyConstructorIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CopyConstructorIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CopyConstructorIterator.java
new file mode 100644
index 0000000..73cc821
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CopyConstructorIterator.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.Iterator;
+
+import com.google.common.base.Function;
+import com.google.common.collect.ForwardingIterator;
+import com.google.common.collect.Iterators;
+
+/**
+ * An iterator that copies the values in an underlying iterator by finding an appropriate copy constructor.
+ */
+public final class CopyConstructorIterator<T> extends ForwardingIterator<T> {
+
+  private final Iterator<T> delegate;
+  private Constructor<T> constructor;
+
+  public CopyConstructorIterator(Iterator<? extends T> copyFrom) {
+    this.delegate = Iterators.transform(
+        copyFrom,
+        new Function<T,T>() {
+          @Override
+          public T apply(T from) {
+            if (constructor == null) {
+              Class<T> elementClass = (Class<T>) from.getClass();
+              try {
+                constructor = elementClass.getConstructor(elementClass);
+              } catch (NoSuchMethodException e) {
+                throw new IllegalStateException(e);
+              }
+            }
+            try {
+              return constructor.newInstance(from);
+            } catch (InstantiationException | IllegalAccessException | InvocationTargetException e) {
+              throw new IllegalStateException(e);
+            }
+          }
+        });
+  }
+
+  @Override
+  protected Iterator<T> delegate() {
+    return delegate;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CountingIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CountingIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CountingIterator.java
new file mode 100644
index 0000000..658c1f1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CountingIterator.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import com.google.common.collect.AbstractIterator;
+
+/**
+ * Iterates over the integers from 0 through {@code to-1}.
+ */
+public final class CountingIterator extends AbstractIterator<Integer> {
+
+  private int count;
+  private final int to;
+
+  public CountingIterator(int to) {
+    this.to = to;
+  }
+
+  @Override
+  protected Integer computeNext() {
+    if (count < to) {
+      return count++;
+    } else {
+      return endOfData();
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterable.java
new file mode 100644
index 0000000..cfc18d6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterable.java
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Iterator;
+
+import com.google.common.base.Charsets;
+
+/**
+ * Iterable representing the lines of a text file. It can produce an {@link Iterator} over those lines. This
+ * assumes the text file's lines are delimited in a manner consistent with how {@link java.io.BufferedReader}
+ * defines lines.
+ *
+ */
+public final class FileLineIterable implements Iterable<String> {
+
+  private final InputStream is;
+  private final Charset encoding;
+  private final boolean skipFirstLine;
+  private final String origFilename;
+  
+  /** Creates a {@link FileLineIterable} over a given file, assuming a UTF-8 encoding. */
+  public FileLineIterable(File file) throws IOException {
+    this(file, Charsets.UTF_8, false);
+  }
+
+  /** Creates a {@link FileLineIterable} over a given file, assuming a UTF-8 encoding. */
+  public FileLineIterable(File file, boolean skipFirstLine) throws IOException {
+    this(file, Charsets.UTF_8, skipFirstLine);
+  }
+  
+  /** Creates a {@link FileLineIterable} over a given file, using the given encoding. */
+  public FileLineIterable(File file, Charset encoding, boolean skipFirstLine) throws IOException {
+    this(FileLineIterator.getFileInputStream(file), encoding, skipFirstLine);
+  }
+
+  public FileLineIterable(InputStream is) {
+    this(is, Charsets.UTF_8, false);
+  }
+  
+  public FileLineIterable(InputStream is, boolean skipFirstLine) {
+    this(is, Charsets.UTF_8, skipFirstLine);
+  }
+  
+  public FileLineIterable(InputStream is, Charset encoding, boolean skipFirstLine) {
+    this.is = is;
+    this.encoding = encoding;
+    this.skipFirstLine = skipFirstLine;
+    this.origFilename = "";
+  }
+
+  public FileLineIterable(InputStream is, Charset encoding, boolean skipFirstLine, String filename) {    
+    this.is = is;
+    this.encoding = encoding;
+    this.skipFirstLine = skipFirstLine;
+    this.origFilename = filename;
+  }
+  
+  
+  @Override
+  public Iterator<String> iterator() {
+    try {
+      return new FileLineIterator(is, encoding, skipFirstLine, this.origFilename);
+    } catch (IOException ioe) {
+      throw new IllegalStateException(ioe);
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterator.java
new file mode 100644
index 0000000..b7cc51e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterator.java
@@ -0,0 +1,167 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.io.BufferedReader;
+import java.io.Closeable;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.ZipInputStream;
+
+import com.google.common.base.Charsets;
+import com.google.common.collect.AbstractIterator;
+import com.google.common.io.Closeables;
+import com.google.common.io.Files;
+import org.apache.mahout.cf.taste.impl.common.SkippingIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Iterates over the lines of a text file. This assumes the text file's lines are delimited in a manner
+ * consistent with how {@link BufferedReader} defines lines.
+ * <p/>
+ * This class will uncompress files that end in .zip or .gz accordingly, too.
+ */
+public final class FileLineIterator extends AbstractIterator<String> implements SkippingIterator<String>, Closeable {
+
+  private final BufferedReader reader;
+
+  private static final Logger log = LoggerFactory.getLogger(FileLineIterator.class);
+
+  /**
+   * Creates a {@link FileLineIterator} over a given file, assuming a UTF-8 encoding.
+   *
+   * @throws java.io.FileNotFoundException if the file does not exist
+   * @throws IOException
+   *           if the file cannot be read
+   */
+  public FileLineIterator(File file) throws IOException {
+    this(file, Charsets.UTF_8, false);
+  }
+
+  /**
+   * Creates a {@link FileLineIterator} over a given file, assuming a UTF-8 encoding.
+   *
+   * @throws java.io.FileNotFoundException if the file does not exist
+   * @throws IOException                   if the file cannot be read
+   */
+  public FileLineIterator(File file, boolean skipFirstLine) throws IOException {
+    this(file, Charsets.UTF_8, skipFirstLine);
+  }
+
+  /**
+   * Creates a {@link FileLineIterator} over a given file, using the given encoding.
+   *
+   * @throws java.io.FileNotFoundException if the file does not exist
+   * @throws IOException                   if the file cannot be read
+   */
+  public FileLineIterator(File file, Charset encoding, boolean skipFirstLine) throws IOException {
+    this(getFileInputStream(file), encoding, skipFirstLine);
+  }
+
+  public FileLineIterator(InputStream is) throws IOException {
+    this(is, Charsets.UTF_8, false);
+  }
+
+  public FileLineIterator(InputStream is, boolean skipFirstLine) throws IOException {
+    this(is, Charsets.UTF_8, skipFirstLine);
+  }
+
+  public FileLineIterator(InputStream is, Charset encoding, boolean skipFirstLine) throws IOException {
+    reader = new BufferedReader(new InputStreamReader(is, encoding));
+    if (skipFirstLine) {
+      reader.readLine();
+    }
+  }
+
+  public FileLineIterator(InputStream is, Charset encoding, boolean skipFirstLine, String filename)
+    throws IOException {
+    InputStream compressedInputStream;
+
+    if ("gz".equalsIgnoreCase(Files.getFileExtension(filename.toLowerCase()))) {
+      compressedInputStream = new GZIPInputStream(is);
+    } else if ("zip".equalsIgnoreCase(Files.getFileExtension(filename.toLowerCase()))) {
+      compressedInputStream = new ZipInputStream(is);
+    } else {
+      compressedInputStream = is;
+    }
+
+    reader = new BufferedReader(new InputStreamReader(compressedInputStream, encoding));
+    if (skipFirstLine) {
+      reader.readLine();
+    }
+  }
+
+  static InputStream getFileInputStream(File file) throws IOException {
+    InputStream is = new FileInputStream(file);
+    String name = file.getName();
+    if ("gz".equalsIgnoreCase(Files.getFileExtension(name.toLowerCase()))) {
+      return new GZIPInputStream(is);
+    } else if ("zip".equalsIgnoreCase(Files.getFileExtension(name.toLowerCase()))) {
+      return new ZipInputStream(is);
+    } else {
+      return is;
+    }
+  }
+
+  @Override
+  protected String computeNext() {
+    String line;
+    try {
+      line = reader.readLine();
+    } catch (IOException ioe) {
+      try {
+        close();
+      } catch (IOException e) {
+        log.error(e.getMessage(), e);
+      }
+      throw new IllegalStateException(ioe);
+    }
+    return line == null ? endOfData() : line;
+  }
+
+
+  @Override
+  public void skip(int n) {
+    try {
+      for (int i = 0; i < n; i++) {
+        if (reader.readLine() == null) {
+          break;
+        }
+      }
+    } catch (IOException ioe) {
+      try {
+        close();
+      } catch (IOException e) {
+        throw new IllegalStateException(e);
+      }
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    endOfData();
+    Closeables.close(reader, true);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FixedSizeSamplingIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FixedSizeSamplingIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FixedSizeSamplingIterator.java
new file mode 100644
index 0000000..1905654
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FixedSizeSamplingIterator.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import com.google.common.collect.ForwardingIterator;
+import com.google.common.collect.Lists;
+import org.apache.mahout.common.RandomUtils;
+
+/**
+ * Sample a fixed number of elements from an Iterator. The results can appear in any order.
+ */
+public final class FixedSizeSamplingIterator<T> extends ForwardingIterator<T> {
+
+  private final Iterator<T> delegate;
+  
+  public FixedSizeSamplingIterator(int size, Iterator<T> source) {
+    List<T> buf = Lists.newArrayListWithCapacity(size);
+    int sofar = 0;
+    Random random = RandomUtils.getRandom();
+    while (source.hasNext()) {
+      T v = source.next();
+      sofar++;
+      if (buf.size() < size) {
+        buf.add(v);
+      } else {
+        int position = random.nextInt(sofar);
+        if (position < buf.size()) {
+          buf.set(position, v);
+        }
+      }
+    }
+    delegate = buf.iterator();
+  }
+
+  @Override
+  protected Iterator<T> delegate() {
+    return delegate;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterable.java
new file mode 100644
index 0000000..425b44b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterable.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.util.Iterator;
+
+/**
+ * Wraps an {@link Iterable} whose {@link Iterable#iterator()} returns only some subset of the elements that
+ * it would, as determined by a iterator rate parameter.
+ */
+public final class SamplingIterable<T> implements Iterable<T> {
+  
+  private final Iterable<? extends T> delegate;
+  private final double samplingRate;
+  
+  public SamplingIterable(Iterable<? extends T> delegate, double samplingRate) {
+    this.delegate = delegate;
+    this.samplingRate = samplingRate;
+  }
+  
+  @Override
+  public Iterator<T> iterator() {
+    return new SamplingIterator<>(delegate.iterator(), samplingRate);
+  }
+  
+  public static <T> Iterable<T> maybeWrapIterable(Iterable<T> delegate, double samplingRate) {
+    return samplingRate >= 1.0 ? delegate : new SamplingIterable<>(delegate, samplingRate);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterator.java
new file mode 100644
index 0000000..2ba46fd
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterator.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.util.Iterator;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.AbstractIterator;
+import org.apache.commons.math3.distribution.PascalDistribution;
+import org.apache.mahout.cf.taste.impl.common.SkippingIterator;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.RandomWrapper;
+
+/**
+ * Wraps an {@link Iterator} and returns only some subset of the elements that it would, as determined by a
+ * iterator rate parameter.
+ */
+public final class SamplingIterator<T> extends AbstractIterator<T> {
+  
+  private final PascalDistribution geometricDistribution;
+  private final Iterator<? extends T> delegate;
+
+  public SamplingIterator(Iterator<? extends T> delegate, double samplingRate) {
+    this(RandomUtils.getRandom(), delegate, samplingRate);
+  }
+
+  public SamplingIterator(RandomWrapper random, Iterator<? extends T> delegate, double samplingRate) {
+    Preconditions.checkNotNull(delegate);
+    Preconditions.checkArgument(samplingRate > 0.0 && samplingRate <= 1.0,
+        "Must be: 0.0 < samplingRate <= 1.0. But samplingRate = " + samplingRate);
+    // Geometric distribution is special case of negative binomial (aka Pascal) with r=1:
+    geometricDistribution = new PascalDistribution(random.getRandomGenerator(), 1, samplingRate);
+    this.delegate = delegate;
+  }
+
+  @Override
+  protected T computeNext() {
+    int toSkip = geometricDistribution.sample();
+    if (delegate instanceof SkippingIterator<?>) {
+      SkippingIterator<? extends T> skippingDelegate = (SkippingIterator<? extends T>) delegate;
+      skippingDelegate.skip(toSkip);
+      if (skippingDelegate.hasNext()) {
+        return skippingDelegate.next();
+      }
+    } else {
+      for (int i = 0; i < toSkip && delegate.hasNext(); i++) {
+        delegate.next();
+      }
+      if (delegate.hasNext()) {
+        return delegate.next();
+      }
+    }
+    return endOfData();
+  }
+
+
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StableFixedSizeSamplingIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StableFixedSizeSamplingIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StableFixedSizeSamplingIterator.java
new file mode 100644
index 0000000..c4ddf7b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StableFixedSizeSamplingIterator.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import com.google.common.base.Function;
+import com.google.common.collect.ForwardingIterator;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Lists;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+
+/**
+ * Sample a fixed number of elements from an Iterator. The results will appear in the original order at some
+ * cost in time and memory relative to a FixedSizeSampler.
+ */
+public class StableFixedSizeSamplingIterator<T> extends ForwardingIterator<T> {
+
+  private final Iterator<T> delegate;
+  
+  public StableFixedSizeSamplingIterator(int size, Iterator<T> source) {
+    List<Pair<Integer,T>> buf = Lists.newArrayListWithCapacity(size);
+    int sofar = 0;
+    Random random = RandomUtils.getRandom();
+    while (source.hasNext()) {
+      T v = source.next();
+      sofar++;
+      if (buf.size() < size) {
+        buf.add(new Pair<>(sofar, v));
+      } else {
+        int position = random.nextInt(sofar);
+        if (position < buf.size()) {
+          buf.set(position, new Pair<>(sofar, v));
+        }
+      }
+    }
+
+    Collections.sort(buf);
+    delegate = Iterators.transform(buf.iterator(),
+      new Function<Pair<Integer,T>,T>() {
+        @Override
+        public T apply(Pair<Integer,T> from) {
+          return from.getSecond();
+        }
+      });
+  }
+
+  @Override
+  protected Iterator<T> delegate() {
+    return delegate;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StringRecordIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StringRecordIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StringRecordIterator.java
new file mode 100644
index 0000000..73b841e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StringRecordIterator.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import com.google.common.base.Function;
+import com.google.common.collect.ForwardingIterator;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.common.Pair;
+
+public class StringRecordIterator extends ForwardingIterator<Pair<List<String>,Long>> {
+  
+  private static final Long ONE = 1L;
+
+  private final Pattern splitter;
+  private final Iterator<Pair<List<String>,Long>> delegate;
+
+  public StringRecordIterator(Iterable<String> stringIterator, String pattern) {
+    this.splitter = Pattern.compile(pattern);
+    delegate = Iterators.transform(
+        stringIterator.iterator(),
+        new Function<String,Pair<List<String>,Long>>() {
+          @Override
+          public Pair<List<String>,Long> apply(String from) {
+            String[] items = splitter.split(from);
+            return new Pair<>(Arrays.asList(items), ONE);
+          }
+        });
+  }
+
+  @Override
+  protected Iterator<Pair<List<String>,Long>> delegate() {
+    return delegate;
+  }
+
+}


[13/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/OptIgSplit.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/OptIgSplit.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/OptIgSplit.java
new file mode 100644
index 0000000..56b1a04
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/OptIgSplit.java
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.split;
+
+import org.apache.commons.math3.stat.descriptive.rank.Percentile;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.DataUtils;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.TreeSet;
+
+/**
+ * <p>Optimized implementation of IgSplit.
+ * This class can be used when the criterion variable is the categorical attribute.</p>
+ *
+ * <p>This code was changed in MAHOUT-1419 to deal in sampled splits among numeric
+ * features to fix a performance problem. To generate some synthetic data that exercises
+ * the issue, try for example generating 4 features of Normal(0,1) values with a random
+ * boolean 0/1 categorical feature. In Scala:</p>
+ *
+ * {@code
+ *  val r = new scala.util.Random()
+ *  val pw = new java.io.PrintWriter("random.csv")
+ *  (1 to 10000000).foreach(e =>
+ *    pw.println(r.nextDouble() + "," +
+ *               r.nextDouble() + "," +
+ *               r.nextDouble() + "," +
+ *               r.nextDouble() + "," +
+ *               (if (r.nextBoolean()) 1 else 0))
+ *   )
+ *   pw.close()
+ * }
+ */
+@Deprecated
+public class OptIgSplit extends IgSplit {
+
+  private static final int MAX_NUMERIC_SPLITS = 16;
+
+  @Override
+  public Split computeSplit(Data data, int attr) {
+    if (data.getDataset().isNumerical(attr)) {
+      return numericalSplit(data, attr);
+    } else {
+      return categoricalSplit(data, attr);
+    }
+  }
+
+  /**
+   * Computes the split for a CATEGORICAL attribute
+   */
+  private static Split categoricalSplit(Data data, int attr) {
+    double[] values = data.values(attr).clone();
+
+    double[] splitPoints = chooseCategoricalSplitPoints(values);
+
+    int numLabels = data.getDataset().nblabels();
+    int[][] counts = new int[splitPoints.length][numLabels];
+    int[] countAll = new int[numLabels];
+
+    computeFrequencies(data, attr, splitPoints, counts, countAll);
+
+    int size = data.size();
+    double hy = entropy(countAll, size); // H(Y)
+    double hyx = 0.0; // H(Y|X)
+    double invDataSize = 1.0 / size;
+
+    for (int index = 0; index < splitPoints.length; index++) {
+      size = DataUtils.sum(counts[index]);
+      hyx += size * invDataSize * entropy(counts[index], size);
+    }
+
+    double ig = hy - hyx;
+    return new Split(attr, ig);
+  }
+
+  static void computeFrequencies(Data data,
+                                 int attr,
+                                 double[] splitPoints,
+                                 int[][] counts,
+                                 int[] countAll) {
+    Dataset dataset = data.getDataset();
+
+    for (int index = 0; index < data.size(); index++) {
+      Instance instance = data.get(index);
+      int label = (int) dataset.getLabel(instance);
+      double value = instance.get(attr);
+      int split = 0;
+      while (split < splitPoints.length && value > splitPoints[split]) {
+        split++;
+      }
+      if (split < splitPoints.length) {
+        counts[split][label]++;
+      } // Otherwise it's in the last split, which we don't need to count
+      countAll[label]++;
+    }
+  }
+
+  /**
+   * Computes the best split for a NUMERICAL attribute
+   */
+  static Split numericalSplit(Data data, int attr) {
+    double[] values = data.values(attr).clone();
+    Arrays.sort(values);
+
+    double[] splitPoints = chooseNumericSplitPoints(values);
+
+    int numLabels = data.getDataset().nblabels();
+    int[][] counts = new int[splitPoints.length][numLabels];
+    int[] countAll = new int[numLabels];
+    int[] countLess = new int[numLabels];
+
+    computeFrequencies(data, attr, splitPoints, counts, countAll);
+
+    int size = data.size();
+    double hy = entropy(countAll, size);
+    double invDataSize = 1.0 / size;
+
+    int best = -1;
+    double bestIg = -1.0;
+
+    // try each possible split value
+    for (int index = 0; index < splitPoints.length; index++) {
+      double ig = hy;
+
+      DataUtils.add(countLess, counts[index]);
+      DataUtils.dec(countAll, counts[index]);
+
+      // instance with attribute value < values[index]
+      size = DataUtils.sum(countLess);
+      ig -= size * invDataSize * entropy(countLess, size);
+      // instance with attribute value >= values[index]
+      size = DataUtils.sum(countAll);
+      ig -= size * invDataSize * entropy(countAll, size);
+
+      if (ig > bestIg) {
+        bestIg = ig;
+        best = index;
+      }
+    }
+
+    if (best == -1) {
+      throw new IllegalStateException("no best split found !");
+    }
+    return new Split(attr, bestIg, splitPoints[best]);
+  }
+
+  /**
+   * @return an array of values to split the numeric feature's values on when
+   *  building candidate splits. When input size is <= MAX_NUMERIC_SPLITS + 1, it will
+   *  return the averages between success values as split points. When larger, it will
+   *  return MAX_NUMERIC_SPLITS approximate percentiles through the data.
+   */
+  private static double[] chooseNumericSplitPoints(double[] values) {
+    if (values.length <= 1) {
+      return values;
+    }
+    if (values.length <= MAX_NUMERIC_SPLITS + 1) {
+      double[] splitPoints = new double[values.length - 1];
+      for (int i = 1; i < values.length; i++) {
+        splitPoints[i-1] = (values[i] + values[i-1]) / 2.0;
+      }
+      return splitPoints;
+    }
+    Percentile distribution = new Percentile();
+    distribution.setData(values);
+    double[] percentiles = new double[MAX_NUMERIC_SPLITS];
+    for (int i = 0 ; i < percentiles.length; i++) {
+      double p = 100.0 * ((i + 1.0) / (MAX_NUMERIC_SPLITS + 1.0));
+      percentiles[i] = distribution.evaluate(p);
+    }
+    return percentiles;
+  }
+
+  private static double[] chooseCategoricalSplitPoints(double[] values) {
+    // There is no great reason to believe that categorical value order matters,
+    // but the original code worked this way, and it's not terrible in the absence
+    // of more sophisticated analysis
+    Collection<Double> uniqueOrderedCategories = new TreeSet<>();
+    for (double v : values) {
+      uniqueOrderedCategories.add(v);
+    }
+    double[] uniqueValues = new double[uniqueOrderedCategories.size()];
+    Iterator<Double> it = uniqueOrderedCategories.iterator();
+    for (int i = 0; i < uniqueValues.length; i++) {
+      uniqueValues[i] = it.next();
+    }
+    return uniqueValues;
+  }
+
+  /**
+   * Computes the Entropy
+   *
+   * @param counts   counts[i] = numInstances with label i
+   * @param dataSize numInstances
+   */
+  private static double entropy(int[] counts, int dataSize) {
+    if (dataSize == 0) {
+      return 0.0;
+    }
+
+    double entropy = 0.0;
+
+    for (int count : counts) {
+      if (count > 0) {
+        double p = count / (double) dataSize;
+        entropy -= p * Math.log(p);
+      }
+    }
+
+    return entropy / LOG2;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/RegressionSplit.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/RegressionSplit.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/RegressionSplit.java
new file mode 100644
index 0000000..38695a3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/RegressionSplit.java
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.split;
+
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.Instance;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Comparator;
+
+/**
+ * Regression problem implementation of IgSplit. This class can be used when the criterion variable is the numerical
+ * attribute.
+ */
+@Deprecated
+public class RegressionSplit extends IgSplit {
+  
+  /**
+   * Comparator for Instance sort
+   */
+  private static class InstanceComparator implements Comparator<Instance>, Serializable {
+    private final int attr;
+
+    InstanceComparator(int attr) {
+      this.attr = attr;
+    }
+    
+    @Override
+    public int compare(Instance arg0, Instance arg1) {
+      return Double.compare(arg0.get(attr), arg1.get(attr));
+    }
+  }
+  
+  @Override
+  public Split computeSplit(Data data, int attr) {
+    if (data.getDataset().isNumerical(attr)) {
+      return numericalSplit(data, attr);
+    } else {
+      return categoricalSplit(data, attr);
+    }
+  }
+  
+  /**
+   * Computes the split for a CATEGORICAL attribute
+   */
+  private static Split categoricalSplit(Data data, int attr) {
+    FullRunningAverage[] ra = new FullRunningAverage[data.getDataset().nbValues(attr)];
+    double[] sk = new double[data.getDataset().nbValues(attr)];
+    for (int i = 0; i < ra.length; i++) {
+      ra[i] = new FullRunningAverage();
+    }
+    FullRunningAverage totalRa = new FullRunningAverage();
+    double totalSk = 0.0;
+
+    for (int i = 0; i < data.size(); i++) {
+      // computes the variance
+      Instance instance = data.get(i);
+      int value = (int) instance.get(attr);
+      double xk = data.getDataset().getLabel(instance);
+      if (ra[value].getCount() == 0) {
+        ra[value].addDatum(xk);
+        sk[value] = 0.0;
+      } else {
+        double mk = ra[value].getAverage();
+        ra[value].addDatum(xk);
+        sk[value] += (xk - mk) * (xk - ra[value].getAverage());
+      }
+
+      // total variance
+      if (i == 0) {
+        totalRa.addDatum(xk);
+        totalSk = 0.0;
+      } else {
+        double mk = totalRa.getAverage();
+        totalRa.addDatum(xk);
+        totalSk += (xk - mk) * (xk - totalRa.getAverage());
+      }
+    }
+
+    // computes the variance gain
+    double ig = totalSk;
+    for (double aSk : sk) {
+      ig -= aSk;
+    }
+
+    return new Split(attr, ig);
+  }
+  
+  /**
+   * Computes the best split for a NUMERICAL attribute
+   */
+  private static Split numericalSplit(Data data, int attr) {
+    FullRunningAverage[] ra = new FullRunningAverage[2];
+    for (int i = 0; i < ra.length; i++) {
+      ra[i] = new FullRunningAverage();
+    }
+
+    // Instance sort
+    Instance[] instances = new Instance[data.size()];
+    for (int i = 0; i < data.size(); i++) {
+      instances[i] = data.get(i);
+    }
+    Arrays.sort(instances, new InstanceComparator(attr));
+
+    double[] sk = new double[2];
+    for (Instance instance : instances) {
+      double xk = data.getDataset().getLabel(instance);
+      if (ra[1].getCount() == 0) {
+        ra[1].addDatum(xk);
+        sk[1] = 0.0;
+      } else {
+        double mk = ra[1].getAverage();
+        ra[1].addDatum(xk);
+        sk[1] += (xk - mk) * (xk - ra[1].getAverage());
+      }
+    }
+    double totalSk = sk[1];
+
+    // find the best split point
+    double split = Double.NaN;
+    double preSplit = Double.NaN;
+    double bestVal = Double.MAX_VALUE;
+    double bestSk = 0.0;
+
+    // computes total variance
+    for (Instance instance : instances) {
+      double xk = data.getDataset().getLabel(instance);
+
+      if (instance.get(attr) > preSplit) {
+        double curVal = sk[0] / ra[0].getCount() + sk[1] / ra[1].getCount();
+        if (curVal < bestVal) {
+          bestVal = curVal;
+          bestSk = sk[0] + sk[1];
+          split = (instance.get(attr) + preSplit) / 2.0;
+        }
+      }
+
+      // computes the variance
+      if (ra[0].getCount() == 0) {
+        ra[0].addDatum(xk);
+        sk[0] = 0.0;
+      } else {
+        double mk = ra[0].getAverage();
+        ra[0].addDatum(xk);
+        sk[0] += (xk - mk) * (xk - ra[0].getAverage());
+      }
+
+      double mk = ra[1].getAverage();
+      ra[1].removeDatum(xk);
+      sk[1] -= (xk - mk) * (xk - ra[1].getAverage());
+
+      preSplit = instance.get(attr);
+    }
+
+    // computes the variance gain
+    double ig = totalSk - bestSk;
+
+    return new Split(attr, ig, split);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/Split.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/Split.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/Split.java
new file mode 100644
index 0000000..2a6a322
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/Split.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.split;
+
+import java.util.Locale;
+
+/**
+ * Contains enough information to identify each split
+ */
+@Deprecated
+public final class Split {
+  
+  private final int attr;
+  private final double ig;
+  private final double split;
+  
+  public Split(int attr, double ig, double split) {
+    this.attr = attr;
+    this.ig = ig;
+    this.split = split;
+  }
+  
+  public Split(int attr, double ig) {
+    this(attr, ig, Double.NaN);
+  }
+
+  /**
+   * @return attribute to split for
+   */
+  public int getAttr() {
+    return attr;
+  }
+
+  /**
+   * @return Information Gain of the split
+   */
+  public double getIg() {
+    return ig;
+  }
+
+  /**
+   * @return split value for NUMERICAL attributes
+   */
+  public double getSplit() {
+    return split;
+  }
+
+  @Override
+  public String toString() {
+    return String.format(Locale.ENGLISH, "attr: %d, ig: %f, split: %f", attr, ig, split);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java
new file mode 100644
index 0000000..f29faed
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java
@@ -0,0 +1,166 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.tools;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.classifier.df.DFUtils;
+import org.apache.mahout.classifier.df.data.DataLoader;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.DescriptorException;
+import org.apache.mahout.classifier.df.data.DescriptorUtils;
+import org.apache.mahout.common.CommandLineUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Generates a file descriptor for a given dataset
+ */
+public final class Describe implements Tool {
+
+  private static final Logger log = LoggerFactory.getLogger(Describe.class);
+
+  private Describe() {}
+
+  public static int main(String[] args) throws Exception {
+    return ToolRunner.run(new Describe(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+    ArgumentBuilder abuilder = new ArgumentBuilder();
+    GroupBuilder gbuilder = new GroupBuilder();
+
+    Option pathOpt = obuilder.withLongName("path").withShortName("p").withRequired(true).withArgument(
+        abuilder.withName("path").withMinimum(1).withMaximum(1).create()).withDescription("Data path").create();
+
+    Option descriptorOpt = obuilder.withLongName("descriptor").withShortName("d").withRequired(true)
+        .withArgument(abuilder.withName("descriptor").withMinimum(1).create()).withDescription(
+            "data descriptor").create();
+
+    Option descPathOpt = obuilder.withLongName("file").withShortName("f").withRequired(true).withArgument(
+        abuilder.withName("file").withMinimum(1).withMaximum(1).create()).withDescription(
+        "Path to generated descriptor file").create();
+
+    Option regOpt = obuilder.withLongName("regression").withDescription("Regression Problem").withShortName("r")
+        .create();
+
+    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+        .create();
+
+    Group group = gbuilder.withName("Options").withOption(pathOpt).withOption(descPathOpt).withOption(
+        descriptorOpt).withOption(regOpt).withOption(helpOpt).create();
+
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(group);
+      CommandLine cmdLine = parser.parse(args);
+
+      if (cmdLine.hasOption(helpOpt)) {
+        CommandLineUtil.printHelp(group);
+        return -1;
+      }
+
+      String dataPath = cmdLine.getValue(pathOpt).toString();
+      String descPath = cmdLine.getValue(descPathOpt).toString();
+      List<String> descriptor = convert(cmdLine.getValues(descriptorOpt));
+      boolean regression = cmdLine.hasOption(regOpt);
+
+      log.debug("Data path : {}", dataPath);
+      log.debug("Descriptor path : {}", descPath);
+      log.debug("Descriptor : {}", descriptor);
+      log.debug("Regression : {}", regression);
+
+      runTool(dataPath, descriptor, descPath, regression);
+    } catch (OptionException e) {
+      log.warn(e.toString());
+      CommandLineUtil.printHelp(group);
+    }
+    return 0;
+  }
+
+  private void runTool(String dataPath, Iterable<String> description, String filePath, boolean regression)
+    throws DescriptorException, IOException {
+    log.info("Generating the descriptor...");
+    String descriptor = DescriptorUtils.generateDescriptor(description);
+
+    Path fPath = validateOutput(filePath);
+
+    log.info("generating the dataset...");
+    Dataset dataset = generateDataset(descriptor, dataPath, regression);
+
+    log.info("storing the dataset description");
+    String json = dataset.toJSON();
+    DFUtils.storeString(conf, fPath, json);
+  }
+
+  private Dataset generateDataset(String descriptor, String dataPath, boolean regression) throws IOException,
+      DescriptorException {
+    Path path = new Path(dataPath);
+    FileSystem fs = path.getFileSystem(conf);
+
+    return DataLoader.generateDataset(descriptor, regression, fs, path);
+  }
+
+  private Path validateOutput(String filePath) throws IOException {
+    Path path = new Path(filePath);
+    FileSystem fs = path.getFileSystem(conf);
+    if (fs.exists(path)) {
+      throw new IllegalStateException("Descriptor's file already exists");
+    }
+
+    return path;
+  }
+
+  private static List<String> convert(Collection<?> values) {
+    List<String> list = new ArrayList<>(values.size());
+    for (Object value : values) {
+      list.add(value.toString());
+    }
+    return list;
+  }
+
+  private Configuration conf;
+
+  @Override
+  public void setConf(Configuration entries) {
+    this.conf = entries;
+  }
+
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/ForestVisualizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/ForestVisualizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/ForestVisualizer.java
new file mode 100644
index 0000000..b421c4e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/ForestVisualizer.java
@@ -0,0 +1,158 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.tools;
+
+import java.io.IOException;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.classifier.df.DecisionForest;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.common.CommandLineUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This tool is to visualize the Decision Forest
+ */
+@Deprecated
+public final class ForestVisualizer {
+
+  private static final Logger log = LoggerFactory.getLogger(ForestVisualizer.class);
+
+  private ForestVisualizer() {
+  }
+
+  public static String toString(DecisionForest forest, Dataset dataset, String[] attrNames) {
+
+    List<Node> trees;
+    try {
+      Method getTrees = forest.getClass().getDeclaredMethod("getTrees");
+      getTrees.setAccessible(true);
+      trees = (List<Node>) getTrees.invoke(forest);
+    } catch (IllegalAccessException e) {
+      throw new IllegalStateException(e);
+    } catch (InvocationTargetException e) {
+      throw new IllegalStateException(e);
+    } catch (NoSuchMethodException e) {
+      throw new IllegalStateException(e);
+    }
+
+    int cnt = 1;
+    StringBuilder buff = new StringBuilder();
+    for (Node tree : trees) {
+      buff.append("Tree[").append(cnt).append("]:");
+      buff.append(TreeVisualizer.toString(tree, dataset, attrNames));
+      buff.append('\n');
+      cnt++;
+    }
+    return buff.toString();
+  }
+
+  /**
+   * Decision Forest to String
+   * @param forestPath
+   *          path to the Decision Forest
+   * @param datasetPath
+   *          dataset path
+   * @param attrNames
+   *          attribute names
+   */
+  public static String toString(String forestPath, String datasetPath, String[] attrNames) throws IOException {
+    Configuration conf = new Configuration();
+    DecisionForest forest = DecisionForest.load(conf, new Path(forestPath));
+    Dataset dataset = Dataset.load(conf, new Path(datasetPath));
+    return toString(forest, dataset, attrNames);
+  }
+
+  /**
+   * Print Decision Forest
+   * @param forestPath
+   *          path to the Decision Forest
+   * @param datasetPath
+   *          dataset path
+   * @param attrNames
+   *          attribute names
+   */
+  public static void print(String forestPath, String datasetPath, String[] attrNames) throws IOException {
+    System.out.println(toString(forestPath, datasetPath, attrNames));
+  }
+  
+  public static void main(String[] args) {
+    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+    ArgumentBuilder abuilder = new ArgumentBuilder();
+    GroupBuilder gbuilder = new GroupBuilder();
+
+    Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true)
+      .withArgument(abuilder.withName("dataset").withMinimum(1).withMaximum(1).create())
+      .withDescription("Dataset path").create();
+
+    Option modelOpt = obuilder.withLongName("model").withShortName("m").withRequired(true)
+      .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create())
+      .withDescription("Path to the Decision Forest").create();
+
+    Option attrNamesOpt = obuilder.withLongName("names").withShortName("n").withRequired(false)
+      .withArgument(abuilder.withName("names").withMinimum(1).create())
+      .withDescription("Optional, Attribute names").create();
+
+    Option helpOpt = obuilder.withLongName("help").withShortName("h")
+      .withDescription("Print out help").create();
+  
+    Group group = gbuilder.withName("Options").withOption(datasetOpt).withOption(modelOpt)
+      .withOption(attrNamesOpt).withOption(helpOpt).create();
+  
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(group);
+      CommandLine cmdLine = parser.parse(args);
+      
+      if (cmdLine.hasOption("help")) {
+        CommandLineUtil.printHelp(group);
+        return;
+      }
+  
+      String datasetName = cmdLine.getValue(datasetOpt).toString();
+      String modelName = cmdLine.getValue(modelOpt).toString();
+      String[] attrNames = null;
+      if (cmdLine.hasOption(attrNamesOpt)) {
+        Collection<String> names = (Collection<String>) cmdLine.getValues(attrNamesOpt);
+        if (!names.isEmpty()) {
+          attrNames = new String[names.size()];
+          names.toArray(attrNames);
+        }
+      }
+      
+      print(modelName, datasetName, attrNames);
+    } catch (Exception e) {
+      log.error("Exception", e);
+      CommandLineUtil.printHelp(group);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Frequencies.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Frequencies.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Frequencies.java
new file mode 100644
index 0000000..c37af4e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Frequencies.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.tools;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.CommandLineUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+/**
+ * Compute the frequency distribution of the "class label"<br>
+ * This class can be used when the criterion variable is the categorical attribute.
+ */
+@Deprecated
+public final class Frequencies extends Configured implements Tool {
+  
+  private static final Logger log = LoggerFactory.getLogger(Frequencies.class);
+  
+  private Frequencies() { }
+  
+  @Override
+  public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
+    
+    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+    ArgumentBuilder abuilder = new ArgumentBuilder();
+    GroupBuilder gbuilder = new GroupBuilder();
+    
+    Option dataOpt = obuilder.withLongName("data").withShortName("d").withRequired(true).withArgument(
+      abuilder.withName("path").withMinimum(1).withMaximum(1).create()).withDescription("Data path").create();
+    
+    Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true).withArgument(
+      abuilder.withName("path").withMinimum(1).create()).withDescription("dataset path").create();
+    
+    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+        .create();
+    
+    Group group = gbuilder.withName("Options").withOption(dataOpt).withOption(datasetOpt).withOption(helpOpt)
+        .create();
+    
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(group);
+      CommandLine cmdLine = parser.parse(args);
+      
+      if (cmdLine.hasOption(helpOpt)) {
+        CommandLineUtil.printHelp(group);
+        return 0;
+      }
+      
+      String dataPath = cmdLine.getValue(dataOpt).toString();
+      String datasetPath = cmdLine.getValue(datasetOpt).toString();
+      
+      log.debug("Data path : {}", dataPath);
+      log.debug("Dataset path : {}", datasetPath);
+      
+      runTool(dataPath, datasetPath);
+    } catch (OptionException e) {
+      log.warn(e.toString(), e);
+      CommandLineUtil.printHelp(group);
+    }
+    
+    return 0;
+  }
+  
+  private void runTool(String data, String dataset) throws IOException,
+                                                   ClassNotFoundException,
+                                                   InterruptedException {
+    
+    FileSystem fs = FileSystem.get(getConf());
+    Path workingDir = fs.getWorkingDirectory();
+    
+    Path dataPath = new Path(data);
+    Path datasetPath = new Path(dataset);
+    
+    log.info("Computing the frequencies...");
+    FrequenciesJob job = new FrequenciesJob(new Path(workingDir, "output"), dataPath, datasetPath);
+    
+    int[][] counts = job.run(getConf());
+    
+    // outputing the frequencies
+    log.info("counts[partition][class]");
+    for (int[] count : counts) {
+      log.info(Arrays.toString(count));
+    }
+  }
+  
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new Frequencies(), args);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/FrequenciesJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/FrequenciesJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/FrequenciesJob.java
new file mode 100644
index 0000000..9d7e2ff
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/FrequenciesJob.java
@@ -0,0 +1,297 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.classifier.df.DFUtils;
+import org.apache.mahout.classifier.df.data.DataConverter;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.classifier.df.mapreduce.Builder;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Arrays;
+
+/**
+ * Temporary class used to compute the frequency distribution of the "class attribute".<br>
+ * This class can be used when the criterion variable is the categorical attribute.
+ */
+@Deprecated
+public class FrequenciesJob {
+  
+  private static final Logger log = LoggerFactory.getLogger(FrequenciesJob.class);
+  
+  /** directory that will hold this job's output */
+  private final Path outputPath;
+  
+  /** file that contains the serialized dataset */
+  private final Path datasetPath;
+  
+  /** directory that contains the data used in the first step */
+  private final Path dataPath;
+  
+  /**
+   * @param base
+   *          base directory
+   * @param dataPath
+   *          data used in the first step
+   */
+  public FrequenciesJob(Path base, Path dataPath, Path datasetPath) {
+    this.outputPath = new Path(base, "frequencies.output");
+    this.dataPath = dataPath;
+    this.datasetPath = datasetPath;
+  }
+  
+  /**
+   * @return counts[partition][label] = num tuples from 'partition' with class == label
+   */
+  public int[][] run(Configuration conf) throws IOException, ClassNotFoundException, InterruptedException {
+    
+    // check the output
+    FileSystem fs = outputPath.getFileSystem(conf);
+    if (fs.exists(outputPath)) {
+      throw new IOException("Output path already exists : " + outputPath);
+    }
+    
+    // put the dataset into the DistributedCache
+    URI[] files = {datasetPath.toUri()};
+    DistributedCache.setCacheFiles(files, conf);
+    
+    Job job = new Job(conf);
+    job.setJarByClass(FrequenciesJob.class);
+    
+    FileInputFormat.setInputPaths(job, dataPath);
+    FileOutputFormat.setOutputPath(job, outputPath);
+    
+    job.setMapOutputKeyClass(LongWritable.class);
+    job.setMapOutputValueClass(IntWritable.class);
+    job.setOutputKeyClass(LongWritable.class);
+    job.setOutputValueClass(Frequencies.class);
+    
+    job.setMapperClass(FrequenciesMapper.class);
+    job.setReducerClass(FrequenciesReducer.class);
+    
+    job.setInputFormatClass(TextInputFormat.class);
+    job.setOutputFormatClass(SequenceFileOutputFormat.class);
+    
+    // run the job
+    boolean succeeded = job.waitForCompletion(true);
+    if (!succeeded) {
+      throw new IllegalStateException("Job failed!");
+    }
+    
+    int[][] counts = parseOutput(job);
+
+    HadoopUtil.delete(conf, outputPath);
+    
+    return counts;
+  }
+  
+  /**
+   * Extracts the output and processes it
+   * 
+   * @return counts[partition][label] = num tuples from 'partition' with class == label
+   */
+  int[][] parseOutput(JobContext job) throws IOException {
+    Configuration conf = job.getConfiguration();
+    
+    int numMaps = conf.getInt("mapred.map.tasks", -1);
+    log.info("mapred.map.tasks = {}", numMaps);
+    
+    FileSystem fs = outputPath.getFileSystem(conf);
+    
+    Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);
+    
+    Frequencies[] values = new Frequencies[numMaps];
+    
+    // read all the outputs
+    int index = 0;
+    for (Path path : outfiles) {
+      for (Frequencies value : new SequenceFileValueIterable<Frequencies>(path, conf)) {
+        values[index++] = value;
+      }
+    }
+    
+    if (index < numMaps) {
+      throw new IllegalStateException("number of output Frequencies (" + index
+          + ") is lesser than the number of mappers!");
+    }
+    
+    // sort the frequencies using the firstIds
+    Arrays.sort(values);
+    return Frequencies.extractCounts(values);
+  }
+  
+  /**
+   * Outputs the first key and the label of each tuple
+   * 
+   */
+  private static class FrequenciesMapper extends Mapper<LongWritable,Text,LongWritable,IntWritable> {
+    
+    private LongWritable firstId;
+    
+    private DataConverter converter;
+    private Dataset dataset;
+    
+    @Override
+    protected void setup(Context context) throws IOException, InterruptedException {
+      Configuration conf = context.getConfiguration();
+      
+      dataset = Builder.loadDataset(conf);
+      setup(dataset);
+    }
+    
+    /**
+     * Useful when testing
+     */
+    void setup(Dataset dataset) {
+      converter = new DataConverter(dataset);
+    }
+    
+    @Override
+    protected void map(LongWritable key, Text value, Context context) throws IOException,
+                                                                     InterruptedException {
+      if (firstId == null) {
+        firstId = new LongWritable(key.get());
+      }
+      
+      Instance instance = converter.convert(value.toString());
+      
+      context.write(firstId, new IntWritable((int) dataset.getLabel(instance)));
+    }
+    
+  }
+  
+  private static class FrequenciesReducer extends Reducer<LongWritable,IntWritable,LongWritable,Frequencies> {
+    
+    private int nblabels;
+    
+    @Override
+    protected void setup(Context context) throws IOException, InterruptedException {
+      Configuration conf = context.getConfiguration();
+      Dataset dataset = Builder.loadDataset(conf);
+      setup(dataset.nblabels());
+    }
+    
+    /**
+     * Useful when testing
+     */
+    void setup(int nblabels) {
+      this.nblabels = nblabels;
+    }
+    
+    @Override
+    protected void reduce(LongWritable key, Iterable<IntWritable> values, Context context)
+      throws IOException, InterruptedException {
+      int[] counts = new int[nblabels];
+      for (IntWritable value : values) {
+        counts[value.get()]++;
+      }
+      
+      context.write(key, new Frequencies(key.get(), counts));
+    }
+  }
+  
+  /**
+   * Output of the job
+   * 
+   */
+  private static class Frequencies implements Writable, Comparable<Frequencies>, Cloneable {
+    
+    /** first key of the partition used to sort the partitions */
+    private long firstId;
+    
+    /** counts[c] = num tuples from the partition with label == c */
+    private int[] counts;
+    
+    Frequencies() { }
+    
+    Frequencies(long firstId, int[] counts) {
+      this.firstId = firstId;
+      this.counts = Arrays.copyOf(counts, counts.length);
+    }
+    
+    @Override
+    public void readFields(DataInput in) throws IOException {
+      firstId = in.readLong();
+      counts = DFUtils.readIntArray(in);
+    }
+    
+    @Override
+    public void write(DataOutput out) throws IOException {
+      out.writeLong(firstId);
+      DFUtils.writeArray(out, counts);
+    }
+    
+    @Override
+    public boolean equals(Object other) {
+      return other instanceof Frequencies && firstId == ((Frequencies) other).firstId;
+    }
+    
+    @Override
+    public int hashCode() {
+      return (int) firstId;
+    }
+    
+    @Override
+    protected Frequencies clone() {
+      return new Frequencies(firstId, counts);
+    }
+    
+    @Override
+    public int compareTo(Frequencies obj) {
+      if (firstId < obj.firstId) {
+        return -1;
+      } else if (firstId > obj.firstId) {
+        return 1;
+      } else {
+        return 0;
+      }
+    }
+    
+    public static int[][] extractCounts(Frequencies[] partitions) {
+      int[][] counts = new int[partitions.length][];
+      for (int p = 0; p < partitions.length; p++) {
+        counts[p] = partitions[p].counts;
+      }
+      return counts;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java
new file mode 100644
index 0000000..a2a3458
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java
@@ -0,0 +1,264 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.tools;
+
+import java.lang.reflect.Field;
+import java.text.DecimalFormat;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.classifier.df.node.CategoricalNode;
+import org.apache.mahout.classifier.df.node.Leaf;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.classifier.df.node.NumericalNode;
+
+/**
+ * This tool is to visualize the Decision tree
+ */
+@Deprecated
+public final class TreeVisualizer {
+  
+  private TreeVisualizer() {}
+  
+  private static String doubleToString(double value) {
+    DecimalFormat df = new DecimalFormat("0.##");
+    return df.format(value);
+  }
+  
+  private static String toStringNode(Node node, Dataset dataset,
+      String[] attrNames, Map<String,Field> fields, int layer) {
+    
+    StringBuilder buff = new StringBuilder();
+    
+    try {
+      if (node instanceof CategoricalNode) {
+        CategoricalNode cnode = (CategoricalNode) node;
+        int attr = (Integer) fields.get("CategoricalNode.attr").get(cnode);
+        double[] values = (double[]) fields.get("CategoricalNode.values").get(cnode);
+        Node[] childs = (Node[]) fields.get("CategoricalNode.childs").get(cnode);
+        String[][] attrValues = (String[][]) fields.get("Dataset.values").get(dataset);
+        for (int i = 0; i < attrValues[attr].length; i++) {
+          int index = ArrayUtils.indexOf(values, i);
+          if (index < 0) {
+            continue;
+          }
+          buff.append('\n');
+          for (int j = 0; j < layer; j++) {
+            buff.append("|   ");
+          }
+          buff.append(attrNames == null ? attr : attrNames[attr]).append(" = ")
+              .append(attrValues[attr][i]);
+          buff.append(toStringNode(childs[index], dataset, attrNames, fields, layer + 1));
+        }
+      } else if (node instanceof NumericalNode) {
+        NumericalNode nnode = (NumericalNode) node;
+        int attr = (Integer) fields.get("NumericalNode.attr").get(nnode);
+        double split = (Double) fields.get("NumericalNode.split").get(nnode);
+        Node loChild = (Node) fields.get("NumericalNode.loChild").get(nnode);
+        Node hiChild = (Node) fields.get("NumericalNode.hiChild").get(nnode);
+        buff.append('\n');
+        for (int j = 0; j < layer; j++) {
+          buff.append("|   ");
+        }
+        buff.append(attrNames == null ? attr : attrNames[attr]).append(" < ")
+            .append(doubleToString(split));
+        buff.append(toStringNode(loChild, dataset, attrNames, fields, layer + 1));
+        buff.append('\n');
+        for (int j = 0; j < layer; j++) {
+          buff.append("|   ");
+        }
+        buff.append(attrNames == null ? attr : attrNames[attr]).append(" >= ")
+            .append(doubleToString(split));
+        buff.append(toStringNode(hiChild, dataset, attrNames, fields, layer + 1));
+      } else if (node instanceof Leaf) {
+        Leaf leaf = (Leaf) node;
+        double label = (Double) fields.get("Leaf.label").get(leaf);
+        if (dataset.isNumerical(dataset.getLabelId())) {
+          buff.append(" : ").append(doubleToString(label));
+        } else {
+          buff.append(" : ").append(dataset.getLabelString(label));
+        }
+      }
+    } catch (IllegalAccessException iae) {
+      throw new IllegalStateException(iae);
+    }
+    
+    return buff.toString();
+  }
+  
+  private static Map<String,Field> getReflectMap() {
+    Map<String,Field> fields = new HashMap<>();
+    
+    try {
+      Field m = CategoricalNode.class.getDeclaredField("attr");
+      m.setAccessible(true);
+      fields.put("CategoricalNode.attr", m);
+      m = CategoricalNode.class.getDeclaredField("values");
+      m.setAccessible(true);
+      fields.put("CategoricalNode.values", m);
+      m = CategoricalNode.class.getDeclaredField("childs");
+      m.setAccessible(true);
+      fields.put("CategoricalNode.childs", m);
+      m = NumericalNode.class.getDeclaredField("attr");
+      m.setAccessible(true);
+      fields.put("NumericalNode.attr", m);
+      m = NumericalNode.class.getDeclaredField("split");
+      m.setAccessible(true);
+      fields.put("NumericalNode.split", m);
+      m = NumericalNode.class.getDeclaredField("loChild");
+      m.setAccessible(true);
+      fields.put("NumericalNode.loChild", m);
+      m = NumericalNode.class.getDeclaredField("hiChild");
+      m.setAccessible(true);
+      fields.put("NumericalNode.hiChild", m);
+      m = Leaf.class.getDeclaredField("label");
+      m.setAccessible(true);
+      fields.put("Leaf.label", m);
+      m = Dataset.class.getDeclaredField("values");
+      m.setAccessible(true);
+      fields.put("Dataset.values", m);
+    } catch (NoSuchFieldException nsfe) {
+      throw new IllegalStateException(nsfe);
+    }
+    
+    return fields;
+  }
+  
+  /**
+   * Decision tree to String
+   * 
+   * @param tree
+   *          Node of tree
+   * @param attrNames
+   *          attribute names
+   */
+  public static String toString(Node tree, Dataset dataset, String[] attrNames) {
+    return toStringNode(tree, dataset, attrNames, getReflectMap(), 0);
+  }
+  
+  /**
+   * Print Decision tree
+   * 
+   * @param tree
+   *          Node of tree
+   * @param attrNames
+   *          attribute names
+   */
+  public static void print(Node tree, Dataset dataset, String[] attrNames) {
+    System.out.println(toString(tree, dataset, attrNames));
+  }
+  
+  private static String toStringPredict(Node node, Instance instance,
+      Dataset dataset, String[] attrNames, Map<String,Field> fields) {
+    StringBuilder buff = new StringBuilder();
+    
+    try {
+      if (node instanceof CategoricalNode) {
+        CategoricalNode cnode = (CategoricalNode) node;
+        int attr = (Integer) fields.get("CategoricalNode.attr").get(cnode);
+        double[] values = (double[]) fields.get("CategoricalNode.values").get(
+            cnode);
+        Node[] childs = (Node[]) fields.get("CategoricalNode.childs")
+            .get(cnode);
+        String[][] attrValues = (String[][]) fields.get("Dataset.values").get(
+            dataset);
+        
+        int index = ArrayUtils.indexOf(values, instance.get(attr));
+        if (index >= 0) {
+          buff.append(attrNames == null ? attr : attrNames[attr]).append(" = ")
+              .append(attrValues[attr][(int) instance.get(attr)]);
+          buff.append(" -> ");
+          buff.append(toStringPredict(childs[index], instance, dataset,
+              attrNames, fields));
+        }
+      } else if (node instanceof NumericalNode) {
+        NumericalNode nnode = (NumericalNode) node;
+        int attr = (Integer) fields.get("NumericalNode.attr").get(nnode);
+        double split = (Double) fields.get("NumericalNode.split").get(nnode);
+        Node loChild = (Node) fields.get("NumericalNode.loChild").get(nnode);
+        Node hiChild = (Node) fields.get("NumericalNode.hiChild").get(nnode);
+        
+        if (instance.get(attr) < split) {
+          buff.append('(').append(attrNames == null ? attr : attrNames[attr])
+              .append(" = ").append(doubleToString(instance.get(attr)))
+              .append(") < ").append(doubleToString(split));
+          buff.append(" -> ");
+          buff.append(toStringPredict(loChild, instance, dataset, attrNames,
+              fields));
+        } else {
+          buff.append('(').append(attrNames == null ? attr : attrNames[attr])
+              .append(" = ").append(doubleToString(instance.get(attr)))
+              .append(") >= ").append(doubleToString(split));
+          buff.append(" -> ");
+          buff.append(toStringPredict(hiChild, instance, dataset, attrNames,
+              fields));
+        }
+      } else if (node instanceof Leaf) {
+        Leaf leaf = (Leaf) node;
+        double label = (Double) fields.get("Leaf.label").get(leaf);
+        if (dataset.isNumerical(dataset.getLabelId())) {
+          buff.append(doubleToString(label));
+        } else {
+          buff.append(dataset.getLabelString(label));
+        }
+      }
+    } catch (IllegalAccessException iae) {
+      throw new IllegalStateException(iae);
+    }
+    
+    return buff.toString();
+  }
+  
+  /**
+   * Predict trace to String
+   * 
+   * @param tree
+   *          Node of tree
+   * @param attrNames
+   *          attribute names
+   */
+  public static String[] predictTrace(Node tree, Data data, String[] attrNames) {
+    Map<String,Field> reflectMap = getReflectMap();
+    String[] prediction = new String[data.size()];
+    for (int i = 0; i < data.size(); i++) {
+      prediction[i] = toStringPredict(tree, data.get(i), data.getDataset(),
+          attrNames, reflectMap);
+    }
+    return prediction;
+  }
+  
+  /**
+   * Print predict trace
+   * 
+   * @param tree
+   *          Node of tree
+   * @param attrNames
+   *          attribute names
+   */
+  public static void predictTracePrint(Node tree, Data data, String[] attrNames) {
+    Map<String,Field> reflectMap = getReflectMap();
+    for (int i = 0; i < data.size(); i++) {
+      System.out.println(toStringPredict(tree, data.get(i), data.getDataset(),
+          attrNames, reflectMap));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java
new file mode 100644
index 0000000..e1b55ab
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java
@@ -0,0 +1,212 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.tools;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Locale;
+import java.util.Random;
+import java.util.Scanner;
+
+import com.google.common.base.Preconditions;
+import com.google.common.io.Closeables;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.classifier.df.data.DataConverter;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.RandomUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This tool is used to uniformly distribute the class of all the tuples of the dataset over a given number of
+ * partitions.<br>
+ * This class can be used when the criterion variable is the categorical attribute.
+ */
+@Deprecated
+public final class UDistrib {
+  
+  private static final Logger log = LoggerFactory.getLogger(UDistrib.class);
+  
+  private UDistrib() {}
+  
+  /**
+   * Launch the uniform distribution tool. Requires the following command line arguments:<br>
+   * 
+   * data : data path dataset : dataset path numpartitions : num partitions output : output path
+   *
+   * @throws java.io.IOException
+   */
+  public static void main(String[] args) throws IOException {
+    
+    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+    ArgumentBuilder abuilder = new ArgumentBuilder();
+    GroupBuilder gbuilder = new GroupBuilder();
+    
+    Option dataOpt = obuilder.withLongName("data").withShortName("d").withRequired(true).withArgument(
+      abuilder.withName("data").withMinimum(1).withMaximum(1).create()).withDescription("Data path").create();
+    
+    Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true).withArgument(
+      abuilder.withName("dataset").withMinimum(1).create()).withDescription("Dataset path").create();
+    
+    Option outputOpt = obuilder.withLongName("output").withShortName("o").withRequired(true).withArgument(
+      abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+      "Path to generated files").create();
+    
+    Option partitionsOpt = obuilder.withLongName("numpartitions").withShortName("p").withRequired(true)
+        .withArgument(abuilder.withName("numparts").withMinimum(1).withMinimum(1).create()).withDescription(
+          "Number of partitions to create").create();
+    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+        .create();
+    
+    Group group = gbuilder.withName("Options").withOption(dataOpt).withOption(outputOpt).withOption(
+      datasetOpt).withOption(partitionsOpt).withOption(helpOpt).create();
+    
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(group);
+      CommandLine cmdLine = parser.parse(args);
+      
+      if (cmdLine.hasOption(helpOpt)) {
+        CommandLineUtil.printHelp(group);
+        return;
+      }
+      
+      String data = cmdLine.getValue(dataOpt).toString();
+      String dataset = cmdLine.getValue(datasetOpt).toString();
+      int numPartitions = Integer.parseInt(cmdLine.getValue(partitionsOpt).toString());
+      String output = cmdLine.getValue(outputOpt).toString();
+      
+      runTool(data, dataset, output, numPartitions);
+    } catch (OptionException e) {
+      log.warn(e.toString(), e);
+      CommandLineUtil.printHelp(group);
+    }
+    
+  }
+  
+  private static void runTool(String dataStr, String datasetStr, String output, int numPartitions) throws IOException {
+
+    Preconditions.checkArgument(numPartitions > 0, "numPartitions <= 0");
+    
+    // make sure the output file does not exist
+    Path outputPath = new Path(output);
+    Configuration conf = new Configuration();
+    FileSystem fs = outputPath.getFileSystem(conf);
+
+    Preconditions.checkArgument(!fs.exists(outputPath), "Output path already exists");
+    
+    // create a new file corresponding to each partition
+    // Path workingDir = fs.getWorkingDirectory();
+    // FileSystem wfs = workingDir.getFileSystem(conf);
+    // File parentFile = new File(workingDir.toString());
+    // File tempFile = FileUtil.createLocalTempFile(parentFile, "Parts", true);
+    // File tempFile = File.createTempFile("df.tools.UDistrib","");
+    // tempFile.deleteOnExit();
+    File tempFile = FileUtil.createLocalTempFile(new File(""), "df.tools.UDistrib", true);
+    Path partsPath = new Path(tempFile.toString());
+    FileSystem pfs = partsPath.getFileSystem(conf);
+    
+    Path[] partPaths = new Path[numPartitions];
+    FSDataOutputStream[] files = new FSDataOutputStream[numPartitions];
+    for (int p = 0; p < numPartitions; p++) {
+      partPaths[p] = new Path(partsPath, String.format(Locale.ENGLISH, "part.%03d", p));
+      files[p] = pfs.create(partPaths[p]);
+    }
+    
+    Path datasetPath = new Path(datasetStr);
+    Dataset dataset = Dataset.load(conf, datasetPath);
+    
+    // currents[label] = next partition file where to place the tuple
+    int[] currents = new int[dataset.nblabels()];
+    
+    // currents is initialized randomly in the range [0, numpartitions[
+    Random random = RandomUtils.getRandom();
+    for (int c = 0; c < currents.length; c++) {
+      currents[c] = random.nextInt(numPartitions);
+    }
+    
+    // foreach tuple of the data
+    Path dataPath = new Path(dataStr);
+    FileSystem ifs = dataPath.getFileSystem(conf);
+    FSDataInputStream input = ifs.open(dataPath);
+    Scanner scanner = new Scanner(input, "UTF-8");
+    DataConverter converter = new DataConverter(dataset);
+    
+    int id = 0;
+    while (scanner.hasNextLine()) {
+      if (id % 1000 == 0) {
+        log.info("progress : {}", id);
+      }
+      
+      String line = scanner.nextLine();
+      if (line.isEmpty()) {
+        continue; // skip empty lines
+      }
+      
+      // write the tuple in files[tuple.label]
+      Instance instance = converter.convert(line);
+      int label = (int) dataset.getLabel(instance);
+      files[currents[label]].writeBytes(line);
+      files[currents[label]].writeChar('\n');
+      
+      // update currents
+      currents[label]++;
+      if (currents[label] == numPartitions) {
+        currents[label] = 0;
+      }
+    }
+    
+    // close all the files.
+    scanner.close();
+    for (FSDataOutputStream file : files) {
+      Closeables.close(file, false);
+    }
+    
+    // merge all output files
+    FileUtil.copyMerge(pfs, partsPath, fs, outputPath, true, conf, null);
+    /*
+     * FSDataOutputStream joined = fs.create(new Path(outputPath, "uniform.data")); for (int p = 0; p <
+     * numPartitions; p++) {log.info("Joining part : {}", p); FSDataInputStream partStream =
+     * fs.open(partPaths[p]);
+     * 
+     * IOUtils.copyBytes(partStream, joined, conf, false);
+     * 
+     * partStream.close(); }
+     * 
+     * joined.close();
+     * 
+     * fs.delete(partsPath, true);
+     */
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/evaluation/Auc.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/evaluation/Auc.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/evaluation/Auc.java
new file mode 100644
index 0000000..049f9bf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/evaluation/Auc.java
@@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.evaluation;
+
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.list.DoubleArrayList;
+
+import com.google.common.base.Preconditions;
+
+import java.util.Random;
+
+/**
+ * Computes AUC and a few other accuracy statistics without storing huge amounts of data.  This is
+ * done by keeping uniform samples of the positive and negative scores.  Then, when AUC is to be
+ * computed, the remaining scores are sorted and a rank-sum statistic is used to compute the AUC.
+ * Since AUC is invariant with respect to down-sampling of either positives or negatives, this is
+ * close to correct and is exactly correct if maxBufferSize or fewer positive and negative scores
+ * are examined.
+ */
+public class Auc {
+
+  private int maxBufferSize = 10000;
+  private final DoubleArrayList[] scores = {new DoubleArrayList(), new DoubleArrayList()};
+  private final Random rand;
+  private int samples;
+  private final double threshold;
+  private final Matrix confusion;
+  private final DenseMatrix entropy;
+
+  private boolean probabilityScore = true;
+
+  private boolean hasScore;
+
+  /**
+   * Allocates a new data-structure for accumulating information about AUC and a few other accuracy
+   * measures.
+   * @param threshold The threshold to use in computing the confusion matrix.
+   */
+  public Auc(double threshold) {
+    confusion = new DenseMatrix(2, 2);
+    entropy = new DenseMatrix(2, 2);
+    this.rand = RandomUtils.getRandom();
+    this.threshold = threshold;
+  }
+
+  public Auc() {
+    this(0.5);
+  }
+
+  /**
+   * Adds a score to the AUC buffers.
+   *
+   * @param trueValue Whether this score is for a true-positive or a true-negative example.
+   * @param score     The score for this example.
+   */
+  public void add(int trueValue, double score) {
+    Preconditions.checkArgument(trueValue == 0 || trueValue == 1, "True value must be 0 or 1");
+    hasScore = true;
+
+    int predictedClass = score > threshold ? 1 : 0;
+    confusion.set(trueValue, predictedClass, confusion.get(trueValue, predictedClass) + 1);
+
+    samples++;
+    if (isProbabilityScore()) {
+      double limited = Math.max(1.0e-20, Math.min(score, 1 - 1.0e-20));
+      double v0 = entropy.get(trueValue, 0);
+      entropy.set(trueValue, 0, (Math.log1p(-limited) - v0) / samples + v0);
+
+      double v1 = entropy.get(trueValue, 1);
+      entropy.set(trueValue, 1, (Math.log(limited) - v1) / samples + v1);
+    }
+
+    // add to buffers
+    DoubleArrayList buf = scores[trueValue];
+    if (buf.size() >= maxBufferSize) {
+      // but if too many points are seen, we insert into a random
+      // place and discard the predecessor.  The random place could
+      // be anywhere, possibly not even in the buffer.
+      // this is a special case of Knuth's permutation algorithm
+      // but since we don't ever shuffle the first maxBufferSize
+      // samples, the result isn't just a fair sample of the prefixes
+      // of all permutations.  The CONTENTs of the result, however,
+      // will be a fair and uniform sample of maxBufferSize elements
+      // chosen from all elements without replacement
+      int index = rand.nextInt(samples);
+      if (index < buf.size()) {
+        buf.set(index, score);
+      }
+    } else {
+      // for small buffers, we collect all points without permuting
+      // since we sort the data later, permuting now would just be
+      // pedantic
+      buf.add(score);
+    }
+  }
+
+  public void add(int trueValue, int predictedClass) {
+    hasScore = false;
+    Preconditions.checkArgument(trueValue == 0 || trueValue == 1, "True value must be 0 or 1");
+    confusion.set(trueValue, predictedClass, confusion.get(trueValue, predictedClass) + 1);
+  }
+
+  /**
+   * Computes the AUC of points seen so far.  This can be moderately expensive since it requires
+   * that all points that have been retained be sorted.
+   *
+   * @return The value of the Area Under the receiver operating Curve.
+   */
+  public double auc() {
+    Preconditions.checkArgument(hasScore, "Can't compute AUC for classifier without a score");
+    scores[0].sort();
+    scores[1].sort();
+
+    double n0 = scores[0].size();
+    double n1 = scores[1].size();
+
+    if (n0 == 0 || n1 == 0) {
+      return 0.5;
+    }
+
+    // scan the data
+    int i0 = 0;
+    int i1 = 0;
+    int rank = 1;
+    double rankSum = 0;
+    while (i0 < n0 && i1 < n1) {
+
+      double v0 = scores[0].get(i0);
+      double v1 = scores[1].get(i1);
+
+      if (v0 < v1) {
+        i0++;
+        rank++;
+      } else if (v1 < v0) {
+        i1++;
+        rankSum += rank;
+        rank++;
+      } else {
+        // ties have to be handled delicately
+        double tieScore = v0;
+
+        // how many negatives are tied?
+        int k0 = 0;
+        while (i0 < n0 && scores[0].get(i0) == tieScore) {
+          k0++;
+          i0++;
+        }
+
+        // and how many positives
+        int k1 = 0;
+        while (i1 < n1 && scores[1].get(i1) == tieScore) {
+          k1++;
+          i1++;
+        }
+
+        // we found k0 + k1 tied values which have
+        // ranks in the half open interval [rank, rank + k0 + k1)
+        // the average rank is assigned to all
+        rankSum += (rank + (k0 + k1 - 1) / 2.0) * k1;
+        rank += k0 + k1;
+      }
+    }
+
+    if (i1 < n1) {
+      rankSum += (rank + (n1 - i1 - 1) / 2.0) * (n1 - i1);
+      rank += (int) (n1 - i1);
+    }
+
+    return (rankSum / n1 - (n1 + 1) / 2) / n0;
+  }
+
+  /**
+   * Returns the confusion matrix for the classifier supposing that we were to use a particular
+   * threshold.
+   * @return The confusion matrix.
+   */
+  public Matrix confusion() {
+    return confusion;
+  }
+
+  /**
+   * Returns a matrix related to the confusion matrix and to the log-likelihood.  For a
+   * pretty accurate classifier, N + entropy is nearly the same as the confusion matrix
+   * because log(1-eps) \approx -eps if eps is small.
+   *
+   * For lower accuracy classifiers, this measure will give us a better picture of how
+   * things work our.
+   *
+   * Also, by definition, log-likelihood = sum(diag(entropy))
+   * @return Returns a cell by cell break-down of the log-likelihood
+   */
+  public Matrix entropy() {
+    if (!hasScore) {
+      // find a constant score that would optimize log-likelihood, but use a dash of Bayesian
+      // conservatism to avoid dividing by zero or taking log(0)
+      double p = (0.5 + confusion.get(1, 1)) / (1 + confusion.get(0, 0) + confusion.get(1, 1));
+      entropy.set(0, 0, confusion.get(0, 0) * Math.log1p(-p));
+      entropy.set(0, 1, confusion.get(0, 1) * Math.log(p));
+      entropy.set(1, 0, confusion.get(1, 0) * Math.log1p(-p));
+      entropy.set(1, 1, confusion.get(1, 1) * Math.log(p));
+    }
+    return entropy;
+  }
+
+  public void setMaxBufferSize(int maxBufferSize) {
+    this.maxBufferSize = maxBufferSize;
+  }
+
+  public boolean isProbabilityScore() {
+    return probabilityScore;
+  }
+
+  public void setProbabilityScore(boolean probabilityScore) {
+    this.probabilityScore = probabilityScore;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java
new file mode 100644
index 0000000..f0794b3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes;
+
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+
+/**
+ * Class implementing the Naive Bayes Classifier Algorithm. Note that this class
+ * supports {@link #classifyFull}, but not {@code classify} or
+ * {@code classifyScalar}. The reason that these two methods are not
+ * supported is because the scores computed by a NaiveBayesClassifier do not
+ * represent probabilities.
+ */
+public abstract class AbstractNaiveBayesClassifier extends AbstractVectorClassifier {
+
+  private final NaiveBayesModel model;
+  
+  protected AbstractNaiveBayesClassifier(NaiveBayesModel model) {
+    this.model = model;
+  }
+
+  protected NaiveBayesModel getModel() {
+    return model;
+  }
+  
+  protected abstract double getScoreForLabelFeature(int label, int feature);
+
+  protected double getScoreForLabelInstance(int label, Vector instance) {
+    double result = 0.0;
+    for (Element e : instance.nonZeroes()) {
+      result += e.get() * getScoreForLabelFeature(label, e.index());
+    }
+    return result;
+  }
+  
+  @Override
+  public int numCategories() {
+    return model.numLabels();
+  }
+
+  @Override
+  public Vector classifyFull(Vector instance) {
+    return classifyFull(model.createScoringVector(), instance);
+  }
+  
+  @Override
+  public Vector classifyFull(Vector r, Vector instance) {
+    for (int label = 0; label < model.numLabels(); label++) {
+      r.setQuick(label, getScoreForLabelInstance(label, instance));
+    }
+    return r;
+  }
+
+  /** Unsupported method. This implementation simply throws an {@link UnsupportedOperationException}. */
+  @Override
+  public double classifyScalar(Vector instance) {
+    throw new UnsupportedOperationException("Not supported in Naive Bayes");
+  }
+  
+  /** Unsupported method. This implementation simply throws an {@link UnsupportedOperationException}. */
+  @Override
+  public Vector classify(Vector instance) {
+    throw new UnsupportedOperationException("probabilites not supported in Naive Bayes");
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/BayesUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/BayesUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/BayesUtils.java
new file mode 100644
index 0000000..4db8b17
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/BayesUtils.java
@@ -0,0 +1,161 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes;
+
+import com.google.common.base.Preconditions;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.regex.Pattern;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.naivebayes.training.ThetaMapper;
+import org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.SparseMatrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
+
+public final class BayesUtils {
+
+  private static final Pattern SLASH = Pattern.compile("/");
+
+  private BayesUtils() {}
+
+  public static NaiveBayesModel readModelFromDir(Path base, Configuration conf) {
+
+    float alphaI = conf.getFloat(ThetaMapper.ALPHA_I, 1.0f);
+    boolean isComplementary = conf.getBoolean(NaiveBayesModel.COMPLEMENTARY_MODEL, true);
+
+    // read feature sums and label sums
+    Vector scoresPerLabel = null;
+    Vector scoresPerFeature = null;
+    for (Pair<Text,VectorWritable> record : new SequenceFileDirIterable<Text, VectorWritable>(
+        new Path(base, TrainNaiveBayesJob.WEIGHTS), PathType.LIST, PathFilters.partFilter(), conf)) {
+      String key = record.getFirst().toString();
+      VectorWritable value = record.getSecond();
+      if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE)) {
+        scoresPerFeature = value.get();
+      } else if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_LABEL)) {
+        scoresPerLabel = value.get();
+      }
+    }
+
+    Preconditions.checkNotNull(scoresPerFeature);
+    Preconditions.checkNotNull(scoresPerLabel);
+
+    Matrix scoresPerLabelAndFeature = new SparseMatrix(scoresPerLabel.size(), scoresPerFeature.size());
+    for (Pair<IntWritable,VectorWritable> entry : new SequenceFileDirIterable<IntWritable,VectorWritable>(
+        new Path(base, TrainNaiveBayesJob.SUMMED_OBSERVATIONS), PathType.LIST, PathFilters.partFilter(), conf)) {
+      scoresPerLabelAndFeature.assignRow(entry.getFirst().get(), entry.getSecond().get());
+    }
+    
+    // perLabelThetaNormalizer is only used by the complementary model, we do not instantiate it for the standard model
+    Vector perLabelThetaNormalizer = null;
+    if (isComplementary) {
+      perLabelThetaNormalizer=scoresPerLabel.like();    
+      for (Pair<Text,VectorWritable> entry : new SequenceFileDirIterable<Text,VectorWritable>(
+          new Path(base, TrainNaiveBayesJob.THETAS), PathType.LIST, PathFilters.partFilter(), conf)) {
+        if (entry.getFirst().toString().equals(TrainNaiveBayesJob.LABEL_THETA_NORMALIZER)) {
+          perLabelThetaNormalizer = entry.getSecond().get();
+        }
+      }
+      Preconditions.checkNotNull(perLabelThetaNormalizer);
+    }
+     
+    return new NaiveBayesModel(scoresPerLabelAndFeature, scoresPerFeature, scoresPerLabel, perLabelThetaNormalizer,
+        alphaI, isComplementary);
+  }
+
+  /** Write the list of labels into a map file */
+  public static int writeLabelIndex(Configuration conf, Iterable<String> labels, Path indexPath)
+    throws IOException {
+    FileSystem fs = FileSystem.get(indexPath.toUri(), conf);
+    int i = 0;
+    try (SequenceFile.Writer writer =
+           SequenceFile.createWriter(fs.getConf(), SequenceFile.Writer.file(indexPath),
+             SequenceFile.Writer.keyClass(Text.class), SequenceFile.Writer.valueClass(IntWritable.class))) {
+      for (String label : labels) {
+        writer.append(new Text(label), new IntWritable(i++));
+      }
+    }
+    return i;
+  }
+
+  public static int writeLabelIndex(Configuration conf, Path indexPath,
+                                    Iterable<Pair<Text,IntWritable>> labels) throws IOException {
+    FileSystem fs = FileSystem.get(indexPath.toUri(), conf);
+    Collection<String> seen = new HashSet<>();
+    int i = 0;
+    try (SequenceFile.Writer writer =
+           SequenceFile.createWriter(fs.getConf(), SequenceFile.Writer.file(indexPath),
+             SequenceFile.Writer.keyClass(Text.class), SequenceFile.Writer.valueClass(IntWritable.class))){
+      for (Object label : labels) {
+        String theLabel = SLASH.split(((Pair<?, ?>) label).getFirst().toString())[1];
+        if (!seen.contains(theLabel)) {
+          writer.append(new Text(theLabel), new IntWritable(i++));
+          seen.add(theLabel);
+        }
+      }
+    }
+    return i;
+  }
+
+  public static Map<Integer, String> readLabelIndex(Configuration conf, Path indexPath) {
+    Map<Integer, String> labelMap = new HashMap<>();
+    for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>(indexPath, true, conf)) {
+      labelMap.put(pair.getSecond().get(), pair.getFirst().toString());
+    }
+    return labelMap;
+  }
+
+  public static OpenObjectIntHashMap<String> readIndexFromCache(Configuration conf) throws IOException {
+    OpenObjectIntHashMap<String> index = new OpenObjectIntHashMap<>();
+    for (Pair<Writable,IntWritable> entry
+        : new SequenceFileIterable<Writable,IntWritable>(HadoopUtil.getSingleCachedFile(conf), conf)) {
+      index.put(entry.getFirst().toString(), entry.getSecond().get());
+    }
+    return index;
+  }
+
+  public static Map<String,Vector> readScoresFromCache(Configuration conf) throws IOException {
+    Map<String,Vector> sumVectors = new HashMap<>();
+    for (Pair<Text,VectorWritable> entry
+        : new SequenceFileDirIterable<Text,VectorWritable>(HadoopUtil.getSingleCachedFile(conf),
+          PathType.LIST, PathFilters.partFilter(), conf)) {
+      sumVectors.put(entry.getFirst().toString(), entry.getSecond().get());
+    }
+    return sumVectors;
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/ComplementaryNaiveBayesClassifier.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/ComplementaryNaiveBayesClassifier.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/ComplementaryNaiveBayesClassifier.java
new file mode 100644
index 0000000..18bd3d6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/ComplementaryNaiveBayesClassifier.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes;
+
+
+/** Implementation of the Naive Bayes Classifier Algorithm */
+public class ComplementaryNaiveBayesClassifier extends AbstractNaiveBayesClassifier {
+  public ComplementaryNaiveBayesClassifier(NaiveBayesModel model) {
+    super(model);
+  }
+
+  @Override
+  public double getScoreForLabelFeature(int label, int feature) {
+    NaiveBayesModel model = getModel();
+    double weight = computeWeight(model.featureWeight(feature), model.weight(label, feature),
+        model.totalWeightSum(), model.labelWeight(label), model.alphaI(), model.numFeatures());
+    // see http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.2, Weight Magnitude Errors
+    return weight / model.thetaNormalizer(label);
+  }
+
+  // see http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.1, Skewed Data bias
+  public static double computeWeight(double featureWeight, double featureLabelWeight,
+      double totalWeight, double labelWeight, double alphaI, double numFeatures) {
+    double numerator = featureWeight - featureLabelWeight + alphaI;
+    double denominator = totalWeight - labelWeight + alphaI * numFeatures;
+    return -Math.log(numerator / denominator);
+  }
+}


[37/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/resources/country.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/resources/country.txt b/community/mahout-mr/mr-examples/bin/resources/country.txt
new file mode 100644
index 0000000..6a22091
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/resources/country.txt
@@ -0,0 +1,229 @@
+Afghanistan
+Albania
+Algeria
+American Samoa
+Andorra
+Angola
+Anguilla
+Antigua and Barbuda
+Argentina
+Armenia
+Aruba
+Australia
+Austria
+Azerbaijan
+Bahamas
+Bangladesh
+Barbados
+Belarus
+Belgium
+Belize
+Benin
+Bermuda
+Bhutan
+Bolivia
+Bosnia and Herzegovina
+Botswana
+Bouvet Island
+Brazil
+British Indian Ocean Territory
+Brunei Darussalam
+Bulgaria
+Burkina Faso
+Burundi
+Cambodia
+Cameroon
+Canada
+Cape Verde
+Cayman Islands
+Central African Republic
+Chad
+Chile
+China
+Christmas Island
+Cocos  Islands
+Colombia
+Comoros
+Congo
+Cook Islands
+Costa Rica
+Croatia
+C�te d'Ivoire
+Cuba
+Cyprus
+Czech Republic
+Djibouti
+Dominica
+Dominican Republic
+Ecuador
+Egypt
+El Salvador
+Equatorial Guinea
+Eritrea
+Estonia
+Ethiopia
+Falkland Islands 
+Faroe Islands
+Fiji
+Finland
+France
+French Guiana
+French Polynesia
+French Southern Territories
+Gabon
+Georgia
+Germany
+Ghana
+Gibraltar
+Greece
+Greenland
+Grenada
+Guadeloupe
+Guam
+Guatemala
+Guernsey
+Guinea
+Guinea-Bissau
+Guyana
+Haiti
+Honduras
+Hong Kong
+Hungary
+Iceland
+India
+Indonesia
+Iran
+Iraq
+Ireland
+Isle of Man
+Israel
+Italy
+Japan
+Jersey
+Jordan
+Kazakhstan
+Kenya
+Kiribati
+Korea
+Kuwait
+Kyrgyzstan
+Latvia
+Lebanon
+Lesotho
+Liberia
+Liechtenstein
+Lithuania
+Luxembourg
+Macedonia
+Madagascar
+Malawi
+Malaysia
+Maldives
+Mali
+Malta
+Marshall Islands
+Martinique
+Mauritania
+Mauritius
+Mayotte
+Mexico
+Micronesia
+Moldova
+Monaco
+Mongolia
+Montenegro
+Montserrat
+Morocco
+Mozambique
+Myanmar
+Namibia
+Nauru
+Nepal
+Netherlands
+Netherlands Antilles
+New Caledonia
+New Zealand
+Nicaragua
+Niger
+Nigeria
+Niue
+Norfolk Island
+Northern Mariana Islands
+Norway
+Oman
+Pakistan
+Palau
+Palestinian Territory
+Panama
+Papua New Guinea
+Paraguay
+Peru
+Philippines
+Pitcairn
+Poland
+Portugal
+Puerto Rico
+Qatar
+R�union
+Russian Federation
+Rwanda
+Saint Barth�lemy
+Saint Helena
+Saint Kitts and Nevis
+Saint Lucia
+Saint Martin 
+Saint Pierre and Miquelon
+Saint Vincent and the Grenadines
+Samoa
+San Marino
+Sao Tome and Principe
+Saudi Arabia
+Senegal
+Serbia
+Seychelles
+Sierra Leone
+Singapore
+Slovakia
+Slovenia
+Solomon Islands
+Somalia
+South Africa
+South Georgia and the South Sandwich Islands
+Spain
+Sri Lanka
+Sudan
+Suriname
+Svalbard and Jan Mayen
+Swaziland
+Sweden
+Switzerland
+Syrian Arab Republic
+Taiwan
+Tanzania
+Thailand
+Timor-Leste
+Togo
+Tokelau
+Tonga
+Trinidad and Tobago
+Tunisia
+Turkey
+Turkmenistan
+Turks and Caicos Islands
+Tuvalu
+Ukraine
+United Arab Emirates
+United Kingdom
+United States
+United States Minor Outlying Islands
+Uruguay
+Uzbekistan
+Vanuatu
+Vatican 
+Venezuela
+Vietnam
+Virgin Islands
+Wallis and Futuna
+Yemen
+Zambia
+Zimbabwe

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/resources/country10.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/resources/country10.txt b/community/mahout-mr/mr-examples/bin/resources/country10.txt
new file mode 100644
index 0000000..97a63e1
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/resources/country10.txt
@@ -0,0 +1,10 @@
+Australia
+Austria
+Bahamas
+Canada
+Colombia
+Cuba
+Panama
+Pakistan
+United Kingdom
+Vietnam

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/resources/country2.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/resources/country2.txt b/community/mahout-mr/mr-examples/bin/resources/country2.txt
new file mode 100644
index 0000000..f4b4f61
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/resources/country2.txt
@@ -0,0 +1,2 @@
+United States
+United Kingdom

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/resources/donut-test.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/resources/donut-test.csv b/community/mahout-mr/mr-examples/bin/resources/donut-test.csv
new file mode 100644
index 0000000..46ea564
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/resources/donut-test.csv
@@ -0,0 +1,41 @@
+"x","y","shape","color","xx","xy","yy","c","a","b"
+0.802415437065065,0.0978854028508067,21,2,0.643870533640319,0.07854475831082,0.00958155209126472,0.503141377562721,0.808363832523192,0.220502180491382
+0.97073650965467,0.989339149091393,23,2,0.942329371176533,0.96038763245370,0.978791951924881,0.67900343471543,1.38604520961670,0.989771844311643
+0.566630310611799,0.369259539060295,25,1,0.321069908904024,0.209233647314105,0.136352607187021,0.146740132271139,0.676330182744379,0.569352171215186
+0.377948862500489,0.500907538458705,24,1,0.142845342665413,0.189317434378387,0.250908362084759,0.122054511555201,0.62749797190921,0.79865886318828
+0.0133881184738129,0.269793515326455,25,2,0.000179241716268851,0.00361202754665705,0.0727885409122062,0.538317888266967,0.270125494221621,1.02283505301727
+0.395229484187439,0.385281964903697,25,1,0.156206345171069,0.152274792255611,0.148442192480054,0.155361155247979,0.551949760078871,0.717070128562224
+0.757145672803745,0.416044564917684,21,1,0.573269569845435,0.315006342020941,0.173093079997545,0.270503996498299,0.863922826323613,0.481737796145881
+0.589166145538911,0.971624446567148,24,2,0.347116747049177,0.572448230095344,0.944054065166917,0.479979395505718,1.13629697360157,1.05491161769044
+0.843438957352191,0.218833807157353,25,2,0.711389274779351,0.184572958142208,0.0478882351549814,0.443852166182378,0.871365313708512,0.269071728782402
+0.628562391968444,0.801476288354024,25,2,0.395090680597092,0.503777852913796,0.642364240793743,0.327744170151609,1.01855531091386,0.8833629703887
+0.262267543468624,0.247060472844169,22,2,0.0687842643570668,0.0647959433010369,0.0610388772419841,0.347124077652729,0.360309785599907,0.778002605819416
+0.738417695043609,0.562460686312988,21,1,0.545260692353516,0.415330923539883,0.316362023647678,0.246463657857698,0.928236347058869,0.620312280963368
+0.498857178725302,0.164454092038795,21,1,0.248858484765768,0.0820391043843046,0.0270451483883046,0.335547854098302,0.525265297877247,0.527436513434051
+0.499293045606464,0.733599063009024,25,1,0.249293545390979,0.366280910423824,0.538167585247717,0.233600132755117,0.88739006679064,0.888186376514393
+0.553942533675581,0.548312899889424,24,1,0.306852330614922,0.303733837011753,0.30064703618515,0.0724150069741539,0.779422457207946,0.706833997094728
+0.661088703200221,0.98143746308051,24,2,0.43703827349895,0.64881721974001,0.963219493937908,0.507672730364875,1.1833248782295,1.03830648704340
+0.492181566543877,0.376017479225993,23,1,0.242242694445585,0.185068871973329,0.141389144683470,0.124228794404457,0.619380205632255,0.63187712891139
+0.991064163157716,0.216620326042175,21,2,0.982208175495505,0.21468464215194,0.0469243656546183,0.566963889458783,1.01446170018888,0.21680455446021
+0.601602173643187,0.343355831922963,24,1,0.361925175332207,0.206563614817919,0.117893227315510,0.186709392055052,0.692689254029335,0.52594111396747
+0.0397100185509771,0.0602901463862509,25,2,0.00157688557331895,0.00239412283143915,0.00363490175127556,0.636562347604197,0.0721927096360464,0.962180726382856
+0.158290433697402,0.630195834673941,23,2,0.0250558614001118,0.0997539719848347,0.397146790040385,0.365672507948237,0.649771230080632,1.05148551299849
+0.967184047214687,0.497705311980098,25,2,0.935444981186582,0.48137263796116,0.247710577573207,0.467189682639721,1.08772954302059,0.498785990511377
+0.538070349488407,0.0130743277259171,24,2,0.289519700998577,0.00703490808881019,0.000170938045484685,0.488411672495383,0.538229169633216,0.462114639529248
+0.758642012253404,0.673675778554752,25,2,0.575537702755893,0.511078748249156,0.453839054611352,0.311542880770993,1.01458206044028,0.715606548922268
+0.986405614530668,0.981674374546856,21,2,0.972996036377624,0.9683291146939,0.96368457764196,0.684544100071034,1.39164672744903,0.981768498658543
+0.51937106740661,0.462004136526957,23,1,0.269746305659081,0.239951581534275,0.213447822168019,0.0426488439882434,0.695121664046734,0.666672328069706
+0.534244359936565,0.692785677267238,21,1,0.28541703612403,0.370116840724856,0.479951994626626,0.195803456422130,0.87485371963012,0.83479357381183
+0.0795328004751354,0.536029864801094,22,2,0.00632546635141770,0.0426319562859392,0.287328015958679,0.422008076977050,0.541898036820671,1.06517035321108
+0.330987347057089,0.804738595616072,23,2,0.10955262391189,0.266358292837412,0.647604207274128,0.348469350894533,0.870147591610767,1.04650950166343
+0.9804020607844,0.74571731640026,25,2,0.961188200790297,0.731102793761427,0.556094315979205,0.539595348001485,1.23178022259229,0.745974795285138
+0.362560331821442,0.805498170899227,21,2,0.131449994210474,0.292041684122788,0.648827303322001,0.334990738397057,0.883333061496328,1.02720817456326
+0.47635925677605,0.961423690896481,21,2,0.226918141516230,0.457983074842334,0.924335513417013,0.462028903057712,1.07296488988841,1.09477629741475
+0.850710266502574,0.635807712096721,24,2,0.723707957532881,0.540888148202193,0.404251446761667,0.376086992190972,1.06205433208219,0.65309943445803
+0.136131341336295,0.714137809583917,25,2,0.0185317420940189,0.0972165379176223,0.509992811077315,0.422203034393551,0.726996941651981,1.12083088398685
+0.930458213202655,0.865616530412808,24,2,0.865752486516278,0.805420010206583,0.749291977723908,0.564774043865972,1.27084399681479,0.868405457050378
+0.374636142514646,0.197784703457728,21,2,0.140352239278254,0.0740972983518064,0.0391187889218614,0.327185241457712,0.423640210792266,0.655895375171089
+0.482126326300204,0.841961156809703,22,1,0.232445794511731,0.405931639420132,0.708898589576332,0.342427950053959,0.970229036922758,0.988479504839456
+0.660344187868759,0.746531683253124,24,2,0.436054446452051,0.492967858096082,0.557309554100743,0.294088642131774,0.996676477375078,0.82016804669243
+0.0772640188224614,0.437956433976069,22,2,0.00596972860459766,0.0338382741581451,0.191805838061035,0.427264688298837,0.444719649515999,1.02139489377063
+0.998469967395067,0.464829172473401,25,2,0.996942275789907,0.464117968683793,0.216066159582307,0.499709210945471,1.10136662168971,0.464831690595724

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/resources/donut.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/resources/donut.csv b/community/mahout-mr/mr-examples/bin/resources/donut.csv
new file mode 100644
index 0000000..33ba3b7
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/resources/donut.csv
@@ -0,0 +1,41 @@
+"x","y","shape","color","k","k0","xx","xy","yy","a","b","c","bias"
+0.923307513352484,0.0135197141207755,21,2,4,8,0.852496764213146,0.0124828536260896,0.000182782669907495,0.923406490600458,0.0778750292332978,0.644866125183976,1
+0.711011884035543,0.909141522599384,22,2,3,9,0.505537899239772,0.64641042683833,0.826538308114327,1.15415605849213,0.953966686673604,0.46035073663368,1
+0.75118898646906,0.836567111080512,23,2,3,9,0.564284893392414,0.62842000028592,0.699844531341594,1.12433510339845,0.872783737128441,0.419968245447719,1
+0.308209649519995,0.418023289414123,24,1,5,1,0.094993188057238,0.128838811521522,0.174743470492603,0.519361780024138,0.808280495564412,0.208575453051705,1
+0.849057961953804,0.500220163026825,25,1,5,2,0.720899422757147,0.424715912147755,0.250220211498583,0.985454024425153,0.52249756970547,0.349058031386046,1
+0.0738831346388906,0.486534863477573,21,2,6,1,0.00545871758406844,0.0359467208248278,0.236716173379140,0.492112681164801,1.04613986717142,0.42632955896436,1
+0.612888508243486,0.0204555552918464,22,2,4,10,0.375632323536926,0.0125369747681119,0.000418429742297785,0.613229772009826,0.387651566219268,0.492652707029903,1
+0.207169560948387,0.932857288978994,23,2,1,4,0.0429192269835473,0.193259634985281,0.870222721601238,0.955584610897845,1.22425602987611,0.522604151014326,1
+0.309267645236105,0.506309477845207,24,1,5,1,0.0956464763898851,0.156585139973909,0.256349287355886,0.593292308854389,0.856423069092351,0.190836685845410,1
+0.78758287569508,0.171928803203627,25,2,4,10,0.620286786088131,0.135408181241926,0.0295595133710317,0.806130448165285,0.273277419610556,0.436273561610666,1
+0.930236018029973,0.0790199618786573,21,2,4,8,0.86533904924026,0.0735072146828825,0.00624415437530446,0.93358620577618,0.105409523078414,0.601936228937031,1
+0.238834470743313,0.623727766098455,22,1,5,1,0.0570419044152386,0.148967690904034,0.389036326202168,0.667890882268509,0.984077887735915,0.288991338582386,1
+0.83537525916472,0.802311758277938,23,2,3,7,0.697851823624524,0.670231393002335,0.643704157471036,1.15825557675997,0.819027144096042,0.451518508649315,1
+0.656760312616825,0.320640653371811,24,1,5,3,0.43133410822855,0.210584055746134,0.102810428594702,0.730851925374252,0.469706197095164,0.238209090579297,1
+0.180789119331166,0.114329558331519,25,2,2,5,0.0326847056685386,0.0206695401642766,0.0130712479082803,0.213906413126907,0.82715035810576,0.500636870310341,1
+0.990028728265315,0.061085847672075,21,2,4,8,0.980156882790638,0.0604767440857932,0.00373148078581595,0.991911469626425,0.06189432159595,0.657855445853466,1
+0.751934139290825,0.972332585137337,22,2,3,9,0.565404949831033,0.731130065509666,0.945430656119858,1.22916052895905,1.00347761677540,0.535321288127727,1
+0.136412925552577,0.552212274167687,23,2,6,1,0.0186084862578129,0.0753288918452558,0.304938395741448,0.5688118159807,1.02504684326820,0.3673168690368,1
+0.5729476721026,0.0981996888294816,24,2,4,10,0.328269034967789,0.0562632831160512,0.0096431788862070,0.581302170866406,0.43819729534628,0.408368525870829,1
+0.446335297077894,0.339370004367083,25,1,5,3,0.199215197417612,0.151472811718508,0.115171999864114,0.560702414192882,0.649397107420365,0.169357302283512,1
+0.922843366628513,0.912627586396411,21,2,3,7,0.851639879330248,0.842212314308118,0.832889111451739,1.29789405992245,0.915883320912091,0.590811338548155,1
+0.166969822719693,0.398156099021435,22,2,6,1,0.0278789216990458,0.0664800532683736,0.158528279187967,0.431749002184154,0.923291695753637,0.348254618269284,1
+0.350683249300346,0.84422400011681,23,2,1,6,0.122978741339848,0.296055215498298,0.712714162373228,0.914162405545687,1.06504760696993,0.375214144584023,1
+0.47748578293249,0.792779305484146,24,1,5,6,0.227992672902653,0.378540847371773,0.628499027203925,0.9254683679665,0.949484141121692,0.29364368150863,1
+0.384564548265189,0.153326370986179,25,2,2,5,0.147889891782409,0.0589638865954405,0.0235089760397912,0.414003463538894,0.634247405427742,0.365387395199715,1
+0.563622857443988,0.467359990812838,21,1,5,3,0.317670725433326,0.263414773476928,0.218425361012576,0.73218582781006,0.639414084578942,0.071506910079209,1
+0.343304847599939,0.854578266385943,22,2,1,6,0.117858218385617,0.293380861503846,0.730304013379203,0.920957236664559,1.07775346743350,0.387658506651072,1
+0.666085948701948,0.710089378990233,23,1,5,2,0.443670491058174,0.472980557667886,0.504226926154735,0.973600234805286,0.784681795257806,0.267809801016930,1
+0.190568120684475,0.0772022884339094,24,2,2,5,0.0363162086212125,0.0147122950193909,0.00596019333943254,0.205612261211838,0.813105258002736,0.523933195018469,1
+0.353534662164748,0.427994541125372,25,1,5,1,0.124986757351942,0.151310905505115,0.183179327233118,0.555127088678854,0.775304301713569,0.163208092002022,1
+0.127048352966085,0.927507144864649,21,2,1,4,0.0161412839913949,0.117838255119330,0.860269503774972,0.936168140755905,1.27370093893119,0.567322915045421,1
+0.960906301159412,0.891004979610443,22,2,3,7,0.923340919607862,0.856172299272088,0.793889873690606,1.31043152942016,0.891862204031343,0.604416671286136,1
+0.306814440060407,0.902291874401271,23,2,1,6,0.094135100629581,0.276836176215481,0.81413062661056,0.953029761990747,1.13782109627099,0.446272800849954,1
+0.087350245565176,0.671402548439801,24,2,6,4,0.00763006540029655,0.0586471774793016,0.450781382051459,0.677060889028273,1.13300968942079,0.446831795474291,1
+0.27015240653418,0.371201378758997,25,1,5,1,0.0729823227562089,0.100280945780549,0.137790463592580,0.459099974241765,0.81882108746687,0.263474858488646,1
+0.871842501685023,0.569787061074749,21,2,3,2,0.7601093477444,0.496764576755166,0.324657294968199,1.04152131169391,0.584021951079369,0.378334613738721,1
+0.686449621338397,0.169308491749689,22,2,4,10,0.471213082635629,0.116221750050949,0.0286653653785545,0.707020825728764,0.356341416814533,0.379631841296403,1
+0.67132937326096,0.571220482233912,23,1,5,2,0.450683127402953,0.383477088331915,0.326292839323543,0.881462402332905,0.659027480614106,0.185542747720368,1
+0.548616112209857,0.405350996181369,24,1,5,3,0.300979638576258,0.222382087605415,0.164309430105228,0.682121007359754,0.606676886210257,0.106404700508298,1
+0.677980388281867,0.993355110753328,25,2,3,9,0.459657406894831,0.673475283690318,0.986754376059756,1.20266860895036,1.04424662144096,0.524477152905055,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/resources/test-data.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/resources/test-data.csv b/community/mahout-mr/mr-examples/bin/resources/test-data.csv
new file mode 100644
index 0000000..ab683cd
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/resources/test-data.csv
@@ -0,0 +1,61 @@
+"V1","V2","V3","V4","V5","V6","V7","V8","y"
+1,-0.212887381184450,-0.955959589855826,-0.00326541907490505,0.0560086232868742,0.091264583618544,0.0172194710825328,-0.0237399208336878,1
+1,3.14702017427074,2.12881054220556,-0.00566925018709358,-0.055626039510634,-0.0630510476335515,-0.00155145331201058,0.108559859662683,0
+1,-2.16541417186635,-2.71847685293678,-0.00833554984263851,0.0433655514274994,-0.102555485096075,-0.156155728366877,-0.0241458595902909,1
+1,-4.33686585982661,-2.6857484867589,-0.0115524101901378,0.122387581992154,0.081766215557828,-0.0206167352421607,-0.0424490760296281,1
+1,2.34100936064648,2.10958510331364,-0.0129315842415535,0.173866353524092,-0.0299915285951044,0.108136400830407,-0.0063355720943443,0
+1,1.30317270786224,3.37038662087804,-0.0230504278644102,-0.131884713919903,0.086455020204179,0.17337860146005,-0.0524355492943794,0
+1,1.94943481762617,3.54806480367192,-0.029538920288902,-0.0720379027720258,0.214306548234308,-0.082665692089578,0.226607475768828,0
+1,3.14635496849369,1.76134258264267,-0.0318247859223975,-0.187198080297378,-0.08576487890296,0.153638925055934,-0.0691201521844938,0
+1,-1.26105438936697,-1.95583819596755,-0.0367826492102569,-0.0936093811581598,-0.0317225362744449,-0.0840334569992295,-0.0627566339884115,1
+1,2.40442001058194,3.23077413487565,-0.0452264569747572,0.0371989606630366,-0.17352653795031,0.102543062447842,-0.0551882772900301,0
+1,-2.20940227045733,-0.175769402031962,-0.0465958462590872,0.130789407148096,-0.140283147466875,0.0708851428212228,0.0605244763586474,1
+1,-1.64710385829030,-2.57691366099069,-0.0553070134425288,-0.0349011715152424,-0.0826092377112715,0.106766133325393,-0.0585587032435851,1
+1,-2.6523724984616,-4.16903830585265,-0.0568310036349303,-0.0291979248790545,-0.255996825268056,0.0401827924643623,0.0179311252387879,1
+1,2.34337447158977,0.28996735916551,-0.0625800583342644,0.0899232083837452,0.0255207970332586,-0.0343458209061299,0.0755898049986344,0
+1,3.67556867120403,1.36097809464341,-0.0956707962851342,0.0537771695881714,-0.0373171704803031,0.0463473815328367,-0.228499359561800,0
+1,1.96533061882493,2.92646586187099,-0.103334098736041,-0.0194013528907574,0.0253359438067293,0.00748464018133427,-0.239745502177878,0
+1,-1.95041601303593,-0.860607985906108,-0.103721968898869,-0.00972933741506002,0.0227857854969761,-0.0287381002832544,-0.130156656165122,1
+1,-1.51543545229533,-1.35683836829949,-0.106483722717291,0.103877046729912,0.00840497101030744,0.0258430051020969,0.168907472637671,1
+1,1.45074382041585,1.88231080047069,-0.107681637419817,-0.00626324733854461,-0.144385489192821,0.00088239451623517,-0.00299885969569744,0
+1,3.87956616310254,4.31276421460554,-0.129963535661731,-0.0640782960295875,-0.0324909886960640,0.0428280701443882,0.0329254937199428,0
+1,-2.88187391546093,-3.16731558128991,-0.136390769151814,-0.155408895734766,0.105626409419800,-0.0918345772196075,0.197828194781600,1
+1,-2.65024496288248,-1.81147577507541,-0.145438998990911,0.0691687502404964,0.0749439097959056,-0.0674149410216342,0.123896965825847,1
+1,-1.37426198993006,-2.08894064826135,-0.153236566384176,0.0213513951854753,-0.134553043562400,0.00287304090325258,0.0122158739075685,1
+1,1.65698424179346,2.49004336804714,-0.153862461770005,0.105220938080375,-0.0946233303225818,-0.122426312548592,-0.00538234276442917,0
+1,2.93315586503758,2.75229115279104,-0.168877592929163,-0.0349207806558679,0.0189964813847077,0.202397029441612,0.0426299706123943,0
+1,-3.84306960373604,-2.35606387141237,-0.179511886850707,-0.0916819865200809,0.0265829433229566,0.101658708455140,-0.0855390303406673,1
+1,2.28101644492271,1.37963780647481,-0.180898801743387,-0.0789829066843624,-0.0779025366072777,0.0442621459868237,-0.136195159617836,0
+1,1.70008372335953,2.71018350574622,-0.188985514267118,-0.195856534813112,-0.106263419324547,-0.0311178988395261,-0.121173036989233,0
+1,-2.05613043162767,-1.73770126734937,0.00630625444849072,-0.134595964087825,0.0708994966210059,0.0739139562742148,-0.00416084523004362,1
+1,2.39375626983328,3.2468518382106,0.00951905535238045,-0.140380515724865,0.0630970962358967,0.00183192220061040,-0.0773483294293499,0
+1,4.26863682432937,3.49421800345979,0.0109175198048448,-0.109995560295421,-0.111585866731122,0.154763193427948,-0.0186987535307691,0
+1,1.54495296452702,3.17243560853872,0.0117478311845783,0.115838636637105,-0.1715332868224,0.0927292648278796,-0.0885962242970987,0
+1,2.16883227993245,1.63879588167162,0.0158863105366749,-0.00488771308802354,0.0280782748001184,0.131946735985038,0.066416828384239,0
+1,1.86427271422921,3.32026821853873,0.0162473257475520,0.0355005599857545,-0.0988825269654524,0.0527023072810735,0.100841323212596,0
+1,-3.03828333997027,-1.43214405751321,0.0247204684728272,0.146197859364444,0.0141171187314724,-0.201738256450160,0.044002672456105,1
+1,2.08595761680696,0.225336429607513,0.0335964287149376,0.0576493862055925,0.121452048491972,0.0640240734436852,0.224720096669846,0
+1,-1.85256114614442,-2.22817393781734,0.0346230650580488,0.160185441442375,0.0114059982858295,0.00496408500928602,-0.094156048483371,1
+1,2.33572915427688,1.03334367238243,0.0357824515834720,-0.172284120406131,0.0329286256184980,-0.101030665525296,-0.00238851979619332,0
+1,-2.00334039609229,-2.98875026257892,0.0375804284421083,0.142856636546252,-0.0862220203147005,-0.0441603903572752,0.0147126239348866,1
+1,2.38346139581192,1.21051372282823,0.0405425233313353,-0.145245065311593,-0.0216697981922324,-0.0128934036902430,-0.0325085994141851,0
+1,-1.15629168023471,-1.37784639006639,0.0429948703549178,-0.00491267793152886,0.0263522850749959,-0.0442602193050815,0.0582704866256344,1
+1,2.13230915550664,1.32833684701498,0.0434112538719301,-0.0296522957829338,0.00247091583877657,-0.123872403365319,-0.136549696313901,0
+1,-1.88291252343724,-1.99980946454726,0.0472833199907535,-0.0365284873908706,-0.0209054390489622,-0.0891896486647233,0.0542966824787834,1
+1,-1.34787394136153,-2.57763619051754,0.0493154843443071,0.0384664637019124,-0.00780509859650452,-0.118550134827935,0.00573215142098708,1
+1,-1.81748193199251,-2.72113041015796,0.0551479875680516,-0.255723061179778,-0.217672946803948,0.145106553357089,0.0632886151091758,1
+1,-3.13049595715861,-0.0285946551309455,0.0724437318718333,-0.0360911974267016,-0.121364676014540,0.038351368519738,-0.0125375424386282,1
+1,-2.3836883021805,-1.40162632998805,0.0746620557343183,0.069222624188286,0.04657285528431,0.0932835769596473,0.00836816351062604,1
+1,-2.43800450243598,-0.965440038635416,0.0763675021411913,-0.122575769653323,0.045866930905471,-0.0493852614669876,0.128116802512532,1
+1,1.09024638837653,2.21814920469686,0.0769910502309598,-0.270152593833931,-0.252735856082821,0.0661674666715274,-0.000429289775969046,0
+1,3.17642151475607,1.18015379683312,0.0776648965451875,-0.117234850817615,0.0759455286430382,0.119280079276134,0.117056969569811,0
+1,-3.5501372839931,-4.02435741321994,0.0833451415432366,-0.0185864612285970,0.0553371588028254,0.0269699189958747,-0.0930023774668385,1
+1,-2.85922019599943,-2.07644295605507,0.0903467736346066,0.124804691516462,0.0673015037344841,0.0234043567104492,0.0866115903248345,1
+1,0.513249476607372,5.0165612245778,0.0934321220365115,-0.0387550539552360,0.070129320868753,0.0635055975927393,-0.00773489793089484,0
+1,1.30094323285406,2.74698316868320,0.094239413405751,-0.105600040230387,-0.0134676903839459,0.00834379403909127,0.0978349326557826,0
+1,1.62511731278249,3.01296963021698,0.104352029985773,-0.0065839083200722,0.068460830526483,-0.1202220553,0.121998460927858,0
+1,1.82917662184333,2.89388269168932,0.110781239485760,-0.262387884050666,-0.00517657837760664,-0.0224028641246511,-0.108606003593092,0
+1,-3.17279743572930,-2.86698187406046,0.110873139279243,-0.093614374710967,0.0925974010859032,-0.00747619041107016,-0.066394213442664,1
+1,-3.20104938765970,-1.68043245593876,0.123227179211642,-0.00179275501686146,-0.175893752209014,-0.0835732816974749,0.0560957582079696,1
+1,-1.89923900052239,-2.92427973445236,0.147975477003611,0.00819675018680998,0.00470753628896422,-0.0122227288860826,0.209903875101594,1
+1,0.148491843864120,-1.54734877494689,0.162479731968606,0.112962938668545,-0.0100535803565242,0.0422099301034027,0.0752974779385111,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/set-dfs-commands.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/set-dfs-commands.sh b/community/mahout-mr/mr-examples/bin/set-dfs-commands.sh
new file mode 100755
index 0000000..0ee5fe1
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/set-dfs-commands.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+#   
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# 
+# Requires $HADOOP_HOME to be set.
+#
+# Figures out the major version of Hadoop we're using and sets commands
+# for dfs commands
+#
+# Run by each example script.
+
+# Find a hadoop shell
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+  HADOOP="${HADOOP_HOME}/bin/hadoop"
+  if [ ! -e $HADOOP ]; then
+    echo "Can't find hadoop in $HADOOP, exiting"
+    exit 1
+  fi
+fi
+
+# Check Hadoop version
+v=`${HADOOP_HOME}/bin/hadoop version | egrep "Hadoop [0-9]+.[0-9]+.[0-9]+" | cut -f 2 -d ' ' | cut -f 1 -d '.'`
+
+if [ $v -eq "1" -o $v -eq "0" ]
+then
+  echo "Discovered Hadoop v0 or v1."
+  export DFS="${HADOOP_HOME}/bin/hadoop dfs"
+  export DFSRM="$DFS -rmr -skipTrash"
+elif [ $v -eq "2" ]
+then
+  echo "Discovered Hadoop v2."
+  export DFS="${HADOOP_HOME}/bin/hdfs dfs"
+  export DFSRM="$DFS -rm -r -skipTrash"
+else
+  echo "Can't determine Hadoop version."
+  exit 1
+fi
+echo "Setting dfs command to $DFS, dfs rm to $DFSRM."
+
+export HVERSION=$v 

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/pom.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/pom.xml b/community/mahout-mr/mr-examples/pom.xml
new file mode 100644
index 0000000..7627f23
--- /dev/null
+++ b/community/mahout-mr/mr-examples/pom.xml
@@ -0,0 +1,121 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.mahout</groupId>
+    <artifactId>mahout-mr</artifactId>
+    <version>0.14.0-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>mr-examples</artifactId>
+  <name>-- Mahout Classic: Examples</name>
+  <description>Scalable machine learning library examples</description>
+
+  <packaging>jar</packaging>
+  <properties>
+    <mahout.skip.example>false</mahout.skip.example>
+  </properties>
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>copy-dependencies</id>
+            <phase>package</phase>
+            <goals>
+              <goal>copy-dependencies</goal>
+            </goals>
+            <configuration>
+              <!-- configure the plugin here -->
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+
+      <!-- create examples hadoop job jar -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>job</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+            <configuration>
+              <skipAssembly>${mahout.skip.example}</skipAssembly>
+              <descriptors>
+                <descriptor>src/main/assembly/job.xml</descriptor>
+              </descriptors>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-remote-resources-plugin</artifactId>
+        <configuration>
+          <appendedResourcesDirectory>../mr/src/main/appended-resources</appendedResourcesDirectory>
+          <resourceBundles>
+            <resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle>
+          </resourceBundles>
+          <supplementalModels>
+            <supplementalModel>supplemental-models.xml</supplementalModel>
+          </supplementalModels>
+        </configuration>
+      </plugin>
+
+      <plugin>
+        <artifactId>maven-source-plugin</artifactId>
+      </plugin>
+
+      <plugin>
+        <groupId>org.mortbay.jetty</groupId>
+        <artifactId>maven-jetty-plugin</artifactId>
+        <version>6.1.26</version>
+      </plugin>
+    </plugins>
+
+  </build>
+
+  <dependencies>
+
+
+
+
+  </dependencies>
+
+  <profiles>
+    <profile>
+      <id>release.prepare</id>
+      <properties>
+        <mahout.skip.example>true</mahout.skip.example>
+      </properties>
+    </profile>
+  </profiles>
+</project>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/assembly/job.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/assembly/job.xml b/community/mahout-mr/mr-examples/src/main/assembly/job.xml
new file mode 100644
index 0000000..0c41f3d
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/assembly/job.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<assembly
+  xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0
+    http://maven.apache.org/xsd/assembly-1.1.0.xsd">
+  <id>job</id>
+  <formats>
+   <format>jar</format>
+  </formats>
+  <includeBaseDirectory>false</includeBaseDirectory>
+  <dependencySets>
+    <dependencySet>
+      <unpack>true</unpack>
+      <unpackOptions>
+        <!-- MAHOUT-1126 -->
+        <excludes>
+          <exclude>META-INF/LICENSE</exclude>
+        </excludes>
+      </unpackOptions>
+      <scope>runtime</scope>
+      <outputDirectory>/</outputDirectory>
+      <useTransitiveFiltering>true</useTransitiveFiltering>
+      <excludes>
+        <exclude>org.apache.hadoop:hadoop-core</exclude>
+      </excludes>
+    </dependencySet>
+  </dependencySets>
+</assembly>
+  
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java
new file mode 100644
index 0000000..6392b9f
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example;
+
+import java.io.File;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+
+/**
+ * This class provides a common implementation for parsing input parameters for
+ * all taste examples. Currently they only need the path to the recommendations
+ * file as input.
+ * 
+ * The class is safe to be used in threaded contexts.
+ */
+public final class TasteOptionParser {
+  
+  private TasteOptionParser() {
+  }
+  
+  /**
+   * Parse the given command line arguments.
+   * @param args the arguments as given to the application.
+   * @return the input file if a file was given on the command line, null otherwise.
+   */
+  public static File getRatings(String[] args) throws OptionException {
+    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+    ArgumentBuilder abuilder = new ArgumentBuilder();
+    GroupBuilder gbuilder = new GroupBuilder();
+    
+    Option inputOpt = obuilder.withLongName("input").withRequired(false).withShortName("i")
+        .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
+        .withDescription("The Path for input data directory.").create();
+    
+    Option helpOpt = DefaultOptionCreator.helpOption();
+    
+    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(helpOpt).create();
+    
+    Parser parser = new Parser();
+    parser.setGroup(group);
+    CommandLine cmdLine = parser.parse(args);
+    
+    if (cmdLine.hasOption(helpOpt)) {
+      CommandLineUtil.printHelp(group);
+      return null;
+    }
+
+    return cmdLine.hasOption(inputOpt) ? new File(cmdLine.getValue(inputOpt).toString()) : null;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java
new file mode 100644
index 0000000..c908e5b
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood;
+import org.apache.mahout.cf.taste.impl.recommender.GenericBooleanPrefUserBasedRecommender;
+import org.apache.mahout.cf.taste.impl.similarity.CachingUserSimilarity;
+import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * A simple {@link Recommender} implemented for the Book Crossing demo.
+ * See the <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/">Book Crossing site</a>.
+ */
+public final class BookCrossingBooleanRecommender implements Recommender {
+
+  private final Recommender recommender;
+
+  public BookCrossingBooleanRecommender(DataModel bcModel) throws TasteException {
+    UserSimilarity similarity = new CachingUserSimilarity(new LogLikelihoodSimilarity(bcModel), bcModel);
+    UserNeighborhood neighborhood =
+        new NearestNUserNeighborhood(10, Double.NEGATIVE_INFINITY, similarity, bcModel, 1.0);
+    recommender = new GenericBooleanPrefUserBasedRecommender(bcModel, neighborhood, similarity);
+  }
+
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
+    return recommender.recommend(userID, howMany);
+  }
+
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
+    return recommend(userID, howMany, null, includeKnownItems);
+  }
+
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
+    return recommender.recommend(userID, howMany, rescorer, false);
+  }
+  
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+    throws TasteException {
+    return recommender.recommend(userID, howMany, rescorer, includeKnownItems);
+  }
+  
+  @Override
+  public float estimatePreference(long userID, long itemID) throws TasteException {
+    return recommender.estimatePreference(userID, itemID);
+  }
+
+  @Override
+  public void setPreference(long userID, long itemID, float value) throws TasteException {
+    recommender.setPreference(userID, itemID, value);
+  }
+
+  @Override
+  public void removePreference(long userID, long itemID) throws TasteException {
+    recommender.removePreference(userID, itemID);
+  }
+
+  @Override
+  public DataModel getDataModel() {
+    return recommender.getDataModel();
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    recommender.refresh(alreadyRefreshed);
+  }
+
+  @Override
+  public String toString() {
+    return "BookCrossingBooleanRecommender[recommender:" + recommender + ']';
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java
new file mode 100644
index 0000000..2219bce
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+
+final class BookCrossingBooleanRecommenderBuilder implements RecommenderBuilder {
+
+  @Override
+  public Recommender buildRecommender(DataModel dataModel) throws TasteException {
+    return new BookCrossingBooleanRecommender(dataModel);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java
new file mode 100644
index 0000000..b9814c7
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import org.apache.commons.cli2.OptionException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.IRStatistics;
+import org.apache.mahout.cf.taste.eval.RecommenderIRStatsEvaluator;
+import org.apache.mahout.cf.taste.example.TasteOptionParser;
+import org.apache.mahout.cf.taste.impl.eval.GenericRecommenderIRStatsEvaluator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+
+public final class BookCrossingBooleanRecommenderEvaluatorRunner {
+
+  private static final Logger log = LoggerFactory.getLogger(BookCrossingBooleanRecommenderEvaluatorRunner.class);
+
+  private BookCrossingBooleanRecommenderEvaluatorRunner() {
+    // do nothing
+  }
+
+  public static void main(String... args) throws IOException, TasteException, OptionException {
+    RecommenderIRStatsEvaluator evaluator = new GenericRecommenderIRStatsEvaluator();
+    File ratingsFile = TasteOptionParser.getRatings(args);
+    DataModel model =
+        ratingsFile == null ? new BookCrossingDataModel(true) : new BookCrossingDataModel(ratingsFile, true);
+
+    IRStatistics evaluation = evaluator.evaluate(
+        new BookCrossingBooleanRecommenderBuilder(),
+        new BookCrossingDataModelBuilder(),
+        model,
+        null,
+        3,
+        Double.NEGATIVE_INFINITY,
+        1.0);
+
+    log.info(String.valueOf(evaluation));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
new file mode 100644
index 0000000..3e2f8b5
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.regex.Pattern;
+
+import com.google.common.base.Charsets;
+import com.google.common.io.Closeables;
+import org.apache.mahout.cf.taste.similarity.precompute.example.GroupLensDataModel;
+import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
+import org.apache.mahout.common.iterator.FileLineIterable;
+
+/**
+ * See <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip">download</a> for
+ * data needed by this class. The BX-Book-Ratings.csv file is needed.
+ */
+public final class BookCrossingDataModel extends FileDataModel {
+
+  private static final Pattern NON_DIGIT_SEMICOLON_PATTERN = Pattern.compile("[^0-9;]");
+
+  public BookCrossingDataModel(boolean ignoreRatings) throws IOException {
+    this(GroupLensDataModel.readResourceToTempFile(
+             "/org/apache/mahout/cf/taste/example/bookcrossing/BX-Book-Ratings.csv"),
+         ignoreRatings);
+  }
+  
+  /**
+   * @param ratingsFile BookCrossing ratings file in its native format
+   * @throws IOException if an error occurs while reading or writing files
+   */
+  public BookCrossingDataModel(File ratingsFile, boolean ignoreRatings) throws IOException {
+    super(convertBCFile(ratingsFile, ignoreRatings));
+  }
+  
+  private static File convertBCFile(File originalFile, boolean ignoreRatings) throws IOException {
+    if (!originalFile.exists()) {
+      throw new FileNotFoundException(originalFile.toString());
+    }
+    File resultFile = new File(new File(System.getProperty("java.io.tmpdir")), "taste.bookcrossing.txt");
+    resultFile.delete();
+    Writer writer = null;
+    try {
+      writer = new OutputStreamWriter(new FileOutputStream(resultFile), Charsets.UTF_8);
+      for (String line : new FileLineIterable(originalFile, true)) {
+        // 0 ratings are basically "no rating", ignore them (thanks h.9000)
+        if (line.endsWith("\"0\"")) {
+          continue;
+        }
+        // Delete replace anything that isn't numeric, or a semicolon delimiter. Make comma the delimiter.
+        String convertedLine = NON_DIGIT_SEMICOLON_PATTERN.matcher(line)
+            .replaceAll("").replace(';', ',');
+        // If this means we deleted an entire ID -- few cases like that -- skip the line
+        if (convertedLine.contains(",,")) {
+          continue;
+        }
+        if (ignoreRatings) {
+          // drop rating
+          convertedLine = convertedLine.substring(0, convertedLine.lastIndexOf(','));
+        }
+        writer.write(convertedLine);
+        writer.write('\n');
+      }
+      writer.flush();
+    } catch (IOException ioe) {
+      resultFile.delete();
+      throw ioe;
+    } finally {
+      Closeables.close(writer, false);
+    }
+    return resultFile;
+  }
+  
+  @Override
+  public String toString() {
+    return "BookCrossingDataModel";
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java
new file mode 100644
index 0000000..9ec2eaf
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import org.apache.mahout.cf.taste.eval.DataModelBuilder;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+final class BookCrossingDataModelBuilder implements DataModelBuilder {
+
+  @Override
+  public DataModel buildDataModel(FastByIDMap<PreferenceArray> trainingData) {
+    return new GenericBooleanPrefDataModel(GenericBooleanPrefDataModel.toDataMap(trainingData));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java
new file mode 100644
index 0000000..c06ca2f
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood;
+import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender;
+import org.apache.mahout.cf.taste.impl.similarity.CachingUserSimilarity;
+import org.apache.mahout.cf.taste.impl.similarity.EuclideanDistanceSimilarity;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+/**
+ * A simple {@link Recommender} implemented for the Book Crossing demo.
+ * See the <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/">Book Crossing site</a>.
+ */
+public final class BookCrossingRecommender implements Recommender {
+
+  private final Recommender recommender;
+
+  public BookCrossingRecommender(DataModel bcModel) throws TasteException {
+    UserSimilarity similarity = new CachingUserSimilarity(new EuclideanDistanceSimilarity(bcModel), bcModel);
+    UserNeighborhood neighborhood = new NearestNUserNeighborhood(10, 0.2, similarity, bcModel, 0.2);
+    recommender = new GenericUserBasedRecommender(bcModel, neighborhood, similarity);
+  }
+  
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
+    return recommender.recommend(userID, howMany);
+  }
+
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
+    return recommend(userID, howMany, null, includeKnownItems);
+  }
+  
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
+    return recommender.recommend(userID, howMany, rescorer, false);
+  }
+  
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+    throws TasteException {
+    return recommender.recommend(userID, howMany, rescorer, false);
+  }
+  
+  @Override
+  public float estimatePreference(long userID, long itemID) throws TasteException {
+    return recommender.estimatePreference(userID, itemID);
+  }
+  
+  @Override
+  public void setPreference(long userID, long itemID, float value) throws TasteException {
+    recommender.setPreference(userID, itemID, value);
+  }
+  
+  @Override
+  public void removePreference(long userID, long itemID) throws TasteException {
+    recommender.removePreference(userID, itemID);
+  }
+  
+  @Override
+  public DataModel getDataModel() {
+    return recommender.getDataModel();
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    recommender.refresh(alreadyRefreshed);
+  }
+  
+  @Override
+  public String toString() {
+    return "BookCrossingRecommender[recommender:" + recommender + ']';
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java
new file mode 100644
index 0000000..bb6d3e1
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+
+final class BookCrossingRecommenderBuilder implements RecommenderBuilder {
+  
+  @Override
+  public Recommender buildRecommender(DataModel dataModel) throws TasteException {
+    return new BookCrossingRecommender(dataModel);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java
new file mode 100644
index 0000000..97074d2
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.commons.cli2.OptionException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.RecommenderEvaluator;
+import org.apache.mahout.cf.taste.example.TasteOptionParser;
+import org.apache.mahout.cf.taste.impl.eval.AverageAbsoluteDifferenceRecommenderEvaluator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class BookCrossingRecommenderEvaluatorRunner {
+  
+  private static final Logger log = LoggerFactory.getLogger(BookCrossingRecommenderEvaluatorRunner.class);
+  
+  private BookCrossingRecommenderEvaluatorRunner() {
+    // do nothing
+  }
+  
+  public static void main(String... args) throws IOException, TasteException, OptionException {
+    RecommenderEvaluator evaluator = new AverageAbsoluteDifferenceRecommenderEvaluator();
+    File ratingsFile = TasteOptionParser.getRatings(args);
+    DataModel model =
+        ratingsFile == null ? new BookCrossingDataModel(false) : new BookCrossingDataModel(ratingsFile, false);
+
+    double evaluation = evaluator.evaluate(new BookCrossingRecommenderBuilder(),
+      null,
+      model,
+      0.9,
+      0.3);
+    log.info(String.valueOf(evaluation));
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README
new file mode 100644
index 0000000..9244fe3
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README
@@ -0,0 +1,9 @@
+Code works with BookCrossing data set, which is not included in this distribution but is downloadable from
+http://www.informatik.uni-freiburg.de/~cziegler/BX/
+
+Data set originated from:
+
+Improving Recommendation Lists Through Topic Diversification,
+ Cai-Nicolas Ziegler, Sean M. McNee, Joseph A. Konstan, Georg Lausen;
+ Proceedings of the 14th International World Wide Web Conference (WWW '05), May 10-14, 2005, Chiba, Japan.
+ To appear.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
new file mode 100644
index 0000000..033daa2
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.email;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+public final class EmailUtility {
+
+  public static final String SEPARATOR = "separator";
+  public static final String MSG_IDS_PREFIX = "msgIdsPrefix";
+  public static final String FROM_PREFIX = "fromPrefix";
+  public static final String MSG_ID_DIMENSION = "msgIdDim";
+  public static final String FROM_INDEX = "fromIdx";
+  public static final String REFS_INDEX = "refsIdx";
+  private static final String[] EMPTY = new String[0];
+  private static final Pattern ADDRESS_CLEANUP = Pattern.compile("mailto:|<|>|\\[|\\]|\\=20");
+  private static final Pattern ANGLE_BRACES = Pattern.compile("<|>");
+  private static final Pattern SPACE_OR_CLOSE_ANGLE = Pattern.compile(">|\\s+");
+  public static final Pattern WHITESPACE = Pattern.compile("\\s*");
+
+  private EmailUtility() {
+  }
+
+  /**
+   * Strip off some spurious characters that make it harder to dedup
+   */
+  public static String cleanUpEmailAddress(CharSequence address) {
+    //do some cleanup to normalize some things, like: Key: karthik ananth <ka...@gmail.com>: Value: 178
+    //Key: karthik ananth [mailto:karthik.jcecs@gmail.com]=20: Value: 179
+    //TODO: is there more to clean up here?
+    return ADDRESS_CLEANUP.matcher(address).replaceAll("");
+  }
+
+  public static void loadDictionaries(Configuration conf, String fromPrefix,
+                                      OpenObjectIntHashMap<String> fromDictionary,
+                                      String msgIdPrefix,
+                                      OpenObjectIntHashMap<String> msgIdDictionary) throws IOException {
+
+    Path[] localFiles = HadoopUtil.getCachedFiles(conf);
+    FileSystem fs = FileSystem.getLocal(conf);
+    for (Path dictionaryFile : localFiles) {
+
+      // key is word value is id
+
+      OpenObjectIntHashMap<String> dictionary = null;
+      if (dictionaryFile.getName().startsWith(fromPrefix)) {
+        dictionary = fromDictionary;
+      } else if (dictionaryFile.getName().startsWith(msgIdPrefix)) {
+        dictionary = msgIdDictionary;
+      }
+      if (dictionary != null) {
+        dictionaryFile = fs.makeQualified(dictionaryFile);
+        for (Pair<Writable, IntWritable> record
+            : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) {
+          dictionary.put(record.getFirst().toString(), record.getSecond().get());
+        }
+      }
+    }
+
+  }
+
+  public static String[] parseReferences(CharSequence rawRefs) {
+    String[] splits;
+    if (rawRefs != null && rawRefs.length() > 0) {
+      splits = SPACE_OR_CLOSE_ANGLE.split(rawRefs);
+      for (int i = 0; i < splits.length; i++) {
+        splits[i] = ANGLE_BRACES.matcher(splits[i]).replaceAll("");
+      }
+    } else {
+      splits = EMPTY;
+    }
+    return splits;
+  }
+
+  public enum Counters {
+    NO_MESSAGE_ID, NO_FROM_ADDRESS
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
new file mode 100644
index 0000000..5cd308d
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.email;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ *  Assumes the input is in the format created by {@link org.apache.mahout.text.SequenceFilesFromMailArchives}
+ */
+public final class FromEmailToDictionaryMapper extends Mapper<Text, Text, Text, VarIntWritable> {
+
+  private String separator;
+
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    super.setup(context);
+    separator = context.getConfiguration().get(EmailUtility.SEPARATOR);
+  }
+
+  @Override
+  protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
+    //From is in the value
+    String valStr = value.toString();
+    int idx = valStr.indexOf(separator);
+    if (idx == -1) {
+      context.getCounter(EmailUtility.Counters.NO_FROM_ADDRESS).increment(1);
+    } else {
+      String full = valStr.substring(0, idx);
+      //do some cleanup to normalize some things, like: Key: karthik ananth <ka...@gmail.com>: Value: 178
+      //Key: karthik ananth [mailto:karthik.jcecs@gmail.com]=20: Value: 179
+      //TODO: is there more to clean up here?
+      full = EmailUtility.cleanUpEmailAddress(full);
+
+      if (EmailUtility.WHITESPACE.matcher(full).matches()) {
+        context.getCounter(EmailUtility.Counters.NO_FROM_ADDRESS).increment(1);
+      } else {
+        context.write(new Text(full), new VarIntWritable(1));
+      }
+    }
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java
new file mode 100644
index 0000000..72fcde9
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.email;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Key: the string id
+ * Value: the count
+ * Out Key: the string id
+ * Out Value: the sum of the counts
+ */
+public final class MailToDictionaryReducer extends Reducer<Text, VarIntWritable, Text, VarIntWritable> {
+
+  @Override
+  protected void reduce(Text key, Iterable<VarIntWritable> values, Context context)
+    throws IOException, InterruptedException {
+    int sum = 0;
+    for (VarIntWritable value : values) {
+      sum += value.get();
+    }
+    context.write(new Text(key), new VarIntWritable(sum));
+  }
+}


[19/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/RandomRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/RandomRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/RandomRecommender.java
new file mode 100644
index 0000000..08aa5ae
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/RandomRecommender.java
@@ -0,0 +1,97 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.common.RandomUtils;
+
+/**
+ * Produces random recommendations and preference estimates. This is likely only useful as a novelty and for
+ * benchmarking.
+ */
+public final class RandomRecommender extends AbstractRecommender {
+  
+  private final Random random = RandomUtils.getRandom();
+  private final float minPref;
+  private final float maxPref;
+  
+  public RandomRecommender(DataModel dataModel) throws TasteException {
+    super(dataModel);
+    float maxPref = Float.NEGATIVE_INFINITY;
+    float minPref = Float.POSITIVE_INFINITY;
+    LongPrimitiveIterator userIterator = dataModel.getUserIDs();
+    while (userIterator.hasNext()) {
+      long userID = userIterator.next();
+      PreferenceArray prefs = dataModel.getPreferencesFromUser(userID);
+      for (int i = 0; i < prefs.length(); i++) {
+        float prefValue = prefs.getValue(i);
+        if (prefValue < minPref) {
+          minPref = prefValue;
+        }
+        if (prefValue > maxPref) {
+          maxPref = prefValue;
+        }
+      }
+    }
+    this.minPref = minPref;
+    this.maxPref = maxPref;
+  }
+
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+    throws TasteException {
+    DataModel dataModel = getDataModel();
+    int numItems = dataModel.getNumItems();
+    List<RecommendedItem> result = new ArrayList<>(howMany);
+    while (result.size() < howMany) {
+      LongPrimitiveIterator it = dataModel.getItemIDs();
+      it.skip(random.nextInt(numItems));
+      long itemID = it.next();
+      if (includeKnownItems || dataModel.getPreferenceValue(userID, itemID) == null) {
+        result.add(new GenericRecommendedItem(itemID, randomPref()));
+      }
+    }
+    return result;
+  }
+  
+  @Override
+  public float estimatePreference(long userID, long itemID) {
+    return randomPref();
+  }
+  
+  private float randomPref() {
+    return minPref + random.nextFloat() * (maxPref - minPref);
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    getDataModel().refresh(alreadyRefreshed);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java
new file mode 100644
index 0000000..623a60b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import com.google.common.base.Preconditions;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveArrayIterator;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.iterator.FixedSizeSamplingIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Iterator;
+
+/**
+ * <p>Returns all items that have not been rated by the user <em>(3)</em> and that were preferred by another user
+ * <em>(2)</em> that has preferred at least one item <em>(1)</em> that the current user has preferred too.</p>
+ *
+ * <p>This strategy uses sampling to limit the number of items that are considered, by sampling three different
+ * things, noted above:</p>
+ *
+ * <ol>
+ *   <li>The items that the user has preferred</li>
+ *   <li>The users who also prefer each of those items</li>
+ *   <li>The items those users also prefer</li>
+ * </ol>
+ * 
+ * <p>There is a maximum associated with each of these three things; if the number of items or users exceeds
+ * that max, it is sampled so that the expected number of items or users actually used in that part of the
+ * computation is equal to the max.</p>
+ * 
+ * <p>Three arguments control these three maxima. Each is a "factor" f, which establishes the max at
+ * f * log2(n), where n is the number of users or items in the data. For example if factor #2 is 5,
+ * which controls the number of users sampled per item, then 5 * log2(# users) is the maximum for this
+ * part of the computation.</p>
+ * 
+ * <p>Each can be set to not do any limiting with value {@link #NO_LIMIT_FACTOR}.</p>
+ */
+public class SamplingCandidateItemsStrategy extends AbstractCandidateItemsStrategy {
+
+  private static final Logger log = LoggerFactory.getLogger(SamplingCandidateItemsStrategy.class);
+
+  /**
+   * Default factor used if not otherwise specified, for all limits. (30).
+   */
+  public static final int DEFAULT_FACTOR = 30;
+  /**
+   * Specify this value as a factor to mean no limit.
+   */
+  public static final int NO_LIMIT_FACTOR = Integer.MAX_VALUE;
+  private static final int MAX_LIMIT = Integer.MAX_VALUE;
+  private static final double LOG2 = Math.log(2.0);
+
+  private final int maxItems;
+  private final int maxUsersPerItem;
+  private final int maxItemsPerUser;
+
+  /**
+   * Defaults to using no limit ({@link #NO_LIMIT_FACTOR}) for all factors, except 
+   * {@code candidatesPerUserFactor} which defaults to {@link #DEFAULT_FACTOR}.
+   *
+   * @see #SamplingCandidateItemsStrategy(int, int, int, int, int)
+   */
+  public SamplingCandidateItemsStrategy(int numUsers, int numItems) {
+    this(DEFAULT_FACTOR, DEFAULT_FACTOR, DEFAULT_FACTOR, numUsers, numItems);
+  }
+
+  /**
+   * @param itemsFactor factor controlling max items considered for a user
+   * @param usersPerItemFactor factor controlling max users considered for each of those items
+   * @param candidatesPerUserFactor factor controlling max candidate items considered from each of those users
+   * @param numUsers number of users currently in the data
+   * @param numItems number of items in the data
+   */
+  public SamplingCandidateItemsStrategy(int itemsFactor,
+                                        int usersPerItemFactor,
+                                        int candidatesPerUserFactor,
+                                        int numUsers,
+                                        int numItems) {
+    Preconditions.checkArgument(itemsFactor > 0, "itemsFactor must be greater then 0!");
+    Preconditions.checkArgument(usersPerItemFactor > 0, "usersPerItemFactor must be greater then 0!");
+    Preconditions.checkArgument(candidatesPerUserFactor > 0, "candidatesPerUserFactor must be greater then 0!");
+    Preconditions.checkArgument(numUsers > 0, "numUsers must be greater then 0!");
+    Preconditions.checkArgument(numItems > 0, "numItems must be greater then 0!");
+    maxItems = computeMaxFrom(itemsFactor, numItems);
+    maxUsersPerItem = computeMaxFrom(usersPerItemFactor, numUsers);
+    maxItemsPerUser = computeMaxFrom(candidatesPerUserFactor, numItems);
+    log.debug("maxItems {}, maxUsersPerItem {}, maxItemsPerUser {}", maxItems, maxUsersPerItem, maxItemsPerUser);
+  }
+
+  private static int computeMaxFrom(int factor, int numThings) {
+    if (factor == NO_LIMIT_FACTOR) {
+      return MAX_LIMIT;
+    }
+    long max = (long) (factor * (1.0 + Math.log(numThings) / LOG2));
+    return max > MAX_LIMIT ? MAX_LIMIT : (int) max;
+  }
+
+  @Override
+  protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel, boolean includeKnownItems)
+    throws TasteException {
+    LongPrimitiveIterator preferredItemIDsIterator = new LongPrimitiveArrayIterator(preferredItemIDs);
+    if (preferredItemIDs.length > maxItems) {
+      double samplingRate = (double) maxItems / preferredItemIDs.length;
+//      log.info("preferredItemIDs.length {}, samplingRate {}", preferredItemIDs.length, samplingRate);
+      preferredItemIDsIterator = 
+          new SamplingLongPrimitiveIterator(preferredItemIDsIterator, samplingRate);
+    }
+    FastIDSet possibleItemsIDs = new FastIDSet();
+    while (preferredItemIDsIterator.hasNext()) {
+      long itemID = preferredItemIDsIterator.nextLong();
+      PreferenceArray prefs = dataModel.getPreferencesForItem(itemID);
+      int prefsLength = prefs.length();
+      if (prefsLength > maxUsersPerItem) {
+        Iterator<Preference> sampledPrefs =
+            new FixedSizeSamplingIterator<>(maxUsersPerItem, prefs.iterator());
+        while (sampledPrefs.hasNext()) {
+          addSomeOf(possibleItemsIDs, dataModel.getItemIDsFromUser(sampledPrefs.next().getUserID()));
+        }
+      } else {
+        for (int i = 0; i < prefsLength; i++) {
+          addSomeOf(possibleItemsIDs, dataModel.getItemIDsFromUser(prefs.getUserID(i)));
+        }
+      }
+    }
+    if (!includeKnownItems) {
+      possibleItemsIDs.removeAll(preferredItemIDs);
+    }
+    return possibleItemsIDs;
+  }
+
+  private void addSomeOf(FastIDSet possibleItemIDs, FastIDSet itemIDs) {
+    if (itemIDs.size() > maxItemsPerUser) {
+      LongPrimitiveIterator it =
+          new SamplingLongPrimitiveIterator(itemIDs.iterator(), (double) maxItemsPerUser / itemIDs.size());
+      while (it.hasNext()) {
+        possibleItemIDs.add(it.nextLong());
+      }
+    } else {
+      possibleItemIDs.addAll(itemIDs);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SimilarUser.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SimilarUser.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SimilarUser.java
new file mode 100644
index 0000000..c6d417f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SimilarUser.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.common.RandomUtils;
+
+/** Simply encapsulates a user and a similarity value. */
+public final class SimilarUser implements Comparable<SimilarUser> {
+  
+  private final long userID;
+  private final double similarity;
+  
+  public SimilarUser(long userID, double similarity) {
+    this.userID = userID;
+    this.similarity = similarity;
+  }
+  
+  long getUserID() {
+    return userID;
+  }
+  
+  double getSimilarity() {
+    return similarity;
+  }
+  
+  @Override
+  public int hashCode() {
+    return (int) userID ^ RandomUtils.hashDouble(similarity);
+  }
+  
+  @Override
+  public boolean equals(Object o) {
+    if (!(o instanceof SimilarUser)) {
+      return false;
+    }
+    SimilarUser other = (SimilarUser) o;
+    return userID == other.getUserID() && similarity == other.getSimilarity();
+  }
+  
+  @Override
+  public String toString() {
+    return "SimilarUser[user:" + userID + ", similarity:" + similarity + ']';
+  }
+  
+  /** Defines an ordering from most similar to least similar. */
+  @Override
+  public int compareTo(SimilarUser other) {
+    double otherSimilarity = other.getSimilarity();
+    if (similarity > otherSimilarity) {
+      return -1;
+    }
+    if (similarity < otherSimilarity) {
+      return 1;
+    }
+    long otherUserID = other.getUserID();
+    if (userID < otherUserID) {
+      return -1;
+    }
+    if (userID > otherUserID) {
+      return 1;
+    }
+    return 0;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TopItems.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TopItems.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TopItems.java
new file mode 100644
index 0000000..f7b4385
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TopItems.java
@@ -0,0 +1,211 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.PriorityQueue;
+import java.util.Queue;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity;
+import org.apache.mahout.cf.taste.impl.similarity.GenericUserSimilarity;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+
+/**
+ * <p>
+ * A simple class that refactors the "find top N things" logic that is used in several places.
+ * </p>
+ */
+public final class TopItems {
+  
+  private static final long[] NO_IDS = new long[0];
+  
+  private TopItems() { }
+  
+  public static List<RecommendedItem> getTopItems(int howMany,
+                                                  LongPrimitiveIterator possibleItemIDs,
+                                                  IDRescorer rescorer,
+                                                  Estimator<Long> estimator) throws TasteException {
+    Preconditions.checkArgument(possibleItemIDs != null, "possibleItemIDs is null");
+    Preconditions.checkArgument(estimator != null, "estimator is null");
+
+    Queue<RecommendedItem> topItems = new PriorityQueue<>(howMany + 1,
+      Collections.reverseOrder(ByValueRecommendedItemComparator.getInstance()));
+    boolean full = false;
+    double lowestTopValue = Double.NEGATIVE_INFINITY;
+    while (possibleItemIDs.hasNext()) {
+      long itemID = possibleItemIDs.next();
+      if (rescorer == null || !rescorer.isFiltered(itemID)) {
+        double preference;
+        try {
+          preference = estimator.estimate(itemID);
+        } catch (NoSuchItemException nsie) {
+          continue;
+        }
+        double rescoredPref = rescorer == null ? preference : rescorer.rescore(itemID, preference);
+        if (!Double.isNaN(rescoredPref) && (!full || rescoredPref > lowestTopValue)) {
+          topItems.add(new GenericRecommendedItem(itemID, (float) rescoredPref));
+          if (full) {
+            topItems.poll();
+          } else if (topItems.size() > howMany) {
+            full = true;
+            topItems.poll();
+          }
+          lowestTopValue = topItems.peek().getValue();
+        }
+      }
+    }
+    int size = topItems.size();
+    if (size == 0) {
+      return Collections.emptyList();
+    }
+    List<RecommendedItem> result = new ArrayList<>(size);
+    result.addAll(topItems);
+    Collections.sort(result, ByValueRecommendedItemComparator.getInstance());
+    return result;
+  }
+  
+  public static long[] getTopUsers(int howMany,
+                                   LongPrimitiveIterator allUserIDs,
+                                   IDRescorer rescorer,
+                                   Estimator<Long> estimator) throws TasteException {
+    Queue<SimilarUser> topUsers = new PriorityQueue<>(howMany + 1, Collections.reverseOrder());
+    boolean full = false;
+    double lowestTopValue = Double.NEGATIVE_INFINITY;
+    while (allUserIDs.hasNext()) {
+      long userID = allUserIDs.next();
+      if (rescorer != null && rescorer.isFiltered(userID)) {
+        continue;
+      }
+      double similarity;
+      try {
+        similarity = estimator.estimate(userID);
+      } catch (NoSuchUserException nsue) {
+        continue;
+      }
+      double rescoredSimilarity = rescorer == null ? similarity : rescorer.rescore(userID, similarity);
+      if (!Double.isNaN(rescoredSimilarity) && (!full || rescoredSimilarity > lowestTopValue)) {
+        topUsers.add(new SimilarUser(userID, rescoredSimilarity));
+        if (full) {
+          topUsers.poll();
+        } else if (topUsers.size() > howMany) {
+          full = true;
+          topUsers.poll();
+        }
+        lowestTopValue = topUsers.peek().getSimilarity();
+      }
+    }
+    int size = topUsers.size();
+    if (size == 0) {
+      return NO_IDS;
+    }
+    List<SimilarUser> sorted = new ArrayList<>(size);
+    sorted.addAll(topUsers);
+    Collections.sort(sorted);
+    long[] result = new long[size];
+    int i = 0;
+    for (SimilarUser similarUser : sorted) {
+      result[i++] = similarUser.getUserID();
+    }
+    return result;
+  }
+  
+  /**
+   * <p>
+   * Thanks to tsmorton for suggesting this functionality and writing part of the code.
+   * </p>
+   * 
+   * @see GenericItemSimilarity#GenericItemSimilarity(Iterable, int)
+   * @see GenericItemSimilarity#GenericItemSimilarity(org.apache.mahout.cf.taste.similarity.ItemSimilarity,
+   *      org.apache.mahout.cf.taste.model.DataModel, int)
+   */
+  public static List<GenericItemSimilarity.ItemItemSimilarity> getTopItemItemSimilarities(
+    int howMany, Iterator<GenericItemSimilarity.ItemItemSimilarity> allSimilarities) {
+    
+    Queue<GenericItemSimilarity.ItemItemSimilarity> topSimilarities
+      = new PriorityQueue<>(howMany + 1, Collections.reverseOrder());
+    boolean full = false;
+    double lowestTopValue = Double.NEGATIVE_INFINITY;
+    while (allSimilarities.hasNext()) {
+      GenericItemSimilarity.ItemItemSimilarity similarity = allSimilarities.next();
+      double value = similarity.getValue();
+      if (!Double.isNaN(value) && (!full || value > lowestTopValue)) {
+        topSimilarities.add(similarity);
+        if (full) {
+          topSimilarities.poll();
+        } else if (topSimilarities.size() > howMany) {
+          full = true;
+          topSimilarities.poll();
+        }
+        lowestTopValue = topSimilarities.peek().getValue();
+      }
+    }
+    int size = topSimilarities.size();
+    if (size == 0) {
+      return Collections.emptyList();
+    }
+    List<GenericItemSimilarity.ItemItemSimilarity> result = new ArrayList<>(size);
+    result.addAll(topSimilarities);
+    Collections.sort(result);
+    return result;
+  }
+  
+  public static List<GenericUserSimilarity.UserUserSimilarity> getTopUserUserSimilarities(
+    int howMany, Iterator<GenericUserSimilarity.UserUserSimilarity> allSimilarities) {
+    
+    Queue<GenericUserSimilarity.UserUserSimilarity> topSimilarities
+      = new PriorityQueue<>(howMany + 1, Collections.reverseOrder());
+    boolean full = false;
+    double lowestTopValue = Double.NEGATIVE_INFINITY;
+    while (allSimilarities.hasNext()) {
+      GenericUserSimilarity.UserUserSimilarity similarity = allSimilarities.next();
+      double value = similarity.getValue();
+      if (!Double.isNaN(value) && (!full || value > lowestTopValue)) {
+        topSimilarities.add(similarity);
+        if (full) {
+          topSimilarities.poll();
+        } else if (topSimilarities.size() > howMany) {
+          full = true;
+          topSimilarities.poll();
+        }
+        lowestTopValue = topSimilarities.peek().getValue();
+      }
+    }
+    int size = topSimilarities.size();
+    if (size == 0) {
+      return Collections.emptyList();
+    }
+    List<GenericUserSimilarity.UserUserSimilarity> result = new ArrayList<>(size);
+    result.addAll(topSimilarities);
+    Collections.sort(result);
+    return result;
+  }
+  
+  public interface Estimator<T> {
+    double estimate(T thing) throws TasteException;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizer.java
new file mode 100644
index 0000000..0ba5139
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizer.java
@@ -0,0 +1,312 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.als.AlternatingLeastSquaresSolver;
+import org.apache.mahout.math.als.ImplicitFeedbackAlternatingLeastSquaresSolver;
+import org.apache.mahout.math.map.OpenIntObjectHashMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * factorizes the rating matrix using "Alternating-Least-Squares with Weighted-λ-Regularization" as described in
+ * <a href="http://www.hpl.hp.com/personal/Robert_Schreiber/papers/2008%20AAIM%20Netflix/netflix_aaim08(submitted).pdf">
+ * "Large-scale Collaborative Filtering for the Netflix Prize"</a>
+ *
+ *  also supports the implicit feedback variant of this approach as described in "Collaborative Filtering for Implicit
+ *  Feedback Datasets" available at http://research.yahoo.com/pub/2433
+ */
+public class ALSWRFactorizer extends AbstractFactorizer {
+
+  private final DataModel dataModel;
+
+  /** number of features used to compute this factorization */
+  private final int numFeatures;
+  /** parameter to control the regularization */
+  private final double lambda;
+  /** number of iterations */
+  private final int numIterations;
+
+  private final boolean usesImplicitFeedback;
+  /** confidence weighting parameter, only necessary when working with implicit feedback */
+  private final double alpha;
+
+  private final int numTrainingThreads;
+
+  private static final double DEFAULT_ALPHA = 40;
+
+  private static final Logger log = LoggerFactory.getLogger(ALSWRFactorizer.class);
+
+  public ALSWRFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
+      boolean usesImplicitFeedback, double alpha, int numTrainingThreads) throws TasteException {
+    super(dataModel);
+    this.dataModel = dataModel;
+    this.numFeatures = numFeatures;
+    this.lambda = lambda;
+    this.numIterations = numIterations;
+    this.usesImplicitFeedback = usesImplicitFeedback;
+    this.alpha = alpha;
+    this.numTrainingThreads = numTrainingThreads;
+  }
+
+  public ALSWRFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
+                         boolean usesImplicitFeedback, double alpha) throws TasteException {
+    this(dataModel, numFeatures, lambda, numIterations, usesImplicitFeedback, alpha,
+        Runtime.getRuntime().availableProcessors());
+  }
+
+  public ALSWRFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations) throws TasteException {
+    this(dataModel, numFeatures, lambda, numIterations, false, DEFAULT_ALPHA);
+  }
+
+  static class Features {
+
+    private final DataModel dataModel;
+    private final int numFeatures;
+
+    private final double[][] M;
+    private final double[][] U;
+
+    Features(ALSWRFactorizer factorizer) throws TasteException {
+      dataModel = factorizer.dataModel;
+      numFeatures = factorizer.numFeatures;
+      Random random = RandomUtils.getRandom();
+      M = new double[dataModel.getNumItems()][numFeatures];
+      LongPrimitiveIterator itemIDsIterator = dataModel.getItemIDs();
+      while (itemIDsIterator.hasNext()) {
+        long itemID = itemIDsIterator.nextLong();
+        int itemIDIndex = factorizer.itemIndex(itemID);
+        M[itemIDIndex][0] = averateRating(itemID);
+        for (int feature = 1; feature < numFeatures; feature++) {
+          M[itemIDIndex][feature] = random.nextDouble() * 0.1;
+        }
+      }
+      U = new double[dataModel.getNumUsers()][numFeatures];
+    }
+
+    double[][] getM() {
+      return M;
+    }
+
+    double[][] getU() {
+      return U;
+    }
+
+    Vector getUserFeatureColumn(int index) {
+      return new DenseVector(U[index]);
+    }
+
+    Vector getItemFeatureColumn(int index) {
+      return new DenseVector(M[index]);
+    }
+
+    void setFeatureColumnInU(int idIndex, Vector vector) {
+      setFeatureColumn(U, idIndex, vector);
+    }
+
+    void setFeatureColumnInM(int idIndex, Vector vector) {
+      setFeatureColumn(M, idIndex, vector);
+    }
+
+    protected void setFeatureColumn(double[][] matrix, int idIndex, Vector vector) {
+      for (int feature = 0; feature < numFeatures; feature++) {
+        matrix[idIndex][feature] = vector.get(feature);
+      }
+    }
+
+    protected double averateRating(long itemID) throws TasteException {
+      PreferenceArray prefs = dataModel.getPreferencesForItem(itemID);
+      RunningAverage avg = new FullRunningAverage();
+      for (Preference pref : prefs) {
+        avg.addDatum(pref.getValue());
+      }
+      return avg.getAverage();
+    }
+  }
+
+  @Override
+  public Factorization factorize() throws TasteException {
+    log.info("starting to compute the factorization...");
+    final Features features = new Features(this);
+
+    /* feature maps necessary for solving for implicit feedback */
+    OpenIntObjectHashMap<Vector> userY = null;
+    OpenIntObjectHashMap<Vector> itemY = null;
+
+    if (usesImplicitFeedback) {
+      userY = userFeaturesMapping(dataModel.getUserIDs(), dataModel.getNumUsers(), features.getU());
+      itemY = itemFeaturesMapping(dataModel.getItemIDs(), dataModel.getNumItems(), features.getM());
+    }
+
+    for (int iteration = 0; iteration < numIterations; iteration++) {
+      log.info("iteration {}", iteration);
+
+      /* fix M - compute U */
+      ExecutorService queue = createQueue();
+      LongPrimitiveIterator userIDsIterator = dataModel.getUserIDs();
+      try {
+
+        final ImplicitFeedbackAlternatingLeastSquaresSolver implicitFeedbackSolver = usesImplicitFeedback
+            ? new ImplicitFeedbackAlternatingLeastSquaresSolver(numFeatures, lambda, alpha, itemY, numTrainingThreads)
+            : null;
+
+        while (userIDsIterator.hasNext()) {
+          final long userID = userIDsIterator.nextLong();
+          final LongPrimitiveIterator itemIDsFromUser = dataModel.getItemIDsFromUser(userID).iterator();
+          final PreferenceArray userPrefs = dataModel.getPreferencesFromUser(userID);
+          queue.execute(new Runnable() {
+            @Override
+            public void run() {
+              List<Vector> featureVectors = new ArrayList<>();
+              while (itemIDsFromUser.hasNext()) {
+                long itemID = itemIDsFromUser.nextLong();
+                featureVectors.add(features.getItemFeatureColumn(itemIndex(itemID)));
+              }
+
+              Vector userFeatures = usesImplicitFeedback
+                  ? implicitFeedbackSolver.solve(sparseUserRatingVector(userPrefs))
+                  : AlternatingLeastSquaresSolver.solve(featureVectors, ratingVector(userPrefs), lambda, numFeatures);
+
+              features.setFeatureColumnInU(userIndex(userID), userFeatures);
+            }
+          });
+        }
+      } finally {
+        queue.shutdown();
+        try {
+          queue.awaitTermination(dataModel.getNumUsers(), TimeUnit.SECONDS);
+        } catch (InterruptedException e) {
+          log.warn("Error when computing user features", e);
+        }
+      }
+
+      /* fix U - compute M */
+      queue = createQueue();
+      LongPrimitiveIterator itemIDsIterator = dataModel.getItemIDs();
+      try {
+
+        final ImplicitFeedbackAlternatingLeastSquaresSolver implicitFeedbackSolver = usesImplicitFeedback
+            ? new ImplicitFeedbackAlternatingLeastSquaresSolver(numFeatures, lambda, alpha, userY, numTrainingThreads)
+            : null;
+
+        while (itemIDsIterator.hasNext()) {
+          final long itemID = itemIDsIterator.nextLong();
+          final PreferenceArray itemPrefs = dataModel.getPreferencesForItem(itemID);
+          queue.execute(new Runnable() {
+            @Override
+            public void run() {
+              List<Vector> featureVectors = new ArrayList<>();
+              for (Preference pref : itemPrefs) {
+                long userID = pref.getUserID();
+                featureVectors.add(features.getUserFeatureColumn(userIndex(userID)));
+              }
+
+              Vector itemFeatures = usesImplicitFeedback
+                  ? implicitFeedbackSolver.solve(sparseItemRatingVector(itemPrefs))
+                  : AlternatingLeastSquaresSolver.solve(featureVectors, ratingVector(itemPrefs), lambda, numFeatures);
+
+              features.setFeatureColumnInM(itemIndex(itemID), itemFeatures);
+            }
+          });
+        }
+      } finally {
+        queue.shutdown();
+        try {
+          queue.awaitTermination(dataModel.getNumItems(), TimeUnit.SECONDS);
+        } catch (InterruptedException e) {
+          log.warn("Error when computing item features", e);
+        }
+      }
+    }
+
+    log.info("finished computation of the factorization...");
+    return createFactorization(features.getU(), features.getM());
+  }
+
+  protected ExecutorService createQueue() {
+    return Executors.newFixedThreadPool(numTrainingThreads);
+  }
+
+  protected static Vector ratingVector(PreferenceArray prefs) {
+    double[] ratings = new double[prefs.length()];
+    for (int n = 0; n < prefs.length(); n++) {
+      ratings[n] = prefs.get(n).getValue();
+    }
+    return new DenseVector(ratings, true);
+  }
+
+  //TODO find a way to get rid of the object overhead here
+  protected OpenIntObjectHashMap<Vector> itemFeaturesMapping(LongPrimitiveIterator itemIDs, int numItems,
+      double[][] featureMatrix) {
+    OpenIntObjectHashMap<Vector> mapping = new OpenIntObjectHashMap<>(numItems);
+    while (itemIDs.hasNext()) {
+      long itemID = itemIDs.next();
+      int itemIndex = itemIndex(itemID);
+      mapping.put(itemIndex, new DenseVector(featureMatrix[itemIndex(itemID)], true));
+    }
+
+    return mapping;
+  }
+
+  protected OpenIntObjectHashMap<Vector> userFeaturesMapping(LongPrimitiveIterator userIDs, int numUsers,
+      double[][] featureMatrix) {
+    OpenIntObjectHashMap<Vector> mapping = new OpenIntObjectHashMap<>(numUsers);
+
+    while (userIDs.hasNext()) {
+      long userID = userIDs.next();
+      int userIndex = userIndex(userID);
+      mapping.put(userIndex, new DenseVector(featureMatrix[userIndex(userID)], true));
+    }
+
+    return mapping;
+  }
+
+  protected Vector sparseItemRatingVector(PreferenceArray prefs) {
+    SequentialAccessSparseVector ratings = new SequentialAccessSparseVector(Integer.MAX_VALUE, prefs.length());
+    for (Preference preference : prefs) {
+      ratings.set(userIndex(preference.getUserID()), preference.getValue());
+    }
+    return ratings;
+  }
+
+  protected Vector sparseUserRatingVector(PreferenceArray prefs) {
+    SequentialAccessSparseVector ratings = new SequentialAccessSparseVector(Integer.MAX_VALUE, prefs.length());
+    for (Preference preference : prefs) {
+      ratings.set(itemIndex(preference.getItemID()), preference.getValue());
+    }
+    return ratings;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/AbstractFactorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/AbstractFactorizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/AbstractFactorizer.java
new file mode 100644
index 0000000..0a39a1d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/AbstractFactorizer.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.util.Collection;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+/**
+ * base class for {@link Factorizer}s, provides ID to index mapping
+ */
+public abstract class AbstractFactorizer implements Factorizer {
+
+  private final DataModel dataModel;
+  private FastByIDMap<Integer> userIDMapping;
+  private FastByIDMap<Integer> itemIDMapping;
+  private final RefreshHelper refreshHelper;
+
+  protected AbstractFactorizer(DataModel dataModel) throws TasteException {
+    this.dataModel = dataModel;
+    buildMappings();
+    refreshHelper = new RefreshHelper(new Callable<Object>() {
+      @Override
+      public Object call() throws TasteException {
+        buildMappings();
+        return null;
+      }
+    });
+    refreshHelper.addDependency(dataModel);
+  }
+  
+  private void buildMappings() throws TasteException {
+    userIDMapping = createIDMapping(dataModel.getNumUsers(), dataModel.getUserIDs());
+    itemIDMapping = createIDMapping(dataModel.getNumItems(), dataModel.getItemIDs());
+  }
+
+  protected Factorization createFactorization(double[][] userFeatures, double[][] itemFeatures) {
+    return new Factorization(userIDMapping, itemIDMapping, userFeatures, itemFeatures);
+  }
+
+  protected Integer userIndex(long userID) {
+    Integer userIndex = userIDMapping.get(userID);
+    if (userIndex == null) {
+      userIndex = userIDMapping.size();
+      userIDMapping.put(userID, userIndex);
+    }
+    return userIndex;
+  }
+
+  protected Integer itemIndex(long itemID) {
+    Integer itemIndex = itemIDMapping.get(itemID);
+    if (itemIndex == null) {
+      itemIndex = itemIDMapping.size();
+      itemIDMapping.put(itemID, itemIndex);
+    }
+    return itemIndex;
+  }
+
+  private static FastByIDMap<Integer> createIDMapping(int size, LongPrimitiveIterator idIterator) {
+    FastByIDMap<Integer> mapping = new FastByIDMap<>(size);
+    int index = 0;
+    while (idIterator.hasNext()) {
+      mapping.put(idIterator.nextLong(), index++);
+    }
+    return mapping;
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    refreshHelper.refresh(alreadyRefreshed);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorization.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorization.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorization.java
new file mode 100644
index 0000000..f169a60
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorization.java
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.util.Arrays;
+import java.util.Map;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+
+/**
+ * a factorization of the rating matrix
+ */
+public class Factorization {
+
+  /** used to find the rows in the user features matrix by userID */
+  private final FastByIDMap<Integer> userIDMapping;
+  /** used to find the rows in the item features matrix by itemID */
+  private final FastByIDMap<Integer> itemIDMapping;
+
+  /** user features matrix */
+  private final double[][] userFeatures;
+  /** item features matrix */
+  private final double[][] itemFeatures;
+
+  public Factorization(FastByIDMap<Integer> userIDMapping, FastByIDMap<Integer> itemIDMapping, double[][] userFeatures,
+      double[][] itemFeatures) {
+    this.userIDMapping = Preconditions.checkNotNull(userIDMapping);
+    this.itemIDMapping = Preconditions.checkNotNull(itemIDMapping);
+    this.userFeatures = userFeatures;
+    this.itemFeatures = itemFeatures;
+  }
+
+  public double[][] allUserFeatures() {
+    return userFeatures;
+  }
+
+  public double[] getUserFeatures(long userID) throws NoSuchUserException {
+    Integer index = userIDMapping.get(userID);
+    if (index == null) {
+      throw new NoSuchUserException(userID);
+    }
+    return userFeatures[index];
+  }
+
+  public double[][] allItemFeatures() {
+    return itemFeatures;
+  }
+
+  public double[] getItemFeatures(long itemID) throws NoSuchItemException {
+    Integer index = itemIDMapping.get(itemID);
+    if (index == null) {
+      throw new NoSuchItemException(itemID);
+    }
+    return itemFeatures[index];
+  }
+
+  public int userIndex(long userID) throws NoSuchUserException {
+    Integer index = userIDMapping.get(userID);
+    if (index == null) {
+      throw new NoSuchUserException(userID);
+    }
+    return index;
+  }
+
+  public Iterable<Map.Entry<Long,Integer>> getUserIDMappings() {
+    return userIDMapping.entrySet();
+  }
+  
+  public LongPrimitiveIterator getUserIDMappingKeys() {
+    return userIDMapping.keySetIterator();
+  }
+
+  public int itemIndex(long itemID) throws NoSuchItemException {
+    Integer index = itemIDMapping.get(itemID);
+    if (index == null) {
+      throw new NoSuchItemException(itemID);
+    }
+    return index;
+  }
+
+  public Iterable<Map.Entry<Long,Integer>> getItemIDMappings() {
+    return itemIDMapping.entrySet();
+  }
+  
+  public LongPrimitiveIterator getItemIDMappingKeys() {
+    return itemIDMapping.keySetIterator();
+  }
+
+  public int numFeatures() {
+    return userFeatures.length > 0 ? userFeatures[0].length : 0;
+  }
+
+  public int numUsers() {
+    return userIDMapping.size();
+  }
+
+  public int numItems() {
+    return itemIDMapping.size();
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (o instanceof Factorization) {
+      Factorization other = (Factorization) o;
+      return userIDMapping.equals(other.userIDMapping) && itemIDMapping.equals(other.itemIDMapping)
+          && Arrays.deepEquals(userFeatures, other.userFeatures) && Arrays.deepEquals(itemFeatures, other.itemFeatures);
+    }
+    return false;
+  }
+
+  @Override
+  public int hashCode() {
+    int hashCode = 31 * userIDMapping.hashCode() + itemIDMapping.hashCode();
+    hashCode = 31 * hashCode + Arrays.deepHashCode(userFeatures);
+    hashCode = 31 * hashCode + Arrays.deepHashCode(itemFeatures);
+    return hashCode;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorizer.java
new file mode 100644
index 0000000..2cabe73
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorizer.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * Implementation must be able to create a factorization of a rating matrix
+ */
+public interface Factorizer extends Refreshable {
+
+  Factorization factorize() throws TasteException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/FilePersistenceStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/FilePersistenceStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/FilePersistenceStrategy.java
new file mode 100644
index 0000000..08c038a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/FilePersistenceStrategy.java
@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Map;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Provides a file-based persistent store. */
+public class FilePersistenceStrategy implements PersistenceStrategy {
+
+  private final File file;
+
+  private static final Logger log = LoggerFactory.getLogger(FilePersistenceStrategy.class);
+
+  /**
+   * @param file the file to use for storage. If the file does not exist it will be created when required.
+   */
+  public FilePersistenceStrategy(File file) {
+    this.file = Preconditions.checkNotNull(file);
+  }
+
+  @Override
+  public Factorization load() throws IOException {
+    if (!file.exists()) {
+      log.info("{} does not yet exist, no factorization found", file.getAbsolutePath());
+      return null;
+    }
+    try (DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(file)))){
+      log.info("Reading factorization from {}...", file.getAbsolutePath());
+      return readBinary(in);
+    }
+  }
+
+  @Override
+  public void maybePersist(Factorization factorization) throws IOException {
+    try (DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file)))){
+      log.info("Writing factorization to {}...", file.getAbsolutePath());
+      writeBinary(factorization, out);
+    }
+  }
+
+  protected static void writeBinary(Factorization factorization, DataOutput out) throws IOException {
+    out.writeInt(factorization.numFeatures());
+    out.writeInt(factorization.numUsers());
+    out.writeInt(factorization.numItems());
+
+    for (Map.Entry<Long,Integer> mappingEntry : factorization.getUserIDMappings()) {
+      long userID = mappingEntry.getKey();
+      out.writeInt(mappingEntry.getValue());
+      out.writeLong(userID);
+      try {
+        double[] userFeatures = factorization.getUserFeatures(userID);
+        for (int feature = 0; feature < factorization.numFeatures(); feature++) {
+          out.writeDouble(userFeatures[feature]);
+        }
+      } catch (NoSuchUserException e) {
+        throw new IOException("Unable to persist factorization", e);
+      }
+    }
+
+    for (Map.Entry<Long,Integer> entry : factorization.getItemIDMappings()) {
+      long itemID = entry.getKey();
+      out.writeInt(entry.getValue());
+      out.writeLong(itemID);
+      try {
+        double[] itemFeatures = factorization.getItemFeatures(itemID);
+        for (int feature = 0; feature < factorization.numFeatures(); feature++) {
+          out.writeDouble(itemFeatures[feature]);
+        }
+      } catch (NoSuchItemException e) {
+        throw new IOException("Unable to persist factorization", e);
+      }
+    }
+  }
+
+  public static Factorization readBinary(DataInput in) throws IOException {
+    int numFeatures = in.readInt();
+    int numUsers = in.readInt();
+    int numItems = in.readInt();
+
+    FastByIDMap<Integer> userIDMapping = new FastByIDMap<>(numUsers);
+    double[][] userFeatures = new double[numUsers][numFeatures];
+
+    for (int n = 0; n < numUsers; n++) {
+      int userIndex = in.readInt();
+      long userID = in.readLong();
+      userIDMapping.put(userID, userIndex);
+      for (int feature = 0; feature < numFeatures; feature++) {
+        userFeatures[userIndex][feature] = in.readDouble();
+      }
+    }
+
+    FastByIDMap<Integer> itemIDMapping = new FastByIDMap<>(numItems);
+    double[][] itemFeatures = new double[numItems][numFeatures];
+
+    for (int n = 0; n < numItems; n++) {
+      int itemIndex = in.readInt();
+      long itemID = in.readLong();
+      itemIDMapping.put(itemID, itemIndex);
+      for (int feature = 0; feature < numFeatures; feature++) {
+        itemFeatures[itemIndex][feature] = in.readDouble();
+      }
+    }
+
+    return new Factorization(userIDMapping, itemIDMapping, userFeatures, itemFeatures);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/NoPersistenceStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/NoPersistenceStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/NoPersistenceStrategy.java
new file mode 100644
index 0000000..0d1aab0
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/NoPersistenceStrategy.java
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.io.IOException;
+
+/**
+ * A {@link PersistenceStrategy} which does nothing.
+ */
+public class NoPersistenceStrategy implements PersistenceStrategy {
+
+  @Override
+  public Factorization load() throws IOException {
+    return null;
+  }
+
+  @Override
+  public void maybePersist(Factorization factorization) throws IOException {
+    // do nothing.
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ParallelSGDFactorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ParallelSGDFactorizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ParallelSGDFactorizer.java
new file mode 100644
index 0000000..8a6a702
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ParallelSGDFactorizer.java
@@ -0,0 +1,340 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.RandomWrapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Minimalistic implementation of Parallel SGD factorizer based on
+ * <a href="http://www.sze.hu/~gtakacs/download/jmlr_2009.pdf">
+ * "Scalable Collaborative Filtering Approaches for Large Recommender Systems"</a>
+ * and
+ * <a href="hwww.cs.wisc.edu/~brecht/papers/hogwildTR.pdf">
+ * "Hogwild!: A Lock-Free Approach to Parallelizing Stochastic Gradient Descent"</a> */
+public class ParallelSGDFactorizer extends AbstractFactorizer {
+
+  private final DataModel dataModel;
+  /** Parameter used to prevent overfitting. */
+  private final double lambda;
+  /** Number of features used to compute this factorization */
+  private final int rank;
+  /** Number of iterations */
+  private final int numEpochs;
+
+  private int numThreads;
+
+  // these next two control decayFactor^steps exponential type of annealing learning rate and decay factor
+  private double mu0 = 0.01;
+  private double decayFactor = 1;
+  // these next two control 1/steps^forget type annealing
+  private int stepOffset = 0;
+  // -1 equals even weighting of all examples, 0 means only use exponential annealing
+  private double forgettingExponent = 0;
+
+  // The following two should be inversely proportional :)
+  private double biasMuRatio = 0.5;
+  private double biasLambdaRatio = 0.1;
+
+  /** TODO: this is not safe as += is not atomic on many processors, can be replaced with AtomicDoubleArray
+   * but it works just fine right now  */
+  /** user features */
+  protected volatile double[][] userVectors;
+  /** item features */
+  protected volatile double[][] itemVectors;
+
+  private final PreferenceShuffler shuffler;
+
+  private int epoch = 1;
+  /** place in user vector where the bias is stored */
+  private static final int USER_BIAS_INDEX = 1;
+  /** place in item vector where the bias is stored */
+  private static final int ITEM_BIAS_INDEX = 2;
+  private static final int FEATURE_OFFSET = 3;
+  /** Standard deviation for random initialization of features */
+  private static final double NOISE = 0.02;
+
+  private static final Logger logger = LoggerFactory.getLogger(ParallelSGDFactorizer.class);
+
+  protected static class PreferenceShuffler {
+
+    private Preference[] preferences;
+    private Preference[] unstagedPreferences;
+
+    protected final RandomWrapper random = RandomUtils.getRandom();
+
+    public PreferenceShuffler(DataModel dataModel) throws TasteException {
+      cachePreferences(dataModel);
+      shuffle();
+      stage();
+    }
+
+    private int countPreferences(DataModel dataModel) throws TasteException {
+      int numPreferences = 0;
+      LongPrimitiveIterator userIDs = dataModel.getUserIDs();
+      while (userIDs.hasNext()) {
+        PreferenceArray preferencesFromUser = dataModel.getPreferencesFromUser(userIDs.nextLong());
+        numPreferences += preferencesFromUser.length();
+      }
+      return numPreferences;
+    }
+
+    private void cachePreferences(DataModel dataModel) throws TasteException {
+      int numPreferences = countPreferences(dataModel);
+      preferences = new Preference[numPreferences];
+
+      LongPrimitiveIterator userIDs = dataModel.getUserIDs();
+      int index = 0;
+      while (userIDs.hasNext()) {
+        long userID = userIDs.nextLong();
+        PreferenceArray preferencesFromUser = dataModel.getPreferencesFromUser(userID);
+        for (Preference preference : preferencesFromUser) {
+          preferences[index++] = preference;
+        }
+      }
+    }
+
+    public final void shuffle() {
+      unstagedPreferences = preferences.clone();
+      /* Durstenfeld shuffle */
+      for (int i = unstagedPreferences.length - 1; i > 0; i--) {
+        int rand = random.nextInt(i + 1);
+        swapCachedPreferences(i, rand);
+      }
+    }
+
+    //merge this part into shuffle() will make compiler-optimizer do some real absurd stuff, test on OpenJDK7
+    private void swapCachedPreferences(int x, int y) {
+      Preference p = unstagedPreferences[x];
+
+      unstagedPreferences[x] = unstagedPreferences[y];
+      unstagedPreferences[y] = p;
+    }
+
+    public final void stage() {
+      preferences = unstagedPreferences;
+    }
+
+    public Preference get(int i) {
+      return preferences[i];
+    }
+
+    public int size() {
+      return preferences.length;
+    }
+
+  }
+
+  public ParallelSGDFactorizer(DataModel dataModel, int numFeatures, double lambda, int numEpochs)
+    throws TasteException {
+    super(dataModel);
+    this.dataModel = dataModel;
+    this.rank = numFeatures + FEATURE_OFFSET;
+    this.lambda = lambda;
+    this.numEpochs = numEpochs;
+
+    shuffler = new PreferenceShuffler(dataModel);
+
+    //max thread num set to n^0.25 as suggested by hogwild! paper
+    numThreads = Math.min(Runtime.getRuntime().availableProcessors(), (int) Math.pow((double) shuffler.size(), 0.25));
+  }
+
+  public ParallelSGDFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
+      double mu0, double decayFactor, int stepOffset, double forgettingExponent) throws TasteException {
+    this(dataModel, numFeatures, lambda, numIterations);
+
+    this.mu0 = mu0;
+    this.decayFactor = decayFactor;
+    this.stepOffset = stepOffset;
+    this.forgettingExponent = forgettingExponent;
+  }
+
+  public ParallelSGDFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
+      double mu0, double decayFactor, int stepOffset, double forgettingExponent, int numThreads) throws TasteException {
+    this(dataModel, numFeatures, lambda, numIterations, mu0, decayFactor, stepOffset, forgettingExponent);
+
+    this.numThreads = numThreads;
+  }
+
+  public ParallelSGDFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
+      double mu0, double decayFactor, int stepOffset, double forgettingExponent,
+      double biasMuRatio, double biasLambdaRatio) throws TasteException {
+    this(dataModel, numFeatures, lambda, numIterations, mu0, decayFactor, stepOffset, forgettingExponent);
+
+    this.biasMuRatio = biasMuRatio;
+    this.biasLambdaRatio = biasLambdaRatio;
+  }
+
+  public ParallelSGDFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
+      double mu0, double decayFactor, int stepOffset, double forgettingExponent,
+      double biasMuRatio, double biasLambdaRatio, int numThreads) throws TasteException {
+    this(dataModel, numFeatures, lambda, numIterations, mu0, decayFactor, stepOffset, forgettingExponent, biasMuRatio,
+         biasLambdaRatio);
+
+    this.numThreads = numThreads;
+  }
+
+  protected void initialize() throws TasteException {
+    RandomWrapper random = RandomUtils.getRandom();
+    userVectors = new double[dataModel.getNumUsers()][rank];
+    itemVectors = new double[dataModel.getNumItems()][rank];
+
+    double globalAverage = getAveragePreference();
+    for (int userIndex = 0; userIndex < userVectors.length; userIndex++) {
+      userVectors[userIndex][0] = globalAverage;
+      userVectors[userIndex][USER_BIAS_INDEX] = 0; // will store user bias
+      userVectors[userIndex][ITEM_BIAS_INDEX] = 1; // corresponding item feature contains item bias
+      for (int feature = FEATURE_OFFSET; feature < rank; feature++) {
+        userVectors[userIndex][feature] = random.nextGaussian() * NOISE;
+      }
+    }
+    for (int itemIndex = 0; itemIndex < itemVectors.length; itemIndex++) {
+      itemVectors[itemIndex][0] = 1; // corresponding user feature contains global average
+      itemVectors[itemIndex][USER_BIAS_INDEX] = 1; // corresponding user feature contains user bias
+      itemVectors[itemIndex][ITEM_BIAS_INDEX] = 0; // will store item bias
+      for (int feature = FEATURE_OFFSET; feature < rank; feature++) {
+        itemVectors[itemIndex][feature] = random.nextGaussian() * NOISE;
+      }
+    }
+  }
+
+  //TODO: needs optimization
+  private double getMu(int i) {
+    return mu0 * Math.pow(decayFactor, i - 1) * Math.pow(i + stepOffset, forgettingExponent);
+  }
+
+  @Override
+  public Factorization factorize() throws TasteException {
+    initialize();
+
+    if (logger.isInfoEnabled()) {
+      logger.info("starting to compute the factorization...");
+    }
+
+    for (epoch = 1; epoch <= numEpochs; epoch++) {
+      shuffler.stage();
+
+      final double mu = getMu(epoch);
+      int subSize = shuffler.size() / numThreads + 1;
+
+      ExecutorService executor=Executors.newFixedThreadPool(numThreads);
+
+      try {
+        for (int t = 0; t < numThreads; t++) {
+          final int iStart = t * subSize;
+          final int iEnd = Math.min((t + 1) * subSize, shuffler.size());
+
+          executor.execute(new Runnable() {
+            @Override
+            public void run() {
+              for (int i = iStart; i < iEnd; i++) {
+                update(shuffler.get(i), mu);
+              }
+            }
+          });
+        }
+      } finally {
+        executor.shutdown();
+        shuffler.shuffle();
+
+        try {
+          boolean terminated = executor.awaitTermination(numEpochs * shuffler.size(), TimeUnit.MICROSECONDS);
+          if (!terminated) {
+            logger.error("subtasks takes forever, return anyway");
+          }
+        } catch (InterruptedException e) {
+          throw new TasteException("waiting fof termination interrupted", e);
+        }
+      }
+
+    }
+
+    return createFactorization(userVectors, itemVectors);
+  }
+
+  double getAveragePreference() throws TasteException {
+    RunningAverage average = new FullRunningAverage();
+    LongPrimitiveIterator it = dataModel.getUserIDs();
+    while (it.hasNext()) {
+      for (Preference pref : dataModel.getPreferencesFromUser(it.nextLong())) {
+        average.addDatum(pref.getValue());
+      }
+    }
+    return average.getAverage();
+  }
+
+  /** TODO: this is the vanilla sgd by Tacaks 2009, I speculate that using scaling technique proposed in:
+   * Towards Optimal One Pass Large Scale Learning with Averaged Stochastic Gradient Descent section 5, page 6
+   * can be beneficial in term s of both speed and accuracy.
+   *
+   * Tacaks' method doesn't calculate gradient of regularization correctly, which has non-zero elements everywhere of
+   * the matrix. While Tacaks' method can only updates a single row/column, if one user has a lot of recommendation,
+   * her vector will be more affected by regularization using an isolated scaling factor for both user vectors and
+   * item vectors can remove this issue without inducing more update cost it even reduces it a bit by only performing
+   * one addition and one multiplication.
+   *
+   * BAD SIDE1: the scaling factor decreases fast, it has to be scaled up from time to time before dropped to zero or
+   *            caused roundoff error
+   * BAD SIDE2: no body experiment on it before, and people generally use very small lambda
+   *            so it's impact on accuracy may still be unknown.
+   * BAD SIDE3: don't know how to make it work for L1-regularization or
+   *            "pseudorank?" (sum of singular values)-regularization */
+  protected void update(Preference preference, double mu) {
+    int userIndex = userIndex(preference.getUserID());
+    int itemIndex = itemIndex(preference.getItemID());
+
+    double[] userVector = userVectors[userIndex];
+    double[] itemVector = itemVectors[itemIndex];
+
+    double prediction = dot(userVector, itemVector);
+    double err = preference.getValue() - prediction;
+
+    // adjust features
+    for (int k = FEATURE_OFFSET; k < rank; k++) {
+      double userFeature = userVector[k];
+      double itemFeature = itemVector[k];
+
+      userVector[k] += mu * (err * itemFeature - lambda * userFeature);
+      itemVector[k] += mu * (err * userFeature - lambda * itemFeature);
+    }
+
+    // adjust user and item bias
+    userVector[USER_BIAS_INDEX] += biasMuRatio * mu * (err - biasLambdaRatio * lambda * userVector[USER_BIAS_INDEX]);
+    itemVector[ITEM_BIAS_INDEX] += biasMuRatio * mu * (err - biasLambdaRatio * lambda * itemVector[ITEM_BIAS_INDEX]);
+  }
+
+  private double dot(double[] userVector, double[] itemVector) {
+    double sum = 0;
+    for (int k = 0; k < rank; k++) {
+      sum += userVector[k] * itemVector[k];
+    }
+    return sum;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/PersistenceStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/PersistenceStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/PersistenceStrategy.java
new file mode 100644
index 0000000..abf3eca
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/PersistenceStrategy.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.io.IOException;
+
+/**
+ * Provides storage for {@link Factorization}s
+ */
+public interface PersistenceStrategy {
+
+  /**
+   * Load a factorization from a persistent store.
+   *
+   * @return a Factorization or null if the persistent store is empty.
+   *
+   * @throws IOException
+   */
+  Factorization load() throws IOException;
+
+  /**
+   * Write a factorization to a persistent store unless it already
+   * contains an identical factorization.
+   *
+   * @param factorization
+   *
+   * @throws IOException
+   */
+  void maybePersist(Factorization factorization) throws IOException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/RatingSGDFactorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/RatingSGDFactorizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/RatingSGDFactorizer.java
new file mode 100644
index 0000000..2c9f0ae
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/RatingSGDFactorizer.java
@@ -0,0 +1,221 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.RandomWrapper;
+
+/** Matrix factorization with user and item biases for rating prediction, trained with plain vanilla SGD  */
+public class RatingSGDFactorizer extends AbstractFactorizer {
+
+  protected static final int FEATURE_OFFSET = 3;
+  
+  /** Multiplicative decay factor for learning_rate */
+  protected final double learningRateDecay;
+  /** Learning rate (step size) */
+  protected final double learningRate;
+  /** Parameter used to prevent overfitting. */
+  protected final double preventOverfitting;
+  /** Number of features used to compute this factorization */
+  protected final int numFeatures;
+  /** Number of iterations */
+  private final int numIterations;
+  /** Standard deviation for random initialization of features */
+  protected final double randomNoise;
+  /** User features */
+  protected double[][] userVectors;
+  /** Item features */
+  protected double[][] itemVectors;
+  protected final DataModel dataModel;
+  private long[] cachedUserIDs;
+  private long[] cachedItemIDs;
+
+  protected double biasLearningRate = 0.5;
+  protected double biasReg = 0.1;
+
+  /** place in user vector where the bias is stored */
+  protected static final int USER_BIAS_INDEX = 1;
+  /** place in item vector where the bias is stored */
+  protected static final int ITEM_BIAS_INDEX = 2;
+
+  public RatingSGDFactorizer(DataModel dataModel, int numFeatures, int numIterations) throws TasteException {
+    this(dataModel, numFeatures, 0.01, 0.1, 0.01, numIterations, 1.0);
+  }
+
+  public RatingSGDFactorizer(DataModel dataModel, int numFeatures, double learningRate, double preventOverfitting,
+      double randomNoise, int numIterations, double learningRateDecay) throws TasteException {
+    super(dataModel);
+    this.dataModel = dataModel;
+    this.numFeatures = numFeatures + FEATURE_OFFSET;
+    this.numIterations = numIterations;
+
+    this.learningRate = learningRate;
+    this.learningRateDecay = learningRateDecay;
+    this.preventOverfitting = preventOverfitting;
+    this.randomNoise = randomNoise;
+  }
+
+  protected void prepareTraining() throws TasteException {
+    RandomWrapper random = RandomUtils.getRandom();
+    userVectors = new double[dataModel.getNumUsers()][numFeatures];
+    itemVectors = new double[dataModel.getNumItems()][numFeatures];
+
+    double globalAverage = getAveragePreference();
+    for (int userIndex = 0; userIndex < userVectors.length; userIndex++) {
+      userVectors[userIndex][0] = globalAverage;
+      userVectors[userIndex][USER_BIAS_INDEX] = 0; // will store user bias
+      userVectors[userIndex][ITEM_BIAS_INDEX] = 1; // corresponding item feature contains item bias
+      for (int feature = FEATURE_OFFSET; feature < numFeatures; feature++) {
+        userVectors[userIndex][feature] = random.nextGaussian() * randomNoise;
+      }
+    }
+    for (int itemIndex = 0; itemIndex < itemVectors.length; itemIndex++) {
+      itemVectors[itemIndex][0] = 1; // corresponding user feature contains global average
+      itemVectors[itemIndex][USER_BIAS_INDEX] = 1; // corresponding user feature contains user bias
+      itemVectors[itemIndex][ITEM_BIAS_INDEX] = 0; // will store item bias
+      for (int feature = FEATURE_OFFSET; feature < numFeatures; feature++) {
+        itemVectors[itemIndex][feature] = random.nextGaussian() * randomNoise;
+      }
+    }
+
+    cachePreferences();
+    shufflePreferences();
+  }
+
+  private int countPreferences() throws TasteException {
+    int numPreferences = 0;
+    LongPrimitiveIterator userIDs = dataModel.getUserIDs();
+    while (userIDs.hasNext()) {
+      PreferenceArray preferencesFromUser = dataModel.getPreferencesFromUser(userIDs.nextLong());
+      numPreferences += preferencesFromUser.length();
+    }
+    return numPreferences;
+  }
+
+  private void cachePreferences() throws TasteException {
+    int numPreferences = countPreferences();
+    cachedUserIDs = new long[numPreferences];
+    cachedItemIDs = new long[numPreferences];
+
+    LongPrimitiveIterator userIDs = dataModel.getUserIDs();
+    int index = 0;
+    while (userIDs.hasNext()) {
+      long userID = userIDs.nextLong();
+      PreferenceArray preferencesFromUser = dataModel.getPreferencesFromUser(userID);
+      for (Preference preference : preferencesFromUser) {
+        cachedUserIDs[index] = userID;
+        cachedItemIDs[index] = preference.getItemID();
+        index++;
+      }
+    }
+  }
+
+  protected void shufflePreferences() {
+    RandomWrapper random = RandomUtils.getRandom();
+    /* Durstenfeld shuffle */
+    for (int currentPos = cachedUserIDs.length - 1; currentPos > 0; currentPos--) {
+      int swapPos = random.nextInt(currentPos + 1);
+      swapCachedPreferences(currentPos, swapPos);
+    }
+  }
+
+  private void swapCachedPreferences(int posA, int posB) {
+    long tmpUserIndex = cachedUserIDs[posA];
+    long tmpItemIndex = cachedItemIDs[posA];
+
+    cachedUserIDs[posA] = cachedUserIDs[posB];
+    cachedItemIDs[posA] = cachedItemIDs[posB];
+
+    cachedUserIDs[posB] = tmpUserIndex;
+    cachedItemIDs[posB] = tmpItemIndex;
+  }
+
+  @Override
+  public Factorization factorize() throws TasteException {
+    prepareTraining();
+    double currentLearningRate = learningRate;
+
+
+    for (int it = 0; it < numIterations; it++) {
+      for (int index = 0; index < cachedUserIDs.length; index++) {
+        long userId = cachedUserIDs[index];
+        long itemId = cachedItemIDs[index];
+        float rating = dataModel.getPreferenceValue(userId, itemId);
+        updateParameters(userId, itemId, rating, currentLearningRate);
+      }
+      currentLearningRate *= learningRateDecay;
+    }
+    return createFactorization(userVectors, itemVectors);
+  }
+
+  double getAveragePreference() throws TasteException {
+    RunningAverage average = new FullRunningAverage();
+    LongPrimitiveIterator it = dataModel.getUserIDs();
+    while (it.hasNext()) {
+      for (Preference pref : dataModel.getPreferencesFromUser(it.nextLong())) {
+        average.addDatum(pref.getValue());
+      }
+    }
+    return average.getAverage();
+  }
+
+  protected void updateParameters(long userID, long itemID, float rating, double currentLearningRate) {
+    int userIndex = userIndex(userID);
+    int itemIndex = itemIndex(itemID);
+
+    double[] userVector = userVectors[userIndex];
+    double[] itemVector = itemVectors[itemIndex];
+    double prediction = predictRating(userIndex, itemIndex);
+    double err = rating - prediction;
+
+    // adjust user bias
+    userVector[USER_BIAS_INDEX] +=
+        biasLearningRate * currentLearningRate * (err - biasReg * preventOverfitting * userVector[USER_BIAS_INDEX]);
+
+    // adjust item bias
+    itemVector[ITEM_BIAS_INDEX] +=
+        biasLearningRate * currentLearningRate * (err - biasReg * preventOverfitting * itemVector[ITEM_BIAS_INDEX]);
+
+    // adjust features
+    for (int feature = FEATURE_OFFSET; feature < numFeatures; feature++) {
+      double userFeature = userVector[feature];
+      double itemFeature = itemVector[feature];
+
+      double deltaUserFeature = err * itemFeature - preventOverfitting * userFeature;
+      userVector[feature] += currentLearningRate * deltaUserFeature;
+
+      double deltaItemFeature = err * userFeature - preventOverfitting * itemFeature;
+      itemVector[feature] += currentLearningRate * deltaItemFeature;
+    }
+  }
+
+  private double predictRating(int userID, int itemID) {
+    double sum = 0;
+    for (int feature = 0; feature < numFeatures; feature++) {
+      sum += userVectors[userID][feature] * itemVectors[itemID][feature];
+    }
+    return sum;
+  }
+}


[24/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsReducer.java
new file mode 100644
index 0000000..f74511b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsReducer.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.preparation;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+
+public class ToItemVectorsReducer extends Reducer<IntWritable,VectorWritable,IntWritable,VectorWritable> {
+
+  private final VectorWritable merged = new VectorWritable();
+
+  @Override
+  protected void reduce(IntWritable row, Iterable<VectorWritable> vectors, Context ctx)
+    throws IOException, InterruptedException {
+
+    merged.setWritesLaxPrecision(true);
+    merged.set(VectorWritable.mergeToVector(vectors.iterator()));
+    ctx.write(row, merged);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
new file mode 100644
index 0000000..c50fa20
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
@@ -0,0 +1,233 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.cf.taste.hadoop.preparation.PreparePreferenceMatrixJob;
+import org.apache.mahout.cf.taste.similarity.precompute.SimilarItem;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasures;
+import org.apache.mahout.math.map.OpenIntLongHashMap;
+
+/**
+ * <p>Distributed precomputation of the item-item-similarities for Itembased Collaborative Filtering</p>
+ *
+ * <p>Preferences in the input file should look like {@code userID,itemID[,preferencevalue]}</p>
+ *
+ * <p>
+ * Preference value is optional to accommodate applications that have no notion of a preference value (that is, the user
+ * simply expresses a preference for an item, but no degree of preference).
+ * </p>
+ *
+ * <p>
+ * The preference value is assumed to be parseable as a {@code double}. The user IDs and item IDs are
+ * parsed as {@code long}s.
+ * </p>
+ *
+ * <p>Command line arguments specific to this class are:</p>
+ *
+ * <ol>
+ * <li>--input (path): Directory containing one or more text files with the preference data</li>
+ * <li>--output (path): output path where similarity data should be written</li>
+ * <li>--similarityClassname (classname): Name of distributed similarity measure class to instantiate or a predefined
+ *  similarity from {@link org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasure}</li>
+ * <li>--maxSimilaritiesPerItem (integer): Maximum number of similarities considered per item (100)</li>
+ * <li>--maxPrefsPerUser (integer): max number of preferences to consider per user, users with more preferences will
+ *  be sampled down (1000)</li>
+ * <li>--minPrefsPerUser (integer): ignore users with less preferences than this (1)</li>
+ * <li>--booleanData (boolean): Treat input data as having no pref values (false)</li>
+ * <li>--threshold (double): discard item pairs with a similarity value below this</li>
+ * </ol>
+ *
+ * <p>General command line options are documented in {@link AbstractJob}.</p>
+ *
+ * <p>Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other arguments.</p>
+ */
+public final class ItemSimilarityJob extends AbstractJob {
+
+  public static final String ITEM_ID_INDEX_PATH_STR = ItemSimilarityJob.class.getName() + ".itemIDIndexPathStr";
+  public static final String MAX_SIMILARITIES_PER_ITEM = ItemSimilarityJob.class.getName() + ".maxSimilarItemsPerItem";
+
+  private static final int DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM = 100;
+  private static final int DEFAULT_MAX_PREFS = 500;
+  private static final int DEFAULT_MIN_PREFS_PER_USER = 1;
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new ItemSimilarityJob(), args);
+  }
+  
+  @Override
+  public int run(String[] args) throws Exception {
+
+    addInputOption();
+    addOutputOption();
+    addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, " 
+        + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
+    addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar items per item to this number "
+        + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
+        String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
+    addOption("maxPrefs", "mppu", "max number of preferences to consider per user or item, " 
+        + "users or items with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS + ')',
+        String.valueOf(DEFAULT_MAX_PREFS));
+    addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this "
+        + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
+    addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE));
+    addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
+    addOption("randomSeed", null, "use this seed for sampling", false);
+
+    Map<String,List<String>> parsedArgs = parseArguments(args);
+    if (parsedArgs == null) {
+      return -1;
+    }
+
+    String similarityClassName = getOption("similarityClassname");
+    int maxSimilarItemsPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
+    int maxPrefs = Integer.parseInt(getOption("maxPrefs"));
+    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
+    boolean booleanData = Boolean.valueOf(getOption("booleanData"));
+
+    double threshold = hasOption("threshold")
+        ? Double.parseDouble(getOption("threshold")) : RowSimilarityJob.NO_THRESHOLD;
+    long randomSeed = hasOption("randomSeed")
+        ? Long.parseLong(getOption("randomSeed")) : RowSimilarityJob.NO_FIXED_RANDOM_SEED;
+
+    Path similarityMatrixPath = getTempPath("similarityMatrix");
+    Path prepPath = getTempPath("prepareRatingMatrix");
+
+    AtomicInteger currentPhase = new AtomicInteger();
+
+    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+      ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[] {
+        "--input", getInputPath().toString(),
+        "--output", prepPath.toString(),
+        "--minPrefsPerUser", String.valueOf(minPrefsPerUser),
+        "--booleanData", String.valueOf(booleanData),
+        "--tempDir", getTempPath().toString(),
+      });
+    }
+
+    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+      int numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());
+
+      ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] {
+        "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(),
+        "--output", similarityMatrixPath.toString(),
+        "--numberOfColumns", String.valueOf(numberOfUsers),
+        "--similarityClassname", similarityClassName,
+        "--maxObservationsPerRow", String.valueOf(maxPrefs),
+        "--maxObservationsPerColumn", String.valueOf(maxPrefs),
+        "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem),
+        "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),
+        "--threshold", String.valueOf(threshold),
+        "--randomSeed", String.valueOf(randomSeed),
+        "--tempDir", getTempPath().toString(),
+      });
+    }
+
+    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+      Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(), SequenceFileInputFormat.class,
+          MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class,
+          MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class);
+      Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
+      mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR,
+          new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
+      mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
+      boolean succeeded = mostSimilarItems.waitForCompletion(true);
+      if (!succeeded) {
+        return -1;
+      }
+    }
+
+    return 0;
+  }
+
+  public static class MostSimilarItemPairsMapper
+      extends Mapper<IntWritable,VectorWritable,EntityEntityWritable,DoubleWritable> {
+
+    private OpenIntLongHashMap indexItemIDMap;
+    private int maxSimilarItemsPerItem;
+
+    @Override
+    protected void setup(Context ctx) {
+      Configuration conf = ctx.getConfiguration();
+      maxSimilarItemsPerItem = conf.getInt(MAX_SIMILARITIES_PER_ITEM, -1);
+      indexItemIDMap = TasteHadoopUtils.readIDIndexMap(conf.get(ITEM_ID_INDEX_PATH_STR), conf);
+
+      Preconditions.checkArgument(maxSimilarItemsPerItem > 0, "maxSimilarItemsPerItem must be greater then 0!");
+    }
+
+    @Override
+    protected void map(IntWritable itemIDIndexWritable, VectorWritable similarityVector, Context ctx)
+      throws IOException, InterruptedException {
+
+      int itemIDIndex = itemIDIndexWritable.get();
+
+      TopSimilarItemsQueue topKMostSimilarItems = new TopSimilarItemsQueue(maxSimilarItemsPerItem);
+
+      for (Vector.Element element : similarityVector.get().nonZeroes()) {
+        SimilarItem top = topKMostSimilarItems.top();
+        double candidateSimilarity = element.get();
+        if (candidateSimilarity > top.getSimilarity()) {
+          top.set(indexItemIDMap.get(element.index()), candidateSimilarity);
+          topKMostSimilarItems.updateTop();
+        }
+      }
+
+      long itemID = indexItemIDMap.get(itemIDIndex);
+      for (SimilarItem similarItem : topKMostSimilarItems.getTopItems()) {
+        long otherItemID = similarItem.getItemID();
+        if (itemID < otherItemID) {
+          ctx.write(new EntityEntityWritable(itemID, otherItemID), new DoubleWritable(similarItem.getSimilarity()));
+        } else {
+          ctx.write(new EntityEntityWritable(otherItemID, itemID), new DoubleWritable(similarItem.getSimilarity()));
+        }
+      }
+    }
+  }
+
+  public static class MostSimilarItemPairsReducer
+      extends Reducer<EntityEntityWritable,DoubleWritable,EntityEntityWritable,DoubleWritable> {
+    @Override
+    protected void reduce(EntityEntityWritable pair, Iterable<DoubleWritable> values, Context ctx)
+      throws IOException, InterruptedException {
+      ctx.write(pair, values.iterator().next());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/TopSimilarItemsQueue.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/TopSimilarItemsQueue.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/TopSimilarItemsQueue.java
new file mode 100644
index 0000000..acb6392
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/TopSimilarItemsQueue.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.mahout.cf.taste.similarity.precompute.SimilarItem;
+
+public class TopSimilarItemsQueue extends PriorityQueue<SimilarItem> {
+
+  private static final long SENTINEL_ID = Long.MIN_VALUE;
+
+  private final int maxSize;
+
+  public TopSimilarItemsQueue(int maxSize) {
+    super(maxSize);
+    this.maxSize = maxSize;
+  }
+
+  public List<SimilarItem> getTopItems() {
+    List<SimilarItem> items = new ArrayList<>(maxSize);
+    while (size() > 0) {
+      SimilarItem topItem = pop();
+      // filter out "sentinel" objects necessary for maintaining an efficient priority queue
+      if (topItem.getItemID() != SENTINEL_ID) {
+        items.add(topItem);
+      }
+    }
+    Collections.reverse(items);
+    return items;
+  }
+
+  @Override
+  protected boolean lessThan(SimilarItem one, SimilarItem two) {
+    return one.getSimilarity() < two.getSimilarity();
+  }
+
+  @Override
+  protected SimilarItem getSentinelObject() {
+    return new SimilarItem(SENTINEL_ID, Double.MIN_VALUE);
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/AbstractLongPrimitiveIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/AbstractLongPrimitiveIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/AbstractLongPrimitiveIterator.java
new file mode 100644
index 0000000..f46785c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/AbstractLongPrimitiveIterator.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+public abstract class AbstractLongPrimitiveIterator implements LongPrimitiveIterator {
+  
+  @Override
+  public Long next() {
+    return nextLong();
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/BitSet.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/BitSet.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/BitSet.java
new file mode 100644
index 0000000..c46b4b6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/BitSet.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.io.Serializable;
+import java.util.Arrays;
+
+/** A simplified and streamlined version of {@link java.util.BitSet}. */
+final class BitSet implements Serializable, Cloneable {
+  
+  private final long[] bits;
+  
+  BitSet(int numBits) {
+    int numLongs = numBits >>> 6;
+    if ((numBits & 0x3F) != 0) {
+      numLongs++;
+    }
+    bits = new long[numLongs];
+  }
+  
+  private BitSet(long[] bits) {
+    this.bits = bits;
+  }
+  
+  boolean get(int index) {
+    // skipping range check for speed
+    return (bits[index >>> 6] & 1L << (index & 0x3F)) != 0L;
+  }
+  
+  void set(int index) {
+    // skipping range check for speed
+    bits[index >>> 6] |= 1L << (index & 0x3F);
+  }
+  
+  void clear(int index) {
+    // skipping range check for speed
+    bits[index >>> 6] &= ~(1L << (index & 0x3F));
+  }
+  
+  void clear() {
+    int length = bits.length;
+    for (int i = 0; i < length; i++) {
+      bits[i] = 0L;
+    }
+  }
+  
+  @Override
+  public BitSet clone() {
+    return new BitSet(bits.clone());
+  }
+
+  @Override
+  public int hashCode() {
+    return Arrays.hashCode(bits);
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (!(o instanceof BitSet)) {
+      return false;
+    }
+    BitSet other = (BitSet) o;
+    return Arrays.equals(bits, other.bits);
+  }
+  
+  @Override
+  public String toString() {
+    StringBuilder result = new StringBuilder(64 * bits.length);
+    for (long l : bits) {
+      for (int j = 0; j < 64; j++) {
+        result.append((l & 1L << j) == 0 ? '0' : '1');
+      }
+      result.append(' ');
+    }
+    return result.toString();
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Cache.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Cache.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Cache.java
new file mode 100755
index 0000000..b2d9b36
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Cache.java
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.TasteException;
+
+import java.util.Iterator;
+
+/**
+ * <p>
+ * An efficient Map-like class which caches values for keys. Values are not "put" into a {@link Cache};
+ * instead the caller supplies the instance with an implementation of {@link Retriever} which can load the
+ * value for a given key.
+ * </p>
+ *
+ * <p>
+ * The cache does not support {@code null} keys.
+ * </p>
+ *
+ * <p>
+ * Thanks to Amila Jayasooriya for helping evaluate performance of the rewrite of this class, as part of a
+ * Google Summer of Code 2007 project.
+ * </p>
+ */
+public final class Cache<K,V> implements Retriever<K,V> {
+
+  private static final Object NULL = new Object();
+  
+  private final FastMap<K,V> cache;
+  private final Retriever<? super K,? extends V> retriever;
+  
+  /**
+   * <p>
+   * Creates a new cache based on the given {@link Retriever}.
+   * </p>
+   * 
+   * @param retriever
+   *          object which can retrieve values for keys
+   */
+  public Cache(Retriever<? super K,? extends V> retriever) {
+    this(retriever, FastMap.NO_MAX_SIZE);
+  }
+  
+  /**
+   * <p>
+   * Creates a new cache based on the given {@link Retriever} and with given maximum size.
+   * </p>
+   * 
+   * @param retriever
+   *          object which can retrieve values for keys
+   * @param maxEntries
+   *          maximum number of entries the cache will store before evicting some
+   */
+  public Cache(Retriever<? super K,? extends V> retriever, int maxEntries) {
+    Preconditions.checkArgument(retriever != null, "retriever is null");
+    Preconditions.checkArgument(maxEntries >= 1, "maxEntries must be at least 1");
+    cache = new FastMap<>(11, maxEntries);
+    this.retriever = retriever;
+  }
+  
+  /**
+   * <p>
+   * Returns cached value for a key. If it does not exist, it is loaded using a {@link Retriever}.
+   * </p>
+   * 
+   * @param key
+   *          cache key
+   * @return value for that key
+   * @throws TasteException
+   *           if an exception occurs while retrieving a new cached value
+   */
+  @Override
+  public V get(K key) throws TasteException {
+    V value;
+    synchronized (cache) {
+      value = cache.get(key);
+    }
+    if (value == null) {
+      return getAndCacheValue(key);
+    }
+    return value == NULL ? null : value;
+  }
+  
+  /**
+   * <p>
+   * Uncaches any existing value for a given key.
+   * </p>
+   * 
+   * @param key
+   *          cache key
+   */
+  public void remove(K key) {
+    synchronized (cache) {
+      cache.remove(key);
+    }
+  }
+
+  /**
+   * Clears all cache entries whose key matches the given predicate.
+   */
+  public void removeKeysMatching(MatchPredicate<K> predicate) {
+    synchronized (cache) {
+      Iterator<K> it = cache.keySet().iterator();
+      while (it.hasNext()) {
+        K key = it.next();
+        if (predicate.matches(key)) {
+          it.remove();
+        }
+      }
+    }
+  }
+
+  /**
+   * Clears all cache entries whose value matches the given predicate.
+   */
+  public void removeValueMatching(MatchPredicate<V> predicate) {
+    synchronized (cache) {
+      Iterator<V> it = cache.values().iterator();
+      while (it.hasNext()) {
+        V value = it.next();
+        if (predicate.matches(value)) {
+          it.remove();
+        }
+      }
+    }
+  }
+  
+  /**
+   * <p>
+   * Clears the cache.
+   * </p>
+   */
+  public void clear() {
+    synchronized (cache) {
+      cache.clear();
+    }
+  }
+  
+  private V getAndCacheValue(K key) throws TasteException {
+    V value = retriever.get(key);
+    if (value == null) {
+      value = (V) NULL;
+    }
+    synchronized (cache) {
+      cache.put(key, value);
+    }
+    return value;
+  }
+  
+  @Override
+  public String toString() {
+    return "Cache[retriever:" + retriever + ']';
+  }
+
+  /**
+   * Used by {#link #removeKeysMatching(Object)} to decide things that are matching.
+   */
+  public interface MatchPredicate<T> {
+    boolean matches(T thing);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastByIDMap.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastByIDMap.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastByIDMap.java
new file mode 100644
index 0000000..fde8958
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastByIDMap.java
@@ -0,0 +1,661 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.io.Serializable;
+import java.util.AbstractCollection;
+import java.util.AbstractSet;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.Set;
+
+import org.apache.mahout.common.RandomUtils;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * @see FastMap
+ * @see FastIDSet
+ */
+public final class FastByIDMap<V> implements Serializable, Cloneable {
+  
+  public static final int NO_MAX_SIZE = Integer.MAX_VALUE;
+  private static final float DEFAULT_LOAD_FACTOR = 1.5f;
+  
+  /** Dummy object used to represent a key that has been removed. */
+  private static final long REMOVED = Long.MAX_VALUE;
+  private static final long NULL = Long.MIN_VALUE;
+  
+  private long[] keys;
+  private V[] values;
+  private float loadFactor;
+  private int numEntries;
+  private int numSlotsUsed;
+  private final int maxSize;
+  private BitSet recentlyAccessed;
+  private final boolean countingAccesses;
+  
+  /** Creates a new {@link FastByIDMap} with default capacity. */
+  public FastByIDMap() {
+    this(2, NO_MAX_SIZE);
+  }
+  
+  public FastByIDMap(int size) {
+    this(size, NO_MAX_SIZE);
+  }
+
+  public FastByIDMap(int size, float loadFactor) {
+    this(size, NO_MAX_SIZE, loadFactor);
+  }
+
+  public FastByIDMap(int size, int maxSize) {
+    this(size, maxSize, DEFAULT_LOAD_FACTOR);
+  }
+
+  /**
+   * Creates a new {@link FastByIDMap} whose capacity can accommodate the given number of entries without rehash.
+   * 
+   * @param size desired capacity
+   * @param maxSize max capacity
+   * @param loadFactor ratio of internal hash table size to current size
+   * @throws IllegalArgumentException if size is less than 0, maxSize is less than 1
+   *  or at least half of {@link RandomUtils#MAX_INT_SMALLER_TWIN_PRIME}, or
+   *  loadFactor is less than 1
+   */
+  public FastByIDMap(int size, int maxSize, float loadFactor) {
+    Preconditions.checkArgument(size >= 0, "size must be at least 0");
+    Preconditions.checkArgument(loadFactor >= 1.0f, "loadFactor must be at least 1.0");
+    this.loadFactor = loadFactor;
+    int max = (int) (RandomUtils.MAX_INT_SMALLER_TWIN_PRIME / loadFactor);
+    Preconditions.checkArgument(size < max, "size must be less than " + max);
+    Preconditions.checkArgument(maxSize >= 1, "maxSize must be at least 1");
+    int hashSize = RandomUtils.nextTwinPrime((int) (loadFactor * size));
+    keys = new long[hashSize];
+    Arrays.fill(keys, NULL);
+    values = (V[]) new Object[hashSize];
+    this.maxSize = maxSize;
+    this.countingAccesses = maxSize != Integer.MAX_VALUE;
+    this.recentlyAccessed = countingAccesses ? new BitSet(hashSize) : null;
+  }
+  
+  /**
+   * @see #findForAdd(long)
+   */
+  private int find(long key) {
+    int theHashCode = (int) key & 0x7FFFFFFF; // make sure it's positive
+    long[] keys = this.keys;
+    int hashSize = keys.length;
+    int jump = 1 + theHashCode % (hashSize - 2);
+    int index = theHashCode % hashSize;
+    long currentKey = keys[index];
+    while (currentKey != NULL && key != currentKey) {
+      index -= index < jump ? jump - hashSize : jump;
+      currentKey = keys[index];
+    }
+    return index;
+  }
+  
+  /**
+   * @see #find(long)
+   */
+  private int findForAdd(long key) {
+    int theHashCode = (int) key & 0x7FFFFFFF; // make sure it's positive
+    long[] keys = this.keys;
+    int hashSize = keys.length;
+    int jump = 1 + theHashCode % (hashSize - 2);
+    int index = theHashCode % hashSize;
+    long currentKey = keys[index];
+    while (currentKey != NULL && currentKey != REMOVED && key != currentKey) {
+      index -= index < jump ? jump - hashSize : jump;
+      currentKey = keys[index];
+    }
+    if (currentKey != REMOVED) {
+      return index;
+    }
+    // If we're adding, it's here, but, the key might have a value already later
+    int addIndex = index;
+    while (currentKey != NULL && key != currentKey) {
+      index -= index < jump ? jump - hashSize : jump;
+      currentKey = keys[index];
+    }
+    return key == currentKey ? index : addIndex;
+  }
+  
+  public V get(long key) {
+    if (key == NULL) {
+      return null;
+    }
+    int index = find(key);
+    if (countingAccesses) {
+      recentlyAccessed.set(index);
+    }
+    return values[index];
+  }
+  
+  public int size() {
+    return numEntries;
+  }
+  
+  public boolean isEmpty() {
+    return numEntries == 0;
+  }
+  
+  public boolean containsKey(long key) {
+    return key != NULL && key != REMOVED && keys[find(key)] != NULL;
+  }
+  
+  public boolean containsValue(Object value) {
+    if (value == null) {
+      return false;
+    }
+    for (V theValue : values) {
+      if (theValue != null && value.equals(theValue)) {
+        return true;
+      }
+    }
+    return false;
+  }
+  
+  public V put(long key, V value) {
+    Preconditions.checkArgument(key != NULL && key != REMOVED);
+    Preconditions.checkNotNull(value);
+    // If less than half the slots are open, let's clear it up
+    if (numSlotsUsed * loadFactor >= keys.length) {
+      // If over half the slots used are actual entries, let's grow
+      if (numEntries * loadFactor >= numSlotsUsed) {
+        growAndRehash();
+      } else {
+        // Otherwise just rehash to clear REMOVED entries and don't grow
+        rehash();
+      }
+    }
+    // Here we may later consider implementing Brent's variation described on page 532
+    int index = findForAdd(key);
+    long keyIndex = keys[index];
+    if (keyIndex == key) {
+      V oldValue = values[index];
+      values[index] = value;
+      return oldValue;
+    }
+    // If size is limited,
+    if (countingAccesses && numEntries >= maxSize) {
+      // and we're too large, clear some old-ish entry
+      clearStaleEntry(index);
+    }
+    keys[index] = key;
+    values[index] = value;
+    numEntries++;
+    if (keyIndex == NULL) {
+      numSlotsUsed++;
+    }
+    return null;
+  }
+  
+  private void clearStaleEntry(int index) {
+    while (true) {
+      long currentKey;
+      do {
+        if (index == 0) {
+          index = keys.length - 1;
+        } else {
+          index--;
+        }
+        currentKey = keys[index];
+      } while (currentKey == NULL || currentKey == REMOVED);
+      if (recentlyAccessed.get(index)) {
+        recentlyAccessed.clear(index);
+      } else {
+        break;
+      }
+    }
+    // Delete the entry
+    keys[index] = REMOVED;
+    numEntries--;
+    values[index] = null;
+  }
+  
+  public V remove(long key) {
+    if (key == NULL || key == REMOVED) {
+      return null;
+    }
+    int index = find(key);
+    if (keys[index] == NULL) {
+      return null;
+    } else {
+      keys[index] = REMOVED;
+      numEntries--;
+      V oldValue = values[index];
+      values[index] = null;
+      // don't decrement numSlotsUsed
+      return oldValue;
+    }
+    // Could un-set recentlyAccessed's bit but doesn't matter
+  }
+  
+  public void clear() {
+    numEntries = 0;
+    numSlotsUsed = 0;
+    Arrays.fill(keys, NULL);
+    Arrays.fill(values, null);
+    if (countingAccesses) {
+      recentlyAccessed.clear();
+    }
+  }
+  
+  public LongPrimitiveIterator keySetIterator() {
+    return new KeyIterator();
+  }
+  
+  public Set<Map.Entry<Long,V>> entrySet() {
+    return new EntrySet();
+  }
+  
+  public Collection<V> values() {
+    return new ValueCollection();
+  }
+  
+  public void rehash() {
+    rehash(RandomUtils.nextTwinPrime((int) (loadFactor * numEntries)));
+  }
+  
+  private void growAndRehash() {
+    if (keys.length * loadFactor >= RandomUtils.MAX_INT_SMALLER_TWIN_PRIME) {
+      throw new IllegalStateException("Can't grow any more");
+    }
+    rehash(RandomUtils.nextTwinPrime((int) (loadFactor * keys.length)));
+  }
+  
+  private void rehash(int newHashSize) {
+    long[] oldKeys = keys;
+    V[] oldValues = values;
+    numEntries = 0;
+    numSlotsUsed = 0;
+    if (countingAccesses) {
+      recentlyAccessed = new BitSet(newHashSize);
+    }
+    keys = new long[newHashSize];
+    Arrays.fill(keys, NULL);
+    values = (V[]) new Object[newHashSize];
+    int length = oldKeys.length;
+    for (int i = 0; i < length; i++) {
+      long key = oldKeys[i];
+      if (key != NULL && key != REMOVED) {
+        put(key, oldValues[i]);
+      }
+    }
+  }
+  
+  void iteratorRemove(int lastNext) {
+    if (lastNext >= values.length) {
+      throw new NoSuchElementException();
+    }
+    if (lastNext < 0) {
+      throw new IllegalStateException();
+    }
+    values[lastNext] = null;
+    keys[lastNext] = REMOVED;
+    numEntries--;
+  }
+  
+  @Override
+  public FastByIDMap<V> clone() {
+    FastByIDMap<V> clone;
+    try {
+      clone = (FastByIDMap<V>) super.clone();
+    } catch (CloneNotSupportedException cnse) {
+      throw new AssertionError();
+    }
+    clone.keys = keys.clone();
+    clone.values = values.clone();
+    clone.recentlyAccessed = countingAccesses ? new BitSet(keys.length) : null;
+    return clone;
+  }
+  
+  @Override
+  public String toString() {
+    if (isEmpty()) {
+      return "{}";
+    }
+    StringBuilder result = new StringBuilder();
+    result.append('{');
+    for (int i = 0; i < keys.length; i++) {
+      long key = keys[i];
+      if (key != NULL && key != REMOVED) {
+        result.append(key).append('=').append(values[i]).append(',');
+      }
+    }
+    result.setCharAt(result.length() - 1, '}');
+    return result.toString();
+  }
+
+  @Override
+  public int hashCode() {
+    int hash = 0;
+    long[] keys = this.keys;
+    int max = keys.length;
+    for (int i = 0; i < max; i++) {
+      long key = keys[i];
+      if (key != NULL && key != REMOVED) {
+        hash = 31 * hash + ((int) (key >> 32) ^ (int) key);
+        hash = 31 * hash + values[i].hashCode();
+      }
+    }
+    return hash;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (!(other instanceof FastByIDMap)) {
+      return false;
+    }
+    FastByIDMap<V> otherMap = (FastByIDMap<V>) other;
+    long[] otherKeys = otherMap.keys;
+    V[] otherValues = otherMap.values;
+    int length = keys.length;
+    int otherLength = otherKeys.length;
+    int max = Math.min(length, otherLength);
+
+    int i = 0;
+    while (i < max) {
+      long key = keys[i];
+      long otherKey = otherKeys[i];
+      if (key == NULL || key == REMOVED) {
+        if (otherKey != NULL && otherKey != REMOVED) {
+          return false;
+        }
+      } else {
+        if (key != otherKey || !values[i].equals(otherValues[i])) {
+          return false;
+        }
+      }
+      i++;
+    }
+    while (i < length) {
+      long key = keys[i];
+      if (key != NULL && key != REMOVED) {
+        return false;
+      }
+      i++;
+    }
+    while (i < otherLength) {
+      long key = otherKeys[i];
+      if (key != NULL && key != REMOVED) {
+        return false;
+      }
+      i++;
+    }
+    return true;
+  }
+  
+  private final class KeyIterator extends AbstractLongPrimitiveIterator {
+    
+    private int position;
+    private int lastNext = -1;
+    
+    @Override
+    public boolean hasNext() {
+      goToNext();
+      return position < keys.length;
+    }
+    
+    @Override
+    public long nextLong() {
+      goToNext();
+      lastNext = position;
+      if (position >= keys.length) {
+        throw new NoSuchElementException();
+      }
+      return keys[position++];
+    }
+    
+    @Override
+    public long peek() {
+      goToNext();
+      if (position >= keys.length) {
+        throw new NoSuchElementException();
+      }
+      return keys[position];
+    }
+    
+    private void goToNext() {
+      int length = values.length;
+      while (position < length && values[position] == null) {
+        position++;
+      }
+    }
+    
+    @Override
+    public void remove() {
+      iteratorRemove(lastNext);
+    }
+    
+    @Override
+    public void skip(int n) {
+      position += n;
+    }
+    
+  }
+  
+  private final class EntrySet extends AbstractSet<Map.Entry<Long,V>> {
+    
+    @Override
+    public int size() {
+      return FastByIDMap.this.size();
+    }
+    
+    @Override
+    public boolean isEmpty() {
+      return FastByIDMap.this.isEmpty();
+    }
+    
+    @Override
+    public boolean contains(Object o) {
+      return containsKey((Long) o);
+    }
+    
+    @Override
+    public Iterator<Map.Entry<Long,V>> iterator() {
+      return new EntryIterator();
+    }
+    
+    @Override
+    public boolean add(Map.Entry<Long,V> t) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean remove(Object o) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean addAll(Collection<? extends Map.Entry<Long,V>> ts) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean retainAll(Collection<?> objects) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean removeAll(Collection<?> objects) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public void clear() {
+      FastByIDMap.this.clear();
+    }
+    
+    private final class MapEntry implements Map.Entry<Long,V> {
+      
+      private final int index;
+      
+      private MapEntry(int index) {
+        this.index = index;
+      }
+      
+      @Override
+      public Long getKey() {
+        return keys[index];
+      }
+      
+      @Override
+      public V getValue() {
+        return values[index];
+      }
+      
+      @Override
+      public V setValue(V value) {
+        Preconditions.checkArgument(value != null);
+
+        V oldValue = values[index];
+        values[index] = value;
+        return oldValue;
+      }
+    }
+    
+    private final class EntryIterator implements Iterator<Map.Entry<Long,V>> {
+      
+      private int position;
+      private int lastNext = -1;
+      
+      @Override
+      public boolean hasNext() {
+        goToNext();
+        return position < keys.length;
+      }
+      
+      @Override
+      public Map.Entry<Long,V> next() {
+        goToNext();
+        lastNext = position;
+        if (position >= keys.length) {
+          throw new NoSuchElementException();
+        }
+        return new MapEntry(position++);
+      }
+      
+      private void goToNext() {
+        int length = values.length;
+        while (position < length && values[position] == null) {
+          position++;
+        }
+      }
+      
+      @Override
+      public void remove() {
+        iteratorRemove(lastNext);
+      }
+    }
+    
+  }
+  
+  private final class ValueCollection extends AbstractCollection<V> {
+    
+    @Override
+    public int size() {
+      return FastByIDMap.this.size();
+    }
+    
+    @Override
+    public boolean isEmpty() {
+      return FastByIDMap.this.isEmpty();
+    }
+    
+    @Override
+    public boolean contains(Object o) {
+      return containsValue(o);
+    }
+    
+    @Override
+    public Iterator<V> iterator() {
+      return new ValueIterator();
+    }
+    
+    @Override
+    public boolean add(V v) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean remove(Object o) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean addAll(Collection<? extends V> vs) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean removeAll(Collection<?> objects) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean retainAll(Collection<?> objects) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public void clear() {
+      FastByIDMap.this.clear();
+    }
+    
+    private final class ValueIterator implements Iterator<V> {
+      
+      private int position;
+      private int lastNext = -1;
+      
+      @Override
+      public boolean hasNext() {
+        goToNext();
+        return position < values.length;
+      }
+      
+      @Override
+      public V next() {
+        goToNext();
+        lastNext = position;
+        if (position >= values.length) {
+          throw new NoSuchElementException();
+        }
+        return values[position++];
+      }
+      
+      private void goToNext() {
+        int length = values.length;
+        while (position < length && values[position] == null) {
+          position++;
+        }
+      }
+      
+      @Override
+      public void remove() {
+        iteratorRemove(lastNext);
+      }
+      
+    }
+    
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastIDSet.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastIDSet.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastIDSet.java
new file mode 100644
index 0000000..5908270
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastIDSet.java
@@ -0,0 +1,426 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import org.apache.mahout.common.RandomUtils;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * @see FastByIDMap
+ */
+public final class FastIDSet implements Serializable, Cloneable, Iterable<Long> {
+  
+  private static final float DEFAULT_LOAD_FACTOR = 1.5f;
+  
+  /** Dummy object used to represent a key that has been removed. */
+  private static final long REMOVED = Long.MAX_VALUE;
+  private static final long NULL = Long.MIN_VALUE;
+  
+  private long[] keys;
+  private float loadFactor;
+  private int numEntries;
+  private int numSlotsUsed;
+  
+  /** Creates a new {@link FastIDSet} with default capacity. */
+  public FastIDSet() {
+    this(2);
+  }
+
+  public FastIDSet(long[] initialKeys) {
+    this(initialKeys.length);
+    addAll(initialKeys);
+  }
+
+  public FastIDSet(int size) {
+    this(size, DEFAULT_LOAD_FACTOR);
+  }
+
+  public FastIDSet(int size, float loadFactor) {
+    Preconditions.checkArgument(size >= 0, "size must be at least 0");
+    Preconditions.checkArgument(loadFactor >= 1.0f, "loadFactor must be at least 1.0");
+    this.loadFactor = loadFactor;
+    int max = (int) (RandomUtils.MAX_INT_SMALLER_TWIN_PRIME / loadFactor);
+    Preconditions.checkArgument(size < max, "size must be less than %d", max);
+    int hashSize = RandomUtils.nextTwinPrime((int) (loadFactor * size));
+    keys = new long[hashSize];
+    Arrays.fill(keys, NULL);
+  }
+  
+  /**
+   * @see #findForAdd(long)
+   */
+  private int find(long key) {
+    int theHashCode = (int) key & 0x7FFFFFFF; // make sure it's positive
+    long[] keys = this.keys;
+    int hashSize = keys.length;
+    int jump = 1 + theHashCode % (hashSize - 2);
+    int index = theHashCode % hashSize;
+    long currentKey = keys[index];
+    while (currentKey != NULL && key != currentKey) { // note: true when currentKey == REMOVED
+      index -= index < jump ? jump - hashSize : jump;
+      currentKey = keys[index];
+    }
+    return index;
+  }
+  
+  /**
+   * @see #find(long)
+   */
+  private int findForAdd(long key) {
+    int theHashCode = (int) key & 0x7FFFFFFF; // make sure it's positive
+    long[] keys = this.keys;
+    int hashSize = keys.length;
+    int jump = 1 + theHashCode % (hashSize - 2);
+    int index = theHashCode % hashSize;
+    long currentKey = keys[index];
+    while (currentKey != NULL && currentKey != REMOVED && key != currentKey) {
+      index -= index < jump ? jump - hashSize : jump;
+      currentKey = keys[index];
+    }
+    if (currentKey != REMOVED) {
+      return index;
+    }
+    // If we're adding, it's here, but, the key might have a value already later
+    int addIndex = index;
+    while (currentKey != NULL && key != currentKey) {
+      index -= index < jump ? jump - hashSize : jump;
+      currentKey = keys[index];
+    }
+    return key == currentKey ? index : addIndex;
+  }
+  
+  public int size() {
+    return numEntries;
+  }
+  
+  public boolean isEmpty() {
+    return numEntries == 0;
+  }
+  
+  public boolean contains(long key) {
+    return key != NULL && key != REMOVED && keys[find(key)] != NULL;
+  }
+  
+  public boolean add(long key) {
+    Preconditions.checkArgument(key != NULL && key != REMOVED);
+
+    // If less than half the slots are open, let's clear it up
+    if (numSlotsUsed * loadFactor >= keys.length) {
+      // If over half the slots used are actual entries, let's grow
+      if (numEntries * loadFactor >= numSlotsUsed) {
+        growAndRehash();
+      } else {
+        // Otherwise just rehash to clear REMOVED entries and don't grow
+        rehash();
+      }
+    }
+    // Here we may later consider implementing Brent's variation described on page 532
+    int index = findForAdd(key);
+    long keyIndex = keys[index];
+    if (keyIndex != key) {
+      keys[index] = key;
+      numEntries++;
+      if (keyIndex == NULL) {
+        numSlotsUsed++;
+      }
+      return true;
+    }
+    return false;
+  }
+  
+  @Override
+  public LongPrimitiveIterator iterator() {
+    return new KeyIterator();
+  }
+  
+  public long[] toArray() {
+    long[] result = new long[numEntries];
+    for (int i = 0, position = 0; i < result.length; i++) {
+      while (keys[position] == NULL || keys[position] == REMOVED) {
+        position++;
+      }
+      result[i] = keys[position++];
+    }
+    return result;
+  }
+  
+  public boolean remove(long key) {
+    if (key == NULL || key == REMOVED) {
+      return false;
+    }
+    int index = find(key);
+    if (keys[index] == NULL) {
+      return false;
+    } else {
+      keys[index] = REMOVED;
+      numEntries--;
+      return true;
+    }
+  }
+  
+  public boolean addAll(long[] c) {
+    boolean changed = false;
+    for (long k : c) {
+      if (add(k)) {
+        changed = true;
+      }
+    }
+    return changed;
+  }
+  
+  public boolean addAll(FastIDSet c) {
+    boolean changed = false;
+    for (long k : c.keys) {
+      if (k != NULL && k != REMOVED && add(k)) {
+        changed = true;
+      }
+    }
+    return changed;
+  }
+  
+  public boolean removeAll(long[] c) {
+    boolean changed = false;
+    for (long o : c) {
+      if (remove(o)) {
+        changed = true;
+      }
+    }
+    return changed;
+  }
+  
+  public boolean removeAll(FastIDSet c) {
+    boolean changed = false;
+    for (long k : c.keys) {
+      if (k != NULL && k != REMOVED && remove(k)) {
+        changed = true;
+      }
+    }
+    return changed;
+  }
+  
+  public boolean retainAll(FastIDSet c) {
+    boolean changed = false;
+    for (int i = 0; i < keys.length; i++) {
+      long k = keys[i];
+      if (k != NULL && k != REMOVED && !c.contains(k)) {
+        keys[i] = REMOVED;
+        numEntries--;
+        changed = true;
+      }
+    }
+    return changed;
+  }
+  
+  public void clear() {
+    numEntries = 0;
+    numSlotsUsed = 0;
+    Arrays.fill(keys, NULL);
+  }
+  
+  private void growAndRehash() {
+    if (keys.length * loadFactor >= RandomUtils.MAX_INT_SMALLER_TWIN_PRIME) {
+      throw new IllegalStateException("Can't grow any more");
+    }
+    rehash(RandomUtils.nextTwinPrime((int) (loadFactor * keys.length)));
+  }
+  
+  public void rehash() {
+    rehash(RandomUtils.nextTwinPrime((int) (loadFactor * numEntries)));
+  }
+  
+  private void rehash(int newHashSize) {
+    long[] oldKeys = keys;
+    numEntries = 0;
+    numSlotsUsed = 0;
+    keys = new long[newHashSize];
+    Arrays.fill(keys, NULL);
+    for (long key : oldKeys) {
+      if (key != NULL && key != REMOVED) {
+        add(key);
+      }
+    }
+  }
+  
+  /**
+   * Convenience method to quickly compute just the size of the intersection with another {@link FastIDSet}.
+   * 
+   * @param other
+   *          {@link FastIDSet} to intersect with
+   * @return number of elements in intersection
+   */
+  public int intersectionSize(FastIDSet other) {
+    int count = 0;
+    for (long key : other.keys) {
+      if (key != NULL && key != REMOVED && keys[find(key)] != NULL) {
+        count++;
+      }
+    }
+    return count;
+  }
+  
+  @Override
+  public FastIDSet clone() {
+    FastIDSet clone;
+    try {
+      clone = (FastIDSet) super.clone();
+    } catch (CloneNotSupportedException cnse) {
+      throw new AssertionError();
+    }
+    clone.keys = keys.clone();
+    return clone;
+  }
+
+  @Override
+  public int hashCode() {
+    int hash = 0;
+    long[] keys = this.keys;
+    for (long key : keys) {
+      if (key != NULL && key != REMOVED) {
+        hash = 31 * hash + ((int) (key >> 32) ^ (int) key);
+      }
+    }
+    return hash;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (!(other instanceof FastIDSet)) {
+      return false;
+    }
+    FastIDSet otherMap = (FastIDSet) other;
+    long[] otherKeys = otherMap.keys;
+    int length = keys.length;
+    int otherLength = otherKeys.length;
+    int max = Math.min(length, otherLength);
+
+    int i = 0;
+    while (i < max) {
+      long key = keys[i];
+      long otherKey = otherKeys[i];
+      if (key == NULL || key == REMOVED) {
+        if (otherKey != NULL && otherKey != REMOVED) {
+          return false;
+        }
+      } else {
+        if (key != otherKey) {
+          return false;
+        }
+      }
+      i++;
+    }
+    while (i < length) {
+      long key = keys[i];
+      if (key != NULL && key != REMOVED) {
+        return false;
+      }
+      i++;
+    }
+    while (i < otherLength) {
+      long key = otherKeys[i];
+      if (key != NULL && key != REMOVED) {
+        return false;
+      }
+      i++;
+    }
+    return true;
+  }
+  
+  @Override
+  public String toString() {
+    if (isEmpty()) {
+      return "[]";
+    }
+    StringBuilder result = new StringBuilder();
+    result.append('[');
+    for (long key : keys) {
+      if (key != NULL && key != REMOVED) {
+        result.append(key).append(',');
+      }
+    }
+    result.setCharAt(result.length() - 1, ']');
+    return result.toString();
+  }
+  
+  private final class KeyIterator extends AbstractLongPrimitiveIterator {
+    
+    private int position;
+    private int lastNext = -1;
+    
+    @Override
+    public boolean hasNext() {
+      goToNext();
+      return position < keys.length;
+    }
+    
+    @Override
+    public long nextLong() {
+      goToNext();
+      lastNext = position;
+      if (position >= keys.length) {
+        throw new NoSuchElementException();
+      }
+      return keys[position++];
+    }
+    
+    @Override
+    public long peek() {
+      goToNext();
+      if (position >= keys.length) {
+        throw new NoSuchElementException();
+      }
+      return keys[position];
+    }
+    
+    private void goToNext() {
+      int length = keys.length;
+      while (position < length
+             && (keys[position] == NULL || keys[position] == REMOVED)) {
+        position++;
+      }
+    }
+    
+    @Override
+    public void remove() {
+      if (lastNext >= keys.length) {
+        throw new NoSuchElementException();
+      }
+      if (lastNext < 0) {
+        throw new IllegalStateException();
+      }
+      keys[lastNext] = REMOVED;
+      numEntries--;
+    }
+    
+    public Iterator<Long> iterator() {
+      return new KeyIterator();
+    }
+    
+    @Override
+    public void skip(int n) {
+      position += n;
+    }
+    
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastMap.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastMap.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastMap.java
new file mode 100644
index 0000000..7c64b44
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastMap.java
@@ -0,0 +1,729 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.io.Serializable;
+import java.util.AbstractCollection;
+import java.util.AbstractSet;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.Set;
+
+import org.apache.mahout.common.RandomUtils;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * This is an optimized {@link Map} implementation, based on algorithms described in Knuth's "Art of Computer
+ * Programming", Vol. 3, p. 529.
+ * </p>
+ *
+ * <p>
+ * It should be faster than {@link java.util.HashMap} in some cases, but not all. Its main feature is a
+ * "max size" and the ability to transparently, efficiently and semi-intelligently evict old entries when max
+ * size is exceeded.
+ * </p>
+ *
+ * <p>
+ * This class is not a bit thread-safe.
+ * </p>
+ *
+ * <p>
+ * This implementation does not allow {@code null} as a key or value.
+ * </p>
+ */
+public final class FastMap<K,V> implements Map<K,V>, Serializable, Cloneable {
+  
+  public static final int NO_MAX_SIZE = Integer.MAX_VALUE;
+  private static final float DEFAULT_LOAD_FACTOR = 1.5f;
+  
+  /** Dummy object used to represent a key that has been removed. */
+  private static final Object REMOVED = new Object();
+  
+  private K[] keys;
+  private V[] values;
+  private float loadFactor;
+  private int numEntries;
+  private int numSlotsUsed;
+  private final int maxSize;
+  private BitSet recentlyAccessed;
+  private final boolean countingAccesses;
+  
+  /** Creates a new {@link FastMap} with default capacity. */
+  public FastMap() {
+    this(2, NO_MAX_SIZE);
+  }
+  
+  public FastMap(int size) {
+    this(size, NO_MAX_SIZE);
+  }
+  
+  public FastMap(Map<K,V> other) {
+    this(other.size());
+    putAll(other);
+  }
+
+  public FastMap(int size, float loadFactor) {
+    this(size, NO_MAX_SIZE, loadFactor);
+  }
+
+  public FastMap(int size, int maxSize) {
+    this(size, maxSize, DEFAULT_LOAD_FACTOR);
+  }
+  
+  /**
+   * Creates a new  whose capacity can accommodate the given number of entries without rehash.
+   * 
+   * @param size desired capacity
+   * @param maxSize max capacity
+   * @throws IllegalArgumentException if size is less than 0, maxSize is less than 1
+   *  or at least half of {@link RandomUtils#MAX_INT_SMALLER_TWIN_PRIME}, or
+   *  loadFactor is less than 1
+   */
+  public FastMap(int size, int maxSize, float loadFactor) {
+    Preconditions.checkArgument(size >= 0, "size must be at least 0");
+    Preconditions.checkArgument(loadFactor >= 1.0f, "loadFactor must be at least 1.0");
+    this.loadFactor = loadFactor;
+    int max = (int) (RandomUtils.MAX_INT_SMALLER_TWIN_PRIME / loadFactor);
+    Preconditions.checkArgument(size < max, "size must be less than " + max);
+    Preconditions.checkArgument(maxSize >= 1, "maxSize must be at least 1");
+    int hashSize = RandomUtils.nextTwinPrime((int) (loadFactor * size));
+    keys = (K[]) new Object[hashSize];
+    values = (V[]) new Object[hashSize];
+    this.maxSize = maxSize;
+    this.countingAccesses = maxSize != Integer.MAX_VALUE;
+    this.recentlyAccessed = countingAccesses ? new BitSet(hashSize) : null;
+  }
+  
+  private int find(Object key) {
+    int theHashCode = key.hashCode() & 0x7FFFFFFF; // make sure it's positive
+    K[] keys = this.keys;
+    int hashSize = keys.length;
+    int jump = 1 + theHashCode % (hashSize - 2);
+    int index = theHashCode % hashSize;
+    K currentKey = keys[index];
+    while (currentKey != null && !key.equals(currentKey)) {
+      index -= index < jump ? jump - hashSize : jump;
+      currentKey = keys[index];
+    }
+    return index;
+  }
+
+  private int findForAdd(Object key) {
+    int theHashCode = key.hashCode() & 0x7FFFFFFF; // make sure it's positive
+    K[] keys = this.keys;
+    int hashSize = keys.length;
+    int jump = 1 + theHashCode % (hashSize - 2);
+    int index = theHashCode % hashSize;
+    K currentKey = keys[index];
+    while (currentKey != null && currentKey != REMOVED && key != currentKey) {
+      index -= index < jump ? jump - hashSize : jump;
+      currentKey = keys[index];
+    }
+    if (currentKey != REMOVED) {
+      return index;
+    }
+    // If we're adding, it's here, but, the key might have a value already later
+    int addIndex = index;
+    while (currentKey != null && key != currentKey) {
+      index -= index < jump ? jump - hashSize : jump;
+      currentKey = keys[index];
+    }
+    return key == currentKey ? index : addIndex;
+  }
+  
+  @Override
+  public V get(Object key) {
+    if (key == null) {
+      return null;
+    }
+    int index = find(key);
+    if (countingAccesses) {
+      recentlyAccessed.set(index);
+    }
+    return values[index];
+  }
+  
+  @Override
+  public int size() {
+    return numEntries;
+  }
+  
+  @Override
+  public boolean isEmpty() {
+    return numEntries == 0;
+  }
+  
+  @Override
+  public boolean containsKey(Object key) {
+    return key != null && keys[find(key)] != null;
+  }
+  
+  @Override
+  public boolean containsValue(Object value) {
+    if (value == null) {
+      return false;
+    }
+    for (V theValue : values) {
+      if (theValue != null && value.equals(theValue)) {
+        return true;
+      }
+    }
+    return false;
+  }
+  
+  /**
+   * @throws NullPointerException
+   *           if key or value is null
+   */
+  @Override
+  public V put(K key, V value) {
+    Preconditions.checkNotNull(key);
+    Preconditions.checkNotNull(value);
+    // If less than half the slots are open, let's clear it up
+    if (numSlotsUsed * loadFactor >= keys.length) {
+      // If over half the slots used are actual entries, let's grow
+      if (numEntries * loadFactor >= numSlotsUsed) {
+        growAndRehash();
+      } else {
+        // Otherwise just rehash to clear REMOVED entries and don't grow
+        rehash();
+      }
+    }
+    // Here we may later consider implementing Brent's variation described on page 532
+    int index = findForAdd(key);
+    if (keys[index] == key) {
+      V oldValue = values[index];
+      values[index] = value;
+      return oldValue;
+    }
+    // If size is limited,
+    if (countingAccesses && numEntries >= maxSize) {
+      // and we're too large, clear some old-ish entry
+      clearStaleEntry(index);
+    }
+    keys[index] = key;
+    values[index] = value;
+    numEntries++;
+    numSlotsUsed++;
+    return null;
+  }
+  
+  private void clearStaleEntry(int index) {
+    while (true) {
+      K currentKey;
+      do {
+        if (index == 0) {
+          index = keys.length - 1;
+        } else {
+          index--;
+        }
+        currentKey = keys[index];
+      } while (currentKey == null || currentKey == REMOVED);
+      if (recentlyAccessed.get(index)) {
+        recentlyAccessed.clear(index);
+      } else {
+        break;
+      }
+    }
+    // Delete the entry
+    ((Object[])keys)[index] = REMOVED;
+    numEntries--;
+    values[index] = null;
+  }
+  
+  @Override
+  public void putAll(Map<? extends K,? extends V> map) {
+    for (Entry<? extends K,? extends V> entry : map.entrySet()) {
+      put(entry.getKey(), entry.getValue());
+    }
+  }
+  
+  @Override
+  public V remove(Object key) {
+    if (key == null) {
+      return null;
+    }
+    int index = find(key);
+    if (keys[index] == null) {
+      return null;
+    } else {
+      ((Object[])keys)[index] = REMOVED;
+      numEntries--;
+      V oldValue = values[index];
+      values[index] = null;
+      // don't decrement numSlotsUsed
+      return oldValue;
+    }
+    // Could un-set recentlyAccessed's bit but doesn't matter
+  }
+  
+  @Override
+  public void clear() {
+    numEntries = 0;
+    numSlotsUsed = 0;
+    Arrays.fill(keys, null);
+    Arrays.fill(values, null);
+    if (countingAccesses) {
+      recentlyAccessed.clear();
+    }
+  }
+  
+  @Override
+  public Set<K> keySet() {
+    return new KeySet();
+  }
+  
+  @Override
+  public Collection<V> values() {
+    return new ValueCollection();
+  }
+  
+  @Override
+  public Set<Entry<K,V>> entrySet() {
+    return new EntrySet();
+  }
+  
+  public void rehash() {
+    rehash(RandomUtils.nextTwinPrime((int) (loadFactor * numEntries)));
+  }
+  
+  private void growAndRehash() {
+    if (keys.length * loadFactor >= RandomUtils.MAX_INT_SMALLER_TWIN_PRIME) {
+      throw new IllegalStateException("Can't grow any more");
+    }
+    rehash(RandomUtils.nextTwinPrime((int) (loadFactor * keys.length)));
+  }
+  
+  private void rehash(int newHashSize) {
+    K[] oldKeys = keys;
+    V[] oldValues = values;
+    numEntries = 0;
+    numSlotsUsed = 0;
+    if (countingAccesses) {
+      recentlyAccessed = new BitSet(newHashSize);
+    }
+    keys = (K[]) new Object[newHashSize];
+    values = (V[]) new Object[newHashSize];
+    int length = oldKeys.length;
+    for (int i = 0; i < length; i++) {
+      K key = oldKeys[i];
+      if (key != null && key != REMOVED) {
+        put(key, oldValues[i]);
+      }
+    }
+  }
+  
+  void iteratorRemove(int lastNext) {
+    if (lastNext >= values.length) {
+      throw new NoSuchElementException();
+    }
+    if (lastNext < 0) {
+      throw new IllegalStateException();
+    }
+    values[lastNext] = null;
+    ((Object[])keys)[lastNext] = REMOVED;
+    numEntries--;
+  }
+  
+  @Override
+  public FastMap<K,V> clone() {
+    FastMap<K,V> clone;
+    try {
+      clone = (FastMap<K,V>) super.clone();
+    } catch (CloneNotSupportedException cnse) {
+      throw new AssertionError();
+    }
+    clone.keys = keys.clone();
+    clone.values = values.clone();
+    clone.recentlyAccessed = countingAccesses ? new BitSet(keys.length) : null;
+    return clone;
+  }
+
+  @Override
+  public int hashCode() {
+    int hash = 0;
+    K[] keys = this.keys;
+    int max = keys.length;
+    for (int i = 0; i < max; i++) {
+      K key = keys[i];
+      if (key != null && key != REMOVED) {
+        hash = 31 * hash + key.hashCode();
+        hash = 31 * hash + values[i].hashCode();
+      }
+    }
+    return hash;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (!(other instanceof FastMap)) {
+      return false;
+    }
+    FastMap<K,V> otherMap = (FastMap<K,V>) other;
+    K[] otherKeys = otherMap.keys;
+    V[] otherValues = otherMap.values;
+    int length = keys.length;
+    int otherLength = otherKeys.length;
+    int max = Math.min(length, otherLength);
+
+    int i = 0;
+    while (i < max) {
+      K key = keys[i];
+      K otherKey = otherKeys[i];
+      if (key == null || key == REMOVED) {
+        if (otherKey != null && otherKey != REMOVED) {
+          return false;
+        }
+      } else {
+        if (key != otherKey || !values[i].equals(otherValues[i])) {
+          return false;
+        }
+      }
+      i++;
+    }
+    while (i < length) {
+      K key = keys[i];
+      if (key != null && key != REMOVED) {
+        return false;
+      }
+      i++;
+    }
+    while (i < otherLength) {
+      K key = otherKeys[i];
+      if (key != null && key != REMOVED) {
+        return false;
+      }
+      i++;
+    }
+    return true;
+  }
+  
+  @Override
+  public String toString() {
+    if (isEmpty()) {
+      return "{}";
+    }
+    StringBuilder result = new StringBuilder();
+    result.append('{');
+    for (int i = 0; i < keys.length; i++) {
+      K key = keys[i];
+      if (key != null && key != REMOVED) {
+        result.append(key).append('=').append(values[i]).append(',');
+      }
+    }
+    result.setCharAt(result.length() - 1, '}');
+    return result.toString();
+  }
+  
+  private final class EntrySet extends AbstractSet<Entry<K,V>> {
+    
+    @Override
+    public int size() {
+      return FastMap.this.size();
+    }
+    
+    @Override
+    public boolean isEmpty() {
+      return FastMap.this.isEmpty();
+    }
+    
+    @Override
+    public boolean contains(Object o) {
+      return containsKey(o);
+    }
+    
+    @Override
+    public Iterator<Entry<K,V>> iterator() {
+      return new EntryIterator();
+    }
+    
+    @Override
+    public boolean add(Entry<K,V> t) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean remove(Object o) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean addAll(Collection<? extends Entry<K,V>> ts) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean retainAll(Collection<?> objects) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean removeAll(Collection<?> objects) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public void clear() {
+      FastMap.this.clear();
+    }
+    
+    private final class MapEntry implements Entry<K,V> {
+      
+      private final int index;
+      
+      private MapEntry(int index) {
+        this.index = index;
+      }
+      
+      @Override
+      public K getKey() {
+        return keys[index];
+      }
+      
+      @Override
+      public V getValue() {
+        return values[index];
+      }
+      
+      @Override
+      public V setValue(V value) {
+        Preconditions.checkArgument(value != null);
+        V oldValue = values[index];
+        values[index] = value;
+        return oldValue;
+      }
+    }
+    
+    private final class EntryIterator implements Iterator<Entry<K,V>> {
+      
+      private int position;
+      private int lastNext = -1;
+      
+      @Override
+      public boolean hasNext() {
+        goToNext();
+        return position < keys.length;
+      }
+      
+      @Override
+      public Entry<K,V> next() {
+        goToNext();
+        lastNext = position;
+        if (position >= keys.length) {
+          throw new NoSuchElementException();
+        }
+        return new MapEntry(position++);
+      }
+      
+      private void goToNext() {
+        int length = values.length;
+        while (position < length && values[position] == null) {
+          position++;
+        }
+      }
+      
+      @Override
+      public void remove() {
+        iteratorRemove(lastNext);
+      }
+    }
+    
+  }
+  
+  private final class KeySet extends AbstractSet<K> {
+    
+    @Override
+    public int size() {
+      return FastMap.this.size();
+    }
+    
+    @Override
+    public boolean isEmpty() {
+      return FastMap.this.isEmpty();
+    }
+    
+    @Override
+    public boolean contains(Object o) {
+      return containsKey(o);
+    }
+    
+    @Override
+    public Iterator<K> iterator() {
+      return new KeyIterator();
+    }
+    
+    @Override
+    public boolean add(K t) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean remove(Object o) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean addAll(Collection<? extends K> ts) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean retainAll(Collection<?> objects) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean removeAll(Collection<?> objects) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public void clear() {
+      FastMap.this.clear();
+    }
+    
+    private final class KeyIterator implements Iterator<K> {
+      
+      private int position;
+      private int lastNext = -1;
+      
+      @Override
+      public boolean hasNext() {
+        goToNext();
+        return position < keys.length;
+      }
+      
+      @Override
+      public K next() {
+        goToNext();
+        lastNext = position;
+        if (position >= keys.length) {
+          throw new NoSuchElementException();
+        }
+        return keys[position++];
+      }
+      
+      private void goToNext() {
+        int length = values.length;
+        while (position < length && values[position] == null) {
+          position++;
+        }
+      }
+      
+      @Override
+      public void remove() {
+        iteratorRemove(lastNext);
+      }
+    }
+    
+  }
+  
+  private final class ValueCollection extends AbstractCollection<V> {
+    
+    @Override
+    public int size() {
+      return FastMap.this.size();
+    }
+    
+    @Override
+    public boolean isEmpty() {
+      return FastMap.this.isEmpty();
+    }
+    
+    @Override
+    public boolean contains(Object o) {
+      return containsValue(o);
+    }
+    
+    @Override
+    public Iterator<V> iterator() {
+      return new ValueIterator();
+    }
+    
+    @Override
+    public boolean add(V v) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean remove(Object o) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean addAll(Collection<? extends V> vs) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean removeAll(Collection<?> objects) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean retainAll(Collection<?> objects) {
+      throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public void clear() {
+      FastMap.this.clear();
+    }
+    
+    private final class ValueIterator implements Iterator<V> {
+      
+      private int position;
+      private int lastNext = -1;
+      
+      @Override
+      public boolean hasNext() {
+        goToNext();
+        return position < values.length;
+      }
+      
+      @Override
+      public V next() {
+        goToNext();
+        lastNext = position;
+        if (position >= values.length) {
+          throw new NoSuchElementException();
+        }
+        return values[position++];
+      }
+      
+      private void goToNext() {
+        int length = values.length;
+        while (position < length && values[position] == null) {
+          position++;
+        }
+      }
+      
+      @Override
+      public void remove() {
+        iteratorRemove(lastNext);
+      }
+      
+    }
+    
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverage.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverage.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverage.java
new file mode 100644
index 0000000..1863d2b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverage.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.io.Serializable;
+
+/**
+ * <p>
+ * A simple class that represents a fixed value of an average and count. This is useful
+ * when an API needs to return {@link RunningAverage} but is not in a position to accept
+ * updates to it.
+ * </p>
+ */
+public class FixedRunningAverage implements RunningAverage, Serializable {
+
+  private final double average;
+  private final int count;
+
+  public FixedRunningAverage(double average, int count) {
+    this.average = average;
+    this.count = count;
+  }
+
+  /**
+   * @throws UnsupportedOperationException
+   */
+  @Override
+  public synchronized void addDatum(double datum) {
+    throw new UnsupportedOperationException();
+  }
+
+  /**
+   * @throws UnsupportedOperationException
+   */
+  @Override
+  public synchronized void removeDatum(double datum) {
+    throw new UnsupportedOperationException();
+  }
+
+  /**
+   * @throws UnsupportedOperationException
+   */
+  @Override
+  public synchronized void changeDatum(double delta) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public synchronized int getCount() {
+    return count;
+  }
+
+  @Override
+  public synchronized double getAverage() {
+    return average;
+  }
+
+  @Override
+  public RunningAverage inverse() {
+    return new InvertedRunningAverage(this);
+  }
+
+  @Override
+  public synchronized String toString() {
+    return String.valueOf(average);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverageAndStdDev.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverageAndStdDev.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverageAndStdDev.java
new file mode 100644
index 0000000..619b6b7
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverageAndStdDev.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+/**
+ * <p>
+ * A simple class that represents a fixed value of an average, count and standard deviation. This is useful
+ * when an API needs to return {@link RunningAverageAndStdDev} but is not in a position to accept
+ * updates to it.
+ * </p>
+ */
+public final class FixedRunningAverageAndStdDev extends FixedRunningAverage implements RunningAverageAndStdDev {
+
+  private final double stdDev;
+
+  public FixedRunningAverageAndStdDev(double average, double stdDev, int count) {
+    super(average, count);
+    this.stdDev = stdDev;
+  }
+
+  @Override
+  public RunningAverageAndStdDev inverse() {
+    return new InvertedRunningAverageAndStdDev(this);
+  }
+
+  @Override
+  public synchronized String toString() {
+    return super.toString() + ',' + stdDev;
+  }
+
+  @Override
+  public double getStandardDeviation() {
+    return stdDev;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverage.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverage.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverage.java
new file mode 100644
index 0000000..00d828f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverage.java
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.io.Serializable;
+
+/**
+ * <p>
+ * A simple class that can keep track of a running average of a series of numbers. One can add to or remove
+ * from the series, as well as update a datum in the series. The class does not actually keep track of the
+ * series of values, just its running average, so it doesn't even matter if you remove/change a value that
+ * wasn't added.
+ * </p>
+ */
+public class FullRunningAverage implements RunningAverage, Serializable {
+  
+  private int count;
+  private double average;
+  
+  public FullRunningAverage() {
+    this(0, Double.NaN);
+  }
+
+  public FullRunningAverage(int count, double average) {
+    this.count = count;
+    this.average = average;
+  }
+
+  /**
+   * @param datum
+   *          new item to add to the running average
+   */
+  @Override
+  public synchronized void addDatum(double datum) {
+    if (++count == 1) {
+      average = datum;
+    } else {
+      average = average * (count - 1) / count + datum / count;
+    }
+  }
+  
+  /**
+   * @param datum
+   *          item to remove to the running average
+   * @throws IllegalStateException
+   *           if count is 0
+   */
+  @Override
+  public synchronized void removeDatum(double datum) {
+    if (count == 0) {
+      throw new IllegalStateException();
+    }
+    if (--count == 0) {
+      average = Double.NaN;
+    } else {
+      average = average * (count + 1) / count - datum / count;
+    }
+  }
+  
+  /**
+   * @param delta
+   *          amount by which to change a datum in the running average
+   * @throws IllegalStateException
+   *           if count is 0
+   */
+  @Override
+  public synchronized void changeDatum(double delta) {
+    if (count == 0) {
+      throw new IllegalStateException();
+    }
+    average += delta / count;
+  }
+  
+  @Override
+  public synchronized int getCount() {
+    return count;
+  }
+  
+  @Override
+  public synchronized double getAverage() {
+    return average;
+  }
+
+  @Override
+  public RunningAverage inverse() {
+    return new InvertedRunningAverage(this);
+  }
+  
+  @Override
+  public synchronized String toString() {
+    return String.valueOf(average);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverageAndStdDev.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverageAndStdDev.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverageAndStdDev.java
new file mode 100644
index 0000000..6212e66
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverageAndStdDev.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+/**
+ * <p>
+ * Extends {@link FullRunningAverage} to add a running standard deviation computation.
+ * Uses Welford's method, as described at http://www.johndcook.com/standard_deviation.html
+ * </p>
+ */
+public final class FullRunningAverageAndStdDev extends FullRunningAverage implements RunningAverageAndStdDev {
+
+  private double stdDev;
+  private double mk;
+  private double sk;
+  
+  public FullRunningAverageAndStdDev() {
+    mk = 0.0;
+    sk = 0.0;
+    recomputeStdDev();
+  }
+  
+  public FullRunningAverageAndStdDev(int count, double average, double mk, double sk) {
+    super(count, average);
+    this.mk = mk;
+    this.sk = sk;
+    recomputeStdDev();
+  }
+
+  public double getMk() {
+    return mk;
+  }
+  
+  public double getSk() {
+    return sk;
+  }
+
+  @Override
+  public synchronized double getStandardDeviation() {
+    return stdDev;
+  }
+  
+  @Override
+  public synchronized void addDatum(double datum) {
+    super.addDatum(datum);
+    int count = getCount();
+    if (count == 1) {
+      mk = datum;
+      sk = 0.0;
+    } else {
+      double oldmk = mk;
+      double diff = datum - oldmk;
+      mk += diff / count;
+      sk += diff * (datum - mk);
+    }
+    recomputeStdDev();
+  }
+  
+  @Override
+  public synchronized void removeDatum(double datum) {
+    int oldCount = getCount();
+    super.removeDatum(datum);
+    double oldmk = mk;
+    mk = (oldCount * oldmk - datum) / (oldCount - 1);
+    sk -= (datum - mk) * (datum - oldmk);
+    recomputeStdDev();
+  }
+  
+  /**
+   * @throws UnsupportedOperationException
+   */
+  @Override
+  public void changeDatum(double delta) {
+    throw new UnsupportedOperationException();
+  }
+  
+  private synchronized void recomputeStdDev() {
+    int count = getCount();
+    stdDev = count > 1 ? Math.sqrt(sk / (count - 1)) : Double.NaN;
+  }
+
+  @Override
+  public RunningAverageAndStdDev inverse() {
+    return new InvertedRunningAverageAndStdDev(this);
+  }
+  
+  @Override
+  public synchronized String toString() {
+    return String.valueOf(String.valueOf(getAverage()) + ',' + stdDev);
+  }
+  
+}


[36/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
new file mode 100644
index 0000000..752bb48
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
@@ -0,0 +1,274 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.email;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.math.VarIntWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * Convert the Mail archives (see {@link org.apache.mahout.text.SequenceFilesFromMailArchives}) to a preference
+ * file that can be consumed by the {@link org.apache.mahout.cf.taste.hadoop.item.RecommenderJob}.
+ * <p/>
+ * This assumes the input is a Sequence File, that the key is: filename/message id and the value is a list
+ * (separated by the user's choosing) containing the from email and any references
+ * <p/>
+ * The output is a matrix where either the from or to are the rows (represented as longs) and the columns are the
+ * message ids that the user has interacted with (as a VectorWritable).  This class currently does not account for
+ * thread hijacking.
+ * <p/>
+ * It also outputs a side table mapping the row ids to their original and the message ids to the message thread id
+ */
+public final class MailToPrefsDriver extends AbstractJob {
+
+  private static final Logger log = LoggerFactory.getLogger(MailToPrefsDriver.class);
+
+  private static final String OUTPUT_FILES_PATTERN = "part-*";
+  private static final int DICTIONARY_BYTE_OVERHEAD = 4;
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new MailToPrefsDriver(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+    addInputOption();
+    addOutputOption();
+    addOption(DefaultOptionCreator.overwriteOption().create());
+    addOption("chunkSize", "cs", "The size of chunks to write.  Default is 100 mb", "100");
+    addOption("separator", "sep", "The separator used in the input file to separate to, from, subject.  Default is \\n",
+        "\n");
+    addOption("from", "f", "The position in the input text (value) where the from email is located, starting from "
+        + "zero (0).", "0");
+    addOption("refs", "r", "The position in the input text (value) where the reference ids are located, "
+        + "starting from zero (0).", "1");
+    addOption(buildOption("useCounts", "u", "If set, then use the number of times the user has interacted with a "
+        + "thread as an indication of their preference.  Otherwise, use boolean preferences.", false, false,
+        String.valueOf(true)));
+    Map<String, List<String>> parsedArgs = parseArguments(args);
+
+    Path input = getInputPath();
+    Path output = getOutputPath();
+    int chunkSize = Integer.parseInt(getOption("chunkSize"));
+    String separator = getOption("separator");
+    Configuration conf = getConf();
+    boolean useCounts = hasOption("useCounts");
+    AtomicInteger currentPhase = new AtomicInteger();
+    int[] msgDim = new int[1];
+    //TODO: mod this to not do so many passes over the data.  Dictionary creation could probably be a chain mapper
+    List<Path> msgIdChunks = null;
+    boolean overwrite = hasOption(DefaultOptionCreator.OVERWRITE_OPTION);
+    // create the dictionary between message ids and longs
+    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+      //TODO: there seems to be a pattern emerging for dictionary creation
+      // -- sparse vectors from seq files also has this.
+      Path msgIdsPath = new Path(output, "msgIds");
+      if (overwrite) {
+        HadoopUtil.delete(conf, msgIdsPath);
+      }
+      log.info("Creating Msg Id Dictionary");
+      Job createMsgIdDictionary = prepareJob(input,
+              msgIdsPath,
+              SequenceFileInputFormat.class,
+              MsgIdToDictionaryMapper.class,
+              Text.class,
+              VarIntWritable.class,
+              MailToDictionaryReducer.class,
+              Text.class,
+              VarIntWritable.class,
+              SequenceFileOutputFormat.class);
+
+      boolean succeeded = createMsgIdDictionary.waitForCompletion(true);
+      if (!succeeded) {
+        return -1;
+      }
+      //write out the dictionary at the top level
+      msgIdChunks = createDictionaryChunks(msgIdsPath, output, "msgIds-dictionary-",
+          createMsgIdDictionary.getConfiguration(), chunkSize, msgDim);
+    }
+    //create the dictionary between from email addresses and longs
+    List<Path> fromChunks = null;
+    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+      Path fromIdsPath = new Path(output, "fromIds");
+      if (overwrite) {
+        HadoopUtil.delete(conf, fromIdsPath);
+      }
+      log.info("Creating From Id Dictionary");
+      Job createFromIdDictionary = prepareJob(input,
+              fromIdsPath,
+              SequenceFileInputFormat.class,
+              FromEmailToDictionaryMapper.class,
+              Text.class,
+              VarIntWritable.class,
+              MailToDictionaryReducer.class,
+              Text.class,
+              VarIntWritable.class,
+              SequenceFileOutputFormat.class);
+      createFromIdDictionary.getConfiguration().set(EmailUtility.SEPARATOR, separator);
+      boolean succeeded = createFromIdDictionary.waitForCompletion(true);
+      if (!succeeded) {
+        return -1;
+      }
+      //write out the dictionary at the top level
+      int[] fromDim = new int[1];
+      fromChunks = createDictionaryChunks(fromIdsPath, output, "fromIds-dictionary-",
+          createFromIdDictionary.getConfiguration(), chunkSize, fromDim);
+    }
+    //OK, we have our dictionaries, let's output the real thing we need: <from_id -> <msgId, msgId, msgId, ...>>
+    if (shouldRunNextPhase(parsedArgs, currentPhase) && fromChunks != null && msgIdChunks != null) {
+      //Job map
+      //may be a way to do this so that we can load the from ids in memory, if they are small enough so that
+      // we don't need the double loop
+      log.info("Creating recommendation matrix");
+      Path vecPath = new Path(output, "recInput");
+      if (overwrite) {
+        HadoopUtil.delete(conf, vecPath);
+      }
+      //conf.set(EmailUtility.FROM_DIMENSION, String.valueOf(fromDim[0]));
+      conf.set(EmailUtility.MSG_ID_DIMENSION, String.valueOf(msgDim[0]));
+      conf.set(EmailUtility.FROM_PREFIX, "fromIds-dictionary-");
+      conf.set(EmailUtility.MSG_IDS_PREFIX, "msgIds-dictionary-");
+      conf.set(EmailUtility.FROM_INDEX, getOption("from"));
+      conf.set(EmailUtility.REFS_INDEX, getOption("refs"));
+      conf.set(EmailUtility.SEPARATOR, separator);
+      conf.set(MailToRecReducer.USE_COUNTS_PREFERENCE, String.valueOf(useCounts));
+      int j = 0;
+      int i = 0;
+      for (Path fromChunk : fromChunks) {
+        for (Path idChunk : msgIdChunks) {
+          Path out = new Path(vecPath, "tmp-" + i + '-' + j);
+          DistributedCache.setCacheFiles(new URI[]{fromChunk.toUri(), idChunk.toUri()}, conf);
+          Job createRecMatrix = prepareJob(input, out, SequenceFileInputFormat.class,
+                  MailToRecMapper.class, Text.class, LongWritable.class, MailToRecReducer.class, Text.class,
+                  NullWritable.class, TextOutputFormat.class);
+          createRecMatrix.getConfiguration().set("mapred.output.compress", "false");
+          boolean succeeded = createRecMatrix.waitForCompletion(true);
+          if (!succeeded) {
+            return -1;
+          }
+          //copy the results up a level
+          //HadoopUtil.copyMergeSeqFiles(out.getFileSystem(conf), out, vecPath.getFileSystem(conf), outPath, true,
+          // conf, "");
+          FileStatus[] fs = HadoopUtil.getFileStatus(new Path(out, "*"), PathType.GLOB, PathFilters.partFilter(), null,
+              conf);
+          for (int k = 0; k < fs.length; k++) {
+            FileStatus f = fs[k];
+            Path outPath = new Path(vecPath, "chunk-" + i + '-' + j + '-' + k);
+            FileUtil.copy(f.getPath().getFileSystem(conf), f.getPath(), outPath.getFileSystem(conf), outPath, true,
+                overwrite, conf);
+          }
+          HadoopUtil.delete(conf, out);
+          j++;
+        }
+        i++;
+      }
+      //concat the files together
+      /*Path mergePath = new Path(output, "vectors.dat");
+      if (overwrite) {
+        HadoopUtil.delete(conf, mergePath);
+      }
+      log.info("Merging together output vectors to vectors.dat in {}", output);*/
+      //HadoopUtil.copyMergeSeqFiles(vecPath.getFileSystem(conf), vecPath, mergePath.getFileSystem(conf), mergePath,
+      // false, conf, "\n");
+    }
+
+    return 0;
+  }
+
+  private static List<Path> createDictionaryChunks(Path inputPath,
+                                                   Path dictionaryPathBase,
+                                                   String name,
+                                                   Configuration baseConf,
+                                                   int chunkSizeInMegabytes, int[] maxTermDimension)
+    throws IOException {
+    List<Path> chunkPaths = new ArrayList<>();
+
+    Configuration conf = new Configuration(baseConf);
+
+    FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
+
+    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
+    int chunkIndex = 0;
+    Path chunkPath = new Path(dictionaryPathBase, name + chunkIndex);
+    chunkPaths.add(chunkPath);
+
+    SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
+
+    try {
+      long currentChunkSize = 0;
+      Path filesPattern = new Path(inputPath, OUTPUT_FILES_PATTERN);
+      int i = 1; //start at 1, since a miss in the OpenObjectIntHashMap returns a 0
+      for (Pair<Writable, Writable> record
+              : new SequenceFileDirIterable<>(filesPattern, PathType.GLOB, null, null, true, conf)) {
+        if (currentChunkSize > chunkSizeLimit) {
+          Closeables.close(dictWriter, false);
+          chunkIndex++;
+
+          chunkPath = new Path(dictionaryPathBase, name + chunkIndex);
+          chunkPaths.add(chunkPath);
+
+          dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
+          currentChunkSize = 0;
+        }
+
+        Writable key = record.getFirst();
+        int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8;
+        currentChunkSize += fieldSize;
+        dictWriter.append(key, new IntWritable(i++));
+      }
+      maxTermDimension[0] = i;
+    } finally {
+      Closeables.close(dictWriter, false);
+    }
+
+    return chunkPaths;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
new file mode 100644
index 0000000..91bbd17
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.email;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+
+public final class MailToRecMapper extends Mapper<Text, Text, Text, LongWritable> {
+
+  private static final Logger log = LoggerFactory.getLogger(MailToRecMapper.class);
+
+  private final OpenObjectIntHashMap<String> fromDictionary = new OpenObjectIntHashMap<>();
+  private final OpenObjectIntHashMap<String> msgIdDictionary = new OpenObjectIntHashMap<>();
+  private String separator = "\n";
+  private int fromIdx;
+  private int refsIdx;
+
+  public enum Counters {
+    REFERENCE, ORIGINAL
+  }
+
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    super.setup(context);
+    Configuration conf = context.getConfiguration();
+    String fromPrefix = conf.get(EmailUtility.FROM_PREFIX);
+    String msgPrefix = conf.get(EmailUtility.MSG_IDS_PREFIX);
+    fromIdx = conf.getInt(EmailUtility.FROM_INDEX, 0);
+    refsIdx = conf.getInt(EmailUtility.REFS_INDEX, 1);
+    EmailUtility.loadDictionaries(conf, fromPrefix, fromDictionary, msgPrefix, msgIdDictionary);
+    log.info("From Dictionary size: {} Msg Id Dictionary size: {}", fromDictionary.size(), msgIdDictionary.size());
+    separator = context.getConfiguration().get(EmailUtility.SEPARATOR);
+  }
+
+  @Override
+  protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
+
+    int msgIdKey = Integer.MIN_VALUE;
+
+
+    int fromKey = Integer.MIN_VALUE;
+    String valStr = value.toString();
+    String[] splits = StringUtils.splitByWholeSeparatorPreserveAllTokens(valStr, separator);
+
+    if (splits != null && splits.length > 0) {
+      if (splits.length > refsIdx) {
+        String from = EmailUtility.cleanUpEmailAddress(splits[fromIdx]);
+        fromKey = fromDictionary.get(from);
+      }
+      //get the references
+      if (splits.length > refsIdx) {
+        String[] theRefs = EmailUtility.parseReferences(splits[refsIdx]);
+        if (theRefs != null && theRefs.length > 0) {
+          //we have a reference, the first one is the original message id, so map to that one if it exists
+          msgIdKey = msgIdDictionary.get(theRefs[0]);
+          context.getCounter(Counters.REFERENCE).increment(1);
+        }
+      }
+    }
+    //we don't have any references, so use the msg id
+    if (msgIdKey == Integer.MIN_VALUE) {
+      //get the msg id and the from and output the associated ids
+      String keyStr = key.toString();
+      int idx = keyStr.lastIndexOf('/');
+      if (idx != -1) {
+        String msgId = keyStr.substring(idx + 1);
+        msgIdKey = msgIdDictionary.get(msgId);
+        context.getCounter(Counters.ORIGINAL).increment(1);
+      }
+    }
+
+    if (msgIdKey != Integer.MIN_VALUE && fromKey != Integer.MIN_VALUE) {
+      context.write(new Text(fromKey + "," + msgIdKey), new LongWritable(1));
+    }
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
new file mode 100644
index 0000000..ee36a41
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.email;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+
+import java.io.IOException;
+
+public class MailToRecReducer extends Reducer<Text, LongWritable, Text, NullWritable> {
+  //if true, then output weight
+  private boolean useCounts = true;
+  /**
+   * We can either ignore how many times the user interacted (boolean) or output the number of times they interacted.
+   */
+  public static final String USE_COUNTS_PREFERENCE = "useBooleanPreferences";
+
+  @Override
+  protected void setup(Context context) throws IOException, InterruptedException {
+    useCounts = context.getConfiguration().getBoolean(USE_COUNTS_PREFERENCE, true);
+  }
+
+  @Override
+  protected void reduce(Text key, Iterable<LongWritable> values, Context context)
+    throws IOException, InterruptedException {
+    if (useCounts) {
+      long sum = 0;
+      for (LongWritable value : values) {
+        sum++;
+      }
+      context.write(new Text(key.toString() + ',' + sum), null);
+    } else {
+      context.write(new Text(key.toString()), null);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
new file mode 100644
index 0000000..f3de847
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.email;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Assumes the input is in the format created by {@link org.apache.mahout.text.SequenceFilesFromMailArchives}
+ */
+public final class MsgIdToDictionaryMapper extends Mapper<Text, Text, Text, VarIntWritable> {
+
+  @Override
+  protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
+    //message id is in the key: /201008/AANLkTikvVnhNH+Y5AGEwqd2=u0CFv2mCm0ce6E6oBnj1@mail.gmail.com
+    String keyStr = key.toString();
+    int idx = keyStr.lastIndexOf('@'); //find the last @
+    if (idx == -1) {
+      context.getCounter(EmailUtility.Counters.NO_MESSAGE_ID).increment(1);
+    } else {
+      //found the @, now find the last slash before the @ and grab everything after that
+      idx = keyStr.lastIndexOf('/', idx);
+      String msgId = keyStr.substring(idx + 1);
+      if (EmailUtility.WHITESPACE.matcher(msgId).matches()) {
+        context.getCounter(EmailUtility.Counters.NO_MESSAGE_ID).increment(1);
+      } else {
+        context.write(new Text(msgId), new VarIntWritable(1));
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java
new file mode 100644
index 0000000..c358021
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.Pair;
+
+public final class DataFileIterable implements Iterable<Pair<PreferenceArray,long[]>> {
+
+  private final File dataFile;
+
+  public DataFileIterable(File dataFile) {
+    this.dataFile = dataFile;
+  }
+
+  @Override
+  public Iterator<Pair<PreferenceArray, long[]>> iterator() {
+    try {
+      return new DataFileIterator(dataFile);
+    } catch (IOException ioe) {
+      throw new IllegalStateException(ioe);
+    }
+  }
+ 
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java
new file mode 100644
index 0000000..786e080
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java
@@ -0,0 +1,158 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+import com.google.common.collect.AbstractIterator;
+import com.google.common.io.Closeables;
+import org.apache.mahout.cf.taste.impl.common.SkippingIterator;
+import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.iterator.FileLineIterator;
+import org.apache.mahout.common.Pair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>An {@link java.util.Iterator} which iterates over any of the KDD Cup's rating files. These include the files
+ * {train,test,validation}Idx{1,2}}.txt. See http://kddcup.yahoo.com/. Each element in the iteration corresponds
+ * to one user's ratings as a {@link PreferenceArray} and corresponding timestamps as a parallel {@code long}
+ * array.</p>
+ *
+ * <p>Timestamps in the data set are relative to some unknown point in time, for anonymity. They are assumed
+ * to be relative to the epoch, time 0, or January 1 1970, for purposes here.</p>
+ */
+public final class DataFileIterator
+    extends AbstractIterator<Pair<PreferenceArray,long[]>>
+    implements SkippingIterator<Pair<PreferenceArray,long[]>>, Closeable {
+
+  private static final Pattern COLON_PATTERN = Pattern.compile(":");
+  private static final Pattern PIPE_PATTERN = Pattern.compile("\\|");
+  private static final Pattern TAB_PATTERN = Pattern.compile("\t");
+
+  private final FileLineIterator lineIterator;
+
+  private static final Logger log = LoggerFactory.getLogger(DataFileIterator.class);
+
+  public DataFileIterator(File dataFile) throws IOException {
+    if (dataFile == null || dataFile.isDirectory() || !dataFile.exists()) {
+      throw new IllegalArgumentException("Bad data file: " + dataFile);
+    }
+    lineIterator = new FileLineIterator(dataFile);
+  }
+
+  @Override
+  protected Pair<PreferenceArray, long[]> computeNext() {
+
+    if (!lineIterator.hasNext()) {
+      return endOfData();
+    }
+
+    String line = lineIterator.next();
+    // First a userID|ratingsCount line
+    String[] tokens = PIPE_PATTERN.split(line);
+
+    long userID = Long.parseLong(tokens[0]);
+    int ratingsLeftToRead = Integer.parseInt(tokens[1]);
+    int ratingsRead = 0;
+
+    PreferenceArray currentUserPrefs = new GenericUserPreferenceArray(ratingsLeftToRead);
+    long[] timestamps = new long[ratingsLeftToRead];
+
+    while (ratingsLeftToRead > 0) {
+
+      line = lineIterator.next();
+
+      // Then a data line. May be 1-4 tokens depending on whether preference info is included (it's not in test data)
+      // or whether date info is included (not inluded in track 2). Item ID is always first, and date is the last
+      // two fields if it exists.
+      tokens = TAB_PATTERN.split(line);
+      boolean hasPref = tokens.length == 2 || tokens.length == 4;
+      boolean hasDate = tokens.length > 2;
+
+      long itemID = Long.parseLong(tokens[0]);
+
+      currentUserPrefs.setUserID(0, userID);
+      currentUserPrefs.setItemID(ratingsRead, itemID);
+      if (hasPref) {
+        float preference = Float.parseFloat(tokens[1]);
+        currentUserPrefs.setValue(ratingsRead, preference);
+      }
+
+      if (hasDate) {
+        long timestamp;
+        if (hasPref) {
+          timestamp = parseFakeTimestamp(tokens[2], tokens[3]);
+        } else {
+          timestamp = parseFakeTimestamp(tokens[1], tokens[2]);
+        }
+        timestamps[ratingsRead] = timestamp;
+      }
+
+      ratingsRead++;
+      ratingsLeftToRead--;
+    }
+
+    return new Pair<>(currentUserPrefs, timestamps);
+  }
+
+  @Override
+  public void skip(int n) {
+    for (int i = 0; i < n; i++) {
+      if (lineIterator.hasNext()) {
+        String line = lineIterator.next();
+        // First a userID|ratingsCount line
+        String[] tokens = PIPE_PATTERN.split(line);
+        int linesToSKip = Integer.parseInt(tokens[1]);
+        lineIterator.skip(linesToSKip);
+      } else {
+        break;
+      }
+    }
+  }
+
+  @Override
+  public void close() {
+    endOfData();
+    try {
+      Closeables.close(lineIterator, true);
+    } catch (IOException e) {
+      log.error(e.getMessage(), e);
+    }
+  }
+
+  /**
+   * @param dateString "date" in days since some undisclosed date, which we will arbitrarily assume to be the
+   *  epoch, January 1 1970.
+   * @param timeString time of day in HH:mm:ss format
+   * @return the UNIX timestamp for this moment in time
+   */
+  private static long parseFakeTimestamp(String dateString, CharSequence timeString) {
+    int days = Integer.parseInt(dateString);
+    String[] timeTokens = COLON_PATTERN.split(timeString);
+    int hours = Integer.parseInt(timeTokens[0]);
+    int minutes = Integer.parseInt(timeTokens[1]);
+    int seconds = Integer.parseInt(timeTokens[2]);
+    return 86400L * days + 3600L + hours + 60L * minutes + seconds;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java
new file mode 100644
index 0000000..4b62050
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java
@@ -0,0 +1,231 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Iterator;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.SamplingIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>An {@link DataModel} which reads into memory any of the KDD Cup's rating files; it is really
+ * meant for use with training data in the files trainIdx{1,2}}.txt.
+ * See http://kddcup.yahoo.com/.</p>
+ *
+ * <p>Timestamps in the data set are relative to some unknown point in time, for anonymity. They are assumed
+ * to be relative to the epoch, time 0, or January 1 1970, for purposes here.</p>
+ */
+public final class KDDCupDataModel implements DataModel {
+
+  private static final Logger log = LoggerFactory.getLogger(KDDCupDataModel.class);
+
+  private final File dataFileDirectory;
+  private final DataModel delegate;
+
+  /**
+   * @param dataFile training rating file
+   */
+  public KDDCupDataModel(File dataFile) throws IOException {
+    this(dataFile, false, 1.0);
+  }
+
+  /**
+   * @param dataFile training rating file
+   * @param storeDates if true, dates are parsed and stored, otherwise not
+   * @param samplingRate percentage of users to keep; can be used to reduce memory requirements
+   */
+  public KDDCupDataModel(File dataFile, boolean storeDates, double samplingRate) throws IOException {
+
+    Preconditions.checkArgument(!Double.isNaN(samplingRate) && samplingRate > 0.0 && samplingRate <= 1.0,
+        "Must be: 0.0 < samplingRate <= 1.0");
+
+    dataFileDirectory = dataFile.getParentFile();
+
+    Iterator<Pair<PreferenceArray,long[]>> dataIterator = new DataFileIterator(dataFile);
+    if (samplingRate < 1.0) {
+      dataIterator = new SamplingIterator<>(dataIterator, samplingRate);
+    }
+
+    FastByIDMap<PreferenceArray> userData = new FastByIDMap<>();
+    FastByIDMap<FastByIDMap<Long>> timestamps = new FastByIDMap<>();
+
+    while (dataIterator.hasNext()) {
+
+      Pair<PreferenceArray,long[]> pair = dataIterator.next();
+      PreferenceArray userPrefs = pair.getFirst();
+      long[] timestampsForPrefs = pair.getSecond();
+
+      userData.put(userPrefs.getUserID(0), userPrefs);
+      if (storeDates) {
+        FastByIDMap<Long> itemTimestamps = new FastByIDMap<>();
+        for (int i = 0; i < timestampsForPrefs.length; i++) {
+          long timestamp = timestampsForPrefs[i];
+          if (timestamp > 0L) {
+            itemTimestamps.put(userPrefs.getItemID(i), timestamp);
+          }
+        }
+      }
+
+    }
+
+    if (storeDates) {
+      delegate = new GenericDataModel(userData, timestamps);
+    } else {
+      delegate = new GenericDataModel(userData);
+    }
+
+    Runtime runtime = Runtime.getRuntime();
+    log.info("Loaded data model in about {}MB heap", (runtime.totalMemory() - runtime.freeMemory()) / 1000000);
+  }
+
+  public File getDataFileDirectory() {
+    return dataFileDirectory;
+  }
+
+  public static File getTrainingFile(File dataFileDirectory) {
+    return getFile(dataFileDirectory, "trainIdx");
+  }
+
+  public static File getValidationFile(File dataFileDirectory) {
+    return getFile(dataFileDirectory, "validationIdx");
+  }
+
+  public static File getTestFile(File dataFileDirectory) {
+    return getFile(dataFileDirectory, "testIdx");
+  }
+
+  public static File getTrackFile(File dataFileDirectory) {
+    return getFile(dataFileDirectory, "trackData");
+  }
+
+  private static File getFile(File dataFileDirectory, String prefix) {
+    // Works on set 1 or 2
+    for (int set : new int[] {1,2}) {
+      // Works on sample data from before contest or real data
+      for (String firstLinesOrNot : new String[] {"", ".firstLines"}) {
+        for (String gzippedOrNot : new String[] {".gz", ""}) {
+          File dataFile = new File(dataFileDirectory, prefix + set + firstLinesOrNot + ".txt" + gzippedOrNot);
+          if (dataFile.exists()) {
+            return dataFile;
+          }
+        }
+      }
+    }
+    throw new IllegalArgumentException("Can't find " + prefix + " file in " + dataFileDirectory);
+  }
+
+  @Override
+  public LongPrimitiveIterator getUserIDs() throws TasteException {
+    return delegate.getUserIDs();
+  }
+
+  @Override
+  public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
+    return delegate.getPreferencesFromUser(userID);
+  }
+
+  @Override
+  public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+    return delegate.getItemIDsFromUser(userID);
+  }
+
+  @Override
+  public LongPrimitiveIterator getItemIDs() throws TasteException {
+    return delegate.getItemIDs();
+  }
+
+  @Override
+  public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
+    return delegate.getPreferencesForItem(itemID);
+  }
+
+  @Override
+  public Float getPreferenceValue(long userID, long itemID) throws TasteException {
+    return delegate.getPreferenceValue(userID, itemID);
+  }
+
+  @Override
+  public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+    return delegate.getPreferenceTime(userID, itemID);
+  }
+
+  @Override
+  public int getNumItems() throws TasteException {
+    return delegate.getNumItems();
+  }
+
+  @Override
+  public int getNumUsers() throws TasteException {
+    return delegate.getNumUsers();
+  }
+
+  @Override
+  public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
+    return delegate.getNumUsersWithPreferenceFor(itemID);
+  }
+
+  @Override
+  public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
+    return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2);
+  }
+
+  @Override
+  public void setPreference(long userID, long itemID, float value) throws TasteException {
+    delegate.setPreference(userID, itemID, value);
+  }
+
+  @Override
+  public void removePreference(long userID, long itemID) throws TasteException {
+    delegate.removePreference(userID, itemID);
+  }
+
+  @Override
+  public boolean hasPreferenceValues() {
+    return delegate.hasPreferenceValues();
+  }
+
+  @Override
+  public float getMaxPreference() {
+    return 100.0f;
+  }
+
+  @Override
+  public float getMinPreference() {
+    return 0.0f;
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    // do nothing
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java
new file mode 100644
index 0000000..3f4a732
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup;
+
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.Pair;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.zip.GZIPOutputStream;
+
+/**
+ * <p>This class converts a KDD Cup input file into a compressed CSV format. The output format is
+ * {@code userID,itemID,score,timestamp}. It can optionally restrict its output to exclude
+ * score and/or timestamp.</p>
+ *
+ * <p>Run as: {@code ToCSV (input file) (output file) [num columns to output]}</p>
+ */
+public final class ToCSV {
+
+  private ToCSV() {
+  }
+
+  public static void main(String[] args) throws Exception {
+
+    File inputFile = new File(args[0]);
+    File outputFile = new File(args[1]);
+    int columnsToOutput = 4;
+    if (args.length >= 3) {
+      columnsToOutput = Integer.parseInt(args[2]);
+    }
+
+    OutputStream outStream = new GZIPOutputStream(new FileOutputStream(outputFile));
+
+    try (Writer outWriter = new BufferedWriter(new OutputStreamWriter(outStream, Charsets.UTF_8))){
+      for (Pair<PreferenceArray,long[]> user : new DataFileIterable(inputFile)) {
+        PreferenceArray prefs = user.getFirst();
+        long[] timestamps = user.getSecond();
+        for (int i = 0; i < prefs.length(); i++) {
+          outWriter.write(String.valueOf(prefs.getUserID(i)));
+          outWriter.write(',');
+          outWriter.write(String.valueOf(prefs.getItemID(i)));
+          if (columnsToOutput > 2) {
+            outWriter.write(',');
+            outWriter.write(String.valueOf(prefs.getValue(i)));
+          }
+          if (columnsToOutput > 3) {
+            outWriter.write(',');
+            outWriter.write(String.valueOf(timestamps[i]));
+          }
+          outWriter.write('\n');
+        }
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java
new file mode 100644
index 0000000..0112ab9
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class EstimateConverter {
+
+  private static final Logger log = LoggerFactory.getLogger(EstimateConverter.class);
+
+  private EstimateConverter() {}
+
+  public static byte convert(double estimate, long userID, long itemID) {
+    if (Double.isNaN(estimate)) {
+      log.warn("Unable to compute estimate for user {}, item {}", userID, itemID);
+      return 0x7F;
+    } else {
+      int scaledEstimate = (int) (estimate * 2.55);
+      if (scaledEstimate > 255) {
+        scaledEstimate = 255;
+      } else if (scaledEstimate < 0) {
+        scaledEstimate = 0;
+      }
+      return (byte) scaledEstimate;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java
new file mode 100644
index 0000000..72056da
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1;
+
+import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+final class Track1Callable implements Callable<byte[]> {
+
+  private static final Logger log = LoggerFactory.getLogger(Track1Callable.class);
+  private static final AtomicInteger COUNT = new AtomicInteger();
+
+  private final Recommender recommender;
+  private final PreferenceArray userTest;
+
+  Track1Callable(Recommender recommender, PreferenceArray userTest) {
+    this.recommender = recommender;
+    this.userTest = userTest;
+  }
+
+  @Override
+  public byte[] call() throws TasteException {
+    long userID = userTest.get(0).getUserID();
+    byte[] result = new byte[userTest.length()];
+    for (int i = 0; i < userTest.length(); i++) {
+      long itemID = userTest.getItemID(i);
+      double estimate;
+      try {
+        estimate = recommender.estimatePreference(userID, itemID);
+      } catch (NoSuchItemException nsie) {
+        // OK in the sample data provided before the contest, should never happen otherwise
+        log.warn("Unknown item {}; OK unless this is the real contest data", itemID);
+        continue;
+      }
+      result[i] = EstimateConverter.convert(estimate, userID, itemID);
+    }
+
+    if (COUNT.incrementAndGet() % 10000 == 0) {
+      log.info("Completed {} users", COUNT.get());
+    }
+
+    return result;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java
new file mode 100644
index 0000000..067daf5
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1;
+
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender;
+import org.apache.mahout.cf.taste.impl.similarity.UncenteredCosineSimilarity;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+
+public final class Track1Recommender implements Recommender {
+
+  private final Recommender recommender;
+
+  public Track1Recommender(DataModel dataModel) throws TasteException {
+    // Change this to whatever you like!
+    ItemSimilarity similarity = new UncenteredCosineSimilarity(dataModel);
+    recommender = new GenericItemBasedRecommender(dataModel, similarity);
+  }
+  
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
+    return recommender.recommend(userID, howMany);
+  }
+
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
+    return recommend(userID, howMany, null, includeKnownItems);
+  }
+
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
+    return recommender.recommend(userID, howMany, rescorer, false);
+  }
+  
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+    throws TasteException {
+    return recommender.recommend(userID, howMany, rescorer, includeKnownItems);
+  }
+  
+  @Override
+  public float estimatePreference(long userID, long itemID) throws TasteException {
+    return recommender.estimatePreference(userID, itemID);
+  }
+  
+  @Override
+  public void setPreference(long userID, long itemID, float value) throws TasteException {
+    recommender.setPreference(userID, itemID, value);
+  }
+  
+  @Override
+  public void removePreference(long userID, long itemID) throws TasteException {
+    recommender.removePreference(userID, itemID);
+  }
+  
+  @Override
+  public DataModel getDataModel() {
+    return recommender.getDataModel();
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    recommender.refresh(alreadyRefreshed);
+  }
+  
+  @Override
+  public String toString() {
+    return "Track1Recommender[recommender:" + recommender + ']';
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java
new file mode 100644
index 0000000..6b9fe1b
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+
+final class Track1RecommenderBuilder implements RecommenderBuilder {
+  
+  @Override
+  public Recommender buildRecommender(DataModel dataModel) throws TasteException {
+    return new Track1Recommender(dataModel);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java
new file mode 100644
index 0000000..bcd0a3d
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1;
+
+import java.io.File;
+import java.util.Collection;
+import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.google.common.collect.Lists;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.DataModelBuilder;
+import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
+import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
+import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.eval.AbstractDifferenceRecommenderEvaluator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.common.Pair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Attempts to run an evaluation just like that dictated for Yahoo's KDD Cup, Track 1.
+ * It will compute the RMSE of a validation data set against the predicted ratings from
+ * the training data set.
+ */
+public final class Track1RecommenderEvaluator extends AbstractDifferenceRecommenderEvaluator {
+
+  private static final Logger log = LoggerFactory.getLogger(Track1RecommenderEvaluator.class);
+
+  private RunningAverage average;
+  private final File dataFileDirectory;
+
+  public Track1RecommenderEvaluator(File dataFileDirectory) {
+    setMaxPreference(100.0f);
+    setMinPreference(0.0f);
+    average = new FullRunningAverage();
+    this.dataFileDirectory = dataFileDirectory;
+  }
+
+  @Override
+  public double evaluate(RecommenderBuilder recommenderBuilder,
+                         DataModelBuilder dataModelBuilder,
+                         DataModel dataModel,
+                         double trainingPercentage,
+                         double evaluationPercentage) throws TasteException {
+
+    Recommender recommender = recommenderBuilder.buildRecommender(dataModel);
+
+    Collection<Callable<Void>> estimateCallables = Lists.newArrayList();
+    AtomicInteger noEstimateCounter = new AtomicInteger();
+    for (Pair<PreferenceArray,long[]> userData
+        : new DataFileIterable(KDDCupDataModel.getValidationFile(dataFileDirectory))) {
+      PreferenceArray validationPrefs = userData.getFirst();
+      long userID = validationPrefs.get(0).getUserID();
+      estimateCallables.add(
+          new PreferenceEstimateCallable(recommender, userID, validationPrefs, noEstimateCounter));
+    }
+
+    RunningAverageAndStdDev timing = new FullRunningAverageAndStdDev();
+    execute(estimateCallables, noEstimateCounter, timing);
+
+    double result = computeFinalEvaluation();
+    log.info("Evaluation result: {}", result);
+    return result;
+  }
+
+  // Use RMSE scoring:
+
+  @Override
+  protected void reset() {
+    average = new FullRunningAverage();
+  }
+
+  @Override
+  protected void processOneEstimate(float estimatedPreference, Preference realPref) {
+    double diff = realPref.getValue() - estimatedPreference;
+    average.addDatum(diff * diff);
+  }
+
+  @Override
+  protected double computeFinalEvaluation() {
+    return Math.sqrt(average.getAverage());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java
new file mode 100644
index 0000000..deadc00
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.commons.cli2.OptionException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.example.TasteOptionParser;
+import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class Track1RecommenderEvaluatorRunner {
+
+  private static final Logger log = LoggerFactory.getLogger(Track1RecommenderEvaluatorRunner.class);
+
+  private Track1RecommenderEvaluatorRunner() {
+  }
+  
+  public static void main(String... args) throws IOException, TasteException, OptionException {
+    File dataFileDirectory = TasteOptionParser.getRatings(args);
+    if (dataFileDirectory == null) {
+      throw new IllegalArgumentException("No data directory");
+    }
+    if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
+      throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
+    }
+    Track1RecommenderEvaluator evaluator = new Track1RecommenderEvaluator(dataFileDirectory);
+    DataModel model = new KDDCupDataModel(KDDCupDataModel.getTrainingFile(dataFileDirectory));
+    double evaluation = evaluator.evaluate(new Track1RecommenderBuilder(),
+      null,
+      model,
+      Float.NaN,
+      Float.NaN);
+    log.info(String.valueOf(evaluation));
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java
new file mode 100644
index 0000000..a0ff126
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1;
+
+import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
+import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.Pair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+/**
+ * <p>Runs "track 1" of the KDD Cup competition using whatever recommender is inside {@link Track1Recommender}
+ * and attempts to output the result in the correct contest format.</p>
+ *
+ * <p>Run as: {@code Track1Runner [track 1 data file directory] [output file]}</p>
+ */
+public final class Track1Runner {
+
+  private static final Logger log = LoggerFactory.getLogger(Track1Runner.class);
+
+  private Track1Runner() {
+  }
+
+  public static void main(String[] args) throws Exception {
+
+    File dataFileDirectory = new File(args[0]);
+    if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
+      throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
+    }
+
+    long start = System.currentTimeMillis();
+
+    KDDCupDataModel model = new KDDCupDataModel(KDDCupDataModel.getTrainingFile(dataFileDirectory));
+    Track1Recommender recommender = new Track1Recommender(model);
+
+    long end = System.currentTimeMillis();
+    log.info("Loaded model in {}s", (end - start) / 1000);
+    start = end;
+
+    Collection<Track1Callable> callables = new ArrayList<>();
+    for (Pair<PreferenceArray,long[]> tests : new DataFileIterable(KDDCupDataModel.getTestFile(dataFileDirectory))) {
+      PreferenceArray userTest = tests.getFirst();
+      callables.add(new Track1Callable(recommender, userTest));
+    }
+
+    int cores = Runtime.getRuntime().availableProcessors();
+    log.info("Running on {} cores", cores);
+    ExecutorService executor = Executors.newFixedThreadPool(cores);
+    List<Future<byte[]>> results = executor.invokeAll(callables);
+    executor.shutdown();
+
+    end = System.currentTimeMillis();
+    log.info("Ran recommendations in {}s", (end - start) / 1000);
+    start = end;
+
+    try (OutputStream out = new BufferedOutputStream(new FileOutputStream(new File(args[1])))){
+      for (Future<byte[]> result : results) {
+        for (byte estimate : result.get()) {
+          out.write(estimate);
+        }
+      }
+    }
+
+    end = System.currentTimeMillis();
+    log.info("Wrote output in {}s", (end - start) / 1000);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java
new file mode 100644
index 0000000..022d78c
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.model.GenericPreference;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * can be used to drop {@link DataModel}s into {@link ParallelArraysSGDFactorizer}
+ */
+public class DataModelFactorizablePreferences implements FactorizablePreferences {
+
+  private final FastIDSet userIDs;
+  private final FastIDSet itemIDs;
+
+  private final List<Preference> preferences;
+
+  private final float minPreference;
+  private final float maxPreference;
+
+  public DataModelFactorizablePreferences(DataModel dataModel) {
+
+    minPreference = dataModel.getMinPreference();
+    maxPreference = dataModel.getMaxPreference();
+
+    try {
+      userIDs = new FastIDSet(dataModel.getNumUsers());
+      itemIDs = new FastIDSet(dataModel.getNumItems());
+      preferences = new ArrayList<>();
+
+      LongPrimitiveIterator userIDsIterator = dataModel.getUserIDs();
+      while (userIDsIterator.hasNext()) {
+        long userID = userIDsIterator.nextLong();
+        userIDs.add(userID);
+        for (Preference preference : dataModel.getPreferencesFromUser(userID)) {
+          itemIDs.add(preference.getItemID());
+          preferences.add(new GenericPreference(userID, preference.getItemID(), preference.getValue()));
+        }
+      }
+    } catch (TasteException te) {
+      throw new IllegalStateException("Unable to create factorizable preferences!", te);
+    }
+  }
+
+  @Override
+  public LongPrimitiveIterator getUserIDs() {
+    return userIDs.iterator();
+  }
+
+  @Override
+  public LongPrimitiveIterator getItemIDs() {
+    return itemIDs.iterator();
+  }
+
+  @Override
+  public Iterable<Preference> getPreferences() {
+    return preferences;
+  }
+
+  @Override
+  public float getMinPreference() {
+    return minPreference;
+  }
+
+  @Override
+  public float getMaxPreference() {
+    return maxPreference;
+  }
+
+  @Override
+  public int numUsers() {
+    return userIDs.size();
+  }
+
+  @Override
+  public int numItems() {
+    return itemIDs.size();
+  }
+
+  @Override
+  public int numPreferences() {
+    return preferences.size();
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java
new file mode 100644
index 0000000..a126dec
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
+
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.Preference;
+
+/**
+ * models the necessary input for {@link ParallelArraysSGDFactorizer}
+ */
+public interface FactorizablePreferences {
+
+  LongPrimitiveIterator getUserIDs();
+
+  LongPrimitiveIterator getItemIDs();
+
+  Iterable<Preference> getPreferences();
+
+  float getMinPreference();
+
+  float getMaxPreference();
+
+  int numUsers();
+
+  int numItems();
+
+  int numPreferences();
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java
new file mode 100644
index 0000000..6dcef6b
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java
@@ -0,0 +1,123 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterables;
+import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
+import org.apache.mahout.cf.taste.impl.common.AbstractLongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.Pair;
+
+import java.io.File;
+
+public class KDDCupFactorizablePreferences implements FactorizablePreferences {
+
+  private final File dataFile;
+
+  public KDDCupFactorizablePreferences(File dataFile) {
+    this.dataFile = dataFile;
+  }
+
+  @Override
+  public LongPrimitiveIterator getUserIDs() {
+    return new FixedSizeLongIterator(numUsers());
+  }
+
+  @Override
+  public LongPrimitiveIterator getItemIDs() {
+    return new FixedSizeLongIterator(numItems());
+  }
+
+  @Override
+  public Iterable<Preference> getPreferences() {
+    Iterable<Iterable<Preference>> prefIterators =
+        Iterables.transform(new DataFileIterable(dataFile),
+          new Function<Pair<PreferenceArray,long[]>,Iterable<Preference>>() {
+            @Override
+            public Iterable<Preference> apply(Pair<PreferenceArray,long[]> from) {
+              return from.getFirst();
+            }
+          });
+    return Iterables.concat(prefIterators);
+  }
+
+  @Override
+  public float getMinPreference() {
+    return 0;
+  }
+
+  @Override
+  public float getMaxPreference() {
+    return 100;
+  }
+
+  @Override
+  public int numUsers() {
+    return 1000990;
+  }
+
+  @Override
+  public int numItems() {
+    return 624961;
+  }
+
+  @Override
+  public int numPreferences() {
+    return 252800275;
+  }
+
+  static class FixedSizeLongIterator extends AbstractLongPrimitiveIterator {
+
+    private long currentValue;
+    private final long maximum;
+
+    FixedSizeLongIterator(long maximum) {
+      this.maximum = maximum;
+      currentValue = 0;
+    }
+
+    @Override
+    public long nextLong() {
+      return currentValue++;
+    }
+
+    @Override
+    public long peek() {
+      return currentValue;
+    }
+
+    @Override
+    public void skip(int n) {
+      currentValue += n;
+    }
+
+    @Override
+    public boolean hasNext() {
+      return currentValue < maximum;
+    }
+
+    @Override
+    public void remove() {
+      throw new UnsupportedOperationException();
+    }
+  }
+
+}


[44/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
deleted file mode 100644
index f4b8bcb..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
+++ /dev/null
@@ -1,311 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.io.Resources;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Locale;
-
-/**
- * Train a logistic regression for the examples from Chapter 13 of Mahout in Action
- */
-public final class TrainLogistic {
-
-  private static String inputFile;
-  private static String outputFile;
-  private static LogisticModelParameters lmp;
-  private static int passes;
-  private static boolean scores;
-  private static OnlineLogisticRegression model;
-
-  private TrainLogistic() {
-  }
-
-  public static void main(String[] args) throws Exception {
-    mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
-  }
-
-  static void mainToOutput(String[] args, PrintWriter output) throws Exception {
-    if (parseArgs(args)) {
-      double logPEstimate = 0;
-      int samples = 0;
-
-      CsvRecordFactory csv = lmp.getCsvRecordFactory();
-      OnlineLogisticRegression lr = lmp.createRegression();
-      for (int pass = 0; pass < passes; pass++) {
-        try (BufferedReader in = open(inputFile)) {
-          // read variable names
-          csv.firstLine(in.readLine());
-
-          String line = in.readLine();
-          while (line != null) {
-            // for each new line, get target and predictors
-            Vector input = new RandomAccessSparseVector(lmp.getNumFeatures());
-            int targetValue = csv.processLine(line, input);
-
-            // check performance while this is still news
-            double logP = lr.logLikelihood(targetValue, input);
-            if (!Double.isInfinite(logP)) {
-              if (samples < 20) {
-                logPEstimate = (samples * logPEstimate + logP) / (samples + 1);
-              } else {
-                logPEstimate = 0.95 * logPEstimate + 0.05 * logP;
-              }
-              samples++;
-            }
-            double p = lr.classifyScalar(input);
-            if (scores) {
-              output.printf(Locale.ENGLISH, "%10d %2d %10.2f %2.4f %10.4f %10.4f%n",
-                samples, targetValue, lr.currentLearningRate(), p, logP, logPEstimate);
-            }
-
-            // now update model
-            lr.train(targetValue, input);
-
-            line = in.readLine();
-          }
-        }
-      }
-
-      try (OutputStream modelOutput = new FileOutputStream(outputFile)) {
-        lmp.saveTo(modelOutput);
-      }
-
-      output.println(lmp.getNumFeatures());
-      output.println(lmp.getTargetVariable() + " ~ ");
-      String sep = "";
-      for (String v : csv.getTraceDictionary().keySet()) {
-        double weight = predictorWeight(lr, 0, csv, v);
-        if (weight != 0) {
-          output.printf(Locale.ENGLISH, "%s%.3f*%s", sep, weight, v);
-          sep = " + ";
-        }
-      }
-      output.printf("%n");
-      model = lr;
-      for (int row = 0; row < lr.getBeta().numRows(); row++) {
-        for (String key : csv.getTraceDictionary().keySet()) {
-          double weight = predictorWeight(lr, row, csv, key);
-          if (weight != 0) {
-            output.printf(Locale.ENGLISH, "%20s %.5f%n", key, weight);
-          }
-        }
-        for (int column = 0; column < lr.getBeta().numCols(); column++) {
-          output.printf(Locale.ENGLISH, "%15.9f ", lr.getBeta().get(row, column));
-        }
-        output.println();
-      }
-    }
-  }
-
-  private static double predictorWeight(OnlineLogisticRegression lr, int row, RecordFactory csv, String predictor) {
-    double weight = 0;
-    for (Integer column : csv.getTraceDictionary().get(predictor)) {
-      weight += lr.getBeta().get(row, column);
-    }
-    return weight;
-  }
-
-  private static boolean parseArgs(String[] args) {
-    DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
-    Option help = builder.withLongName("help").withDescription("print this list").create();
-
-    Option quiet = builder.withLongName("quiet").withDescription("be extra quiet").create();
-    Option scores = builder.withLongName("scores").withDescription("output score diagnostics during training").create();
-
-    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
-    Option inputFile = builder.withLongName("input")
-            .withRequired(true)
-            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
-            .withDescription("where to get training data")
-            .create();
-
-    Option outputFile = builder.withLongName("output")
-            .withRequired(true)
-            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
-            .withDescription("where to get training data")
-            .create();
-
-    Option predictors = builder.withLongName("predictors")
-            .withRequired(true)
-            .withArgument(argumentBuilder.withName("p").create())
-            .withDescription("a list of predictor variables")
-            .create();
-
-    Option types = builder.withLongName("types")
-            .withRequired(true)
-            .withArgument(argumentBuilder.withName("t").create())
-            .withDescription("a list of predictor variable types (numeric, word, or text)")
-            .create();
-
-    Option target = builder.withLongName("target")
-            .withRequired(true)
-            .withArgument(argumentBuilder.withName("target").withMaximum(1).create())
-            .withDescription("the name of the target variable")
-            .create();
-
-    Option features = builder.withLongName("features")
-            .withArgument(
-                    argumentBuilder.withName("numFeatures")
-                            .withDefault("1000")
-                            .withMaximum(1).create())
-            .withDescription("the number of internal hashed features to use")
-            .create();
-
-    Option passes = builder.withLongName("passes")
-            .withArgument(
-                    argumentBuilder.withName("passes")
-                            .withDefault("2")
-                            .withMaximum(1).create())
-            .withDescription("the number of times to pass over the input data")
-            .create();
-
-    Option lambda = builder.withLongName("lambda")
-            .withArgument(argumentBuilder.withName("lambda").withDefault("1e-4").withMaximum(1).create())
-            .withDescription("the amount of coefficient decay to use")
-            .create();
-
-    Option rate = builder.withLongName("rate")
-            .withArgument(argumentBuilder.withName("learningRate").withDefault("1e-3").withMaximum(1).create())
-            .withDescription("the learning rate")
-            .create();
-
-    Option noBias = builder.withLongName("noBias")
-            .withDescription("don't include a bias term")
-            .create();
-
-    Option targetCategories = builder.withLongName("categories")
-            .withRequired(true)
-            .withArgument(argumentBuilder.withName("number").withMaximum(1).create())
-            .withDescription("the number of target categories to be considered")
-            .create();
-
-    Group normalArgs = new GroupBuilder()
-            .withOption(help)
-            .withOption(quiet)
-            .withOption(inputFile)
-            .withOption(outputFile)
-            .withOption(target)
-            .withOption(targetCategories)
-            .withOption(predictors)
-            .withOption(types)
-            .withOption(passes)
-            .withOption(lambda)
-            .withOption(rate)
-            .withOption(noBias)
-            .withOption(features)
-            .create();
-
-    Parser parser = new Parser();
-    parser.setHelpOption(help);
-    parser.setHelpTrigger("--help");
-    parser.setGroup(normalArgs);
-    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
-    CommandLine cmdLine = parser.parseAndHelp(args);
-
-    if (cmdLine == null) {
-      return false;
-    }
-
-    TrainLogistic.inputFile = getStringArgument(cmdLine, inputFile);
-    TrainLogistic.outputFile = getStringArgument(cmdLine, outputFile);
-
-    List<String> typeList = new ArrayList<>();
-    for (Object x : cmdLine.getValues(types)) {
-      typeList.add(x.toString());
-    }
-
-    List<String> predictorList = new ArrayList<>();
-    for (Object x : cmdLine.getValues(predictors)) {
-      predictorList.add(x.toString());
-    }
-
-    lmp = new LogisticModelParameters();
-    lmp.setTargetVariable(getStringArgument(cmdLine, target));
-    lmp.setMaxTargetCategories(getIntegerArgument(cmdLine, targetCategories));
-    lmp.setNumFeatures(getIntegerArgument(cmdLine, features));
-    lmp.setUseBias(!getBooleanArgument(cmdLine, noBias));
-    lmp.setTypeMap(predictorList, typeList);
-
-    lmp.setLambda(getDoubleArgument(cmdLine, lambda));
-    lmp.setLearningRate(getDoubleArgument(cmdLine, rate));
-
-    TrainLogistic.scores = getBooleanArgument(cmdLine, scores);
-    TrainLogistic.passes = getIntegerArgument(cmdLine, passes);
-
-    return true;
-  }
-
-  private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
-    return (String) cmdLine.getValue(inputFile);
-  }
-
-  private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
-    return cmdLine.hasOption(option);
-  }
-
-  private static int getIntegerArgument(CommandLine cmdLine, Option features) {
-    return Integer.parseInt((String) cmdLine.getValue(features));
-  }
-
-  private static double getDoubleArgument(CommandLine cmdLine, Option op) {
-    return Double.parseDouble((String) cmdLine.getValue(op));
-  }
-
-  public static OnlineLogisticRegression getModel() {
-    return model;
-  }
-
-  public static LogisticModelParameters getParameters() {
-    return lmp;
-  }
-
-  static BufferedReader open(String inputFile) throws IOException {
-    InputStream in;
-    try {
-      in = Resources.getResource(inputFile).openStream();
-    } catch (IllegalArgumentException e) {
-      in = new FileInputStream(new File(inputFile));
-    }
-    return new BufferedReader(new InputStreamReader(in, Charsets.UTF_8));
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
deleted file mode 100644
index 632b32c..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.collect.HashMultiset;
-import com.google.common.collect.Multiset;
-import com.google.common.collect.Ordering;
-import org.apache.mahout.classifier.NewsgroupHelper;
-import org.apache.mahout.ep.State;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.vectorizer.encoders.Dictionary;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-
-/**
- * Reads and trains an adaptive logistic regression model on the 20 newsgroups data.
- * The first command line argument gives the path of the directory holding the training
- * data.  The optional second argument, leakType, defines which classes of features to use.
- * Importantly, leakType controls whether a synthetic date is injected into the data as
- * a target leak and if so, how.
- * <p/>
- * The value of leakType % 3 determines whether the target leak is injected according to
- * the following table:
- * <p/>
- * <table>
- * <tr><td valign='top'>0</td><td>No leak injected</td></tr>
- * <tr><td valign='top'>1</td><td>Synthetic date injected in MMM-yyyy format. This will be a single token and
- * is a perfect target leak since each newsgroup is given a different month</td></tr>
- * <tr><td valign='top'>2</td><td>Synthetic date injected in dd-MMM-yyyy HH:mm:ss format.  The day varies
- * and thus there are more leak symbols that need to be learned.  Ultimately this is just
- * as big a leak as case 1.</td></tr>
- * </table>
- * <p/>
- * Leaktype also determines what other text will be indexed.  If leakType is greater
- * than or equal to 6, then neither headers nor text body will be used for features and the leak is the only
- * source of data.  If leakType is greater than or equal to 3, then subject words will be used as features.
- * If leakType is less than 3, then both subject and body text will be used as features.
- * <p/>
- * A leakType of 0 gives no leak and all textual features.
- * <p/>
- * See the following table for a summary of commonly used values for leakType
- * <p/>
- * <table>
- * <tr><td><b>leakType</b></td><td><b>Leak?</b></td><td><b>Subject?</b></td><td><b>Body?</b></td></tr>
- * <tr><td colspan=4><hr></td></tr>
- * <tr><td>0</td><td>no</td><td>yes</td><td>yes</td></tr>
- * <tr><td>1</td><td>mmm-yyyy</td><td>yes</td><td>yes</td></tr>
- * <tr><td>2</td><td>dd-mmm-yyyy</td><td>yes</td><td>yes</td></tr>
- * <tr><td colspan=4><hr></td></tr>
- * <tr><td>3</td><td>no</td><td>yes</td><td>no</td></tr>
- * <tr><td>4</td><td>mmm-yyyy</td><td>yes</td><td>no</td></tr>
- * <tr><td>5</td><td>dd-mmm-yyyy</td><td>yes</td><td>no</td></tr>
- * <tr><td colspan=4><hr></td></tr>
- * <tr><td>6</td><td>no</td><td>no</td><td>no</td></tr>
- * <tr><td>7</td><td>mmm-yyyy</td><td>no</td><td>no</td></tr>
- * <tr><td>8</td><td>dd-mmm-yyyy</td><td>no</td><td>no</td></tr>
- * <tr><td colspan=4><hr></td></tr>
- * </table>
- */
-public final class TrainNewsGroups {
-
-  private TrainNewsGroups() {
-  }
-
-  public static void main(String[] args) throws IOException {
-    File base = new File(args[0]);
-
-    Multiset<String> overallCounts = HashMultiset.create();
-
-    int leakType = 0;
-    if (args.length > 1) {
-      leakType = Integer.parseInt(args[1]);
-    }
-
-    Dictionary newsGroups = new Dictionary();
-
-    NewsgroupHelper helper = new NewsgroupHelper();
-    helper.getEncoder().setProbes(2);
-    AdaptiveLogisticRegression learningAlgorithm =
-        new AdaptiveLogisticRegression(20, NewsgroupHelper.FEATURES, new L1());
-    learningAlgorithm.setInterval(800);
-    learningAlgorithm.setAveragingWindow(500);
-
-    List<File> files = new ArrayList<>();
-    for (File newsgroup : base.listFiles()) {
-      if (newsgroup.isDirectory()) {
-        newsGroups.intern(newsgroup.getName());
-        files.addAll(Arrays.asList(newsgroup.listFiles()));
-      }
-    }
-    Collections.shuffle(files);
-    System.out.println(files.size() + " training files");
-    SGDInfo info = new SGDInfo();
-
-    int k = 0;
-
-    for (File file : files) {
-      String ng = file.getParentFile().getName();
-      int actual = newsGroups.intern(ng);
-
-      Vector v = helper.encodeFeatureVector(file, actual, leakType, overallCounts);
-      learningAlgorithm.train(actual, v);
-
-      k++;
-      State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();
-
-      SGDHelper.analyzeState(info, leakType, k, best);
-    }
-    learningAlgorithm.close();
-    SGDHelper.dissect(leakType, newsGroups, learningAlgorithm, files, overallCounts);
-    System.out.println("exiting main");
-
-    File modelFile = new File(System.getProperty("java.io.tmpdir"), "news-group.model");
-    ModelSerializer.writeBinary(modelFile.getAbsolutePath(),
-        learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
-
-    List<Integer> counts = new ArrayList<>();
-    System.out.println("Word counts");
-    for (String count : overallCounts.elementSet()) {
-      counts.add(overallCounts.count(count));
-    }
-    Collections.sort(counts, Ordering.natural().reverse());
-    k = 0;
-    for (Integer count : counts) {
-      System.out.println(k + "\t" + count);
-      k++;
-      if (k > 1000) {
-        break;
-      }
-    }
-  }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java
deleted file mode 100644
index 7a74289..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.util.Locale;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.classifier.ConfusionMatrix;
-import org.apache.mahout.classifier.evaluation.Auc;
-import org.apache.mahout.classifier.sgd.AdaptiveLogisticRegression.Wrapper;
-import org.apache.mahout.ep.State;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.stats.OnlineSummarizer;
-
-/*
- * Auc and averageLikelihood are always shown if possible, if the number of target value is more than 2, 
- * then Auc and entropy matirx are not shown regardless the value of showAuc and showEntropy
- * the user passes, because the current implementation does not support them on two value targets.
- * */
-public final class ValidateAdaptiveLogistic {
-
-  private static String inputFile;
-  private static String modelFile;
-  private static String defaultCategory;
-  private static boolean showAuc;
-  private static boolean showScores;
-  private static boolean showConfusion;
-
-  private ValidateAdaptiveLogistic() {
-  }
-
-  public static void main(String[] args) throws IOException {
-    mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
-  }
-
-  static void mainToOutput(String[] args, PrintWriter output) throws IOException {
-    if (parseArgs(args)) {
-      if (!showAuc && !showConfusion && !showScores) {
-        showAuc = true;
-        showConfusion = true;
-      }
-
-      Auc collector = null;
-      AdaptiveLogisticModelParameters lmp = AdaptiveLogisticModelParameters
-          .loadFromFile(new File(modelFile));
-      CsvRecordFactory csv = lmp.getCsvRecordFactory();
-      AdaptiveLogisticRegression lr = lmp.createAdaptiveLogisticRegression();      
-
-      if (lmp.getTargetCategories().size() <= 2) {
-        collector = new Auc();
-      }
-
-      OnlineSummarizer slh = new OnlineSummarizer();
-      ConfusionMatrix cm = new ConfusionMatrix(lmp.getTargetCategories(), defaultCategory);
-
-      State<Wrapper, CrossFoldLearner> best = lr.getBest();
-      if (best == null) {
-        output.println("AdaptiveLogisticRegression has not be trained probably.");
-        return;
-      }
-      CrossFoldLearner learner = best.getPayload().getLearner();
-
-      BufferedReader in = TrainLogistic.open(inputFile);
-      String line = in.readLine();
-      csv.firstLine(line);
-      line = in.readLine();
-      if (showScores) {
-        output.println("\"target\", \"model-output\", \"log-likelihood\", \"average-likelihood\"");
-      }
-      while (line != null) {
-        Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
-        //TODO: How to avoid extra target values not shown in the training process.
-        int target = csv.processLine(line, v);
-        double likelihood = learner.logLikelihood(target, v);
-        double score = learner.classifyFull(v).maxValue();
-
-        slh.add(likelihood);
-        cm.addInstance(csv.getTargetString(line), csv.getTargetLabel(target));        
-
-        if (showScores) {
-          output.printf(Locale.ENGLISH, "%8d, %.12f, %.13f, %.13f%n", target,
-              score, learner.logLikelihood(target, v), slh.getMean());
-        }
-        if (collector != null) {
-          collector.add(target, score);
-        }
-        line = in.readLine();
-      }
-
-      output.printf(Locale.ENGLISH,"\nLog-likelihood:");
-      output.printf(Locale.ENGLISH, "Min=%.2f, Max=%.2f, Mean=%.2f, Median=%.2f%n",
-          slh.getMin(), slh.getMax(), slh.getMean(), slh.getMedian());
-
-      if (collector != null) {        
-        output.printf(Locale.ENGLISH, "%nAUC = %.2f%n", collector.auc());
-      }
-
-      if (showConfusion) {
-        output.printf(Locale.ENGLISH, "%n%s%n%n", cm.toString());
-
-        if (collector != null) {
-          Matrix m = collector.entropy();
-          output.printf(Locale.ENGLISH,
-              "Entropy Matrix: [[%.1f, %.1f], [%.1f, %.1f]]%n", m.get(0, 0),
-              m.get(1, 0), m.get(0, 1), m.get(1, 1));
-        }        
-      }
-
-    }
-  }
-
-  private static boolean parseArgs(String[] args) {
-    DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
-    Option help = builder.withLongName("help")
-        .withDescription("print this list").create();
-
-    Option quiet = builder.withLongName("quiet")
-        .withDescription("be extra quiet").create();
-
-    Option auc = builder.withLongName("auc").withDescription("print AUC")
-        .create();
-    Option confusion = builder.withLongName("confusion")
-        .withDescription("print confusion matrix").create();
-
-    Option scores = builder.withLongName("scores")
-        .withDescription("print scores").create();
-
-    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
-    Option inputFileOption = builder
-        .withLongName("input")
-        .withRequired(true)
-        .withArgument(
-            argumentBuilder.withName("input").withMaximum(1)
-                .create())
-        .withDescription("where to get validate data").create();
-
-    Option modelFileOption = builder
-        .withLongName("model")
-        .withRequired(true)
-        .withArgument(
-            argumentBuilder.withName("model").withMaximum(1)
-                .create())
-        .withDescription("where to get the trained model").create();
-
-    Option defaultCagetoryOption = builder
-      .withLongName("defaultCategory")
-      .withRequired(false)
-      .withArgument(
-          argumentBuilder.withName("defaultCategory").withMaximum(1).withDefault("unknown")
-          .create())
-      .withDescription("the default category value to use").create();
-
-    Group normalArgs = new GroupBuilder().withOption(help)
-        .withOption(quiet).withOption(auc).withOption(scores)
-        .withOption(confusion).withOption(inputFileOption)
-        .withOption(modelFileOption).withOption(defaultCagetoryOption).create();
-
-    Parser parser = new Parser();
-    parser.setHelpOption(help);
-    parser.setHelpTrigger("--help");
-    parser.setGroup(normalArgs);
-    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
-    CommandLine cmdLine = parser.parseAndHelp(args);
-
-    if (cmdLine == null) {
-      return false;
-    }
-
-    inputFile = getStringArgument(cmdLine, inputFileOption);
-    modelFile = getStringArgument(cmdLine, modelFileOption);
-    defaultCategory = getStringArgument(cmdLine, defaultCagetoryOption);
-    showAuc = getBooleanArgument(cmdLine, auc);
-    showScores = getBooleanArgument(cmdLine, scores);
-    showConfusion = getBooleanArgument(cmdLine, confusion);
-
-    return true;
-  }
-
-  private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
-    return cmdLine.hasOption(option);
-  }
-
-  private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
-    return (String) cmdLine.getValue(inputFile);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java
deleted file mode 100644
index ab3c861..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd.bankmarketing;
-
-import com.google.common.collect.Lists;
-import org.apache.mahout.classifier.evaluation.Auc;
-import org.apache.mahout.classifier.sgd.L1;
-import org.apache.mahout.classifier.sgd.OnlineLogisticRegression;
-
-import java.util.Collections;
-import java.util.List;
-
-/**
- * Uses the SGD classifier on the 'Bank marketing' dataset from UCI.
- *
- * See http://archive.ics.uci.edu/ml/datasets/Bank+Marketing
- *
- * Learn when people accept or reject an offer from the bank via telephone based on income, age, education and more.
- */
-public class BankMarketingClassificationMain {
-
-  public static final int NUM_CATEGORIES = 2;
-
-  public static void main(String[] args) throws Exception {
-    List<TelephoneCall> calls = Lists.newArrayList(new TelephoneCallParser("bank-full.csv"));
-
-    double heldOutPercentage = 0.10;
-
-    for (int run = 0; run < 20; run++) {
-      Collections.shuffle(calls);
-      int cutoff = (int) (heldOutPercentage * calls.size());
-      List<TelephoneCall> test = calls.subList(0, cutoff);
-      List<TelephoneCall> train = calls.subList(cutoff, calls.size());
-
-      OnlineLogisticRegression lr = new OnlineLogisticRegression(NUM_CATEGORIES, TelephoneCall.FEATURES, new L1())
-        .learningRate(1)
-        .alpha(1)
-        .lambda(0.000001)
-        .stepOffset(10000)
-        .decayExponent(0.2);
-      for (int pass = 0; pass < 20; pass++) {
-        for (TelephoneCall observation : train) {
-          lr.train(observation.getTarget(), observation.asVector());
-        }
-        if (pass % 5 == 0) {
-          Auc eval = new Auc(0.5);
-          for (TelephoneCall testCall : test) {
-            eval.add(testCall.getTarget(), lr.classifyScalar(testCall.asVector()));
-          }
-          System.out.printf("%d, %.4f, %.4f\n", pass, lr.currentLearningRate(), eval.auc());
-        }
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java
deleted file mode 100644
index 728ec20..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd.bankmarketing;
-
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
-import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
-import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
-
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.Map;
-
-public class TelephoneCall {
-  public static final int FEATURES = 100;
-  private static final ConstantValueEncoder interceptEncoder = new ConstantValueEncoder("intercept");
-  private static final FeatureVectorEncoder featureEncoder = new StaticWordValueEncoder("feature");
-
-  private RandomAccessSparseVector vector;
-
-  private Map<String, String> fields = new LinkedHashMap<>();
-
-  public TelephoneCall(Iterable<String> fieldNames, Iterable<String> values) {
-    vector = new RandomAccessSparseVector(FEATURES);
-    Iterator<String> value = values.iterator();
-    interceptEncoder.addToVector("1", vector);
-    for (String name : fieldNames) {
-      String fieldValue = value.next();
-      fields.put(name, fieldValue);
-
-      switch (name) {
-        case "age": {
-          double v = Double.parseDouble(fieldValue);
-          featureEncoder.addToVector(name, Math.log(v), vector);
-          break;
-        }
-        case "balance": {
-          double v;
-          v = Double.parseDouble(fieldValue);
-          if (v < -2000) {
-            v = -2000;
-          }
-          featureEncoder.addToVector(name, Math.log(v + 2001) - 8, vector);
-          break;
-        }
-        case "duration": {
-          double v;
-          v = Double.parseDouble(fieldValue);
-          featureEncoder.addToVector(name, Math.log(v + 1) - 5, vector);
-          break;
-        }
-        case "pdays": {
-          double v;
-          v = Double.parseDouble(fieldValue);
-          featureEncoder.addToVector(name, Math.log(v + 2), vector);
-          break;
-        }
-        case "job":
-        case "marital":
-        case "education":
-        case "default":
-        case "housing":
-        case "loan":
-        case "contact":
-        case "campaign":
-        case "previous":
-        case "poutcome":
-          featureEncoder.addToVector(name + ":" + fieldValue, 1, vector);
-          break;
-        case "day":
-        case "month":
-        case "y":
-          // ignore these for vectorizing
-          break;
-        default:
-          throw new IllegalArgumentException(String.format("Bad field name: %s", name));
-      }
-    }
-  }
-
-  public Vector asVector() {
-    return vector;
-  }
-
-  public int getTarget() {
-    return fields.get("y").equals("no") ? 0 : 1;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java
deleted file mode 100644
index 5ef6490..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd.bankmarketing;
-
-import com.google.common.base.CharMatcher;
-import com.google.common.base.Splitter;
-import com.google.common.collect.AbstractIterator;
-import com.google.common.io.Resources;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.Iterator;
-
-/** Parses semi-colon separated data as TelephoneCalls  */
-public class TelephoneCallParser implements Iterable<TelephoneCall> {
-
-  private final Splitter onSemi = Splitter.on(";").trimResults(CharMatcher.anyOf("\" ;"));
-  private String resourceName;
-
-  public TelephoneCallParser(String resourceName) throws IOException {
-    this.resourceName = resourceName;
-  }
-
-  @Override
-  public Iterator<TelephoneCall> iterator() {
-    try {
-      return new AbstractIterator<TelephoneCall>() {
-        BufferedReader input =
-            new BufferedReader(new InputStreamReader(Resources.getResource(resourceName).openStream()));
-        Iterable<String> fieldNames = onSemi.split(input.readLine());
-
-          @Override
-          protected TelephoneCall computeNext() {
-            try {
-              String line = input.readLine();
-              if (line == null) {
-                return endOfData();
-              }
-
-              return new TelephoneCall(fieldNames, onSemi.split(line));
-            } catch (IOException e) {
-              throw new RuntimeException("Error reading data", e);
-            }
-          }
-        };
-      } catch (IOException e) {
-        throw new RuntimeException("Error reading data", e);
-      }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
deleted file mode 100644
index a0b845f..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-
-final class ClustersFilter implements PathFilter {
-
-  @Override
-  public boolean accept(Path path) {
-    String pathString = path.toString();
-    return pathString.contains("/clusters-");
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
deleted file mode 100644
index 50dba99..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import java.awt.BasicStroke;
-import java.awt.Color;
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-import org.apache.mahout.math.DenseVector;
-
-/**
- * Java desktop graphics class that runs canopy clustering and displays the results.
- * This class generates random data and clusters it.
- */
-@Deprecated
-public class DisplayCanopy extends DisplayClustering {
-
-  DisplayCanopy() {
-    initialize();
-    this.setTitle("Canopy Clusters (>" + (int) (significance * 100) + "% of population)");
-  }
-
-  @Override
-  public void paint(Graphics g) {
-    plotSampleData((Graphics2D) g);
-    plotClusters((Graphics2D) g);
-  }
-
-  protected static void plotClusters(Graphics2D g2) {
-    int cx = CLUSTERS.size() - 1;
-    for (List<Cluster> clusters : CLUSTERS) {
-      for (Cluster cluster : clusters) {
-        if (isSignificant(cluster)) {
-          g2.setStroke(new BasicStroke(1));
-          g2.setColor(Color.BLUE);
-          double[] t1 = {T1, T1};
-          plotEllipse(g2, cluster.getCenter(), new DenseVector(t1));
-          double[] t2 = {T2, T2};
-          plotEllipse(g2, cluster.getCenter(), new DenseVector(t2));
-          g2.setColor(COLORS[Math.min(DisplayClustering.COLORS.length - 1, cx)]);
-          g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1));
-          plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3));
-        }
-      }
-      cx--;
-    }
-  }
-
-  public static void main(String[] args) throws Exception {
-    Path samples = new Path("samples");
-    Path output = new Path("output");
-    Configuration conf = new Configuration();
-    HadoopUtil.delete(conf, samples);
-    HadoopUtil.delete(conf, output);
-    RandomUtils.useTestSeed();
-    generateSamples();
-    writeSampleData(samples);
-    CanopyDriver.buildClusters(conf, samples, output, new ManhattanDistanceMeasure(), T1, T2, 0, true);
-    loadClustersWritable(output);
-
-    new DisplayCanopy();
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
deleted file mode 100644
index ad85c6a..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import java.awt.*;
-import java.awt.event.WindowAdapter;
-import java.awt.event.WindowEvent;
-import java.awt.geom.AffineTransform;
-import java.awt.geom.Ellipse2D;
-import java.awt.geom.Rectangle2D;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.clustering.AbstractCluster;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.UncommonDistributions;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class DisplayClustering extends Frame {
-  
-  private static final Logger log = LoggerFactory.getLogger(DisplayClustering.class);
-  
-  protected static final int DS = 72; // default scale = 72 pixels per inch
-  
-  protected static final int SIZE = 8; // screen size in inches
-  
-  private static final Collection<Vector> SAMPLE_PARAMS = new ArrayList<>();
-  
-  protected static final List<VectorWritable> SAMPLE_DATA = new ArrayList<>();
-  
-  protected static final List<List<Cluster>> CLUSTERS = new ArrayList<>();
-  
-  static final Color[] COLORS = { Color.red, Color.orange, Color.yellow, Color.green, Color.blue, Color.magenta,
-    Color.lightGray };
-  
-  protected static final double T1 = 3.0;
-  
-  protected static final double T2 = 2.8;
-  
-  static double significance = 0.05;
-  
-  protected static int res; // screen resolution
-  
-  public DisplayClustering() {
-    initialize();
-    this.setTitle("Sample Data");
-  }
-  
-  public void initialize() {
-    // Get screen resolution
-    res = Toolkit.getDefaultToolkit().getScreenResolution();
-    
-    // Set Frame size in inches
-    this.setSize(SIZE * res, SIZE * res);
-    this.setVisible(true);
-    this.setTitle("Asymmetric Sample Data");
-    
-    // Window listener to terminate program.
-    this.addWindowListener(new WindowAdapter() {
-      @Override
-      public void windowClosing(WindowEvent e) {
-        System.exit(0);
-      }
-    });
-  }
-  
-  public static void main(String[] args) throws Exception {
-    RandomUtils.useTestSeed();
-    generateSamples();
-    new DisplayClustering();
-  }
-  
-  // Override the paint() method
-  @Override
-  public void paint(Graphics g) {
-    Graphics2D g2 = (Graphics2D) g;
-    plotSampleData(g2);
-    plotSampleParameters(g2);
-    plotClusters(g2);
-  }
-  
-  protected static void plotClusters(Graphics2D g2) {
-    int cx = CLUSTERS.size() - 1;
-    for (List<Cluster> clusters : CLUSTERS) {
-      g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1));
-      g2.setColor(COLORS[Math.min(COLORS.length - 1, cx--)]);
-      for (Cluster cluster : clusters) {
-        plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3));
-      }
-    }
-  }
-  
-  protected static void plotSampleParameters(Graphics2D g2) {
-    Vector v = new DenseVector(2);
-    Vector dv = new DenseVector(2);
-    g2.setColor(Color.RED);
-    for (Vector param : SAMPLE_PARAMS) {
-      v.set(0, param.get(0));
-      v.set(1, param.get(1));
-      dv.set(0, param.get(2) * 3);
-      dv.set(1, param.get(3) * 3);
-      plotEllipse(g2, v, dv);
-    }
-  }
-  
-  protected static void plotSampleData(Graphics2D g2) {
-    double sx = (double) res / DS;
-    g2.setTransform(AffineTransform.getScaleInstance(sx, sx));
-    
-    // plot the axes
-    g2.setColor(Color.BLACK);
-    Vector dv = new DenseVector(2).assign(SIZE / 2.0);
-    plotRectangle(g2, new DenseVector(2).assign(2), dv);
-    plotRectangle(g2, new DenseVector(2).assign(-2), dv);
-    
-    // plot the sample data
-    g2.setColor(Color.DARK_GRAY);
-    dv.assign(0.03);
-    for (VectorWritable v : SAMPLE_DATA) {
-      plotRectangle(g2, v.get(), dv);
-    }
-  }
-  
-  /**
-   * This method plots points and colors them according to their cluster
-   * membership, rather than drawing ellipses.
-   * 
-   * As of commit, this method is used only by K-means spectral clustering.
-   * Since the cluster assignments are set within the eigenspace of the data, it
-   * is not inherent that the original data cluster as they would in K-means:
-   * that is, as symmetric gaussian mixtures.
-   * 
-   * Since Spectral K-Means uses K-Means to cluster the eigenspace data, the raw
-   * output is not directly usable. Rather, the cluster assignments from the raw
-   * output need to be transferred back to the original data. As such, this
-   * method will read the SequenceFile cluster results of K-means and transfer
-   * the cluster assignments to the original data, coloring them appropriately.
-   * 
-   * @param g2
-   * @param data
-   */
-  protected static void plotClusteredSampleData(Graphics2D g2, Path data) {
-    double sx = (double) res / DS;
-    g2.setTransform(AffineTransform.getScaleInstance(sx, sx));
-    
-    g2.setColor(Color.BLACK);
-    Vector dv = new DenseVector(2).assign(SIZE / 2.0);
-    plotRectangle(g2, new DenseVector(2).assign(2), dv);
-    plotRectangle(g2, new DenseVector(2).assign(-2), dv);
-    
-    // plot the sample data, colored according to the cluster they belong to
-    dv.assign(0.03);
-    
-    Path clusteredPointsPath = new Path(data, "clusteredPoints");
-    Path inputPath = new Path(clusteredPointsPath, "part-m-00000");
-    Map<Integer,Color> colors = new HashMap<>();
-    int point = 0;
-    for (Pair<IntWritable,WeightedVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedVectorWritable>(
-        inputPath, new Configuration())) {
-      int clusterId = record.getFirst().get();
-      VectorWritable v = SAMPLE_DATA.get(point++);
-      Integer key = clusterId;
-      if (!colors.containsKey(key)) {
-        colors.put(key, COLORS[Math.min(COLORS.length - 1, colors.size())]);
-      }
-      plotClusteredRectangle(g2, v.get(), dv, colors.get(key));
-    }
-  }
-  
-  /**
-   * Identical to plotRectangle(), but with the option of setting the color of
-   * the rectangle's stroke.
-   * 
-   * NOTE: This should probably be refactored with plotRectangle() since most of
-   * the code here is direct copy/paste from that method.
-   * 
-   * @param g2
-   *          A Graphics2D context.
-   * @param v
-   *          A vector for the rectangle's center.
-   * @param dv
-   *          A vector for the rectangle's dimensions.
-   * @param color
-   *          The color of the rectangle's stroke.
-   */
-  protected static void plotClusteredRectangle(Graphics2D g2, Vector v, Vector dv, Color color) {
-    double[] flip = {1, -1};
-    Vector v2 = v.times(new DenseVector(flip));
-    v2 = v2.minus(dv.divide(2));
-    int h = SIZE / 2;
-    double x = v2.get(0) + h;
-    double y = v2.get(1) + h;
-    
-    g2.setStroke(new BasicStroke(1));
-    g2.setColor(color);
-    g2.draw(new Rectangle2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
-  }
-  
-  /**
-   * Draw a rectangle on the graphics context
-   * 
-   * @param g2
-   *          a Graphics2D context
-   * @param v
-   *          a Vector of rectangle center
-   * @param dv
-   *          a Vector of rectangle dimensions
-   */
-  protected static void plotRectangle(Graphics2D g2, Vector v, Vector dv) {
-    double[] flip = {1, -1};
-    Vector v2 = v.times(new DenseVector(flip));
-    v2 = v2.minus(dv.divide(2));
-    int h = SIZE / 2;
-    double x = v2.get(0) + h;
-    double y = v2.get(1) + h;
-    g2.draw(new Rectangle2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
-  }
-  
-  /**
-   * Draw an ellipse on the graphics context
-   * 
-   * @param g2
-   *          a Graphics2D context
-   * @param v
-   *          a Vector of ellipse center
-   * @param dv
-   *          a Vector of ellipse dimensions
-   */
-  protected static void plotEllipse(Graphics2D g2, Vector v, Vector dv) {
-    double[] flip = {1, -1};
-    Vector v2 = v.times(new DenseVector(flip));
-    v2 = v2.minus(dv.divide(2));
-    int h = SIZE / 2;
-    double x = v2.get(0) + h;
-    double y = v2.get(1) + h;
-    g2.draw(new Ellipse2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
-  }
-  
-  protected static void generateSamples() {
-    generateSamples(500, 1, 1, 3);
-    generateSamples(300, 1, 0, 0.5);
-    generateSamples(300, 0, 2, 0.1);
-  }
-  
-  protected static void generate2dSamples() {
-    generate2dSamples(500, 1, 1, 3, 1);
-    generate2dSamples(300, 1, 0, 0.5, 1);
-    generate2dSamples(300, 0, 2, 0.1, 0.5);
-  }
-  
-  /**
-   * Generate random samples and add them to the sampleData
-   * 
-   * @param num
-   *          int number of samples to generate
-   * @param mx
-   *          double x-value of the sample mean
-   * @param my
-   *          double y-value of the sample mean
-   * @param sd
-   *          double standard deviation of the samples
-   */
-  protected static void generateSamples(int num, double mx, double my, double sd) {
-    double[] params = {mx, my, sd, sd};
-    SAMPLE_PARAMS.add(new DenseVector(params));
-    log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd);
-    for (int i = 0; i < num; i++) {
-      SAMPLE_DATA.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
-          UncommonDistributions.rNorm(my, sd)})));
-    }
-  }
-  
-  protected static void writeSampleData(Path output) throws IOException {
-    Configuration conf = new Configuration();
-    FileSystem fs = FileSystem.get(output.toUri(), conf);
-
-    try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class, VectorWritable.class)) {
-      int i = 0;
-      for (VectorWritable vw : SAMPLE_DATA) {
-        writer.append(new Text("sample_" + i++), vw);
-      }
-    }
-  }
-  
-  protected static List<Cluster> readClustersWritable(Path clustersIn) {
-    List<Cluster> clusters = new ArrayList<>();
-    Configuration conf = new Configuration();
-    for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST,
-        PathFilters.logsCRCFilter(), conf)) {
-      Cluster cluster = value.getValue();
-      log.info(
-          "Reading Cluster:{} center:{} numPoints:{} radius:{}",
-          cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null),
-          cluster.getNumObservations(), AbstractCluster.formatVector(cluster.getRadius(), null));
-      clusters.add(cluster);
-    }
-    return clusters;
-  }
-  
-  protected static void loadClustersWritable(Path output) throws IOException {
-    Configuration conf = new Configuration();
-    FileSystem fs = FileSystem.get(output.toUri(), conf);
-    for (FileStatus s : fs.listStatus(output, new ClustersFilter())) {
-      List<Cluster> clusters = readClustersWritable(s.getPath());
-      CLUSTERS.add(clusters);
-    }
-  }
-  
-  /**
-   * Generate random samples and add them to the sampleData
-   * 
-   * @param num
-   *          int number of samples to generate
-   * @param mx
-   *          double x-value of the sample mean
-   * @param my
-   *          double y-value of the sample mean
-   * @param sdx
-   *          double x-value standard deviation of the samples
-   * @param sdy
-   *          double y-value standard deviation of the samples
-   */
-  protected static void generate2dSamples(int num, double mx, double my, double sdx, double sdy) {
-    double[] params = {mx, my, sdx, sdy};
-    SAMPLE_PARAMS.add(new DenseVector(params));
-    log.info("Generating {} samples m=[{}, {}] sd=[{}, {}]", num, mx, my, sdx, sdy);
-    for (int i = 0; i < num; i++) {
-      SAMPLE_DATA.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sdx),
-          UncommonDistributions.rNorm(my, sdy)})));
-    }
-  }
-  
-  protected static boolean isSignificant(Cluster cluster) {
-    return (double) cluster.getNumObservations() / SAMPLE_DATA.size() > significance;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
deleted file mode 100644
index f8ce7c7..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
+++ /dev/null
@@ -1,110 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.io.IOException;
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.ClusterClassifier;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.clustering.fuzzykmeans.SoftCluster;
-import org.apache.mahout.clustering.iterator.ClusterIterator;
-import org.apache.mahout.clustering.iterator.FuzzyKMeansClusteringPolicy;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-import org.apache.mahout.math.Vector;
-
-import com.google.common.collect.Lists;
-
-public class DisplayFuzzyKMeans extends DisplayClustering {
-  
-  DisplayFuzzyKMeans() {
-    initialize();
-    this.setTitle("Fuzzy k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
-  }
-  
-  // Override the paint() method
-  @Override
-  public void paint(Graphics g) {
-    plotSampleData((Graphics2D) g);
-    plotClusters((Graphics2D) g);
-  }
-  
-  public static void main(String[] args) throws Exception {
-    DistanceMeasure measure = new ManhattanDistanceMeasure();
-    
-    Path samples = new Path("samples");
-    Path output = new Path("output");
-    Configuration conf = new Configuration();
-    HadoopUtil.delete(conf, output);
-    HadoopUtil.delete(conf, samples);
-    RandomUtils.useTestSeed();
-    DisplayClustering.generateSamples();
-    writeSampleData(samples);
-    boolean runClusterer = true;
-    int maxIterations = 10;
-    float threshold = 0.001F;
-    float m = 1.1F;
-    if (runClusterer) {
-      runSequentialFuzzyKClusterer(conf, samples, output, measure, maxIterations, m, threshold);
-    } else {
-      int numClusters = 3;
-      runSequentialFuzzyKClassifier(conf, samples, output, measure, numClusters, maxIterations, m, threshold);
-    }
-    new DisplayFuzzyKMeans();
-  }
-  
-  private static void runSequentialFuzzyKClassifier(Configuration conf, Path samples, Path output,
-      DistanceMeasure measure, int numClusters, int maxIterations, float m, double threshold) throws IOException {
-    Collection<Vector> points = Lists.newArrayList();
-    for (int i = 0; i < numClusters; i++) {
-      points.add(SAMPLE_DATA.get(i).get());
-    }
-    List<Cluster> initialClusters = Lists.newArrayList();
-    int id = 0;
-    for (Vector point : points) {
-      initialClusters.add(new SoftCluster(point, id++, measure));
-    }
-    ClusterClassifier prior = new ClusterClassifier(initialClusters, new FuzzyKMeansClusteringPolicy(m, threshold));
-    Path priorPath = new Path(output, "classifier-0");
-    prior.writeToSeqFiles(priorPath);
-    
-    ClusterIterator.iterateSeq(conf, samples, priorPath, output, maxIterations);
-    loadClustersWritable(output);
-  }
-  
-  private static void runSequentialFuzzyKClusterer(Configuration conf, Path samples, Path output,
-      DistanceMeasure measure, int maxIterations, float m, double threshold) throws IOException,
-      ClassNotFoundException, InterruptedException {
-    Path clustersIn = new Path(output, "random-seeds");
-    RandomSeedGenerator.buildRandom(conf, samples, clustersIn, 3, measure);
-    FuzzyKMeansDriver.run(samples, clustersIn, output, threshold, maxIterations, m, true, true, threshold,
-        true);
-    
-    loadClustersWritable(output);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
deleted file mode 100644
index 336d69e..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.io.IOException;
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.ClusterClassifier;
-import org.apache.mahout.clustering.iterator.ClusterIterator;
-import org.apache.mahout.clustering.iterator.KMeansClusteringPolicy;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-import org.apache.mahout.math.Vector;
-
-import com.google.common.collect.Lists;
-
-public class DisplayKMeans extends DisplayClustering {
-  
-  DisplayKMeans() {
-    initialize();
-    this.setTitle("k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
-  }
-  
-  public static void main(String[] args) throws Exception {
-    DistanceMeasure measure = new ManhattanDistanceMeasure();
-    Path samples = new Path("samples");
-    Path output = new Path("output");
-    Configuration conf = new Configuration();
-    HadoopUtil.delete(conf, samples);
-    HadoopUtil.delete(conf, output);
-    
-    RandomUtils.useTestSeed();
-    generateSamples();
-    writeSampleData(samples);
-    boolean runClusterer = true;
-    double convergenceDelta = 0.001;
-    int numClusters = 3;
-    int maxIterations = 10;
-    if (runClusterer) {
-      runSequentialKMeansClusterer(conf, samples, output, measure, numClusters, maxIterations, convergenceDelta);
-    } else {
-      runSequentialKMeansClassifier(conf, samples, output, measure, numClusters, maxIterations, convergenceDelta);
-    }
-    new DisplayKMeans();
-  }
-  
-  private static void runSequentialKMeansClassifier(Configuration conf, Path samples, Path output,
-      DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta) throws IOException {
-    Collection<Vector> points = Lists.newArrayList();
-    for (int i = 0; i < numClusters; i++) {
-      points.add(SAMPLE_DATA.get(i).get());
-    }
-    List<Cluster> initialClusters = Lists.newArrayList();
-    int id = 0;
-    for (Vector point : points) {
-      initialClusters.add(new org.apache.mahout.clustering.kmeans.Kluster(point, id++, measure));
-    }
-    ClusterClassifier prior = new ClusterClassifier(initialClusters, new KMeansClusteringPolicy(convergenceDelta));
-    Path priorPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
-    prior.writeToSeqFiles(priorPath);
-    
-    ClusterIterator.iterateSeq(conf, samples, priorPath, output, maxIterations);
-    loadClustersWritable(output);
-  }
-  
-  private static void runSequentialKMeansClusterer(Configuration conf, Path samples, Path output,
-    DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta)
-    throws IOException, InterruptedException, ClassNotFoundException {
-    Path clustersIn = new Path(output, "random-seeds");
-    RandomSeedGenerator.buildRandom(conf, samples, clustersIn, numClusters, measure);
-    KMeansDriver.run(samples, clustersIn, output, convergenceDelta, maxIterations, true, 0.0, true);
-    loadClustersWritable(output);
-  }
-  
-  // Override the paint() method
-  @Override
-  public void paint(Graphics g) {
-    plotSampleData((Graphics2D) g);
-    plotClusters((Graphics2D) g);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
deleted file mode 100644
index 2b70749..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.io.BufferedWriter;
-import java.io.FileWriter;
-import java.io.Writer;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.spectral.kmeans.SpectralKMeansDriver;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-
-public class DisplaySpectralKMeans extends DisplayClustering {
-
-  protected static final String SAMPLES = "samples";
-  protected static final String OUTPUT = "output";
-  protected static final String TEMP = "tmp";
-  protected static final String AFFINITIES = "affinities";
-
-  DisplaySpectralKMeans() {
-    initialize();
-    setTitle("Spectral k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
-  }
-
-  public static void main(String[] args) throws Exception {
-    DistanceMeasure measure = new ManhattanDistanceMeasure();
-    Path samples = new Path(SAMPLES);
-    Path output = new Path(OUTPUT);
-    Path tempDir = new Path(TEMP);
-    Configuration conf = new Configuration();
-    HadoopUtil.delete(conf, samples);
-    HadoopUtil.delete(conf, output);
-
-    RandomUtils.useTestSeed();
-    DisplayClustering.generateSamples();
-    writeSampleData(samples);
-    Path affinities = new Path(output, AFFINITIES);
-    FileSystem fs = FileSystem.get(output.toUri(), conf);
-    if (!fs.exists(output)) {
-      fs.mkdirs(output);
-    }
-
-    try (Writer writer = new BufferedWriter(new FileWriter(affinities.toString()))){
-      for (int i = 0; i < SAMPLE_DATA.size(); i++) {
-        for (int j = 0; j < SAMPLE_DATA.size(); j++) {
-          writer.write(i + "," + j + ',' + measure.distance(SAMPLE_DATA.get(i).get(),
-              SAMPLE_DATA.get(j).get()) + '\n');
-        }
-      }
-    }
-
-    int maxIter = 10;
-    double convergenceDelta = 0.001;
-    SpectralKMeansDriver.run(new Configuration(), affinities, output, SAMPLE_DATA.size(), 3, measure,
-        convergenceDelta, maxIter, tempDir);
-    new DisplaySpectralKMeans();
-  }
-
-  @Override
-  public void paint(Graphics g) {
-    plotClusteredSampleData((Graphics2D) g, new Path(new Path(OUTPUT), "kmeans_out"));
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/README.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/README.txt b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/README.txt
deleted file mode 100644
index 470c16c..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/README.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-The following classes can be run without parameters to generate a sample data set and 
-run the reference clustering implementations over them:
-
-DisplayClustering - generates 1000 samples from three, symmetric distributions. This is the same 
-    data set that is used by the following clustering programs. It displays the points on a screen
-    and superimposes the model parameters that were used to generate the points. You can edit the
-    generateSamples() method to change the sample points used by these programs.
-    
-  * DisplayCanopy - uses Canopy clustering
-  * DisplayKMeans - uses k-Means clustering
-  * DisplayFuzzyKMeans - uses Fuzzy k-Means clustering
-  
-  * NOTE: some of these programs display the sample points and then superimpose all of the clusters
-    from each iteration. The last iteration's clusters are in bold red and the previous several are 
-    colored (orange, yellow, green, blue, violet) in order after which all earlier clusters are in
-    light grey. This helps to visualize how the clusters converge upon a solution over multiple
-    iterations.
-  * NOTE: by changing the parameter values (k, ALPHA_0, numIterations) and the display SIGNIFICANCE
-    you can obtain different results.
-    
-  
-    
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
deleted file mode 100644
index c29cbc4..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.streaming.tools;
-
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.util.List;
-
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import com.google.common.io.Closeables;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.clustering.ClusteringUtils;
-import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
-import org.apache.mahout.math.Centroid;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.math.stats.OnlineSummarizer;
-
-public class ClusterQualitySummarizer extends AbstractJob {
-  private String outputFile;
-
-  private PrintWriter fileOut;
-
-  private String trainFile;
-  private String testFile;
-  private String centroidFile;
-  private String centroidCompareFile;
-  private boolean mahoutKMeansFormat;
-  private boolean mahoutKMeansFormatCompare;
-
-  private DistanceMeasure distanceMeasure = new SquaredEuclideanDistanceMeasure();
-
-  public void printSummaries(List<OnlineSummarizer> summarizers, String type) {
-    printSummaries(summarizers, type, fileOut);
-  }
-
-  public static void printSummaries(List<OnlineSummarizer> summarizers, String type, PrintWriter fileOut) {
-    double maxDistance = 0;
-    for (int i = 0; i < summarizers.size(); ++i) {
-      OnlineSummarizer summarizer = summarizers.get(i);
-      if (summarizer.getCount() > 1) {
-        maxDistance = Math.max(maxDistance, summarizer.getMax());
-        System.out.printf("Average distance in cluster %d [%d]: %f\n", i, summarizer.getCount(), summarizer.getMean());
-        // If there is just one point in the cluster, quartiles cannot be estimated. We'll just assume all the quartiles
-        // equal the only value.
-        if (fileOut != null) {
-          fileOut.printf("%d,%f,%f,%f,%f,%f,%f,%f,%d,%s\n", i, summarizer.getMean(),
-              summarizer.getSD(),
-              summarizer.getQuartile(0),
-              summarizer.getQuartile(1),
-              summarizer.getQuartile(2),
-              summarizer.getQuartile(3),
-              summarizer.getQuartile(4), summarizer.getCount(), type);
-        }
-      } else {
-        System.out.printf("Cluster %d is has %d data point. Need atleast 2 data points in a cluster for" +
-            " OnlineSummarizer.\n", i, summarizer.getCount());
-      }
-    }
-    System.out.printf("Num clusters: %d; maxDistance: %f\n", summarizers.size(), maxDistance);
-  }
-
-  public int run(String[] args) throws IOException {
-    if (!parseArgs(args)) {
-      return -1;
-    }
-
-    Configuration conf = new Configuration();
-    try {
-      fileOut = new PrintWriter(new FileOutputStream(outputFile));
-      fileOut.printf("cluster,distance.mean,distance.sd,distance.q0,distance.q1,distance.q2,distance.q3,"
-          + "distance.q4,count,is.train\n");
-
-      // Reading in the centroids (both pairs, if they exist).
-      List<Centroid> centroids;
-      List<Centroid> centroidsCompare = null;
-      if (mahoutKMeansFormat) {
-        SequenceFileDirValueIterable<ClusterWritable> clusterIterable =
-            new SequenceFileDirValueIterable<>(new Path(centroidFile), PathType.GLOB, conf);
-        centroids = Lists.newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterIterable));
-      } else {
-        SequenceFileDirValueIterable<CentroidWritable> centroidIterable =
-            new SequenceFileDirValueIterable<>(new Path(centroidFile), PathType.GLOB, conf);
-        centroids = Lists.newArrayList(IOUtils.getCentroidsFromCentroidWritableIterable(centroidIterable));
-      }
-
-      if (centroidCompareFile != null) {
-        if (mahoutKMeansFormatCompare) {
-          SequenceFileDirValueIterable<ClusterWritable> clusterCompareIterable =
-              new SequenceFileDirValueIterable<>(new Path(centroidCompareFile), PathType.GLOB, conf);
-          centroidsCompare = Lists.newArrayList(
-              IOUtils.getCentroidsFromClusterWritableIterable(clusterCompareIterable));
-        } else {
-          SequenceFileDirValueIterable<CentroidWritable> centroidCompareIterable =
-              new SequenceFileDirValueIterable<>(new Path(centroidCompareFile), PathType.GLOB, conf);
-          centroidsCompare = Lists.newArrayList(
-              IOUtils.getCentroidsFromCentroidWritableIterable(centroidCompareIterable));
-        }
-      }
-
-      // Reading in the "training" set.
-      SequenceFileDirValueIterable<VectorWritable> trainIterable =
-          new SequenceFileDirValueIterable<>(new Path(trainFile), PathType.GLOB, conf);
-      Iterable<Vector> trainDatapoints = IOUtils.getVectorsFromVectorWritableIterable(trainIterable);
-      Iterable<Vector> datapoints = trainDatapoints;
-
-      printSummaries(ClusteringUtils.summarizeClusterDistances(trainDatapoints, centroids,
-          new SquaredEuclideanDistanceMeasure()), "train");
-
-      // Also adding in the "test" set.
-      if (testFile != null) {
-        SequenceFileDirValueIterable<VectorWritable> testIterable =
-            new SequenceFileDirValueIterable<>(new Path(testFile), PathType.GLOB, conf);
-        Iterable<Vector> testDatapoints = IOUtils.getVectorsFromVectorWritableIterable(testIterable);
-
-        printSummaries(ClusteringUtils.summarizeClusterDistances(testDatapoints, centroids,
-            new SquaredEuclideanDistanceMeasure()), "test");
-
-        datapoints = Iterables.concat(trainDatapoints, testDatapoints);
-      }
-
-      // At this point, all train/test CSVs have been written. We now compute quality metrics.
-      List<OnlineSummarizer> summaries =
-          ClusteringUtils.summarizeClusterDistances(datapoints, centroids, distanceMeasure);
-      List<OnlineSummarizer> compareSummaries = null;
-      if (centroidsCompare != null) {
-        compareSummaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroidsCompare, distanceMeasure);
-      }
-      System.out.printf("[Dunn Index] First: %f", ClusteringUtils.dunnIndex(centroids, distanceMeasure, summaries));
-      if (compareSummaries != null) {
-        System.out.printf(" Second: %f\n", ClusteringUtils.dunnIndex(centroidsCompare, distanceMeasure, compareSummaries));
-      } else {
-        System.out.printf("\n");
-      }
-      System.out.printf("[Davies-Bouldin Index] First: %f",
-          ClusteringUtils.daviesBouldinIndex(centroids, distanceMeasure, summaries));
-      if (compareSummaries != null) {
-        System.out.printf(" Second: %f\n",
-          ClusteringUtils.daviesBouldinIndex(centroidsCompare, distanceMeasure, compareSummaries));
-      } else {
-        System.out.printf("\n");
-      }
-    } catch (IOException e) {
-      System.out.println(e.getMessage());
-    } finally {
-      Closeables.close(fileOut, false);
-    }
-    return 0;
-  }
-
-  private boolean parseArgs(String[] args) {
-    DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
-    Option help = builder.withLongName("help").withDescription("print this list").create();
-
-    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
-    Option inputFileOption = builder.withLongName("input")
-        .withShortName("i")
-        .withRequired(true)
-        .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
-        .withDescription("where to get seq files with the vectors (training set)")
-        .create();
-
-    Option testInputFileOption = builder.withLongName("testInput")
-        .withShortName("itest")
-        .withArgument(argumentBuilder.withName("testInput").withMaximum(1).create())
-        .withDescription("where to get seq files with the vectors (test set)")
-        .create();
-
-    Option centroidsFileOption = builder.withLongName("centroids")
-        .withShortName("c")
-        .withRequired(true)
-        .withArgument(argumentBuilder.withName("centroids").withMaximum(1).create())
-        .withDescription("where to get seq files with the centroids (from Mahout KMeans or StreamingKMeansDriver)")
-        .create();
-
-    Option centroidsCompareFileOption = builder.withLongName("centroidsCompare")
-        .withShortName("cc")
-        .withRequired(false)
-        .withArgument(argumentBuilder.withName("centroidsCompare").withMaximum(1).create())
-        .withDescription("where to get seq files with the second set of centroids (from Mahout KMeans or "
-            + "StreamingKMeansDriver)")
-        .create();
-
-    Option outputFileOption = builder.withLongName("output")
-        .withShortName("o")
-        .withRequired(true)
-        .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
-        .withDescription("where to dump the CSV file with the results")
-        .create();
-
-    Option mahoutKMeansFormatOption = builder.withLongName("mahoutkmeansformat")
-        .withShortName("mkm")
-        .withDescription("if set, read files as (IntWritable, ClusterWritable) pairs")
-        .withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create())
-        .create();
-
-    Option mahoutKMeansCompareFormatOption = builder.withLongName("mahoutkmeansformatCompare")
-        .withShortName("mkmc")
-        .withDescription("if set, read files as (IntWritable, ClusterWritable) pairs")
-        .withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create())
-        .create();
-
-    Group normalArgs = new GroupBuilder()
-        .withOption(help)
-        .withOption(inputFileOption)
-        .withOption(testInputFileOption)
-        .withOption(outputFileOption)
-        .withOption(centroidsFileOption)
-        .withOption(centroidsCompareFileOption)
-        .withOption(mahoutKMeansFormatOption)
-        .withOption(mahoutKMeansCompareFormatOption)
-        .create();
-
-    Parser parser = new Parser();
-    parser.setHelpOption(help);
-    parser.setHelpTrigger("--help");
-    parser.setGroup(normalArgs);
-    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 150));
-
-    CommandLine cmdLine = parser.parseAndHelp(args);
-    if (cmdLine == null) {
-      return false;
-    }
-
-    trainFile = (String) cmdLine.getValue(inputFileOption);
-    if (cmdLine.hasOption(testInputFileOption)) {
-      testFile = (String) cmdLine.getValue(testInputFileOption);
-    }
-    centroidFile = (String) cmdLine.getValue(centroidsFileOption);
-    if (cmdLine.hasOption(centroidsCompareFileOption)) {
-      centroidCompareFile = (String) cmdLine.getValue(centroidsCompareFileOption);
-    }
-    outputFile = (String) cmdLine.getValue(outputFileOption);
-    if (cmdLine.hasOption(mahoutKMeansFormatOption)) {
-      mahoutKMeansFormat = true;
-    }
-    if (cmdLine.hasOption(mahoutKMeansCompareFormatOption)) {
-      mahoutKMeansFormatCompare = true;
-    }
-    return true;
-  }
-
-  public static void main(String[] args) throws IOException {
-    new ClusterQualitySummarizer().run(args);
-  }
-}


[20/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractRecommender.java
new file mode 100644
index 0000000..3a62b08
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractRecommender.java
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy;
+
+import java.util.List;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+public abstract class AbstractRecommender implements Recommender {
+  
+  private static final Logger log = LoggerFactory.getLogger(AbstractRecommender.class);
+  
+  private final DataModel dataModel;
+  private final CandidateItemsStrategy candidateItemsStrategy;
+  
+  protected AbstractRecommender(DataModel dataModel, CandidateItemsStrategy candidateItemsStrategy) {
+    this.dataModel = Preconditions.checkNotNull(dataModel);
+    this.candidateItemsStrategy = Preconditions.checkNotNull(candidateItemsStrategy);
+  }
+
+  protected AbstractRecommender(DataModel dataModel) {
+    this(dataModel, getDefaultCandidateItemsStrategy());
+  }
+
+  protected static CandidateItemsStrategy getDefaultCandidateItemsStrategy() {
+    return new PreferredItemsNeighborhoodCandidateItemsStrategy();
+  }
+
+
+  /**
+   * <p>
+   * Default implementation which just calls
+   * {@link Recommender#recommend(long, int, org.apache.mahout.cf.taste.recommender.IDRescorer)}, with a
+   * {@link org.apache.mahout.cf.taste.recommender.Rescorer} that does nothing.
+   * </p>
+   */
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
+    return recommend(userID, howMany, null, false);
+  }
+
+  /**
+   * <p>
+   * Default implementation which just calls
+   * {@link Recommender#recommend(long, int, org.apache.mahout.cf.taste.recommender.IDRescorer)}, with a
+   * {@link org.apache.mahout.cf.taste.recommender.Rescorer} that does nothing.
+   * </p>
+   */
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
+    return recommend(userID, howMany, null, includeKnownItems);
+  }
+  
+  /**
+   * <p> Delegates to {@link Recommender#recommend(long, int, IDRescorer, boolean)}
+   */
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException{
+    return recommend(userID, howMany,rescorer, false);  
+  }
+  
+  /**
+   * <p>
+   * Default implementation which just calls {@link DataModel#setPreference(long, long, float)}.
+   * </p>
+   *
+   * @throws IllegalArgumentException
+   *           if userID or itemID is {@code null}, or if value is {@link Double#NaN}
+   */
+  @Override
+  public void setPreference(long userID, long itemID, float value) throws TasteException {
+    Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
+    log.debug("Setting preference for user {}, item {}", userID, itemID);
+    dataModel.setPreference(userID, itemID, value);
+  }
+  
+  /**
+   * <p>
+   * Default implementation which just calls {@link DataModel#removePreference(long, long)} (Object, Object)}.
+   * </p>
+   *
+   * @throws IllegalArgumentException
+   *           if userID or itemID is {@code null}
+   */
+  @Override
+  public void removePreference(long userID, long itemID) throws TasteException {
+    log.debug("Remove preference for user '{}', item '{}'", userID, itemID);
+    dataModel.removePreference(userID, itemID);
+  }
+  
+  @Override
+  public DataModel getDataModel() {
+    return dataModel;
+  }
+
+  /**
+   * @param userID
+   *          ID of user being evaluated
+   * @param preferencesFromUser
+   *          the preferences from the user
+   * @param includeKnownItems
+   *          whether to include items already known by the user in recommendations
+   * @return all items in the {@link DataModel} for which the user has not expressed a preference and could
+   *         possibly be recommended to the user
+   * @throws TasteException
+   *           if an error occurs while listing items
+   */
+  protected FastIDSet getAllOtherItems(long userID, PreferenceArray preferencesFromUser, boolean includeKnownItems)
+      throws TasteException {
+    return candidateItemsStrategy.getCandidateItems(userID, preferencesFromUser, dataModel, includeKnownItems);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllSimilarItemsCandidateItemsStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllSimilarItemsCandidateItemsStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllSimilarItemsCandidateItemsStrategy.java
new file mode 100644
index 0000000..37389a7
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllSimilarItemsCandidateItemsStrategy.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+
+/**
+ * returns the result of {@link ItemSimilarity#allSimilarItemIDs(long)} as candidate items
+ */
+public class AllSimilarItemsCandidateItemsStrategy extends AbstractCandidateItemsStrategy {
+
+  private final ItemSimilarity similarity;
+
+  public AllSimilarItemsCandidateItemsStrategy(ItemSimilarity similarity) {
+    Preconditions.checkArgument(similarity != null, "similarity is null");
+    this.similarity = similarity;
+  }
+
+  @Override
+  protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel, boolean includeKnownItems)
+    throws TasteException {
+    FastIDSet candidateItemIDs = new FastIDSet();
+    for (long itemID : preferredItemIDs) {
+      candidateItemIDs.addAll(similarity.allSimilarItemIDs(itemID));
+    }
+    if (!includeKnownItems) {
+      candidateItemIDs.removeAll(preferredItemIDs);
+    }
+    return candidateItemIDs;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllUnknownItemsCandidateItemsStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllUnknownItemsCandidateItemsStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllUnknownItemsCandidateItemsStrategy.java
new file mode 100644
index 0000000..929eddd
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllUnknownItemsCandidateItemsStrategy.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+public final class AllUnknownItemsCandidateItemsStrategy extends AbstractCandidateItemsStrategy {
+
+  /** return all items the user has not yet seen */
+  @Override
+  protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel, boolean includeKnownItems)
+    throws TasteException {
+    FastIDSet possibleItemIDs = new FastIDSet(dataModel.getNumItems());
+    LongPrimitiveIterator allItemIDs = dataModel.getItemIDs();
+    while (allItemIDs.hasNext()) {
+      possibleItemIDs.add(allItemIDs.nextLong());
+    }
+    if (!includeKnownItems) {
+      possibleItemIDs.removeAll(preferredItemIDs);
+    }
+    return possibleItemIDs;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByRescoreComparator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByRescoreComparator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByRescoreComparator.java
new file mode 100644
index 0000000..1677ea8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByRescoreComparator.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.io.Serializable;
+import java.util.Comparator;
+
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+
+/**
+ * <p>
+ * Defines ordering on {@link RecommendedItem} by the rescored value of the recommendations' estimated
+ * preference value, from high to low.
+ * </p>
+ */
+final class ByRescoreComparator implements Comparator<RecommendedItem>, Serializable {
+  
+  private final IDRescorer rescorer;
+  
+  ByRescoreComparator(IDRescorer rescorer) {
+    this.rescorer = rescorer;
+  }
+  
+  @Override
+  public int compare(RecommendedItem o1, RecommendedItem o2) {
+    double rescored1;
+    double rescored2;
+    if (rescorer == null) {
+      rescored1 = o1.getValue();
+      rescored2 = o2.getValue();
+    } else {
+      rescored1 = rescorer.rescore(o1.getItemID(), o1.getValue());
+      rescored2 = rescorer.rescore(o2.getItemID(), o2.getValue());
+    }
+    if (rescored1 < rescored2) {
+      return 1;
+    } else if (rescored1 > rescored2) {
+      return -1;
+    } else {
+      return 0;
+    }
+  }
+  
+  @Override
+  public String toString() {
+    return "ByRescoreComparator[rescorer:" + rescorer + ']';
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByValueRecommendedItemComparator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByValueRecommendedItemComparator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByValueRecommendedItemComparator.java
new file mode 100644
index 0000000..57c5f3d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByValueRecommendedItemComparator.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.io.Serializable;
+import java.util.Comparator;
+
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+
+/**
+ * Defines a natural ordering from most-preferred item (highest value) to least-preferred.
+ */
+public final class ByValueRecommendedItemComparator implements Comparator<RecommendedItem>, Serializable {
+
+  private static final Comparator<RecommendedItem> INSTANCE = new ByValueRecommendedItemComparator();
+
+  public static Comparator<RecommendedItem> getInstance() {
+    return INSTANCE;
+  }
+
+  @Override
+  public int compare(RecommendedItem o1, RecommendedItem o2) {
+    float value1 = o1.getValue();
+    float value2 = o2.getValue();
+    return value1 > value2 ? -1 : value1 < value2 ? 1 : 0;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/CachingRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/CachingRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/CachingRecommender.java
new file mode 100644
index 0000000..7ed8cc3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/CachingRecommender.java
@@ -0,0 +1,251 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.Retriever;
+import org.apache.mahout.cf.taste.impl.model.PlusAnonymousUserDataModel;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.common.LongPair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A {@link Recommender} which caches the results from another {@link Recommender} in memory.
+ *
+ * TODO: Should be checked for thread safety
+ * </p>
+ */
+public final class CachingRecommender implements Recommender {
+  
+  private static final Logger log = LoggerFactory.getLogger(CachingRecommender.class);
+  
+  private final Recommender recommender;
+  private final int[] maxHowMany;
+  private final Retriever<Long,Recommendations> recommendationsRetriever;
+  private final Cache<Long,Recommendations> recommendationCache;
+  private final Cache<LongPair,Float> estimatedPrefCache;
+  private final RefreshHelper refreshHelper;
+  private IDRescorer currentRescorer;
+  private boolean currentlyIncludeKnownItems;
+  
+  public CachingRecommender(Recommender recommender) throws TasteException {
+    Preconditions.checkArgument(recommender != null, "recommender is null");
+    this.recommender = recommender;
+    maxHowMany = new int[]{1};
+    // Use "num users" as an upper limit on cache size. Rough guess.
+    int numUsers = recommender.getDataModel().getNumUsers();
+    recommendationsRetriever = new RecommendationRetriever();
+    recommendationCache = new Cache<>(recommendationsRetriever, numUsers);
+    estimatedPrefCache = new Cache<>(new EstimatedPrefRetriever(), numUsers);
+    refreshHelper = new RefreshHelper(new Callable<Object>() {
+      @Override
+      public Object call() {
+        clear();
+        return null;
+      }
+    });
+    refreshHelper.addDependency(recommender);
+  }
+  
+  private void setCurrentRescorer(IDRescorer rescorer) {
+    if (rescorer == null) {
+      if (currentRescorer != null) {
+        currentRescorer = null;
+        clear();
+      }
+    } else {
+      if (!rescorer.equals(currentRescorer)) {
+        currentRescorer = rescorer;
+        clear();
+      }
+    }
+  }
+
+  public void setCurrentlyIncludeKnownItems(boolean currentlyIncludeKnownItems) {
+    this.currentlyIncludeKnownItems = currentlyIncludeKnownItems;
+  }
+
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
+    return recommend(userID, howMany, null, false);
+  }
+
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
+    return recommend(userID, howMany, null, includeKnownItems);
+  }
+
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany,IDRescorer rescorer) throws TasteException {
+      return recommend(userID, howMany, rescorer, false);
+  }
+ 
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany,IDRescorer rescorer, boolean includeKnownItems)
+    throws TasteException {
+    Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1");
+    synchronized (maxHowMany) {
+      if (howMany > maxHowMany[0]) {
+        maxHowMany[0] = howMany;
+      }
+    }
+
+    // Special case, avoid caching an anonymous user
+    if (userID == PlusAnonymousUserDataModel.TEMP_USER_ID) {
+      return recommendationsRetriever.get(PlusAnonymousUserDataModel.TEMP_USER_ID).getItems();
+    }
+
+    setCurrentRescorer(rescorer);
+    setCurrentlyIncludeKnownItems(includeKnownItems);
+
+    Recommendations recommendations = recommendationCache.get(userID);
+    if (recommendations.getItems().size() < howMany && !recommendations.isNoMoreRecommendableItems()) {
+      clear(userID);
+      recommendations = recommendationCache.get(userID);
+      if (recommendations.getItems().size() < howMany) {
+        recommendations.setNoMoreRecommendableItems(true);
+      }
+    }
+
+    List<RecommendedItem> recommendedItems = recommendations.getItems();
+    return recommendedItems.size() > howMany ? recommendedItems.subList(0, howMany) : recommendedItems;
+  }
+  
+  @Override
+  public float estimatePreference(long userID, long itemID) throws TasteException {
+    return estimatedPrefCache.get(new LongPair(userID, itemID));
+  }
+  
+  @Override
+  public void setPreference(long userID, long itemID, float value) throws TasteException {
+    recommender.setPreference(userID, itemID, value);
+    clear(userID);
+  }
+  
+  @Override
+  public void removePreference(long userID, long itemID) throws TasteException {
+    recommender.removePreference(userID, itemID);
+    clear(userID);
+  }
+  
+  @Override
+  public DataModel getDataModel() {
+    return recommender.getDataModel();
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    refreshHelper.refresh(alreadyRefreshed);
+  }
+  
+  /**
+   * <p>
+   * Clears cached recommendations for the given user.
+   * </p>
+   * 
+   * @param userID
+   *          clear cached data associated with this user ID
+   */
+  public void clear(final long userID) {
+    log.debug("Clearing recommendations for user ID '{}'", userID);
+    recommendationCache.remove(userID);
+    estimatedPrefCache.removeKeysMatching(new Cache.MatchPredicate<LongPair>() {
+      @Override
+      public boolean matches(LongPair userItemPair) {
+        return userItemPair.getFirst() == userID;
+      }
+    });
+  }
+  
+  /**
+   * <p>
+   * Clears all cached recommendations.
+   * </p>
+   */
+  public void clear() {
+    log.debug("Clearing all recommendations...");
+    recommendationCache.clear();
+    estimatedPrefCache.clear();
+  }
+  
+  @Override
+  public String toString() {
+    return "CachingRecommender[recommender:" + recommender + ']';
+  }
+  
+  private final class RecommendationRetriever implements Retriever<Long,Recommendations> {
+    @Override
+    public Recommendations get(Long key) throws TasteException {
+      log.debug("Retrieving new recommendations for user ID '{}'", key);
+      int howMany = maxHowMany[0];
+      IDRescorer rescorer = currentRescorer;
+      List<RecommendedItem> recommendations =
+          rescorer == null ? recommender.recommend(key, howMany, null, currentlyIncludeKnownItems) :
+              recommender.recommend(key, howMany, rescorer, currentlyIncludeKnownItems);
+      return new Recommendations(Collections.unmodifiableList(recommendations));
+    }
+  }
+  
+  private final class EstimatedPrefRetriever implements Retriever<LongPair,Float> {
+    @Override
+    public Float get(LongPair key) throws TasteException {
+      long userID = key.getFirst();
+      long itemID = key.getSecond();
+      log.debug("Retrieving estimated preference for user ID '{}' and item ID '{}'", userID, itemID);
+      return recommender.estimatePreference(userID, itemID);
+    }
+  }
+  
+  private static final class Recommendations {
+    
+    private final List<RecommendedItem> items;
+    private boolean noMoreRecommendableItems;
+    
+    private Recommendations(List<RecommendedItem> items) {
+      this.items = items;
+    }
+    
+    List<RecommendedItem> getItems() {
+      return items;
+    }
+    
+    boolean isNoMoreRecommendableItems() {
+      return noMoreRecommendableItems;
+    }
+    
+    void setNoMoreRecommendableItems(boolean noMoreRecommendableItems) {
+      this.noMoreRecommendableItems = noMoreRecommendableItems;
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/EstimatedPreferenceCapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/EstimatedPreferenceCapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/EstimatedPreferenceCapper.java
new file mode 100644
index 0000000..f0f389f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/EstimatedPreferenceCapper.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.model.DataModel;
+
+/**
+ * Simple class which encapsulates restricting a preference value
+ * to a predefined range. The simple logic is wrapped up here for
+ * performance reasons.
+ */
+public final class EstimatedPreferenceCapper {
+
+  private final float min;
+  private final float max;
+
+  public EstimatedPreferenceCapper(DataModel model) {
+    min = model.getMinPreference();
+    max = model.getMaxPreference();
+  }
+
+  public float capEstimate(float estimate) {
+    if (estimate > max) {
+      estimate = max;
+    } else if (estimate < min) {
+      estimate = min;
+    }
+    return estimate;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefItemBasedRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefItemBasedRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefItemBasedRecommender.java
new file mode 100644
index 0000000..40e21a3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefItemBasedRecommender.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy;
+import org.apache.mahout.cf.taste.recommender.MostSimilarItemsCandidateItemsStrategy;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+
+/**
+ * A variant on {@link GenericItemBasedRecommender} which is appropriate for use when no notion of preference
+ * value exists in the data.
+ *
+ * @see org.apache.mahout.cf.taste.impl.recommender.GenericBooleanPrefUserBasedRecommender
+ */
+public final class GenericBooleanPrefItemBasedRecommender extends GenericItemBasedRecommender {
+
+  public GenericBooleanPrefItemBasedRecommender(DataModel dataModel, ItemSimilarity similarity) {
+    super(dataModel, similarity);
+  }
+
+  public GenericBooleanPrefItemBasedRecommender(DataModel dataModel, ItemSimilarity similarity,
+      CandidateItemsStrategy candidateItemsStrategy, MostSimilarItemsCandidateItemsStrategy
+      mostSimilarItemsCandidateItemsStrategy) {
+    super(dataModel, similarity, candidateItemsStrategy, mostSimilarItemsCandidateItemsStrategy);
+  }
+  
+  /**
+   * This computation is in a technical sense, wrong, since in the domain of "boolean preference users" where
+   * all preference values are 1, this method should only ever return 1.0 or NaN. This isn't terribly useful
+   * however since it means results can't be ranked by preference value (all are 1). So instead this returns a
+   * sum of similarities.
+   */
+  @Override
+  protected float doEstimatePreference(long userID, PreferenceArray preferencesFromUser, long itemID)
+    throws TasteException {
+    double[] similarities = getSimilarity().itemSimilarities(itemID, preferencesFromUser.getIDs());
+    boolean foundAPref = false;
+    double totalSimilarity = 0.0;
+    for (double theSimilarity : similarities) {
+      if (!Double.isNaN(theSimilarity)) {
+        foundAPref = true;
+        totalSimilarity += theSimilarity;
+      }
+    }
+    return foundAPref ? (float) totalSimilarity : Float.NaN;
+  }
+  
+  @Override
+  public String toString() {
+    return "GenericBooleanPrefItemBasedRecommender";
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefUserBasedRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefUserBasedRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefUserBasedRecommender.java
new file mode 100644
index 0000000..15fcc9f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefUserBasedRecommender.java
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+/**
+ * A variant on {@link GenericUserBasedRecommender} which is appropriate for use when no notion of preference
+ * value exists in the data.
+ */
+public final class GenericBooleanPrefUserBasedRecommender extends GenericUserBasedRecommender {
+  
+  public GenericBooleanPrefUserBasedRecommender(DataModel dataModel,
+                                                UserNeighborhood neighborhood,
+                                                UserSimilarity similarity) {
+    super(dataModel, neighborhood, similarity);
+  }
+  
+  /**
+   * This computation is in a technical sense, wrong, since in the domain of "boolean preference users" where
+   * all preference values are 1, this method should only ever return 1.0 or NaN. This isn't terribly useful
+   * however since it means results can't be ranked by preference value (all are 1). So instead this returns a
+   * sum of similarities to any other user in the neighborhood who has also rated the item.
+   */
+  @Override
+  protected float doEstimatePreference(long theUserID, long[] theNeighborhood, long itemID) throws TasteException {
+    if (theNeighborhood.length == 0) {
+      return Float.NaN;
+    }
+    DataModel dataModel = getDataModel();
+    UserSimilarity similarity = getSimilarity();
+    float totalSimilarity = 0.0f;
+    boolean foundAPref = false;
+    for (long userID : theNeighborhood) {
+      // See GenericItemBasedRecommender.doEstimatePreference() too
+      if (userID != theUserID && dataModel.getPreferenceValue(userID, itemID) != null) {
+        foundAPref = true;
+        totalSimilarity += (float) similarity.userSimilarity(theUserID, userID);
+      }
+    }
+    return foundAPref ? totalSimilarity : Float.NaN;
+  }
+  
+  @Override
+  protected FastIDSet getAllOtherItems(long[] theNeighborhood, long theUserID, boolean includeKnownItems)
+      throws TasteException {
+    DataModel dataModel = getDataModel();
+    FastIDSet possibleItemIDs = new FastIDSet();
+    for (long userID : theNeighborhood) {
+      possibleItemIDs.addAll(dataModel.getItemIDsFromUser(userID));
+    }
+    if (!includeKnownItems) {
+      possibleItemIDs.removeAll(dataModel.getItemIDsFromUser(theUserID));
+    }
+    return possibleItemIDs;
+  }
+  
+  @Override
+  public String toString() {
+    return "GenericBooleanPrefUserBasedRecommender";
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericItemBasedRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericItemBasedRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericItemBasedRecommender.java
new file mode 100644
index 0000000..6dc8aa5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericItemBasedRecommender.java
@@ -0,0 +1,378 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.ItemBasedRecommender;
+import org.apache.mahout.cf.taste.recommender.MostSimilarItemsCandidateItemsStrategy;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Rescorer;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.apache.mahout.common.LongPair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A simple {@link org.apache.mahout.cf.taste.recommender.Recommender} which uses a given
+ * {@link org.apache.mahout.cf.taste.model.DataModel} and
+ * {@link org.apache.mahout.cf.taste.similarity.ItemSimilarity} to produce recommendations. This class
+ * represents Taste's support for item-based recommenders.
+ * </p>
+ * 
+ * <p>
+ * The {@link org.apache.mahout.cf.taste.similarity.ItemSimilarity} is the most important point to discuss
+ * here. Item-based recommenders are useful because they can take advantage of something to be very fast: they
+ * base their computations on item similarity, not user similarity, and item similarity is relatively static.
+ * It can be precomputed, instead of re-computed in real time.
+ * </p>
+ * 
+ * <p>
+ * Thus it's strongly recommended that you use
+ * {@link org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity} with pre-computed similarities if
+ * you're going to use this class. You can use
+ * {@link org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity} too, which computes
+ * similarities in real-time, but will probably find this painfully slow for large amounts of data.
+ * </p>
+ */
+public class GenericItemBasedRecommender extends AbstractRecommender implements ItemBasedRecommender {
+  
+  private static final Logger log = LoggerFactory.getLogger(GenericItemBasedRecommender.class);
+  
+  private final ItemSimilarity similarity;
+  private final MostSimilarItemsCandidateItemsStrategy mostSimilarItemsCandidateItemsStrategy;
+  private final RefreshHelper refreshHelper;
+  private EstimatedPreferenceCapper capper;
+
+  private static final boolean EXCLUDE_ITEM_IF_NOT_SIMILAR_TO_ALL_BY_DEFAULT = true;
+
+  public GenericItemBasedRecommender(DataModel dataModel,
+                                     ItemSimilarity similarity,
+                                     CandidateItemsStrategy candidateItemsStrategy,
+                                     MostSimilarItemsCandidateItemsStrategy mostSimilarItemsCandidateItemsStrategy) {
+    super(dataModel, candidateItemsStrategy);
+    Preconditions.checkArgument(similarity != null, "similarity is null");
+    this.similarity = similarity;
+    Preconditions.checkArgument(mostSimilarItemsCandidateItemsStrategy != null,
+        "mostSimilarItemsCandidateItemsStrategy is null");
+    this.mostSimilarItemsCandidateItemsStrategy = mostSimilarItemsCandidateItemsStrategy;
+    this.refreshHelper = new RefreshHelper(new Callable<Void>() {
+      @Override
+      public Void call() {
+        capper = buildCapper();
+        return null;
+      }
+    });
+    refreshHelper.addDependency(dataModel);
+    refreshHelper.addDependency(similarity);
+    refreshHelper.addDependency(candidateItemsStrategy);
+    refreshHelper.addDependency(mostSimilarItemsCandidateItemsStrategy);
+    capper = buildCapper();
+  }
+
+  public GenericItemBasedRecommender(DataModel dataModel, ItemSimilarity similarity) {
+    this(dataModel,
+         similarity,
+         AbstractRecommender.getDefaultCandidateItemsStrategy(),
+         getDefaultMostSimilarItemsCandidateItemsStrategy());
+  }
+
+  protected static MostSimilarItemsCandidateItemsStrategy getDefaultMostSimilarItemsCandidateItemsStrategy() {
+    return new PreferredItemsNeighborhoodCandidateItemsStrategy();
+  }
+
+  public ItemSimilarity getSimilarity() {
+    return similarity;
+  }
+  
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+    throws TasteException {
+    Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1");
+    log.debug("Recommending items for user ID '{}'", userID);
+
+    PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID);
+    if (preferencesFromUser.length() == 0) {
+      return Collections.emptyList();
+    }
+
+    FastIDSet possibleItemIDs = getAllOtherItems(userID, preferencesFromUser, includeKnownItems);
+
+    TopItems.Estimator<Long> estimator = new Estimator(userID, preferencesFromUser);
+
+    List<RecommendedItem> topItems = TopItems.getTopItems(howMany, possibleItemIDs.iterator(), rescorer,
+      estimator);
+
+    log.debug("Recommendations are: {}", topItems);
+    return topItems;
+  }
+  
+  @Override
+  public float estimatePreference(long userID, long itemID) throws TasteException {
+    PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID);
+    Float actualPref = getPreferenceForItem(preferencesFromUser, itemID);
+    if (actualPref != null) {
+      return actualPref;
+    }
+    return doEstimatePreference(userID, preferencesFromUser, itemID);
+  }
+
+  private static Float getPreferenceForItem(PreferenceArray preferencesFromUser, long itemID) {
+    int size = preferencesFromUser.length();
+    for (int i = 0; i < size; i++) {
+      if (preferencesFromUser.getItemID(i) == itemID) {
+        return preferencesFromUser.getValue(i);
+      }
+    }
+    return null;
+  }
+
+  @Override
+  public List<RecommendedItem> mostSimilarItems(long itemID, int howMany) throws TasteException {
+    return mostSimilarItems(itemID, howMany, null);
+  }
+  
+  @Override
+  public List<RecommendedItem> mostSimilarItems(long itemID, int howMany,
+                                                Rescorer<LongPair> rescorer) throws TasteException {
+    TopItems.Estimator<Long> estimator = new MostSimilarEstimator(itemID, similarity, rescorer);
+    return doMostSimilarItems(new long[] {itemID}, howMany, estimator);
+  }
+  
+  @Override
+  public List<RecommendedItem> mostSimilarItems(long[] itemIDs, int howMany) throws TasteException {
+    TopItems.Estimator<Long> estimator = new MultiMostSimilarEstimator(itemIDs, similarity, null,
+        EXCLUDE_ITEM_IF_NOT_SIMILAR_TO_ALL_BY_DEFAULT);
+    return doMostSimilarItems(itemIDs, howMany, estimator);
+  }
+  
+  @Override
+  public List<RecommendedItem> mostSimilarItems(long[] itemIDs, int howMany,
+                                                Rescorer<LongPair> rescorer) throws TasteException {
+    TopItems.Estimator<Long> estimator = new MultiMostSimilarEstimator(itemIDs, similarity, rescorer,
+        EXCLUDE_ITEM_IF_NOT_SIMILAR_TO_ALL_BY_DEFAULT);
+    return doMostSimilarItems(itemIDs, howMany, estimator);
+  }
+
+  @Override
+  public List<RecommendedItem> mostSimilarItems(long[] itemIDs,
+                                                int howMany,
+                                                boolean excludeItemIfNotSimilarToAll) throws TasteException {
+    TopItems.Estimator<Long> estimator = new MultiMostSimilarEstimator(itemIDs, similarity, null,
+        excludeItemIfNotSimilarToAll);
+    return doMostSimilarItems(itemIDs, howMany, estimator);
+  }
+
+  @Override
+  public List<RecommendedItem> mostSimilarItems(long[] itemIDs, int howMany,
+                                                Rescorer<LongPair> rescorer,
+                                                boolean excludeItemIfNotSimilarToAll) throws TasteException {
+    TopItems.Estimator<Long> estimator = new MultiMostSimilarEstimator(itemIDs, similarity, rescorer,
+        excludeItemIfNotSimilarToAll);
+    return doMostSimilarItems(itemIDs, howMany, estimator);
+  }
+
+  @Override
+  public List<RecommendedItem> recommendedBecause(long userID, long itemID, int howMany) throws TasteException {
+    Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1");
+
+    DataModel model = getDataModel();
+    TopItems.Estimator<Long> estimator = new RecommendedBecauseEstimator(userID, itemID);
+
+    PreferenceArray prefs = model.getPreferencesFromUser(userID);
+    int size = prefs.length();
+    FastIDSet allUserItems = new FastIDSet(size);
+    for (int i = 0; i < size; i++) {
+      allUserItems.add(prefs.getItemID(i));
+    }
+    allUserItems.remove(itemID);
+
+    return TopItems.getTopItems(howMany, allUserItems.iterator(), null, estimator);
+  }
+  
+  private List<RecommendedItem> doMostSimilarItems(long[] itemIDs,
+                                                   int howMany,
+                                                   TopItems.Estimator<Long> estimator) throws TasteException {
+    FastIDSet possibleItemIDs = mostSimilarItemsCandidateItemsStrategy.getCandidateItems(itemIDs, getDataModel());
+    return TopItems.getTopItems(howMany, possibleItemIDs.iterator(), null, estimator);
+  }
+  
+  protected float doEstimatePreference(long userID, PreferenceArray preferencesFromUser, long itemID)
+    throws TasteException {
+    double preference = 0.0;
+    double totalSimilarity = 0.0;
+    int count = 0;
+    double[] similarities = similarity.itemSimilarities(itemID, preferencesFromUser.getIDs());
+    for (int i = 0; i < similarities.length; i++) {
+      double theSimilarity = similarities[i];
+      if (!Double.isNaN(theSimilarity)) {
+        // Weights can be negative!
+        preference += theSimilarity * preferencesFromUser.getValue(i);
+        totalSimilarity += theSimilarity;
+        count++;
+      }
+    }
+    // Throw out the estimate if it was based on no data points, of course, but also if based on
+    // just one. This is a bit of a band-aid on the 'stock' item-based algorithm for the moment.
+    // The reason is that in this case the estimate is, simply, the user's rating for one item
+    // that happened to have a defined similarity. The similarity score doesn't matter, and that
+    // seems like a bad situation.
+    if (count <= 1) {
+      return Float.NaN;
+    }
+    float estimate = (float) (preference / totalSimilarity);
+    if (capper != null) {
+      estimate = capper.capEstimate(estimate);
+    }
+    return estimate;
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    refreshHelper.refresh(alreadyRefreshed);
+  }
+  
+  @Override
+  public String toString() {
+    return "GenericItemBasedRecommender[similarity:" + similarity + ']';
+  }
+
+  private EstimatedPreferenceCapper buildCapper() {
+    DataModel dataModel = getDataModel();
+    if (Float.isNaN(dataModel.getMinPreference()) && Float.isNaN(dataModel.getMaxPreference())) {
+      return null;
+    } else {
+      return new EstimatedPreferenceCapper(dataModel);
+    }
+  }
+  
+  public static class MostSimilarEstimator implements TopItems.Estimator<Long> {
+    
+    private final long toItemID;
+    private final ItemSimilarity similarity;
+    private final Rescorer<LongPair> rescorer;
+    
+    public MostSimilarEstimator(long toItemID, ItemSimilarity similarity, Rescorer<LongPair> rescorer) {
+      this.toItemID = toItemID;
+      this.similarity = similarity;
+      this.rescorer = rescorer;
+    }
+    
+    @Override
+    public double estimate(Long itemID) throws TasteException {
+      LongPair pair = new LongPair(toItemID, itemID);
+      if (rescorer != null && rescorer.isFiltered(pair)) {
+        return Double.NaN;
+      }
+      double originalEstimate = similarity.itemSimilarity(toItemID, itemID);
+      return rescorer == null ? originalEstimate : rescorer.rescore(pair, originalEstimate);
+    }
+  }
+  
+  private final class Estimator implements TopItems.Estimator<Long> {
+    
+    private final long userID;
+    private final PreferenceArray preferencesFromUser;
+    
+    private Estimator(long userID, PreferenceArray preferencesFromUser) {
+      this.userID = userID;
+      this.preferencesFromUser = preferencesFromUser;
+    }
+    
+    @Override
+    public double estimate(Long itemID) throws TasteException {
+      return doEstimatePreference(userID, preferencesFromUser, itemID);
+    }
+  }
+  
+  private static final class MultiMostSimilarEstimator implements TopItems.Estimator<Long> {
+    
+    private final long[] toItemIDs;
+    private final ItemSimilarity similarity;
+    private final Rescorer<LongPair> rescorer;
+    private final boolean excludeItemIfNotSimilarToAll;
+    
+    private MultiMostSimilarEstimator(long[] toItemIDs, ItemSimilarity similarity, Rescorer<LongPair> rescorer,
+        boolean excludeItemIfNotSimilarToAll) {
+      this.toItemIDs = toItemIDs;
+      this.similarity = similarity;
+      this.rescorer = rescorer;
+      this.excludeItemIfNotSimilarToAll = excludeItemIfNotSimilarToAll;
+    }
+    
+    @Override
+    public double estimate(Long itemID) throws TasteException {
+      RunningAverage average = new FullRunningAverage();
+      double[] similarities = similarity.itemSimilarities(itemID, toItemIDs);
+      for (int i = 0; i < toItemIDs.length; i++) {
+        long toItemID = toItemIDs[i];
+        LongPair pair = new LongPair(toItemID, itemID);
+        if (rescorer != null && rescorer.isFiltered(pair)) {
+          continue;
+        }
+        double estimate = similarities[i];
+        if (rescorer != null) {
+          estimate = rescorer.rescore(pair, estimate);
+        }
+        if (excludeItemIfNotSimilarToAll || !Double.isNaN(estimate)) {
+          average.addDatum(estimate);
+        }
+      }
+      double averageEstimate = average.getAverage();
+      return averageEstimate == 0 ? Double.NaN : averageEstimate;
+    }
+  }
+  
+  private final class RecommendedBecauseEstimator implements TopItems.Estimator<Long> {
+    
+    private final long userID;
+    private final long recommendedItemID;
+
+    private RecommendedBecauseEstimator(long userID, long recommendedItemID) {
+      this.userID = userID;
+      this.recommendedItemID = recommendedItemID;
+    }
+    
+    @Override
+    public double estimate(Long itemID) throws TasteException {
+      Float pref = getDataModel().getPreferenceValue(userID, itemID);
+      if (pref == null) {
+        return Float.NaN;
+      }
+      double similarityValue = similarity.itemSimilarity(recommendedItemID, itemID);
+      return (1.0 + similarityValue) * pref;
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericRecommendedItem.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericRecommendedItem.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericRecommendedItem.java
new file mode 100644
index 0000000..8c8f6ce
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericRecommendedItem.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.io.Serializable;
+
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.common.RandomUtils;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A simple implementation of {@link RecommendedItem}.
+ * </p>
+ */
+public final class GenericRecommendedItem implements RecommendedItem, Serializable {
+  
+  private final long itemID;
+  private final float value;
+  
+  /**
+   * @throws IllegalArgumentException
+   *           if item is null or value is NaN
+   */
+  public GenericRecommendedItem(long itemID, float value) {
+    Preconditions.checkArgument(!Float.isNaN(value), "value is NaN");
+    this.itemID = itemID;
+    this.value = value;
+  }
+  
+  @Override
+  public long getItemID() {
+    return itemID;
+  }
+  
+  @Override
+  public float getValue() {
+    return value;
+  }
+
+  @Override
+  public String toString() {
+    return "RecommendedItem[item:" + itemID + ", value:" + value + ']';
+  }
+  
+  @Override
+  public int hashCode() {
+    return (int) itemID ^ RandomUtils.hashFloat(value);
+  }
+  
+  @Override
+  public boolean equals(Object o) {
+    if (!(o instanceof GenericRecommendedItem)) {
+      return false;
+    }
+    RecommendedItem other = (RecommendedItem) o;
+    return itemID == other.getItemID() && value == other.getValue();
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericUserBasedRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericUserBasedRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericUserBasedRecommender.java
new file mode 100644
index 0000000..1e2ef73
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericUserBasedRecommender.java
@@ -0,0 +1,247 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Rescorer;
+import org.apache.mahout.cf.taste.recommender.UserBasedRecommender;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+import org.apache.mahout.common.LongPair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A simple {@link org.apache.mahout.cf.taste.recommender.Recommender}
+ * which uses a given {@link DataModel} and {@link UserNeighborhood} to produce recommendations.
+ * </p>
+ */
+public class GenericUserBasedRecommender extends AbstractRecommender implements UserBasedRecommender {
+  
+  private static final Logger log = LoggerFactory.getLogger(GenericUserBasedRecommender.class);
+  
+  private final UserNeighborhood neighborhood;
+  private final UserSimilarity similarity;
+  private final RefreshHelper refreshHelper;
+  private EstimatedPreferenceCapper capper;
+  
+  public GenericUserBasedRecommender(DataModel dataModel,
+                                     UserNeighborhood neighborhood,
+                                     UserSimilarity similarity) {
+    super(dataModel);
+    Preconditions.checkArgument(neighborhood != null, "neighborhood is null");
+    this.neighborhood = neighborhood;
+    this.similarity = similarity;
+    this.refreshHelper = new RefreshHelper(new Callable<Void>() {
+      @Override
+      public Void call() {
+        capper = buildCapper();
+        return null;
+      }
+    });
+    refreshHelper.addDependency(dataModel);
+    refreshHelper.addDependency(similarity);
+    refreshHelper.addDependency(neighborhood);
+    capper = buildCapper();
+  }
+  
+  public UserSimilarity getSimilarity() {
+    return similarity;
+  }
+  
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+    throws TasteException {
+    Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1");
+
+    log.debug("Recommending items for user ID '{}'", userID);
+
+    long[] theNeighborhood = neighborhood.getUserNeighborhood(userID);
+
+    if (theNeighborhood.length == 0) {
+      return Collections.emptyList();
+    }
+
+    FastIDSet allItemIDs = getAllOtherItems(theNeighborhood, userID, includeKnownItems);
+
+    TopItems.Estimator<Long> estimator = new Estimator(userID, theNeighborhood);
+
+    List<RecommendedItem> topItems = TopItems
+        .getTopItems(howMany, allItemIDs.iterator(), rescorer, estimator);
+
+    log.debug("Recommendations are: {}", topItems);
+    return topItems;
+  }
+  
+  @Override
+  public float estimatePreference(long userID, long itemID) throws TasteException {
+    DataModel model = getDataModel();
+    Float actualPref = model.getPreferenceValue(userID, itemID);
+    if (actualPref != null) {
+      return actualPref;
+    }
+    long[] theNeighborhood = neighborhood.getUserNeighborhood(userID);
+    return doEstimatePreference(userID, theNeighborhood, itemID);
+  }
+  
+  @Override
+  public long[] mostSimilarUserIDs(long userID, int howMany) throws TasteException {
+    return mostSimilarUserIDs(userID, howMany, null);
+  }
+  
+  @Override
+  public long[] mostSimilarUserIDs(long userID, int howMany, Rescorer<LongPair> rescorer) throws TasteException {
+    TopItems.Estimator<Long> estimator = new MostSimilarEstimator(userID, similarity, rescorer);
+    return doMostSimilarUsers(howMany, estimator);
+  }
+  
+  private long[] doMostSimilarUsers(int howMany, TopItems.Estimator<Long> estimator) throws TasteException {
+    DataModel model = getDataModel();
+    return TopItems.getTopUsers(howMany, model.getUserIDs(), null, estimator);
+  }
+  
+  protected float doEstimatePreference(long theUserID, long[] theNeighborhood, long itemID) throws TasteException {
+    if (theNeighborhood.length == 0) {
+      return Float.NaN;
+    }
+    DataModel dataModel = getDataModel();
+    double preference = 0.0;
+    double totalSimilarity = 0.0;
+    int count = 0;
+    for (long userID : theNeighborhood) {
+      if (userID != theUserID) {
+        // See GenericItemBasedRecommender.doEstimatePreference() too
+        Float pref = dataModel.getPreferenceValue(userID, itemID);
+        if (pref != null) {
+          double theSimilarity = similarity.userSimilarity(theUserID, userID);
+          if (!Double.isNaN(theSimilarity)) {
+            preference += theSimilarity * pref;
+            totalSimilarity += theSimilarity;
+            count++;
+          }
+        }
+      }
+    }
+    // Throw out the estimate if it was based on no data points, of course, but also if based on
+    // just one. This is a bit of a band-aid on the 'stock' item-based algorithm for the moment.
+    // The reason is that in this case the estimate is, simply, the user's rating for one item
+    // that happened to have a defined similarity. The similarity score doesn't matter, and that
+    // seems like a bad situation.
+    if (count <= 1) {
+      return Float.NaN;
+    }
+    float estimate = (float) (preference / totalSimilarity);
+    if (capper != null) {
+      estimate = capper.capEstimate(estimate);
+    }
+    return estimate;
+  }
+  
+  protected FastIDSet getAllOtherItems(long[] theNeighborhood, long theUserID, boolean includeKnownItems)
+    throws TasteException {
+    DataModel dataModel = getDataModel();
+    FastIDSet possibleItemIDs = new FastIDSet();
+    for (long userID : theNeighborhood) {
+      possibleItemIDs.addAll(dataModel.getItemIDsFromUser(userID));
+    }
+    if (!includeKnownItems) {
+      possibleItemIDs.removeAll(dataModel.getItemIDsFromUser(theUserID));
+    }
+    return possibleItemIDs;
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    refreshHelper.refresh(alreadyRefreshed);
+  }
+  
+  @Override
+  public String toString() {
+    return "GenericUserBasedRecommender[neighborhood:" + neighborhood + ']';
+  }
+
+  private EstimatedPreferenceCapper buildCapper() {
+    DataModel dataModel = getDataModel();
+    if (Float.isNaN(dataModel.getMinPreference()) && Float.isNaN(dataModel.getMaxPreference())) {
+      return null;
+    } else {
+      return new EstimatedPreferenceCapper(dataModel);
+    }
+  }
+  
+  private static final class MostSimilarEstimator implements TopItems.Estimator<Long> {
+    
+    private final long toUserID;
+    private final UserSimilarity similarity;
+    private final Rescorer<LongPair> rescorer;
+    
+    private MostSimilarEstimator(long toUserID, UserSimilarity similarity, Rescorer<LongPair> rescorer) {
+      this.toUserID = toUserID;
+      this.similarity = similarity;
+      this.rescorer = rescorer;
+    }
+    
+    @Override
+    public double estimate(Long userID) throws TasteException {
+      // Don't consider the user itself as a possible most similar user
+      if (userID == toUserID) {
+        return Double.NaN;
+      }
+      if (rescorer == null) {
+        return similarity.userSimilarity(toUserID, userID);
+      } else {
+        LongPair pair = new LongPair(toUserID, userID);
+        if (rescorer.isFiltered(pair)) {
+          return Double.NaN;
+        }
+        double originalEstimate = similarity.userSimilarity(toUserID, userID);
+        return rescorer.rescore(pair, originalEstimate);
+      }
+    }
+  }
+  
+  private final class Estimator implements TopItems.Estimator<Long> {
+    
+    private final long theUserID;
+    private final long[] theNeighborhood;
+    
+    Estimator(long theUserID, long[] theNeighborhood) {
+      this.theUserID = theUserID;
+      this.theNeighborhood = theNeighborhood;
+    }
+    
+    @Override
+    public double estimate(Long itemID) throws TasteException {
+      return doEstimatePreference(theUserID, theNeighborhood, itemID);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemAverageRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemAverageRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemAverageRecommender.java
new file mode 100644
index 0000000..618c65f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemAverageRecommender.java
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A simple recommender that always estimates preference for an item to be the average of all known preference
+ * values for that item. No information about users is taken into account. This implementation is provided for
+ * experimentation; while simple and fast, it may not produce very good recommendations.
+ * </p>
+ */
+public final class ItemAverageRecommender extends AbstractRecommender {
+  
+  private static final Logger log = LoggerFactory.getLogger(ItemAverageRecommender.class);
+  
+  private final FastByIDMap<RunningAverage> itemAverages;
+  private final ReadWriteLock buildAveragesLock;
+  private final RefreshHelper refreshHelper;
+  
+  public ItemAverageRecommender(DataModel dataModel) throws TasteException {
+    super(dataModel);
+    this.itemAverages = new FastByIDMap<>();
+    this.buildAveragesLock = new ReentrantReadWriteLock();
+    this.refreshHelper = new RefreshHelper(new Callable<Object>() {
+      @Override
+      public Object call() throws TasteException {
+        buildAverageDiffs();
+        return null;
+      }
+    });
+    refreshHelper.addDependency(dataModel);
+    buildAverageDiffs();
+  }
+  
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+    throws TasteException {
+    Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1");
+    log.debug("Recommending items for user ID '{}'", userID);
+
+    PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID);
+    FastIDSet possibleItemIDs = getAllOtherItems(userID, preferencesFromUser, includeKnownItems);
+
+    TopItems.Estimator<Long> estimator = new Estimator();
+
+    List<RecommendedItem> topItems = TopItems.getTopItems(howMany, possibleItemIDs.iterator(), rescorer,
+      estimator);
+
+    log.debug("Recommendations are: {}", topItems);
+    return topItems;
+  }
+  
+  @Override
+  public float estimatePreference(long userID, long itemID) throws TasteException {
+    DataModel dataModel = getDataModel();
+    Float actualPref = dataModel.getPreferenceValue(userID, itemID);
+    if (actualPref != null) {
+      return actualPref;
+    }
+    return doEstimatePreference(itemID);
+  }
+  
+  private float doEstimatePreference(long itemID) {
+    buildAveragesLock.readLock().lock();
+    try {
+      RunningAverage average = itemAverages.get(itemID);
+      return average == null ? Float.NaN : (float) average.getAverage();
+    } finally {
+      buildAveragesLock.readLock().unlock();
+    }
+  }
+  
+  private void buildAverageDiffs() throws TasteException {
+    try {
+      buildAveragesLock.writeLock().lock();
+      DataModel dataModel = getDataModel();
+      LongPrimitiveIterator it = dataModel.getUserIDs();
+      while (it.hasNext()) {
+        PreferenceArray prefs = dataModel.getPreferencesFromUser(it.nextLong());
+        int size = prefs.length();
+        for (int i = 0; i < size; i++) {
+          long itemID = prefs.getItemID(i);
+          RunningAverage average = itemAverages.get(itemID);
+          if (average == null) {
+            average = new FullRunningAverage();
+            itemAverages.put(itemID, average);
+          }
+          average.addDatum(prefs.getValue(i));
+        }
+      }
+    } finally {
+      buildAveragesLock.writeLock().unlock();
+    }
+  }
+  
+  @Override
+  public void setPreference(long userID, long itemID, float value) throws TasteException {
+    DataModel dataModel = getDataModel();
+    double prefDelta;
+    try {
+      Float oldPref = dataModel.getPreferenceValue(userID, itemID);
+      prefDelta = oldPref == null ? value : value - oldPref;
+    } catch (NoSuchUserException nsee) {
+      prefDelta = value;
+    }
+    super.setPreference(userID, itemID, value);
+    try {
+      buildAveragesLock.writeLock().lock();
+      RunningAverage average = itemAverages.get(itemID);
+      if (average == null) {
+        RunningAverage newAverage = new FullRunningAverage();
+        newAverage.addDatum(prefDelta);
+        itemAverages.put(itemID, newAverage);
+      } else {
+        average.changeDatum(prefDelta);
+      }
+    } finally {
+      buildAveragesLock.writeLock().unlock();
+    }
+  }
+  
+  @Override
+  public void removePreference(long userID, long itemID) throws TasteException {
+    DataModel dataModel = getDataModel();
+    Float oldPref = dataModel.getPreferenceValue(userID, itemID);
+    super.removePreference(userID, itemID);
+    if (oldPref != null) {
+      try {
+        buildAveragesLock.writeLock().lock();
+        RunningAverage average = itemAverages.get(itemID);
+        if (average == null) {
+          throw new IllegalStateException("No preferences exist for item ID: " + itemID);
+        } else {
+          average.removeDatum(oldPref);
+        }
+      } finally {
+        buildAveragesLock.writeLock().unlock();
+      }
+    }
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    refreshHelper.refresh(alreadyRefreshed);
+  }
+  
+  @Override
+  public String toString() {
+    return "ItemAverageRecommender";
+  }
+  
+  private final class Estimator implements TopItems.Estimator<Long> {
+    
+    @Override
+    public double estimate(Long itemID) {
+      return doEstimatePreference(itemID);
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemUserAverageRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemUserAverageRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemUserAverageRecommender.java
new file mode 100644
index 0000000..b2bcd24
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemUserAverageRecommender.java
@@ -0,0 +1,240 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * Like {@link ItemAverageRecommender}, except that estimated preferences are adjusted for the users' average
+ * preference value. For example, say user X has not rated item Y. Item Y's average preference value is 3.5.
+ * User X's average preference value is 4.2, and the average over all preference values is 4.0. User X prefers
+ * items 0.2 higher on average, so, the estimated preference for user X, item Y is 3.5 + 0.2 = 3.7.
+ * </p>
+ */
+public final class ItemUserAverageRecommender extends AbstractRecommender {
+  
+  private static final Logger log = LoggerFactory.getLogger(ItemUserAverageRecommender.class);
+  
+  private final FastByIDMap<RunningAverage> itemAverages;
+  private final FastByIDMap<RunningAverage> userAverages;
+  private final RunningAverage overallAveragePrefValue;
+  private final ReadWriteLock buildAveragesLock;
+  private final RefreshHelper refreshHelper;
+  
+  public ItemUserAverageRecommender(DataModel dataModel) throws TasteException {
+    super(dataModel);
+    this.itemAverages = new FastByIDMap<>();
+    this.userAverages = new FastByIDMap<>();
+    this.overallAveragePrefValue = new FullRunningAverage();
+    this.buildAveragesLock = new ReentrantReadWriteLock();
+    this.refreshHelper = new RefreshHelper(new Callable<Object>() {
+      @Override
+      public Object call() throws TasteException {
+        buildAverageDiffs();
+        return null;
+      }
+    });
+    refreshHelper.addDependency(dataModel);
+    buildAverageDiffs();
+  }
+  
+  @Override
+  public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+    throws TasteException {
+    Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1");
+    log.debug("Recommending items for user ID '{}'", userID);
+
+    PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID);
+    FastIDSet possibleItemIDs = getAllOtherItems(userID, preferencesFromUser, includeKnownItems);
+
+    TopItems.Estimator<Long> estimator = new Estimator(userID);
+
+    List<RecommendedItem> topItems = TopItems.getTopItems(howMany, possibleItemIDs.iterator(), rescorer,
+      estimator);
+
+    log.debug("Recommendations are: {}", topItems);
+    return topItems;
+  }
+  
+  @Override
+  public float estimatePreference(long userID, long itemID) throws TasteException {
+    DataModel dataModel = getDataModel();
+    Float actualPref = dataModel.getPreferenceValue(userID, itemID);
+    if (actualPref != null) {
+      return actualPref;
+    }
+    return doEstimatePreference(userID, itemID);
+  }
+  
+  private float doEstimatePreference(long userID, long itemID) {
+    buildAveragesLock.readLock().lock();
+    try {
+      RunningAverage itemAverage = itemAverages.get(itemID);
+      if (itemAverage == null) {
+        return Float.NaN;
+      }
+      RunningAverage userAverage = userAverages.get(userID);
+      if (userAverage == null) {
+        return Float.NaN;
+      }
+      double userDiff = userAverage.getAverage() - overallAveragePrefValue.getAverage();
+      return (float) (itemAverage.getAverage() + userDiff);
+    } finally {
+      buildAveragesLock.readLock().unlock();
+    }
+  }
+
+  private void buildAverageDiffs() throws TasteException {
+    try {
+      buildAveragesLock.writeLock().lock();
+      DataModel dataModel = getDataModel();
+      LongPrimitiveIterator it = dataModel.getUserIDs();
+      while (it.hasNext()) {
+        long userID = it.nextLong();
+        PreferenceArray prefs = dataModel.getPreferencesFromUser(userID);
+        int size = prefs.length();
+        for (int i = 0; i < size; i++) {
+          long itemID = prefs.getItemID(i);
+          float value = prefs.getValue(i);
+          addDatumAndCreateIfNeeded(itemID, value, itemAverages);
+          addDatumAndCreateIfNeeded(userID, value, userAverages);
+          overallAveragePrefValue.addDatum(value);
+        }
+      }
+    } finally {
+      buildAveragesLock.writeLock().unlock();
+    }
+  }
+  
+  private static void addDatumAndCreateIfNeeded(long itemID, float value, FastByIDMap<RunningAverage> averages) {
+    RunningAverage itemAverage = averages.get(itemID);
+    if (itemAverage == null) {
+      itemAverage = new FullRunningAverage();
+      averages.put(itemID, itemAverage);
+    }
+    itemAverage.addDatum(value);
+  }
+  
+  @Override
+  public void setPreference(long userID, long itemID, float value) throws TasteException {
+    DataModel dataModel = getDataModel();
+    double prefDelta;
+    try {
+      Float oldPref = dataModel.getPreferenceValue(userID, itemID);
+      prefDelta = oldPref == null ? value : value - oldPref;
+    } catch (NoSuchUserException nsee) {
+      prefDelta = value;
+    }
+    super.setPreference(userID, itemID, value);
+    try {
+      buildAveragesLock.writeLock().lock();
+      RunningAverage itemAverage = itemAverages.get(itemID);
+      if (itemAverage == null) {
+        RunningAverage newItemAverage = new FullRunningAverage();
+        newItemAverage.addDatum(prefDelta);
+        itemAverages.put(itemID, newItemAverage);
+      } else {
+        itemAverage.changeDatum(prefDelta);
+      }
+      RunningAverage userAverage = userAverages.get(userID);
+      if (userAverage == null) {
+        RunningAverage newUserAveragae = new FullRunningAverage();
+        newUserAveragae.addDatum(prefDelta);
+        userAverages.put(userID, newUserAveragae);
+      } else {
+        userAverage.changeDatum(prefDelta);
+      }
+      overallAveragePrefValue.changeDatum(prefDelta);
+    } finally {
+      buildAveragesLock.writeLock().unlock();
+    }
+  }
+  
+  @Override
+  public void removePreference(long userID, long itemID) throws TasteException {
+    DataModel dataModel = getDataModel();
+    Float oldPref = dataModel.getPreferenceValue(userID, itemID);
+    super.removePreference(userID, itemID);
+    if (oldPref != null) {
+      try {
+        buildAveragesLock.writeLock().lock();
+        RunningAverage itemAverage = itemAverages.get(itemID);
+        if (itemAverage == null) {
+          throw new IllegalStateException("No preferences exist for item ID: " + itemID);
+        }
+        itemAverage.removeDatum(oldPref);
+        RunningAverage userAverage = userAverages.get(userID);
+        if (userAverage == null) {
+          throw new IllegalStateException("No preferences exist for user ID: " + userID);
+        }
+        userAverage.removeDatum(oldPref);
+        overallAveragePrefValue.removeDatum(oldPref);
+      } finally {
+        buildAveragesLock.writeLock().unlock();
+      }
+    }
+  }
+  
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    refreshHelper.refresh(alreadyRefreshed);
+  }
+  
+  @Override
+  public String toString() {
+    return "ItemUserAverageRecommender";
+  }
+  
+  private final class Estimator implements TopItems.Estimator<Long> {
+    
+    private final long userID;
+    
+    private Estimator(long userID) {
+      this.userID = userID;
+    }
+    
+    @Override
+    public double estimate(Long itemID) {
+      return doEstimatePreference(userID, itemID);
+    }
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/NullRescorer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/NullRescorer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/NullRescorer.java
new file mode 100644
index 0000000..e0eda7a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/NullRescorer.java
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.Rescorer;
+import org.apache.mahout.common.LongPair;
+
+/**
+ * <p>
+ * A simple {@link Rescorer} which always returns the original score.
+ * </p>
+ */
+public final class NullRescorer<T> implements Rescorer<T>, IDRescorer {
+  
+  private static final IDRescorer USER_OR_ITEM_INSTANCE = new NullRescorer<>();
+  private static final Rescorer<LongPair> ITEM_ITEM_PAIR_INSTANCE = new NullRescorer<>();
+  private static final Rescorer<LongPair> USER_USER_PAIR_INSTANCE = new NullRescorer<>();
+
+  private NullRescorer() {
+  }
+
+  public static IDRescorer getItemInstance() {
+    return USER_OR_ITEM_INSTANCE;
+  }
+  
+  public static IDRescorer getUserInstance() {
+    return USER_OR_ITEM_INSTANCE;
+  }
+  
+  public static Rescorer<LongPair> getItemItemPairInstance() {
+    return ITEM_ITEM_PAIR_INSTANCE;
+  }
+  
+  public static Rescorer<LongPair> getUserUserPairInstance() {
+    return USER_USER_PAIR_INSTANCE;
+  }
+
+  /**
+   * @param thing
+   *          to rescore
+   * @param originalScore
+   *          current score for item
+   * @return same originalScore as new score, always
+   */
+  @Override
+  public double rescore(T thing, double originalScore) {
+    return originalScore;
+  }
+  
+  @Override
+  public boolean isFiltered(T thing) {
+    return false;
+  }
+  
+  @Override
+  public double rescore(long id, double originalScore) {
+    return originalScore;
+  }
+  
+  @Override
+  public boolean isFiltered(long id) {
+    return false;
+  }
+  
+  @Override
+  public String toString() {
+    return "NullRescorer";
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/PreferredItemsNeighborhoodCandidateItemsStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/PreferredItemsNeighborhoodCandidateItemsStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/PreferredItemsNeighborhoodCandidateItemsStrategy.java
new file mode 100644
index 0000000..6297d0b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/PreferredItemsNeighborhoodCandidateItemsStrategy.java
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+public final class PreferredItemsNeighborhoodCandidateItemsStrategy extends AbstractCandidateItemsStrategy {
+
+  /**
+   * returns all items that have not been rated by the user and that were preferred by another user
+   * that has preferred at least one item that the current user has preferred too
+   */
+  @Override
+  protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel, boolean includeKnownItems)
+    throws TasteException {
+    FastIDSet possibleItemsIDs = new FastIDSet();
+    for (long itemID : preferredItemIDs) {
+      PreferenceArray itemPreferences = dataModel.getPreferencesForItem(itemID);
+      int numUsersPreferringItem = itemPreferences.length();
+      for (int index = 0; index < numUsersPreferringItem; index++) {
+        possibleItemsIDs.addAll(dataModel.getItemIDsFromUser(itemPreferences.getUserID(index)));
+      }
+    }
+    if (!includeKnownItems) {
+      possibleItemsIDs.removeAll(preferredItemIDs);
+    }
+    return possibleItemsIDs;
+  }
+
+}


[49/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/resources/bank-full.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/resources/bank-full.csv b/community/mahout-mr/examples/bin/resources/bank-full.csv
deleted file mode 100644
index d7a2ede..0000000
--- a/community/mahout-mr/examples/bin/resources/bank-full.csv
+++ /dev/null
@@ -1,45212 +0,0 @@
-"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"
-58;"management";"married";"tertiary";"no";2143;"yes";"no";"unknown";5;"may";261;1;-1;0;"unknown";"no"
-44;"technician";"single";"secondary";"no";29;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
-33;"entrepreneur";"married";"secondary";"no";2;"yes";"yes";"unknown";5;"may";76;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"unknown";"no";1506;"yes";"no";"unknown";5;"may";92;1;-1;0;"unknown";"no"
-33;"unknown";"single";"unknown";"no";1;"no";"no";"unknown";5;"may";198;1;-1;0;"unknown";"no"
-35;"management";"married";"tertiary";"no";231;"yes";"no";"unknown";5;"may";139;1;-1;0;"unknown";"no"
-28;"management";"single";"tertiary";"no";447;"yes";"yes";"unknown";5;"may";217;1;-1;0;"unknown";"no"
-42;"entrepreneur";"divorced";"tertiary";"yes";2;"yes";"no";"unknown";5;"may";380;1;-1;0;"unknown";"no"
-58;"retired";"married";"primary";"no";121;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
-43;"technician";"single";"secondary";"no";593;"yes";"no";"unknown";5;"may";55;1;-1;0;"unknown";"no"
-41;"admin.";"divorced";"secondary";"no";270;"yes";"no";"unknown";5;"may";222;1;-1;0;"unknown";"no"
-29;"admin.";"single";"secondary";"no";390;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";6;"yes";"no";"unknown";5;"may";517;1;-1;0;"unknown";"no"
-58;"technician";"married";"unknown";"no";71;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
-57;"services";"married";"secondary";"no";162;"yes";"no";"unknown";5;"may";174;1;-1;0;"unknown";"no"
-51;"retired";"married";"primary";"no";229;"yes";"no";"unknown";5;"may";353;1;-1;0;"unknown";"no"
-45;"admin.";"single";"unknown";"no";13;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";52;"yes";"no";"unknown";5;"may";38;1;-1;0;"unknown";"no"
-60;"retired";"married";"primary";"no";60;"yes";"no";"unknown";5;"may";219;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";54;1;-1;0;"unknown";"no"
-28;"blue-collar";"married";"secondary";"no";723;"yes";"yes";"unknown";5;"may";262;1;-1;0;"unknown";"no"
-56;"management";"married";"tertiary";"no";779;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
-32;"blue-collar";"single";"primary";"no";23;"yes";"yes";"unknown";5;"may";160;1;-1;0;"unknown";"no"
-25;"services";"married";"secondary";"no";50;"yes";"no";"unknown";5;"may";342;1;-1;0;"unknown";"no"
-40;"retired";"married";"primary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-44;"admin.";"married";"secondary";"no";-372;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
-39;"management";"single";"tertiary";"no";255;"yes";"no";"unknown";5;"may";296;1;-1;0;"unknown";"no"
-52;"entrepreneur";"married";"secondary";"no";113;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
-46;"management";"single";"secondary";"no";-246;"yes";"no";"unknown";5;"may";255;2;-1;0;"unknown";"no"
-36;"technician";"single";"secondary";"no";265;"yes";"yes";"unknown";5;"may";348;1;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";839;"no";"yes";"unknown";5;"may";225;1;-1;0;"unknown";"no"
-49;"management";"married";"tertiary";"no";378;"yes";"no";"unknown";5;"may";230;1;-1;0;"unknown";"no"
-60;"admin.";"married";"secondary";"no";39;"yes";"yes";"unknown";5;"may";208;1;-1;0;"unknown";"no"
-59;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";226;1;-1;0;"unknown";"no"
-51;"management";"married";"tertiary";"no";10635;"yes";"no";"unknown";5;"may";336;1;-1;0;"unknown";"no"
-57;"technician";"divorced";"secondary";"no";63;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
-25;"blue-collar";"married";"secondary";"no";-7;"yes";"no";"unknown";5;"may";365;1;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";-3;"no";"no";"unknown";5;"may";1666;1;-1;0;"unknown";"no"
-36;"admin.";"divorced";"secondary";"no";506;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
-44;"services";"divorced";"secondary";"no";2586;"yes";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
-50;"management";"married";"secondary";"no";49;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
-60;"blue-collar";"married";"unknown";"no";104;"yes";"no";"unknown";5;"may";22;1;-1;0;"unknown";"no"
-54;"retired";"married";"secondary";"no";529;"yes";"no";"unknown";5;"may";1492;1;-1;0;"unknown";"no"
-58;"retired";"married";"unknown";"no";96;"yes";"no";"unknown";5;"may";616;1;-1;0;"unknown";"no"
-36;"admin.";"single";"primary";"no";-171;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
-58;"self-employed";"married";"tertiary";"no";-364;"yes";"no";"unknown";5;"may";355;1;-1;0;"unknown";"no"
-44;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
-55;"technician";"divorced";"secondary";"no";0;"no";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
-29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";363;1;-1;0;"unknown";"no"
-54;"blue-collar";"married";"secondary";"no";1291;"yes";"no";"unknown";5;"may";266;1;-1;0;"unknown";"no"
-48;"management";"divorced";"tertiary";"no";-244;"yes";"no";"unknown";5;"may";253;1;-1;0;"unknown";"no"
-32;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";179;1;-1;0;"unknown";"no"
-42;"admin.";"single";"secondary";"no";-76;"yes";"no";"unknown";5;"may";787;1;-1;0;"unknown";"no"
-24;"technician";"single";"secondary";"no";-103;"yes";"yes";"unknown";5;"may";145;1;-1;0;"unknown";"no"
-38;"entrepreneur";"single";"tertiary";"no";243;"no";"yes";"unknown";5;"may";174;1;-1;0;"unknown";"no"
-38;"management";"single";"tertiary";"no";424;"yes";"no";"unknown";5;"may";104;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"unknown";"no";306;"yes";"no";"unknown";5;"may";13;1;-1;0;"unknown";"no"
-40;"blue-collar";"single";"unknown";"no";24;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
-46;"services";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";1778;1;-1;0;"unknown";"no"
-32;"admin.";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
-53;"technician";"divorced";"secondary";"no";989;"yes";"no";"unknown";5;"may";812;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";249;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";790;"yes";"no";"unknown";5;"may";391;1;-1;0;"unknown";"no"
-49;"blue-collar";"married";"unknown";"no";154;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
-51;"management";"married";"tertiary";"no";6530;"yes";"no";"unknown";5;"may";91;1;-1;0;"unknown";"no"
-60;"retired";"married";"tertiary";"no";100;"no";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
-59;"management";"divorced";"tertiary";"no";59;"yes";"no";"unknown";5;"may";273;1;-1;0;"unknown";"no"
-55;"technician";"married";"secondary";"no";1205;"yes";"no";"unknown";5;"may";158;2;-1;0;"unknown";"no"
-35;"blue-collar";"single";"secondary";"no";12223;"yes";"yes";"unknown";5;"may";177;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"secondary";"no";5935;"yes";"yes";"unknown";5;"may";258;1;-1;0;"unknown";"no"
-31;"services";"married";"secondary";"no";25;"yes";"yes";"unknown";5;"may";172;1;-1;0;"unknown";"no"
-54;"management";"married";"secondary";"no";282;"yes";"yes";"unknown";5;"may";154;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
-43;"technician";"married";"secondary";"no";1937;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";384;"yes";"no";"unknown";5;"may";176;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";582;"no";"yes";"unknown";5;"may";211;1;-1;0;"unknown";"no"
-55;"services";"divorced";"secondary";"no";91;"no";"no";"unknown";5;"may";349;1;-1;0;"unknown";"no"
-49;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";5;"may";272;1;-1;0;"unknown";"no"
-55;"services";"divorced";"secondary";"yes";1;"yes";"no";"unknown";5;"may";208;1;-1;0;"unknown";"no"
-45;"admin.";"single";"secondary";"no";206;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
-47;"services";"divorced";"secondary";"no";164;"no";"no";"unknown";5;"may";212;1;-1;0;"unknown";"no"
-42;"technician";"single";"secondary";"no";690;"yes";"no";"unknown";5;"may";20;1;-1;0;"unknown";"no"
-59;"admin.";"married";"secondary";"no";2343;"yes";"no";"unknown";5;"may";1042;1;-1;0;"unknown";"yes"
-46;"self-employed";"married";"tertiary";"no";137;"yes";"yes";"unknown";5;"may";246;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";173;"yes";"no";"unknown";5;"may";529;2;-1;0;"unknown";"no"
-56;"admin.";"married";"secondary";"no";45;"no";"no";"unknown";5;"may";1467;1;-1;0;"unknown";"yes"
-41;"technician";"married";"secondary";"no";1270;"yes";"no";"unknown";5;"may";1389;1;-1;0;"unknown";"yes"
-46;"management";"divorced";"secondary";"no";16;"yes";"yes";"unknown";5;"may";188;2;-1;0;"unknown";"no"
-57;"retired";"married";"secondary";"no";486;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
-42;"management";"single";"secondary";"no";50;"no";"no";"unknown";5;"may";48;1;-1;0;"unknown";"no"
-30;"technician";"married";"secondary";"no";152;"yes";"yes";"unknown";5;"may";213;2;-1;0;"unknown";"no"
-60;"admin.";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";583;1;-1;0;"unknown";"no"
-60;"blue-collar";"married";"unknown";"no";54;"yes";"no";"unknown";5;"may";221;1;-1;0;"unknown";"no"
-57;"entrepreneur";"divorced";"secondary";"no";-37;"no";"no";"unknown";5;"may";173;1;-1;0;"unknown";"no"
-36;"management";"married";"tertiary";"no";101;"yes";"yes";"unknown";5;"may";426;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";383;"no";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
-60;"retired";"married";"tertiary";"no";81;"yes";"no";"unknown";5;"may";101;1;-1;0;"unknown";"no"
-39;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";203;1;-1;0;"unknown";"no"
-46;"management";"married";"tertiary";"no";229;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";-674;"yes";"no";"unknown";5;"may";257;1;-1;0;"unknown";"no"
-53;"blue-collar";"married";"primary";"no";90;"no";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
-52;"blue-collar";"married";"primary";"no";128;"yes";"no";"unknown";5;"may";229;1;-1;0;"unknown";"no"
-59;"blue-collar";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";55;3;-1;0;"unknown";"no"
-27;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";400;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";54;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
-47;"technician";"married";"tertiary";"no";151;"yes";"no";"unknown";5;"may";190;1;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";61;"no";"yes";"unknown";5;"may";21;1;-1;0;"unknown";"no"
-59;"retired";"single";"secondary";"no";30;"yes";"no";"unknown";5;"may";514;1;-1;0;"unknown";"no"
-45;"management";"married";"tertiary";"no";523;"yes";"no";"unknown";5;"may";849;2;-1;0;"unknown";"no"
-29;"services";"divorced";"secondary";"no";31;"yes";"no";"unknown";5;"may";194;1;-1;0;"unknown";"no"
-46;"technician";"divorced";"secondary";"no";79;"no";"no";"unknown";5;"may";144;1;-1;0;"unknown";"no"
-56;"self-employed";"married";"primary";"no";-34;"yes";"yes";"unknown";5;"may";212;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"primary";"no";448;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
-59;"retired";"divorced";"primary";"no";81;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";144;"yes";"no";"unknown";5;"may";247;2;-1;0;"unknown";"no"
-41;"admin.";"married";"secondary";"no";351;"yes";"no";"unknown";5;"may";518;1;-1;0;"unknown";"no"
-33;"management";"single";"tertiary";"no";-67;"yes";"no";"unknown";5;"may";364;1;-1;0;"unknown";"no"
-59;"management";"divorced";"tertiary";"no";262;"no";"no";"unknown";5;"may";178;1;-1;0;"unknown";"no"
-57;"technician";"married";"primary";"no";0;"no";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-56;"technician";"divorced";"unknown";"no";56;"yes";"no";"unknown";5;"may";439;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
-34;"admin.";"married";"unknown";"no";3;"yes";"no";"unknown";5;"may";120;3;-1;0;"unknown";"no"
-43;"services";"married";"secondary";"no";41;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
-52;"technician";"married";"tertiary";"no";7;"no";"yes";"unknown";5;"may";175;1;-1;0;"unknown";"no"
-33;"technician";"single";"secondary";"no";105;"yes";"no";"unknown";5;"may";262;2;-1;0;"unknown";"no"
-29;"admin.";"single";"secondary";"no";818;"yes";"yes";"unknown";5;"may";61;1;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";-16;"yes";"yes";"unknown";5;"may";78;1;-1;0;"unknown";"no"
-31;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";143;1;-1;0;"unknown";"no"
-55;"services";"married";"secondary";"no";2476;"yes";"no";"unknown";5;"may";579;1;-1;0;"unknown";"yes"
-55;"management";"married";"unknown";"no";1185;"no";"no";"unknown";5;"may";677;1;-1;0;"unknown";"no"
-32;"admin.";"single";"secondary";"no";217;"yes";"no";"unknown";5;"may";345;1;-1;0;"unknown";"no"
-38;"technician";"single";"secondary";"no";1685;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
-55;"admin.";"single";"secondary";"no";802;"yes";"yes";"unknown";5;"may";100;2;-1;0;"unknown";"no"
-28;"unemployed";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
-23;"blue-collar";"married";"secondary";"no";94;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
-32;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";136;1;-1;0;"unknown";"no"
-43;"services";"single";"unknown";"no";0;"no";"no";"unknown";5;"may";73;1;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";517;"yes";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
-46;"blue-collar";"married";"secondary";"no";265;"yes";"no";"unknown";5;"may";541;1;-1;0;"unknown";"no"
-53;"housemaid";"divorced";"primary";"no";947;"yes";"no";"unknown";5;"may";163;1;-1;0;"unknown";"no"
-34;"self-employed";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";301;1;-1;0;"unknown";"no"
-57;"unemployed";"married";"tertiary";"no";42;"no";"no";"unknown";5;"may";46;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";37;"yes";"no";"unknown";5;"may";204;1;-1;0;"unknown";"no"
-59;"blue-collar";"married";"secondary";"no";57;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";22;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
-56;"blue-collar";"divorced";"primary";"no";8;"yes";"no";"unknown";5;"may";157;2;-1;0;"unknown";"no"
-48;"unemployed";"married";"secondary";"no";293;"yes";"no";"unknown";5;"may";243;1;-1;0;"unknown";"no"
-43;"services";"married";"primary";"no";3;"yes";"no";"unknown";5;"may";186;2;-1;0;"unknown";"no"
-54;"blue-collar";"married";"primary";"no";348;"yes";"no";"unknown";5;"may";579;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"unknown";"no";-19;"yes";"no";"unknown";5;"may";163;2;-1;0;"unknown";"no"
-26;"student";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";610;2;-1;0;"unknown";"no"
-40;"management";"married";"tertiary";"no";-4;"yes";"no";"unknown";5;"may";2033;1;-1;0;"unknown";"no"
-39;"management";"married";"secondary";"no";18;"yes";"no";"unknown";5;"may";85;1;-1;0;"unknown";"no"
-50;"technician";"married";"primary";"no";139;"no";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
-41;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"unknown";"no";1883;"yes";"no";"unknown";5;"may";57;1;-1;0;"unknown";"no"
-60;"retired";"divorced";"secondary";"no";216;"yes";"no";"unknown";5;"may";238;1;-1;0;"unknown";"no"
-52;"blue-collar";"married";"secondary";"no";782;"yes";"no";"unknown";5;"may";93;3;-1;0;"unknown";"no"
-48;"blue-collar";"married";"secondary";"no";904;"yes";"no";"unknown";5;"may";128;2;-1;0;"unknown";"no"
-48;"services";"married";"unknown";"no";1705;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
-39;"technician";"single";"tertiary";"no";47;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-47;"services";"single";"secondary";"no";176;"yes";"no";"unknown";5;"may";303;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";1225;"yes";"no";"unknown";5;"may";558;5;-1;0;"unknown";"no"
-45;"technician";"married";"secondary";"no";86;"yes";"no";"unknown";5;"may";270;1;-1;0;"unknown";"no"
-26;"admin.";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";228;1;-1;0;"unknown";"no"
-52;"management";"married";"tertiary";"no";271;"yes";"no";"unknown";5;"may";99;1;-1;0;"unknown";"no"
-54;"technician";"married";"secondary";"no";1378;"yes";"no";"unknown";5;"may";240;1;-1;0;"unknown";"no"
-54;"admin.";"married";"tertiary";"no";184;"no";"no";"unknown";5;"may";673;2;-1;0;"unknown";"yes"
-50;"blue-collar";"married";"primary";"no";0;"no";"no";"unknown";5;"may";233;3;-1;0;"unknown";"no"
-35;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";1056;1;-1;0;"unknown";"no"
-44;"services";"married";"secondary";"no";1357;"yes";"yes";"unknown";5;"may";250;1;-1;0;"unknown";"no"
-53;"entrepreneur";"married";"unknown";"no";19;"yes";"no";"unknown";5;"may";252;1;-1;0;"unknown";"no"
-35;"retired";"single";"primary";"no";434;"no";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
-60;"admin.";"divorced";"secondary";"no";92;"yes";"no";"unknown";5;"may";130;1;-1;0;"unknown";"no"
-53;"admin.";"divorced";"secondary";"no";1151;"yes";"no";"unknown";5;"may";412;1;-1;0;"unknown";"no"
-48;"unemployed";"married";"secondary";"no";41;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
-34;"technician";"married";"secondary";"no";51;"yes";"no";"unknown";5;"may";19;2;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"no";214;"yes";"no";"unknown";5;"may";458;2;-1;0;"unknown";"no"
-51;"management";"married";"secondary";"no";1161;"yes";"no";"unknown";5;"may";717;1;-1;0;"unknown";"no"
-31;"services";"married";"tertiary";"no";37;"yes";"no";"unknown";5;"may";313;1;-1;0;"unknown";"no"
-35;"technician";"divorced";"secondary";"no";787;"yes";"no";"unknown";5;"may";683;2;-1;0;"unknown";"no"
-35;"services";"married";"secondary";"no";59;"yes";"no";"unknown";5;"may";1077;1;-1;0;"unknown";"no"
-38;"technician";"married";"secondary";"no";253;"yes";"no";"unknown";5;"may";416;1;-1;0;"unknown";"no"
-36;"admin.";"married";"tertiary";"no";211;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
-58;"retired";"married";"primary";"no";235;"yes";"no";"unknown";5;"may";167;1;-1;0;"unknown";"no"
-40;"services";"divorced";"unknown";"no";4384;"yes";"no";"unknown";5;"may";315;1;-1;0;"unknown";"no"
-54;"management";"married";"secondary";"no";4080;"no";"no";"unknown";5;"may";140;1;-1;0;"unknown";"no"
-34;"blue-collar";"single";"secondary";"no";53;"yes";"yes";"unknown";5;"may";346;1;-1;0;"unknown";"no"
-31;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";562;1;-1;0;"unknown";"no"
-51;"retired";"married";"secondary";"no";2127;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
-33;"management";"married";"tertiary";"no";377;"yes";"no";"unknown";5;"may";217;1;-1;0;"unknown";"no"
-55;"management";"married";"tertiary";"no";73;"yes";"no";"unknown";5;"may";142;2;-1;0;"unknown";"no"
-42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";5;"may";67;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";243;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
-33;"blue-collar";"single";"secondary";"no";307;"yes";"no";"unknown";5;"may";309;2;-1;0;"unknown";"no"
-38;"services";"married";"secondary";"no";155;"yes";"no";"unknown";5;"may";248;1;-1;0;"unknown";"no"
-50;"technician";"divorced";"tertiary";"no";173;"no";"yes";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-43;"management";"married";"tertiary";"no";400;"yes";"no";"unknown";5;"may";256;1;-1;0;"unknown";"no"
-61;"blue-collar";"divorced";"primary";"no";1428;"yes";"no";"unknown";5;"may";82;2;-1;0;"unknown";"no"
-47;"admin.";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
-48;"self-employed";"married";"tertiary";"no";7;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";575;"yes";"no";"unknown";5;"may";477;1;-1;0;"unknown";"no"
-35;"student";"single";"unknown";"no";298;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
-35;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";471;1;-1;0;"unknown";"no"
-50;"services";"married";"secondary";"no";5699;"yes";"no";"unknown";5;"may";381;2;-1;0;"unknown";"no"
-41;"management";"married";"tertiary";"no";176;"yes";"yes";"unknown";5;"may";42;1;-1;0;"unknown";"no"
-41;"management";"married";"tertiary";"no";517;"yes";"no";"unknown";5;"may";251;1;-1;0;"unknown";"no"
-39;"services";"single";"unknown";"no";257;"yes";"no";"unknown";5;"may";408;1;-1;0;"unknown";"no"
-42;"retired";"married";"secondary";"no";56;"yes";"no";"unknown";5;"may";215;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";-390;"yes";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
-53;"retired";"married";"secondary";"no";330;"yes";"no";"unknown";5;"may";216;2;-1;0;"unknown";"no"
-59;"housemaid";"divorced";"primary";"no";195;"no";"no";"unknown";5;"may";366;2;-1;0;"unknown";"no"
-36;"services";"married";"secondary";"no";301;"yes";"no";"unknown";5;"may";210;1;-1;0;"unknown";"no"
-54;"blue-collar";"married";"primary";"no";-41;"yes";"no";"unknown";5;"may";288;1;-1;0;"unknown";"no"
-40;"technician";"married";"tertiary";"no";483;"yes";"no";"unknown";5;"may";168;1;-1;0;"unknown";"no"
-47;"unknown";"married";"unknown";"no";28;"no";"no";"unknown";5;"may";338;2;-1;0;"unknown";"no"
-53;"unemployed";"married";"unknown";"no";13;"no";"no";"unknown";5;"may";410;3;-1;0;"unknown";"no"
-46;"housemaid";"married";"primary";"no";965;"no";"no";"unknown";5;"may";177;1;-1;0;"unknown";"no"
-39;"management";"married";"tertiary";"no";378;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
-40;"unemployed";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
-28;"blue-collar";"married";"primary";"no";324;"yes";"no";"unknown";5;"may";175;1;-1;0;"unknown";"no"
-35;"entrepreneur";"divorced";"secondary";"no";-69;"yes";"no";"unknown";5;"may";300;1;-1;0;"unknown";"no"
-55;"retired";"married";"secondary";"no";0;"no";"yes";"unknown";5;"may";136;1;-1;0;"unknown";"no"
-43;"technician";"divorced";"unknown";"no";205;"yes";"no";"unknown";5;"may";1419;1;-1;0;"unknown";"no"
-48;"blue-collar";"married";"primary";"no";278;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
-58;"management";"married";"unknown";"no";1065;"yes";"no";"unknown";5;"may";213;3;-1;0;"unknown";"no"
-33;"management";"single";"tertiary";"no";34;"yes";"no";"unknown";5;"may";27;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"unknown";"no";1033;"no";"no";"unknown";5;"may";238;2;-1;0;"unknown";"no"
-53;"services";"divorced";"secondary";"no";1467;"yes";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"primary";"no";-12;"yes";"no";"unknown";5;"may";18;1;-1;0;"unknown";"no"
-31;"services";"married";"secondary";"no";388;"yes";"no";"unknown";5;"may";730;2;-1;0;"unknown";"no"
-57;"entrepreneur";"married";"secondary";"no";294;"yes";"no";"unknown";5;"may";746;2;-1;0;"unknown";"no"
-53;"blue-collar";"married";"unknown";"no";1827;"no";"no";"unknown";5;"may";121;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"primary";"no";627;"yes";"no";"unknown";5;"may";247;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";5;"may";40;1;-1;0;"unknown";"no"
-53;"admin.";"divorced";"secondary";"no";315;"yes";"no";"unknown";5;"may";181;2;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
-44;"admin.";"divorced";"secondary";"no";66;"yes";"no";"unknown";5;"may";206;1;-1;0;"unknown";"no"
-49;"blue-collar";"divorced";"primary";"no";-9;"yes";"yes";"unknown";5;"may";389;1;-1;0;"unknown";"no"
-46;"technician";"married";"secondary";"no";349;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
-43;"entrepreneur";"married";"unknown";"no";100;"yes";"no";"unknown";5;"may";702;1;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
-43;"technician";"married";"secondary";"no";434;"yes";"no";"unknown";5;"may";117;1;-1;0;"unknown";"no"
-49;"management";"married";"tertiary";"no";3237;"yes";"no";"unknown";5;"may";232;3;-1;0;"unknown";"no"
-42;"management";"married";"unknown";"no";275;"no";"no";"unknown";5;"may";408;2;-1;0;"unknown";"no"
-22;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
-40;"management";"married";"tertiary";"no";207;"yes";"no";"unknown";5;"may";39;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";483;"yes";"no";"unknown";5;"may";282;1;-1;0;"unknown";"no"
-51;"services";"married";"secondary";"no";2248;"yes";"no";"unknown";5;"may";714;2;-1;0;"unknown";"no"
-49;"admin.";"married";"secondary";"no";428;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
-53;"blue-collar";"married";"secondary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-34;"services";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";142;1;-1;0;"unknown";"no"
-33;"technician";"divorced";"secondary";"no";140;"yes";"no";"unknown";5;"may";227;1;-1;0;"unknown";"no"
-50;"management";"single";"tertiary";"no";297;"yes";"no";"unknown";5;"may";119;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";279;"yes";"no";"unknown";5;"may";361;1;-1;0;"unknown";"no"
-59;"entrepreneur";"divorced";"secondary";"no";901;"yes";"no";"unknown";5;"may";73;3;-1;0;"unknown";"no"
-30;"technician";"single";"secondary";"no";2573;"yes";"no";"unknown";5;"may";67;2;-1;0;"unknown";"no"
-36;"services";"married";"secondary";"no";143;"yes";"yes";"unknown";5;"may";350;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";475;"yes";"no";"unknown";5;"may";332;2;-1;0;"unknown";"no"
-53;"blue-collar";"married";"secondary";"no";70;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
-34;"management";"single";"tertiary";"no";318;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";275;"yes";"no";"unknown";5;"may";132;1;-1;0;"unknown";"no"
-42;"management";"divorced";"tertiary";"no";742;"yes";"no";"unknown";5;"may";58;3;-1;0;"unknown";"no"
-41;"entrepreneur";"married";"primary";"no";236;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
-30;"student";"single";"tertiary";"no";25;"yes";"no";"unknown";5;"may";89;2;-1;0;"unknown";"no"
-37;"management";"single";"tertiary";"no";600;"yes";"no";"unknown";5;"may";152;1;-1;0;"unknown";"no"
-39;"admin.";"divorced";"secondary";"no";-349;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
-41;"blue-collar";"married";"primary";"no";183;"yes";"yes";"unknown";5;"may";110;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";463;1;-1;0;"unknown";"no"
-42;"management";"single";"tertiary";"no";0;"yes";"yes";"unknown";5;"may";562;2;-1;0;"unknown";"yes"
-40;"blue-collar";"divorced";"primary";"no";0;"yes";"no";"unknown";5;"may";962;1;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";1078;"yes";"no";"unknown";5;"may";10;4;-1;0;"unknown";"no"
-56;"entrepreneur";"divorced";"secondary";"no";155;"no";"no";"unknown";5;"may";118;3;-1;0;"unknown";"no"
-37;"admin.";"married";"secondary";"no";190;"yes";"no";"unknown";5;"may";92;2;-1;0;"unknown";"no"
-59;"retired";"married";"secondary";"no";319;"yes";"no";"unknown";5;"may";143;3;-1;0;"unknown";"no"
-39;"services";"divorced";"secondary";"no";-185;"yes";"no";"unknown";5;"may";189;3;-1;0;"unknown";"no"
-49;"services";"married";"secondary";"no";47;"no";"no";"unknown";5;"may";234;2;-1;0;"unknown";"no"
-38;"services";"single";"secondary";"no";570;"yes";"no";"unknown";5;"may";75;2;-1;0;"unknown";"no"
-36;"self-employed";"married";"tertiary";"no";19;"no";"no";"unknown";5;"may";189;2;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";61;"yes";"no";"unknown";5;"may";621;3;-1;0;"unknown";"no"
-41;"admin.";"married";"secondary";"no";-62;"yes";"yes";"unknown";5;"may";55;2;-1;0;"unknown";"no"
-54;"technician";"married";"tertiary";"no";258;"no";"no";"unknown";5;"may";310;4;-1;0;"unknown";"no"
-58;"blue-collar";"married";"primary";"no";76;"yes";"no";"unknown";5;"may";156;2;-1;0;"unknown";"no"
-30;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";5;2;-1;0;"unknown";"no"
-33;"admin.";"single";"secondary";"no";352;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
-47;"admin.";"married";"secondary";"no";368;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
-50;"technician";"single";"tertiary";"no";339;"yes";"no";"unknown";5;"may";2;3;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";1331;"yes";"no";"unknown";5;"may";286;2;-1;0;"unknown";"no"
-40;"self-employed";"married";"secondary";"no";672;"yes";"no";"unknown";5;"may";164;2;-1;0;"unknown";"no"
-37;"management";"married";"tertiary";"no";58;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
-54;"technician";"single";"unknown";"no";447;"yes";"no";"unknown";5;"may";742;2;-1;0;"unknown";"no"
-24;"student";"single";"secondary";"no";423;"yes";"no";"unknown";5;"may";226;3;-1;0;"unknown";"no"
-54;"management";"married";"tertiary";"no";0;"no";"no";"unknown";5;"may";120;2;-1;0;"unknown";"no"
-34;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";5;"may";362;4;-1;0;"unknown";"no"
-56;"technician";"divorced";"primary";"no";13;"yes";"no";"unknown";5;"may";357;2;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";200;2;-1;0;"unknown";"no"
-24;"student";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";204;2;-1;0;"unknown";"no"
-42;"blue-collar";"divorced";"primary";"no";28;"yes";"no";"unknown";5;"may";126;3;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";792;"yes";"no";"unknown";5;"may";65;2;-1;0;"unknown";"no"
-42;"blue-collar";"married";"unknown";"no";408;"yes";"no";"unknown";5;"may";107;2;-1;0;"unknown";"no"
-51;"admin.";"married";"secondary";"no";531;"yes";"no";"unknown";5;"may";267;2;-1;0;"unknown";"no"
-57;"retired";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";248;2;-1;0;"unknown";"no"
-36;"services";"single";"secondary";"no";62;"yes";"no";"unknown";5;"may";215;2;-1;0;"unknown";"no"
-53;"services";"married";"unknown";"no";257;"yes";"no";"unknown";5;"may";209;2;-1;0;"unknown";"no"
-50;"technician";"married";"secondary";"no";1234;"yes";"no";"unknown";5;"may";205;2;-1;0;"unknown";"no"
-54;"management";"married";"tertiary";"no";313;"yes";"no";"unknown";5;"may";83;2;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";5;"may";106;3;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";129;"yes";"yes";"unknown";5;"may";189;2;-1;0;"unknown";"no"
-43;"management";"married";"unknown";"no";0;"yes";"no";"unknown";5;"may";105;2;-1;0;"unknown";"no"
-56;"admin.";"married";"secondary";"no";353;"yes";"no";"unknown";5;"may";106;2;-1;0;"unknown";"no"
-54;"technician";"married";"unknown";"no";851;"yes";"no";"unknown";5;"may";108;2;-1;0;"unknown";"no"
-55;"services";"divorced";"primary";"no";96;"yes";"yes";"unknown";5;"may";311;2;-1;0;"unknown";"no"
-37;"services";"divorced";"secondary";"no";398;"yes";"yes";"unknown";5;"may";214;2;-1;0;"unknown";"no"
-33;"admin.";"single";"tertiary";"no";193;"no";"no";"unknown";5;"may";132;2;-1;0;"unknown";"no"
-46;"admin.";"married";"secondary";"no";-358;"yes";"no";"unknown";5;"may";358;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";539;"yes";"yes";"unknown";5;"may";453;2;-1;0;"unknown";"no"
-51;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";364;2;-1;0;"unknown";"no"
-40;"retired";"single";"primary";"no";0;"no";"no";"unknown";5;"may";136;2;-1;0;"unknown";"no"
-42;"blue-collar";"married";"secondary";"no";490;"yes";"no";"unknown";5;"may";386;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";173;2;-1;0;"unknown";"no"
-49;"blue-collar";"married";"unknown";"no";403;"yes";"no";"unknown";5;"may";241;2;-1;0;"unknown";"no"
-48;"management";"married";"secondary";"no";161;"yes";"no";"unknown";5;"may";224;3;-1;0;"unknown";"no"
-32;"technician";"divorced";"tertiary";"no";2558;"no";"no";"unknown";5;"may";148;2;-1;0;"unknown";"no"
-31;"admin.";"single";"secondary";"no";98;"yes";"no";"unknown";5;"may";196;2;-1;0;"unknown";"no"
-55;"management";"single";"tertiary";"no";115;"no";"no";"unknown";5;"may";111;4;-1;0;"unknown";"no"
-40;"blue-collar";"single";"secondary";"no";436;"yes";"no";"unknown";5;"may";231;3;-1;0;"unknown";"no"
-47;"technician";"married";"tertiary";"no";831;"yes";"no";"unknown";5;"may";316;3;-1;0;"unknown";"no"
-57;"technician";"married";"unknown";"no";206;"yes";"no";"unknown";5;"may";216;3;-1;0;"unknown";"no"
-41;"blue-collar";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";240;2;-1;0;"unknown";"no"
-48;"blue-collar";"married";"secondary";"no";1;"no";"no";"unknown";5;"may";669;3;-1;0;"unknown";"no"
-42;"blue-collar";"married";"unknown";"no";57;"yes";"no";"unknown";5;"may";425;2;-1;0;"unknown";"no"
-30;"blue-collar";"single";"secondary";"no";-457;"yes";"no";"unknown";5;"may";143;2;-1;0;"unknown";"no"
-58;"management";"single";"tertiary";"no";1387;"yes";"no";"unknown";5;"may";174;5;-1;0;"unknown";"no"
-45;"management";"divorced";"tertiary";"no";24598;"yes";"no";"unknown";5;"may";313;3;-1;0;"unknown";"no"
-49;"blue-collar";"married";"secondary";"no";30;"yes";"no";"unknown";5;"may";135;4;-1;0;"unknown";"no"
-42;"admin.";"single";"secondary";"no";1022;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";56;"yes";"yes";"unknown";5;"may";152;2;-1;0;"unknown";"no"
-51;"admin.";"single";"secondary";"yes";-2;"no";"no";"unknown";5;"may";402;3;-1;0;"unknown";"no"
-32;"services";"single";"secondary";"no";121;"yes";"no";"unknown";5;"may";213;2;-1;0;"unknown";"no"
-41;"blue-collar";"single";"secondary";"no";842;"yes";"no";"unknown";5;"may";144;3;-1;0;"unknown";"no"
-43;"management";"divorced";"secondary";"no";693;"yes";"no";"unknown";5;"may";124;3;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"secondary";"no";-333;"yes";"no";"unknown";5;"may";183;2;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";1533;"yes";"no";"unknown";5;"may";325;2;-1;0;"unknown";"no"
-34;"management";"married";"tertiary";"no";46;"yes";"no";"unknown";5;"may";39;4;-1;0;"unknown";"no"
-53;"services";"married";"unknown";"no";18;"no";"no";"unknown";5;"may";503;2;-1;0;"unknown";"no"
-45;"technician";"married";"secondary";"no";44;"yes";"no";"unknown";5;"may";95;4;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";-100;"yes";"no";"unknown";5;"may";680;2;-1;0;"unknown";"no"
-44;"services";"married";"tertiary";"no";510;"yes";"no";"unknown";5;"may";421;4;-1;0;"unknown";"no"
-55;"management";"married";"tertiary";"no";685;"yes";"no";"unknown";5;"may";174;3;-1;0;"unknown";"no"
-46;"management";"single";"tertiary";"no";187;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
-45;"blue-collar";"married";"secondary";"no";66;"yes";"no";"unknown";5;"may";808;2;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";560;"yes";"no";"unknown";5;"may";198;3;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";195;2;-1;0;"unknown";"no"
-59;"unknown";"divorced";"unknown";"no";27;"no";"no";"unknown";5;"may";347;3;-1;0;"unknown";"no"
-31;"admin.";"single";"secondary";"no";12;"yes";"no";"unknown";5;"may";208;2;-1;0;"unknown";"no"
-44;"blue-collar";"single";"secondary";"no";34;"yes";"no";"unknown";5;"may";404;4;-1;0;"unknown";"no"
-33;"entrepreneur";"single";"tertiary";"no";1068;"yes";"no";"unknown";5;"may";396;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";216;4;-1;0;"unknown";"no"
-46;"admin.";"single";"tertiary";"no";377;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
-48;"management";"married";"tertiary";"no";263;"yes";"no";"unknown";5;"may";350;2;-1;0;"unknown";"no"
-42;"services";"married";"secondary";"no";1263;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
-27;"services";"married";"secondary";"no";8;"yes";"no";"unknown";6;"may";88;3;-1;0;"unknown";"no"
-48;"admin.";"married";"secondary";"no";126;"yes";"yes";"unknown";6;"may";379;2;-1;0;"unknown";"no"
-59;"admin.";"married";"secondary";"no";230;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
-46;"technician";"married";"tertiary";"no";841;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
-38;"admin.";"divorced";"secondary";"no";308;"yes";"no";"unknown";6;"may";102;1;-1;0;"unknown";"no"
-43;"management";"divorced";"tertiary";"no";1;"yes";"no";"unknown";6;"may";306;1;-1;0;"unknown";"no"
-38;"admin.";"divorced";"tertiary";"no";86;"yes";"no";"unknown";6;"may";218;1;-1;0;"unknown";"no"
-23;"student";"single";"secondary";"no";157;"yes";"no";"unknown";6;"may";54;1;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";22;"yes";"no";"unknown";6;"may";344;1;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";46;"yes";"yes";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";1293;"no";"no";"unknown";6;"may";652;1;-1;0;"unknown";"no"
-25;"admin.";"single";"secondary";"no";122;"yes";"no";"unknown";6;"may";286;1;-1;0;"unknown";"no"
-48;"blue-collar";"married";"unknown";"no";131;"yes";"no";"unknown";6;"may";189;1;-1;0;"unknown";"no"
-49;"blue-collar";"single";"secondary";"no";143;"yes";"no";"unknown";6;"may";83;1;-1;0;"unknown";"no"
-38;"admin.";"single";"secondary";"no";393;"no";"no";"unknown";6;"may";184;2;-1;0;"unknown";"no"
-43;"blue-collar";"married";"primary";"no";98;"yes";"no";"unknown";6;"may";235;1;-1;0;"unknown";"no"
-33;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";290;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";224;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";757;"yes";"no";"unknown";6;"may";133;1;-1;0;"unknown";"no"
-49;"services";"married";"secondary";"no";245;"yes";"yes";"unknown";6;"may";318;1;-1;0;"unknown";"no"
-40;"management";"married";"secondary";"no";8486;"no";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
-43;"admin.";"married";"unknown";"no";350;"no";"no";"unknown";6;"may";437;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";20;"yes";"no";"unknown";6;"may";402;1;-1;0;"unknown";"no"
-58;"services";"married";"secondary";"no";1667;"yes";"yes";"unknown";6;"may";85;1;-1;0;"unknown";"no"
-57;"technician";"married";"unknown";"no";345;"yes";"no";"unknown";6;"may";125;1;-1;0;"unknown";"no"
-32;"unemployed";"married";"secondary";"no";10;"yes";"no";"unknown";6;"may";501;4;-1;0;"unknown";"no"
-56;"management";"married";"tertiary";"no";830;"yes";"yes";"unknown";6;"may";1201;1;-1;0;"unknown";"yes"
-58;"blue-collar";"divorced";"unknown";"no";29;"yes";"no";"unknown";6;"may";253;1;-1;0;"unknown";"no"
-60;"retired";"divorced";"secondary";"no";545;"yes";"no";"unknown";6;"may";1030;1;-1;0;"unknown";"yes"
-37;"technician";"married";"tertiary";"no";8730;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
-46;"technician";"divorced";"tertiary";"no";477;"yes";"no";"unknown";6;"may";114;1;-1;0;"unknown";"no"
-27;"admin.";"married";"secondary";"no";4;"yes";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";769;2;-1;0;"unknown";"no"
-32;"technician";"single";"secondary";"no";0;"yes";"yes";"unknown";6;"may";135;3;-1;0;"unknown";"no"
-40;"admin.";"single";"secondary";"no";263;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";1;"no";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";283;"no";"yes";"unknown";6;"may";199;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"primary";"no";206;"yes";"no";"unknown";6;"may";152;1;-1;0;"unknown";"no"
-42;"housemaid";"married";"primary";"no";17;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
-48;"technician";"married";"secondary";"no";141;"yes";"yes";"unknown";6;"may";424;1;-1;0;"unknown";"no"
-29;"self-employed";"single";"tertiary";"no";16;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
-50;"services";"married";"secondary";"no";206;"yes";"no";"unknown";6;"may";154;1;-1;0;"unknown";"no"
-52;"technician";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";203;2;-1;0;"unknown";"no"
-50;"management";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";326;1;-1;0;"unknown";"no"
-58;"retired";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";393;1;-1;0;"unknown";"no"
-46;"blue-collar";"divorced";"primary";"no";1927;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
-38;"technician";"married";"secondary";"no";284;"yes";"no";"unknown";6;"may";483;1;-1;0;"unknown";"no"
-46;"blue-collar";"married";"secondary";"no";1660;"yes";"no";"unknown";6;"may";259;1;-1;0;"unknown";"no"
-32;"services";"single";"secondary";"no";406;"yes";"no";"unknown";6;"may";227;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";230;"yes";"no";"unknown";6;"may";673;1;-1;0;"unknown";"no"
-39;"admin.";"single";"secondary";"no";-25;"yes";"no";"unknown";6;"may";576;1;-1;0;"unknown";"no"
-48;"admin.";"married";"secondary";"no";182;"yes";"no";"unknown";6;"may";180;2;-1;0;"unknown";"no"
-36;"entrepreneur";"married";"tertiary";"no";1169;"yes";"no";"unknown";6;"may";168;2;-1;0;"unknown";"no"
-34;"admin.";"divorced";"secondary";"no";67;"yes";"no";"unknown";6;"may";90;1;-1;0;"unknown";"no"
-40;"technician";"married";"secondary";"no";77;"no";"no";"unknown";6;"may";505;1;-1;0;"unknown";"no"
-43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";245;1;-1;0;"unknown";"no"
-52;"blue-collar";"divorced";"primary";"no";55;"yes";"yes";"unknown";6;"may";186;1;-1;0;"unknown";"no"
-33;"technician";"married";"secondary";"yes";72;"yes";"no";"unknown";6;"may";623;1;-1;0;"unknown";"no"
-49;"management";"single";"tertiary";"no";163;"yes";"no";"unknown";6;"may";496;3;-1;0;"unknown";"no"
-32;"management";"single";"tertiary";"no";151;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
-39;"admin.";"single";"secondary";"no";113;"yes";"no";"unknown";6;"may";342;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
-38;"technician";"single";"tertiary";"no";9;"yes";"no";"unknown";6;"may";185;3;-1;0;"unknown";"no"
-43;"management";"married";"secondary";"no";375;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
-39;"services";"married";"secondary";"no";1142;"yes";"no";"unknown";6;"may";276;1;-1;0;"unknown";"no"
-54;"blue-collar";"married";"primary";"no";2102;"yes";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
-38;"technician";"single";"tertiary";"no";4325;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";217;"yes";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-55;"admin.";"married";"secondary";"no";131;"yes";"no";"unknown";6;"may";744;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";1680;"yes";"no";"unknown";6;"may";765;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";119;1;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";320;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
-55;"admin.";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"secondary";"no";39;"no";"no";"unknown";6;"may";241;1;-1;0;"unknown";"no"
-35;"management";"single";"tertiary";"no";560;"yes";"no";"unknown";6;"may";181;1;-1;0;"unknown";"no"
-58;"technician";"divorced";"secondary";"no";469;"no";"no";"unknown";6;"may";196;1;-1;0;"unknown";"no"
-35;"admin.";"married";"secondary";"no";530;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
-49;"services";"married";"primary";"no";61;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
-34;"technician";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";139;"yes";"no";"unknown";6;"may";309;2;-1;0;"unknown";"no"
-24;"self-employed";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
-34;"technician";"married";"secondary";"no";367;"yes";"no";"unknown";6;"may";140;1;-1;0;"unknown";"no"
-51;"admin.";"divorced";"secondary";"no";228;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
-39;"technician";"single";"unknown";"no";45248;"yes";"no";"unknown";6;"may";1623;1;-1;0;"unknown";"yes"
-50;"self-employed";"married";"unknown";"no";-84;"yes";"no";"unknown";6;"may";101;1;-1;0;"unknown";"no"
-32;"services";"single";"secondary";"no";310;"yes";"no";"unknown";6;"may";144;1;-1;0;"unknown";"no"
-42;"blue-collar";"married";"unknown";"no";132;"yes";"no";"unknown";6;"may";238;1;-1;0;"unknown";"no"
-50;"technician";"married";"secondary";"no";797;"yes";"no";"unknown";6;"may";354;1;-1;0;"unknown";"no"
-40;"services";"married";"secondary";"no";71;"no";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
-46;"management";"divorced";"unknown";"no";2;"yes";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
-37;"management";"married";"tertiary";"no";231;"yes";"yes";"unknown";6;"may";451;2;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";270;"yes";"yes";"unknown";6;"may";159;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";274;"yes";"yes";"unknown";6;"may";409;1;-1;0;"unknown";"no"
-40;"admin.";"single";"secondary";"no";-109;"yes";"yes";"unknown";6;"may";170;1;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";608;1;-1;0;"unknown";"yes"
-33;"blue-collar";"single";"secondary";"yes";-60;"no";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
-35;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
-58;"blue-collar";"divorced";"secondary";"no";-11;"no";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";-509;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
-39;"unemployed";"married";"primary";"no";408;"yes";"no";"unknown";6;"may";53;1;-1;0;"unknown";"no"
-36;"services";"single";"primary";"no";58;"yes";"no";"unknown";6;"may";134;1;-1;0;"unknown";"no"
-57;"retired";"single";"secondary";"no";1640;"no";"yes";"unknown";6;"may";204;4;-1;0;"unknown";"no"
-36;"admin.";"single";"secondary";"no";20;"yes";"no";"unknown";6;"may";186;1;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";71;"yes";"no";"unknown";6;"may";678;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";52;"yes";"no";"unknown";6;"may";182;1;-1;0;"unknown";"no"
-44;"self-employed";"married";"tertiary";"no";292;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
-44;"services";"divorced";"secondary";"no";424;"yes";"no";"unknown";6;"may";27;1;-1;0;"unknown";"no"
-39;"housemaid";"single";"primary";"no";109;"yes";"no";"unknown";6;"may";699;3;-1;0;"unknown";"no"
-46;"blue-collar";"married";"unknown";"no";1044;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";983;"yes";"no";"unknown";6;"may";97;1;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";869;"no";"no";"unknown";6;"may";1677;1;-1;0;"unknown";"yes"
-40;"blue-collar";"married";"primary";"no";668;"yes";"no";"unknown";6;"may";283;2;-1;0;"unknown";"no"
-50;"management";"married";"tertiary";"no";964;"yes";"no";"unknown";6;"may";323;1;-1;0;"unknown";"no"
-31;"management";"single";"secondary";"no";301;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";140;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
-39;"management";"single";"secondary";"no";1877;"yes";"no";"unknown";6;"may";185;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";1127;"yes";"no";"unknown";6;"may";47;1;-1;0;"unknown";"no"
-41;"technician";"married";"secondary";"no";871;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
-41;"technician";"married";"secondary";"no";767;"yes";"yes";"unknown";6;"may";204;1;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
-30;"services";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";30;2;-1;0;"unknown";"no"
-54;"management";"divorced";"primary";"no";0;"no";"no";"unknown";6;"may";472;1;-1;0;"unknown";"no"
-43;"blue-collar";"divorced";"secondary";"no";110;"yes";"yes";"unknown";6;"may";448;1;-1;0;"unknown";"no"
-59;"management";"divorced";"tertiary";"no";-76;"yes";"yes";"unknown";6;"may";264;1;-1;0;"unknown";"no"
-47;"technician";"married";"unknown";"no";178;"yes";"no";"unknown";6;"may";169;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";288;1;-1;0;"unknown";"no"
-32;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";176;2;-1;0;"unknown";"no"
-29;"blue-collar";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";215;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";337;1;-1;0;"unknown";"no"
-55;"unemployed";"married";"tertiary";"no";5345;"no";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
-30;"blue-collar";"divorced";"secondary";"no";-209;"yes";"no";"unknown";6;"may";188;2;-1;0;"unknown";"no"
-39;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
-39;"blue-collar";"divorced";"secondary";"no";42;"yes";"no";"unknown";6;"may";226;2;-1;0;"unknown";"no"
-50;"blue-collar";"divorced";"secondary";"no";41;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";-99;"yes";"no";"unknown";6;"may";111;2;-1;0;"unknown";"no"
-37;"technician";"single";"secondary";"no";17;"yes";"no";"unknown";6;"may";164;1;-1;0;"unknown";"no"
-46;"admin.";"married";"primary";"no";276;"yes";"yes";"unknown";6;"may";157;2;-1;0;"unknown";"no"
-32;"technician";"single";"unknown";"no";-170;"no";"no";"unknown";6;"may";46;1;-1;0;"unknown";"no"
-37;"management";"single";"tertiary";"no";230;"yes";"yes";"unknown";6;"may";374;1;-1;0;"unknown";"no"
-29;"blue-collar";"married";"secondary";"no";9;"yes";"no";"unknown";6;"may";349;1;-1;0;"unknown";"no"
-41;"blue-collar";"married";"secondary";"no";946;"yes";"no";"unknown";6;"may";325;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";1297;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
-57;"retired";"divorced";"secondary";"no";-331;"yes";"no";"unknown";6;"may";531;1;-1;0;"unknown";"no"
-48;"blue-collar";"single";"secondary";"no";44;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
-60;"retired";"married";"secondary";"yes";15;"no";"no";"unknown";6;"may";80;1;-1;0;"unknown";"no"
-26;"admin.";"single";"secondary";"no";712;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
-58;"retired";"married";"secondary";"no";5435;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";507;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
-55;"unemployed";"divorced";"secondary";"no";387;"yes";"no";"unknown";6;"may";918;1;-1;0;"unknown";"yes"
-41;"blue-collar";"married";"primary";"no";0;"yes";"yes";"unknown";6;"may";238;1;-1;0;"unknown";"no"
-50;"management";"divorced";"secondary";"no";1716;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
-49;"entrepreneur";"married";"secondary";"no";167;"yes";"yes";"unknown";6;"may";198;3;-1;0;"unknown";"no"
-44;"admin.";"married";"unknown";"no";40;"no";"yes";"unknown";6;"may";160;2;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";148;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
-31;"technician";"married";"secondary";"no";17;"yes";"yes";"unknown";6;"may";120;1;-1;0;"unknown";"no"
-34;"blue-collar";"single";"tertiary";"no";1011;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
-46;"management";"single";"unknown";"no";1527;"yes";"no";"unknown";6;"may";269;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";744;"no";"no";"unknown";6;"may";157;1;-1;0;"unknown";"no"
-52;"admin.";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";128;1;-1;0;"unknown";"no"
-29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
-53;"retired";"married";"primary";"no";136;"yes";"no";"unknown";6;"may";267;2;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";1335;"yes";"no";"unknown";6;"may";371;2;-1;0;"unknown";"no"
-38;"management";"married";"secondary";"no";517;"yes";"no";"unknown";6;"may";288;2;-1;0;"unknown";"no"
-46;"management";"married";"tertiary";"no";459;"yes";"no";"unknown";6;"may";221;1;-1;0;"unknown";"no"
-48;"management";"divorced";"unknown";"no";549;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
-30;"admin.";"divorced";"secondary";"no";83;"yes";"yes";"unknown";6;"may";310;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";213;"no";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
-31;"housemaid";"married";"primary";"no";203;"yes";"no";"unknown";6;"may";604;3;-1;0;"unknown";"no"
-42;"services";"single";"secondary";"no";518;"yes";"no";"unknown";6;"may";198;1;-1;0;"unknown";"no"
-40;"management";"single";"tertiary";"no";3877;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
-52;"admin.";"married";"secondary";"no";1236;"yes";"no";"unknown";6;"may";247;1;-1;0;"unknown";"no"
-45;"blue-collar";"divorced";"secondary";"no";756;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
-48;"blue-collar";"married";"secondary";"no";157;"yes";"no";"unknown";6;"may";73;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";263;2;-1;0;"unknown";"no"
-34;"blue-collar";"married";"unknown";"no";245;"yes";"no";"unknown";6;"may";13;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"primary";"no";-144;"yes";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
-46;"blue-collar";"married";"secondary";"no";71;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
-49;"services";"divorced";"secondary";"no";505;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
-50;"technician";"married";"primary";"no";249;"yes";"no";"unknown";6;"may";129;1;-1;0;"unknown";"no"
-34;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
-40;"unemployed";"single";"secondary";"no";11;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
-36;"admin.";"married";"secondary";"no";639;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
-59;"blue-collar";"divorced";"unknown";"no";124;"yes";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";250;1;-1;0;"unknown";"no"
-36;"self-employed";"married";"tertiary";"no";107;"yes";"no";"unknown";6;"may";146;1;-1;0;"unknown";"no"
-56;"services";"married";"secondary";"no";473;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
-42;"services";"divorced";"secondary";"no";372;"yes";"yes";"unknown";6;"may";121;2;-1;0;"unknown";"no"
-30;"admin.";"married";"secondary";"no";46;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
-30;"student";"single";"tertiary";"no";34;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
-47;"self-employed";"married";"unknown";"no";935;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
-33;"blue-collar";"married";"secondary";"no";-10;"yes";"no";"unknown";6;"may";123;1;-1;0;"unknown";"no"
-36;"admin.";"married";"secondary";"no";-106;"yes";"no";"unknown";6;"may";130;2;-1;0;"unknown";"no"
-39;"services";"divorced";"primary";"no";471;"yes";"no";"unknown";6;"may";161;2;-1;0;"unknown";"no"
-56;"admin.";"divorced";"secondary";"no";778;"yes";"no";"unknown";6;"may";149;2;-1;0;"unknown";"no"
-39;"blue-collar";"divorced";"unknown";"no";170;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
-42;"technician";"married";"secondary";"no";315;"yes";"no";"unknown";6;"may";259;2;-1;0;"unknown";"no"
-52;"blue-collar";"married";"secondary";"no";3165;"no";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
-36;"admin.";"divorced";"secondary";"no";131;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
-35;"entrepreneur";"married";"secondary";"yes";204;"yes";"no";"unknown";6;"may";424;2;-1;0;"unknown";"no"
-47;"technician";"married";"secondary";"no";83;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
-59;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";6;"may";97;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";5431;"yes";"yes";"unknown";6;"may";383;1;-1;0;"unknown";"no"
-38;"management";"married";"unknown";"no";1759;"yes";"no";"unknown";6;"may";440;1;-1;0;"unknown";"no"
-46;"unemployed";"married";"secondary";"no";-125;"yes";"no";"unknown";6;"may";23;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-28;"services";"single";"secondary";"no";5090;"yes";"no";"unknown";6;"may";1297;3;-1;0;"unknown";"yes"
-38;"technician";"married";"unknown";"no";573;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
-56;"blue-collar";"married";"secondary";"no";1602;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
-41;"blue-collar";"single";"primary";"yes";-137;"yes";"yes";"unknown";6;"may";189;1;-1;0;"unknown";"no"
-52;"technician";"married";"unknown";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"no";193;"no";"no";"unknown";6;"may";179;1;-1;0;"unknown";"no"
-61;"retired";"married";"secondary";"no";195;"yes";"yes";"unknown";6;"may";179;1;-1;0;"unknown";"no"
-53;"entrepreneur";"married";"secondary";"no";288;"no";"no";"unknown";6;"may";69;1;-1;0;"unknown";"no"
-47;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";6;"may";105;2;-1;0;"unknown";"no"
-53;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";6;"may";266;3;-1;0;"unknown";"no"
-46;"services";"married";"secondary";"no";216;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
-39;"blue-collar";"divorced";"primary";"no";190;"yes";"yes";"unknown";6;"may";96;2;-1;0;"unknown";"no"
-56;"technician";"divorced";"secondary";"no";99;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
-55;"services";"divorced";"primary";"no";2298;"yes";"no";"unknown";6;"may";162;2;-1;0;"unknown";"no"
-44;"management";"married";"tertiary";"no";17;"yes";"no";"unknown";6;"may";352;2;-1;0;"unknown";"no"
-37;"technician";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";76;4;-1;0;"unknown";"no"
-35;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";154;2;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";840;"yes";"no";"unknown";6;"may";310;2;-1;0;"unknown";"no"
-37;"services";"married";"secondary";"no";358;"yes";"no";"unknown";6;"may";390;3;-1;0;"unknown";"no"
-30;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";369;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";-325;"yes";"yes";"unknown";6;"may";112;2;-1;0;"unknown";"no"
-36;"technician";"single";"secondary";"no";-15;"yes";"no";"unknown";6;"may";341;3;-1;0;"unknown";"no"
-38;"technician";"married";"secondary";"no";581;"yes";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
-41;"admin.";"divorced";"primary";"no";4070;"yes";"no";"unknown";6;"may";140;2;-1;0;"unknown";"no"
-48;"retired";"married";"secondary";"no";74;"no";"yes";"unknown";6;"may";315;1;-1;0;"unknown";"no"
-55;"services";"divorced";"secondary";"no";141;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
-28;"services";"divorced";"secondary";"no";89;"no";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"yes";0;"yes";"no";"unknown";6;"may";138;3;-1;0;"unknown";"no"
-30;"blue-collar";"married";"secondary";"no";450;"no";"no";"unknown";6;"may";526;2;-1;0;"unknown";"no"
-48;"technician";"married";"tertiary";"no";310;"no";"no";"unknown";6;"may";135;1;-1;0;"unknown";"no"
-31;"self-employed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";36;5;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";384;"yes";"no";"unknown";6;"may";1906;3;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";395;"yes";"no";"unknown";6;"may";219;2;-1;0;"unknown";"no"
-37;"services";"single";"unknown";"no";-118;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
-56;"blue-collar";"married";"primary";"no";5;"yes";"yes";"unknown";6;"may";407;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"secondary";"no";50;"yes";"yes";"unknown";6;"may";121;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";285;"yes";"yes";"unknown";6;"may";209;1;-1;0;"unknown";"no"
-49;"technician";"married";"unknown";"no";15;"no";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";653;"yes";"yes";"unknown";6;"may";208;1;-1;0;"unknown";"no"
-43;"self-employed";"married";"secondary";"no";918;"yes";"no";"unknown";6;"may";193;1;-1;0;"unknown";"no"
-32;"services";"married";"secondary";"no";243;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
-29;"technician";"single";"tertiary";"no";405;"yes";"no";"unknown";6;"may";65;1;-1;0;"unknown";"no"
-48;"management";"divorced";"tertiary";"no";1328;"yes";"no";"unknown";6;"may";339;1;-1;0;"unknown";"no"
-55;"services";"married";"primary";"no";255;"yes";"no";"unknown";6;"may";285;1;-1;0;"unknown";"no"
-53;"blue-collar";"married";"secondary";"no";3397;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
-47;"technician";"married";"unknown";"no";2106;"yes";"no";"unknown";6;"may";168;1;-1;0;"unknown";"no"
-39;"management";"married";"tertiary";"no";2877;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
-31;"blue-collar";"single";"tertiary";"no";60;"yes";"yes";"unknown";6;"may";389;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";2226;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";2880;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
-40;"technician";"single";"unknown";"no";-5;"yes";"no";"unknown";6;"may";78;2;-1;0;"unknown";"no"
-48;"technician";"married";"secondary";"no";147;"no";"no";"unknown";6;"may";142;3;-1;0;"unknown";"no"
-33;"technician";"divorced";"secondary";"no";7;"yes";"yes";"unknown";6;"may";87;1;-1;0;"unknown";"no"
-40;"technician";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
-59;"retired";"married";"primary";"no";-119;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
-30;"technician";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";703;1;-1;0;"unknown";"yes"
-31;"management";"single";"tertiary";"no";1852;"yes";"no";"unknown";6;"may";170;3;-1;0;"unknown";"no"
-35;"unemployed";"married";"secondary";"no";533;"yes";"no";"unknown";6;"may";802;1;-1;0;"unknown";"no"
-54;"technician";"divorced";"secondary";"no";21;"yes";"no";"unknown";6;"may";381;2;-1;0;"unknown";"no"
-34;"admin.";"single";"unknown";"no";2434;"yes";"no";"unknown";6;"may";218;4;-1;0;"unknown";"no"
-32;"technician";"married";"secondary";"no";90;"yes";"yes";"unknown";6;"may";57;2;-1;0;"unknown";"no"
-56;"admin.";"divorced";"unknown";"no";4246;"yes";"no";"unknown";6;"may";304;2;-1;0;"unknown";"no"
-32;"admin.";"single";"tertiary";"no";395;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
-42;"blue-collar";"married";"primary";"no";15;"yes";"no";"unknown";6;"may";230;1;-1;0;"unknown";"no"
-33;"services";"married";"tertiary";"no";85;"no";"no";"unknown";6;"may";262;3;-1;0;"unknown";"no"
-52;"entrepreneur";"married";"tertiary";"no";-184;"yes";"yes";"unknown";6;"may";392;2;-1;0;"unknown";"no"
-52;"services";"married";"secondary";"no";660;"no";"no";"unknown";6;"may";201;2;-1;0;"unknown";"no"
-52;"blue-collar";"divorced";"primary";"yes";-183;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
-30;"unemployed";"divorced";"secondary";"no";1144;"yes";"no";"unknown";6;"may";252;1;-1;0;"unknown";"no"
-44;"services";"divorced";"secondary";"no";1;"yes";"no";"unknown";6;"may";235;4;-1;0;"unknown";"no"
-35;"admin.";"married";"secondary";"no";69;"yes";"yes";"unknown";6;"may";235;2;-1;0;"unknown";"no"
-55;"management";"single";"secondary";"no";220;"yes";"no";"unknown";6;"may";328;2;-1;0;"unknown";"no"
-33;"blue-collar";"married";"primary";"no";332;"yes";"no";"unknown";6;"may";116;2;-1;0;"unknown";"no"
-37;"blue-collar";"single";"secondary";"no";240;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
-42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";293;1;-1;0;"unknown";"no"
-43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";37;2;-1;0;"unknown";"no"
-38;"entrepreneur";"married";"tertiary";"no";898;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";123;"yes";"yes";"unknown";6;"may";530;2;-1;0;"unknown";"no"
-31;"student";"single";"secondary";"no";252;"yes";"no";"unknown";6;"may";175;3;-1;0;"unknown";"no"
-41;"management";"married";"tertiary";"no";65;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
-41;"technician";"married";"secondary";"no";-366;"yes";"yes";"unknown";6;"may";29;3;-1;0;"unknown";"no"
-29;"student";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";311;2;-1;0;"unknown";"no"
-38;"admin.";"single";"secondary";"no";221;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
-44;"self-employed";"divorced";"tertiary";"no";4;"yes";"no";"unknown";6;"may";312;3;-1;0;"unknown";"no"
-39;"admin.";"married";"secondary";"no";104;"yes";"no";"unknown";6;"may";412;1;-1;0;"unknown";"no"
-28;"technician";"single";"secondary";"no";312;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
-33;"blue-collar";"married";"secondary";"no";-349;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
-41;"services";"married";"unknown";"no";4;"no";"no";"unknown";6;"may";284;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";-322;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
-29;"admin.";"married";"secondary";"no";-150;"yes";"no";"unknown";6;"may";328;1;-1;0;"unknown";"no"
-38;"management";"married";"unknown";"no";1349;"yes";"no";"unknown";6;"may";100;1;-1;0;"unknown";"no"
-32;"admin.";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";226;1;-1;0;"unknown";"no"
-45;"services";"married";"secondary";"no";1259;"yes";"no";"unknown";6;"may";507;1;-1;0;"unknown";"no"
-33;"admin.";"single";"secondary";"no";101;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";848;"yes";"no";"unknown";6;"may";684;2;-1;0;"unknown";"no"
-41;"entrepreneur";"married";"unknown";"no";89;"yes";"no";"unknown";6;"may";333;2;-1;0;"unknown";"no"
-41;"blue-collar";"married";"secondary";"no";140;"yes";"no";"unknown";6;"may";311;3;-1;0;"unknown";"no"
-35;"admin.";"single";"secondary";"no";148;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
-40;"technician";"single";"secondary";"no";200;"yes";"no";"unknown";6;"may";322;2;-1;0;"unknown";"no"
-60;"self-employed";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";202;4;-1;0;"unknown";"no"
-47;"services";"divorced";"secondary";"no";201;"yes";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
-46;"blue-collar";"married";"primary";"no";530;"yes";"no";"unknown";6;"may";739;3;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";273;2;-1;0;"unknown";"no"
-49;"self-employed";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
-29;"blue-collar";"married";"secondary";"no";43;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
-31;"management";"single";"tertiary";"no";-173;"yes";"no";"unknown";6;"may";396;2;-1;0;"unknown";"no"
-38;"management";"married";"tertiary";"no";389;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";215;"yes";"yes";"unknown";6;"may";308;3;-1;0;"unknown";"no"
-35;"technician";"married";"secondary";"no";-131;"yes";"no";"unknown";6;"may";467;2;-1;0;"unknown";"no"
-31;"management";"single";"secondary";"no";783;"yes";"no";"unknown";6;"may";320;1;-1;0;"unknown";"no"
-41;"admin.";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
-46;"services";"married";"unknown";"no";80;"yes";"no";"unknown";6;"may";245;2;-1;0;"unknown";"no"
-40;"services";"divorced";"secondary";"no";105;"yes";"no";"unknown";6;"may";189;2;-1;0;"unknown";"no"
-29;"admin.";"married";"secondary";"no";182;"yes";"yes";"unknown";6;"may";477;1;-1;0;"unknown";"no"
-49;"admin.";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";65;3;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"no";510;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
-40;"management";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
-53;"admin.";"married";"secondary";"no";244;"yes";"yes";"unknown";6;"may";197;2;-1;0;"unknown";"no"
-49;"management";"married";"tertiary";"no";92;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";6;"may";64;2;-1;0;"unknown";"no"
-29;"student";"single";"secondary";"no";948;"yes";"no";"unknown";6;"may";75;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";6;"may";400;2;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";710;"yes";"no";"unknown";6;"may";378;3;-1;0;"unknown";"no"
-39;"services";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";118;2;-1;0;"unknown";"no"
-36;"technician";"married";"secondary";"no";368;"yes";"yes";"unknown";6;"may";1597;2;-1;0;"unknown";"yes"
-44;"entrepreneur";"married";"tertiary";"no";1631;"yes";"no";"unknown";6;"may";346;2;-1;0;"unknown";"no"
-40;"admin.";"married";"secondary";"no";6;"yes";"no";"unknown";6;"may";60;3;-1;0;"unknown";"no"
-49;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";6;"may";276;2;-1;0;"unknown";"no"
-30;"technician";"single";"unknown";"no";-48;"yes";"no";"unknown";6;"may";152;2;-1;0;"unknown";"no"
-57;"management";"married";"tertiary";"no";2142;"yes";"no";"unknown";6;"may";251;3;-1;0;"unknown";"no"
-24;"services";"single";"secondary";"no";77;"yes";"yes";"unknown";6;"may";390;2;-1;0;"unknown";"no"
-46;"blue-collar";"married";"unknown";"no";401;"yes";"no";"unknown";6;"may";306;2;-1;0;"unknown";"no"
-33;"admin.";"married";"secondary";"no";21;"no";"no";"unknown";6;"may";189;3;-1;0;"unknown";"no"
-43;"services";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";125;2;-1;0;"unknown";"no"
-43;"admin.";"single";"secondary";"no";-497;"yes";"no";"unknown";6;"may";234;2;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"primary";"no";369;"no";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
-44;"technician";"single";"unknown";"no";78;"yes";"no";"unknown";6;"may";13;6;-1;0;"unknown";"no"
-35;"technician";"single";"tertiary";"no";226;"yes";"yes";"unknown";6;"may";283;3;-1;0;"unknown";"no"
-47;"technician";"married";"secondary";"no";503;"yes";"no";"unknown";6;"may";109;2;-1;0;"unknown";"no"
-33;"blue-collar";"married";"secondary";"no";372;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
-31;"admin.";"married";"secondary";"no";0;"yes";"yes";"unknown";6;"may";144;2;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";121;2;-1;0;"unknown";"no"
-36;"entrepreneur";"married";"tertiary";"no";125;"yes";"no";"unknown";6;"may";95;3;-1;0;"unknown";"no"
-56;"retired";"divorced";"primary";"no";4;"yes";"no";"unknown";6;"may";31;3;-1;0;"unknown";"no"
-40;"admin.";"single";"unknown";"no";419;"yes";"no";"unknown";6;"may";112;3;-1;0;"unknown";"no"
-41;"admin.";"divorced";"secondary";"no";322;"yes";"no";"unknown";6;"may";87;4;-1;0;"unknown";"no"
-53;"retired";"married";"secondary";"no";303;"yes";"no";"unknown";6;"may";593;2;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";607;"yes";"no";"unknown";6;"may";99;2;-1;0;"unknown";"no"
-44;"blue-collar";"divorced";"secondary";"no";579;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";3047;"yes";"no";"unknown";6;"may";285;2;-1;0;"unknown";"no"
-54;"technician";"divorced";"secondary";"no";83;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
-58;"management";"married";"tertiary";"no";68;"yes";"no";"unknown";6;"may";172;5;-1;0;"unknown";"no"
-52;"blue-collar";"married";"primary";"no";58;"yes";"no";"unknown";6;"may";213;3;-1;0;"unknown";"no"
-28;"admin.";"single";"secondary";"no";251;"yes";"no";"unknown";6;"may";178;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";688;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
-60;"retired";"married";"primary";"no";364;"yes";"no";"unknown";6;"may";631;2;-1;0;"unknown";"no"
-42;"services";"divorced";"secondary";"no";55;"yes";"no";"unknown";6;"may";176;5;-1;0;"unknown";"no"
-42;"admin.";"married";"secondary";"no";101;"yes";"no";"unknown";6;"may";32;3;-1;0;"unknown";"no"
-44;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";1529;2;-1;0;"unknown";"no"
-51;"blue-collar";"divorced";"primary";"no";325;"yes";"no";"unknown";6;"may";254;2;-1;0;"unknown";"no"
-49;"blue-collar";"married";"primary";"no";198;"yes";"no";"unknown";6;"may";200;2;-1;0;"unknown";"no"
-47;"entrepreneur";"married";"unknown";"no";209;"yes";"no";"unknown";6;"may";135;2;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";112;4;-1;0;"unknown";"no"
-34;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";314;3;-1;0;"unknown";"no"
-35;"services";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";597;3;-1;0;"unknown";"no"
-35;"blue-collar";"single";"secondary";"no";376;"yes";"yes";"unknown";6;"may";207;3;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";-7;"yes";"no";"unknown";6;"may";410;2;-1;0;"unknown";"no"
-55;"technician";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
-55;"retired";"married";"secondary";"no";143;"yes";"no";"unknown";6;"may";42;3;-1;0;"unknown";"no"
-35;"management";"single";"tertiary";"no";550;"yes";"no";"unknown";6;"may";55;2;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";162;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
-53;"management";"married";"tertiary";"no";115;"yes";"no";"unknown";6;"may";336;3;-1;0;"unknown";"no"
-41;"blue-collar";"married";"primary";"no";512;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
-57;"blue-collar";"married";"unknown";"no";807;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
-45;"blue-collar";"married";"unknown";"no";248;"yes";"no";"unknown";6;"may";88;5;-1;0;"unknown";"no"
-43;"blue-collar";"married";"primary";"no";1211;"yes";"no";"unknown";6;"may";208;3;-1;0;"unknown";"no"
-56;"self-employed";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";305;2;-1;0;"unknown";"no"
-31;"entrepreneur";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";206;2;-1;0;"unknown";"no"
-37;"blue-collar";"single";"secondary";"no";88;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
-30;"management";"married";"tertiary";"no";32;"yes";"no";"unknown";6;"may";122;3;-1;0;"unknown";"no"
-30;"admin.";"single";"secondary";"no";115;"yes";"no";"unknown";6;"may";66;3;-1;0;"unknown";"no"
-54;"blue-collar";"married";"secondary";"no";254;"yes";"no";"unknown";6;"may";66;2;-1;0;"unknown";"no"
-36;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";6;"may";164;2;-1;0;"unknown";"no"
-55;"unemployed";"married";"tertiary";"no";383;"no";"no";"unknown";6;"may";343;3;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";569;"yes";"yes";"unknown";6;"may";126;2;-1;0;"unknown";"no"
-38;"housemaid";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";59;3;-1;0;"unknown";"no"
-48;"admin.";"married";"secondary";"no";3754;"yes";"no";"unknown";6;"may";249;3;-1;0;"unknown";"no"
-55;"housemaid";"divorced";"tertiary";"no";6920;"yes";"no";"unknown";6;"may";406;3;-1;0;"unknown";"no"
-59;"services";"married";"secondary";"no";307;"yes";"yes";"unknown";6;"may";250;7;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";-421;"yes";"no";"unknown";6;"may";183;5;-1;0;"unknown";"no"
-33;"blue-collar";"divorced";"secondary";"no";60;"no";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";67;"yes";"no";"unknown";6;"may";220;2;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";402;"yes";"no";"unknown";6;"may";153;3;-1;0;"unknown";"no"
-30;"self-employed";"single";"tertiary";"no";800;"no";"no";"unknown";6;"may";95;2;-1;0;"unknown";"no"
-42;"technician";"married";"tertiary";"no";239;"yes";"yes";"unknown";6;"may";191;3;-1;0;"unknown";"no"
-51;"blue-collar";"divorced";"secondary";"no";421;"yes";"no";"unknown";6;"may";216;2;-1;0;"unknown";"no"
-44;"admin.";"divorced";"secondary";"no";161;"yes";"no";"unknown";7;"may";89;2;-1;0;"unknown";"no"
-46;"technician";"married";"secondary";"yes";289;"no";"no";"unknown";7;"may";51;3;-1;0;"unknown";"no"
-29;"student";"single";"secondary";"no";110;"yes";"no";"unknown";7;"may";169;3;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";245;"yes";"no";"unknown";7;"may";148;3;-1;0;"unknown";"no"
-42;"services";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";132;3;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";156;"yes";"no";"unknown";7;"may";117;3;-1;0;"unknown";"no"
-42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";7;"may";275;4;-1;0;"unknown";"no"
-39;"admin.";"married";"secondary";"no";20;"yes";"no";"unknown";7;"may";124;2;-1;0;"unknown";"no"
-55;"technician";"single";"tertiary";"no";92;"yes";"no";"unknown";7;"may";118;3;-1;0;"unknown";"no"
-46;"services";"married";"secondary";"no";89;"yes";"no";"unknown";7;"may";479;2;-1;0;"unknown";"no"
-42;"blue-collar";"married";"secondary";"no";166;"yes";"no";"unknown";7;"may";285;3;-1;0;"unknown";"no"
-45;"management";"married";"tertiary";"no";103;"yes";"no";"unknown";7;"may";35;4;-1;0;"unknown";"no"
-43;"blue-collar";"married";"primary";"no";-454;"yes";"no";"unknown";7;"may";322;2;-1;0;"unknown";"no"
-42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";7;"may";202;2;-1;0;"unknown";"no"
-30;"admin.";"married";"secondary";"no";4;"no";"no";"unknown";7;"may";172;8;-1;0;"unknown";"no"
-47;"blue-collar";"married";"secondary";"no";1001;"yes";"no";"unknown";7;"may";201;4;-1;0;"unknown";"no"
-51;"services";"divorced";"secondary";"no";-69;"yes";"no";"unknown";7;"may";216;3;-1;0;"unknown";"no"
-38;"technician";"single";"secondary";"no";42;"yes";"no";"unknown";7;"may";195;2;-1;0;"unknown";"no"
-57;"technician";"married";"unknown";"no";1617;"yes";"no";"unknown";7;"may";96;2;-1;0;"unknown";"no"
-42;"management";"divorced";"tertiary";"no";221;"yes";"no";"unknown";7;"may";720;2;-1;0;"unknown";"no"
-32;"technician";"divorced";"secondary";"no";210;"yes";"yes";"unknown";7;"may";188;2;-1;0;"unknown";"no"
-46;"management";"married";"tertiary";"no";0;"no";"no";"unknown";7;"may";70;2;-1;0;"unknown";"no"
-29;"student";"single";"tertiary";"no";185;"yes";"no";"unknown";7;"may";141;3;-1;0;"unknown";"no"
-59;"retired";"married";"secondary";"no";836;"yes";"no";"unknown";7;"may";106;1;-1;0;"unknown";"no"
-32;"blue-collar";"single";"secondary";"no";301;"yes";"no";"unknown";7;"may";395;2;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";503;"yes";"no";"unknown";7;"may";629;2;-1;0;"unknown";"no"
-40;"retired";"married";"primary";"no";407;"yes";"no";"unknown";7;"may";502;1;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";53;"yes";"no";"unknown";7;"may";446;1;-1;0;"unknown";"no"
-46;"self-employed";"married";"tertiary";"no";2303;"yes";"no";"unknown";7;"may";241;1;-1;0;"unknown";"no"
-43;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";7;"may";131;3;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";205;"yes";"no";"unknown";7;"may";312;1;-1;0;"unknown";"no"
-39;"management";"married";"tertiary";"no";305;"yes";"no";"unknown";7;"may";275;6;-1;0;"unknown";"no"
-30;"blue-collar";"divorced";"secondary";"no";251;"yes";"yes";"unknown";7;"may";120;2;-1;0;"unknown";"no"
-56;"retired";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";333;4;-1;0;"unknown";"no"
-29;"technician";"married";"secondary";"no";8;"no";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"secondary";"no";139;"yes";"no";"unknown";7;"may";91;1;-1;0;"unknown";"no"
-36;"services";"married";"secondary";"no";184;"yes";"no";"unknown";7;"may";128;3;-1;0;"unknown";"no"
-37;"blue-collar";"single";"secondary";"no";238;"yes";"no";"unknown";7;"may";200;2;-1;0;"unknown";"no"
-35;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";7;"may";326;1;-1;0;"unknown";"no"
-35;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";7;"may";292;1;-1;0;"unknown";"no"
-47;"services";"married";"primary";"no";222;"yes";"no";"unknown";7;"may";68;1;-1;0;"unknown";"no"
-31;"services";"married";"secondary";"no";414;"yes";"no";"unknown";7;"may";215;1;-1;0;"unknown";"no"
-56;"retired";"single";"primary";"no";223;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";197;"no";"no";"unknown";7;"may";32;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";-251;"yes";"no";"unknown";7;"may";162;1;-1;0;"unknown";"no"
-45;"self-employed";"divorced";"secondary";"no";-139;"yes";"no";"unknown";7;"may";152;3;-1;0;"unknown";"no"
-47;"blue-collar";"married";"unknown";"no";733;"yes";"no";"unknown";7;"may";268;1;-1;0;"unknown";"no"
-29;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";7;"may";104;2;-1;0;"unknown";"no"
-57;"services";"married";"secondary";"no";1;"no";"no";"unknown";7;"may";852;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";97;"yes";"no";"unknown";7;"may";923;3;-1;0;"unknown";"no"
-31;"blue-collar";"single";"primary";"no";435;"yes";"no";"unknown";7;"may";159;2;-1;0;"unknown";"no"
-31;"management";"divorced";"tertiary";"no";0;"yes";"no";"unknown";7;"may";953;3;-1;0;"unknown";"no"
-37;"technician";"single";"tertiary";"no";147;"no";"no";"unknown";7;"may";416;2;-1;0;"unknown";"no"
-30;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";174;1;-1;0;"unknown";"no"
-58;"services";"divorced";"secondary";"no";1109;"yes";"yes";"unknown";7;"may";180;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";404;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";981;"yes";"no";"unknown";7;"may";294;1;-1;0;"unknown";"no"
-33;"blue-collar";"single";"primary";"no";95;"yes";"no";"unknown";7;"may";102;1;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";302;"yes";"no";"unknown";7;"may";124;1;-1;0;"unknown";"no"
-36;"services";"divorced";"secondary";"no";-290;"yes";"yes";"unknown";7;"may";128;1;-1;0;"unknown";"no"
-37;"services";"single";"secondary";"no";259;"yes";"no";"unknown";7;"may";130;1;-1;0;"unknown";"no"
-35;"blue-collar";"married";"secondary";"no";527;"yes";"yes";"unknown";7;"may";143;1;-1;0;"unknown";"no"
-55;"retired";"married";"secondary";"no";102;"yes";"no";"unknown";7;"may";74;1;-1;0;"unknown";"no"
-34;"management";"single";"tertiary";"no";872;"yes";"no";"unknown";7;"may";105;2;-1;0;"unknown";"no"
-40;"management";"divorced";"tertiary";"no";490;"yes";"no";"unknown";7;"may";477;2;-1;0;"unknown";"no"
-42;"blue-collar";"single";"primary";"no";19;"yes";"no";"unknown";7;"may";158;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";16;"yes";"no";"unknown";7;"may";250;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";386;"yes";"no";"unknown";7;"may";168;1;-1;0;"unknown";"no"
-35;"technician";"single";"secondary";"no";539;"yes";"no";"unknown";7;"may";520;1;-1;0;"unknown";"no"
-44;"technician";"divorced";"secondary";"no";-329;"yes";"no";"unknown";7;"may";171;1;-1;0;"unknown";"no"
-30;"services";"single";"secondary";"no";-174;"yes";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
-45;"entrepreneur";"married";"secondary";"no";68;"yes";"no";"unknown";7;"may";254;1;-1;0;"unknown";"no"
-35;"blue-collar";"single";"unknown";"yes";-532;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
-36;"admin.";"divorced";"secondary";"no";0;"yes";"no";"unknown";7;"may";133;2;-1;0;"unknown";"no"
-49;"blue-collar";"married";"secondary";"no";64;"yes";"no";"unknown";7;"may";293;3;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";1415;"yes";"no";"unknown";7;"may";485;1;-1;0;"unknown";"no"
-31;"technician";"single";"secondary";"no";147;"yes";"no";"unknown";7;"may";374;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";72;"yes";"no";"unknown";7;"may";425;6;-1;0;"unknown";"no"
-37;"services";"single";"secondary";"no";-196;"yes";"no";"unknown";7;"may";207;1;-1;0;"unknown";"no"
-33;"blue-collar";"married";"primary";"no";716;"yes";"no";"unknown";7;"may";83;3;-1;0;"unknown";"no"
-37;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";7;"may";228;1;-1;0;"unknown";"no"
-42;"services";"married";"secondary";"no";-246;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
-56;"blue-collar";"married";"secondary";"no";-203;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";245;"yes";"yes";"unknown";7;"may";732;2;-1;0;"unknown";"yes"
-36;"services";"single";"secondary";"no";342;"yes";"no";"unknown";7;"may";142;1;-1;0;"unknown";"no"
-29;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
-54;"management";"married";"tertiary";"yes";-248;"yes";"yes";"unknown";7;"may";112;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";376;"yes";"no";"unknown";7;"may";1521;1;-1;0;"unknown";"no"
-43;"blue-collar";"divorced";"secondary";"no";370;"yes";"no";"unknown";7;"may";216;1;-1;0;"unknown";"no"
-47;"admin.";"single";"secondary";"no";594;"yes";"no";"unknown";7;"may";161;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"secondary";"no";387;"yes";"no";"unknown";7;"may";122;2;-1;0;"unknown";"no"
-38;"services";"married";"secondary";"no";208;"yes";"no";"unknown";7;"may";800;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";563;"yes";"no";"unknown";7;"may";615;1;-1;0;"unknown";"no"
-33;"services";"divorced";"secondary";"no";392;"yes";"yes";"unknown";7;"may";254;1;-1;0;"unknown";"no"
-33;"retired";"married";"secondary";"no";165;"no";"no";"unknown";7;"may";111;1;-1;0;"unknown";"no"
-53;"admin.";"divorced";"unknown";"no";236;"yes";"no";"unknown";7;"may";354;1;-1;0;"unknown";"no"
-37;"services";"married";"primary";"no";52;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
-40;"management";"single";"tertiary";"no";1265;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";693;"yes";"no";"unknown";7;"may";327;3;-1;0;"unknown";"no"
-35;"technician";"married";"secondary";"no";118;"yes";"no";"unknown";7;"may";236;1;-1;0;"unknown";"no"
-49;"blue-collar";"married";"primary";"no";3659;"yes";"no";"unknown";7;"may";160;1;-1;0;"unknown";"no"
-26;"blue-collar";"single";"secondary";"no";24;"yes";"no";"unknown";7;"may";180;1;-1;0;"unknown";"no"
-38;"management";"single";"tertiary";"no";673;"yes";"no";"unknown";7;"may";184;1;-1;0;"unknown";"no"
-52;"self-employed";"married";"secondary";"no";273;"no";"no";"unknown";7;"may";227;1;-1;0;"unknown";"no"
-33;"services";"divorced";"secondary";"no";327;"yes";"no";"unknown";7;"may";109;1;-1;0;"unknown";"no"
-31;"admin.";"single";"secondary";"no";299;"yes";"no";"unknown";7;"may";492;2;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";298;1;-1;0;"unknown";"no"
-35;"blue-collar";"single";"primary";"no";109;"yes";"no";"unknown";7;"may";83;2;-1;0;"unknown";"no"
-55;"management";"divorced";"tertiary";"no";552;"no";"no";"unknown";7;"may";241;2;-1;0;"unknown";"no"
-32;"blue-collar";"divorced";"primary";"no";473;"yes";"no";"unknown";7;"may";204;2;-1;0;"unknown";"no"
-37;"unknown";"single";"unknown";"no";414;"yes";"no";"unknown";7;"may";131;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"secondary";"no";154

<TRUNCATED>

[16/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Rescorer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Rescorer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Rescorer.java
new file mode 100644
index 0000000..1490761
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Rescorer.java
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.recommender;
+
+/**
+ * <p>
+ * A {@link Rescorer} simply assigns a new "score" to a thing like an ID of an item or user which a
+ * {@link Recommender} is considering returning as a top recommendation. It may be used to arbitrarily re-rank
+ * the results according to application-specific logic before returning recommendations. For example, an
+ * application may want to boost the score of items in a certain category just for one request.
+ * </p>
+ *
+ * <p>
+ * A {@link Rescorer} can also exclude a thing from consideration entirely by returning {@code true} from
+ * {@link #isFiltered(Object)}.
+ * </p>
+ */
+public interface Rescorer<T> {
+  
+  /**
+   * @param thing
+   *          thing to rescore
+   * @param originalScore
+   *          original score
+   * @return modified score, or {@link Double#NaN} to indicate that this should be excluded entirely
+   */
+  double rescore(T thing, double originalScore);
+  
+  /**
+   * Returns {@code true} to exclude the given thing.
+   *
+   * @param thing
+   *          the thing to filter
+   * @return {@code true} to exclude, {@code false} otherwise
+   */
+  boolean isFiltered(T thing);
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/UserBasedRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/UserBasedRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/UserBasedRecommender.java
new file mode 100644
index 0000000..b48593a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/UserBasedRecommender.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.cf.taste.recommender;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.common.LongPair;
+
+/**
+ * <p>
+ * Interface implemented by "user-based" recommenders.
+ * </p>
+ */
+public interface UserBasedRecommender extends Recommender {
+  
+  /**
+   * @param userID
+   *          ID of user for which to find most similar other users
+   * @param howMany
+   *          desired number of most similar users to find
+   * @return users most similar to the given user
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+   */
+  long[] mostSimilarUserIDs(long userID, int howMany) throws TasteException;
+  
+  /**
+   * @param userID
+   *          ID of user for which to find most similar other users
+   * @param howMany
+   *          desired number of most similar users to find
+   * @param rescorer
+   *          {@link Rescorer} which can adjust user-user similarity estimates used to determine most similar
+   *          users
+   * @return IDs of users most similar to the given user
+   * @throws TasteException
+   *           if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+   */
+  long[] mostSimilarUserIDs(long userID, int howMany, Rescorer<LongPair> rescorer) throws TasteException;
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/ItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/ItemSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/ItemSimilarity.java
new file mode 100644
index 0000000..814610b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/ItemSimilarity.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * <p>
+ * Implementations of this interface define a notion of similarity between two items. Implementations should
+ * return values in the range -1.0 to 1.0, with 1.0 representing perfect similarity.
+ * </p>
+ * 
+ * @see UserSimilarity
+ */
+public interface ItemSimilarity extends Refreshable {
+  
+  /**
+   * <p>
+   * Returns the degree of similarity, of two items, based on the preferences that users have expressed for
+   * the items.
+   * </p>
+   * 
+   * @param itemID1 first item ID
+   * @param itemID2 second item ID
+   * @return similarity between the items, in [-1,1] or {@link Double#NaN} similarity is unknown
+   * @throws org.apache.mahout.cf.taste.common.NoSuchItemException
+   *  if either item is known to be non-existent in the data
+   * @throws TasteException if an error occurs while accessing the data
+   */
+  double itemSimilarity(long itemID1, long itemID2) throws TasteException;
+
+  /**
+   * <p>A bulk-get version of {@link #itemSimilarity(long, long)}.</p>
+   *
+   * @param itemID1 first item ID
+   * @param itemID2s second item IDs to compute similarity with
+   * @return similarity between itemID1 and other items
+   * @throws org.apache.mahout.cf.taste.common.NoSuchItemException
+   *  if any item is known to be non-existent in the data
+   * @throws TasteException if an error occurs while accessing the data
+   */
+  double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException;
+
+  /**
+   * @return all IDs of similar items, in no particular order
+   */
+  long[] allSimilarItemIDs(long itemID) throws TasteException;
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/PreferenceInferrer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/PreferenceInferrer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/PreferenceInferrer.java
new file mode 100644
index 0000000..76bb328
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/PreferenceInferrer.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * <p>
+ * Implementations of this interface compute an inferred preference for a user and an item that the user has
+ * not expressed any preference for. This might be an average of other preferences scores from that user, for
+ * example. This technique is sometimes called "default voting".
+ * </p>
+ */
+public interface PreferenceInferrer extends Refreshable {
+  
+  /**
+   * <p>
+   * Infers the given user's preference value for an item.
+   * </p>
+   * 
+   * @param userID
+   *          ID of user to infer preference for
+   * @param itemID
+   *          item ID to infer preference for
+   * @return inferred preference
+   * @throws TasteException
+   *           if an error occurs while inferring
+   */
+  float inferPreference(long userID, long itemID) throws TasteException;
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/UserSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/UserSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/UserSimilarity.java
new file mode 100644
index 0000000..bd53c51
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/UserSimilarity.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * <p>
+ * Implementations of this interface define a notion of similarity between two users. Implementations should
+ * return values in the range -1.0 to 1.0, with 1.0 representing perfect similarity.
+ * </p>
+ * 
+ * @see ItemSimilarity
+ */
+public interface UserSimilarity extends Refreshable {
+  
+  /**
+   * <p>
+   * Returns the degree of similarity, of two users, based on the their preferences.
+   * </p>
+   * 
+   * @param userID1 first user ID
+   * @param userID2 second user ID
+   * @return similarity between the users, in [-1,1] or {@link Double#NaN} similarity is unknown
+   * @throws org.apache.mahout.cf.taste.common.NoSuchUserException
+   *  if either user is known to be non-existent in the data
+   * @throws TasteException if an error occurs while accessing the data
+   */
+  double userSimilarity(long userID1, long userID2) throws TasteException;
+
+  // Should we implement userSimilarities() like ItemSimilarity.itemSimilarities()?
+  
+  /**
+   * <p>
+   * Attaches a {@link PreferenceInferrer} to the {@link UserSimilarity} implementation.
+   * </p>
+   * 
+   * @param inferrer {@link PreferenceInferrer}
+   */
+  void setPreferenceInferrer(PreferenceInferrer inferrer);
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/BatchItemSimilarities.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/BatchItemSimilarities.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/BatchItemSimilarities.java
new file mode 100644
index 0000000..b934d0c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/BatchItemSimilarities.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity.precompute;
+
+import org.apache.mahout.cf.taste.recommender.ItemBasedRecommender;
+
+import java.io.IOException;
+
+public abstract class BatchItemSimilarities {
+
+  private final ItemBasedRecommender recommender;
+  private final int similarItemsPerItem;
+
+  /**
+   * @param recommender recommender to use
+   * @param similarItemsPerItem number of similar items to compute per item
+   */
+  protected BatchItemSimilarities(ItemBasedRecommender recommender, int similarItemsPerItem) {
+    this.recommender = recommender;
+    this.similarItemsPerItem = similarItemsPerItem;
+  }
+
+  protected ItemBasedRecommender getRecommender() {
+    return recommender;
+  }
+
+  protected int getSimilarItemsPerItem() {
+    return similarItemsPerItem;
+  }
+
+  /**
+   * @param degreeOfParallelism number of threads to use for the computation
+   * @param maxDurationInHours  maximum duration of the computation
+   * @param writer  {@link SimilarItemsWriter} used to persist the results
+   * @return  the number of similarities precomputed
+   * @throws IOException
+   * @throws RuntimeException if the computation takes longer than maxDurationInHours
+   */
+  public abstract int computeItemSimilarities(int degreeOfParallelism, int maxDurationInHours,
+      SimilarItemsWriter writer) throws IOException;
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItem.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItem.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItem.java
new file mode 100644
index 0000000..5d40051
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItem.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity.precompute;
+
+import com.google.common.primitives.Doubles;
+
+import java.util.Comparator;
+
+/**
+ * Modeling similarity towards another item
+ */
+public class SimilarItem {
+
+  public static final Comparator<SimilarItem> COMPARE_BY_SIMILARITY = new Comparator<SimilarItem>() {
+    @Override
+    public int compare(SimilarItem s1, SimilarItem s2) {
+      return Doubles.compare(s1.similarity, s2.similarity);
+    }
+  };
+
+  private long itemID;
+  private double similarity;
+
+  public SimilarItem(long itemID, double similarity) {
+    set(itemID, similarity);
+  }
+
+  public void set(long itemID, double similarity) {
+    this.itemID = itemID;
+    this.similarity = similarity;
+  }
+
+  public long getItemID() {
+    return itemID;
+  }
+
+  public double getSimilarity() {
+    return similarity;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItems.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItems.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItems.java
new file mode 100644
index 0000000..057e996
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItems.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity.precompute;
+
+import com.google.common.collect.UnmodifiableIterator;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.NoSuchElementException;
+
+/**
+ * Compact representation of all similar items for an item
+ */
+public class SimilarItems {
+
+  private final long itemID;
+  private final long[] similarItemIDs;
+  private final double[] similarities;
+
+  public SimilarItems(long itemID, List<RecommendedItem> similarItems) {
+    this.itemID = itemID;
+
+    int numSimilarItems = similarItems.size();
+    similarItemIDs = new long[numSimilarItems];
+    similarities = new double[numSimilarItems];
+
+    for (int n = 0; n < numSimilarItems; n++) {
+      similarItemIDs[n] = similarItems.get(n).getItemID();
+      similarities[n] = similarItems.get(n).getValue();
+    }
+  }
+
+  public long getItemID() {
+    return itemID;
+  }
+
+  public int numSimilarItems() {
+    return similarItemIDs.length;
+  }
+
+  public Iterable<SimilarItem> getSimilarItems() {
+    return new Iterable<SimilarItem>() {
+      @Override
+      public Iterator<SimilarItem> iterator() {
+        return new SimilarItemsIterator();
+      }
+    };
+  }
+
+  private class SimilarItemsIterator extends UnmodifiableIterator<SimilarItem> {
+
+    private int index = -1;
+
+    @Override
+    public boolean hasNext() {
+      return index < (similarItemIDs.length - 1);
+    }
+
+    @Override
+    public SimilarItem next() {
+      if (!hasNext()) {
+        throw new NoSuchElementException();
+      }
+      index++;
+      return new SimilarItem(similarItemIDs[index], similarities[index]);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItemsWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItemsWriter.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItemsWriter.java
new file mode 100644
index 0000000..35d6bfe
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItemsWriter.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity.precompute;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+/**
+ * Used to persist the results of a batch item similarity computation
+ * conducted with a {@link BatchItemSimilarities} implementation
+ */
+public interface SimilarItemsWriter extends Closeable {
+
+  void open() throws IOException;
+
+  void add(SimilarItems similarItems) throws IOException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
new file mode 100644
index 0000000..efd233f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
@@ -0,0 +1,248 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * Defines the interface for classifiers that take a vector as input. This is
+ * implemented as an abstract class so that it can implement a number of handy
+ * convenience methods related to classification of vectors.
+ *
+ * <p>
+ * A classifier takes an input vector and calculates the scores (usually
+ * probabilities) that the input vector belongs to one of {@code n}
+ * categories. In {@code AbstractVectorClassifier} each category is denoted
+ * by an integer {@code c} between {@code 0} and {@code n-1}
+ * (inclusive).
+ *
+ * <p>
+ * New users should start by looking at {@link #classifyFull} (not {@link #classify}).
+ *
+ */
+public abstract class AbstractVectorClassifier {
+
+  /** Minimum allowable log likelihood value. */
+  public static final double MIN_LOG_LIKELIHOOD = -100.0;
+
+   /**
+    * Returns the number of categories that a target variable can be assigned to.
+    * A vector classifier will encode it's output as an integer from
+    * {@code 0} to {@code numCategories()-1} (inclusive).
+    *
+    * @return The number of categories.
+    */
+  public abstract int numCategories();
+
+  /**
+   * Compute and return a vector containing {@code n-1} scores, where
+   * {@code n} is equal to {@code numCategories()}, given an input
+   * vector {@code instance}. Higher scores indicate that the input vector
+   * is more likely to belong to that category. The categories are denoted by
+   * the integers {@code 0} through {@code n-1} (inclusive), and the
+   * scores in the returned vector correspond to categories 1 through
+   * {@code n-1} (leaving out category 0). It is assumed that the score for
+   * category 0 is one minus the sum of the scores in the returned vector.
+   *
+   * @param instance  A feature vector to be classified.
+   * @return A vector of probabilities in 1 of {@code n-1} encoding.
+   */
+  public abstract Vector classify(Vector instance);
+  
+  /**
+   * Compute and return a vector of scores before applying the inverse link
+   * function. For logistic regression and other generalized linear models, this
+   * is just the linear part of the classification.
+   * 
+   * <p>
+   * The implementation of this method provided by {@code AbstractVectorClassifier} throws an
+   * {@link UnsupportedOperationException}. Your subclass must explicitly override this method to support
+   * this operation.
+   * 
+   * @param features  A feature vector to be classified.
+   * @return A vector of scores. If transformed by the link function, these will become probabilities.
+   */
+  public Vector classifyNoLink(Vector features) {
+    throw new UnsupportedOperationException(this.getClass().getName()
+        + " doesn't support classification without a link");
+  }
+
+  /**
+   * Classifies a vector in the special case of a binary classifier where
+   * {@link #classify(Vector)} would return a vector with only one element. As
+   * such, using this method can avoid the allocation of a vector.
+   * 
+   * @param instance The feature vector to be classified.
+   * @return The score for category 1.
+   * 
+   * @see #classify(Vector)
+   */
+  public abstract double classifyScalar(Vector instance);
+
+  /**
+   * Computes and returns a vector containing {@code n} scores, where
+   * {@code n} is {@code numCategories()}, given an input vector
+   * {@code instance}. Higher scores indicate that the input vector is more
+   * likely to belong to the corresponding category. The categories are denoted
+   * by the integers {@code 0} through {@code n-1} (inclusive).
+   *
+   * <p>
+   * Using this method it is possible to classify an input vector, for example,
+   * by selecting the category with the largest score. If
+   * {@code classifier} is an instance of
+   * {@code AbstractVectorClassifier} and {@code input} is a
+   * {@code Vector} of features describing an element to be classified,
+   * then the following code could be used to classify {@code input}.<br>
+   * {@code
+   * Vector scores = classifier.classifyFull(input);<br>
+   * int assignedCategory = scores.maxValueIndex();<br>
+   * } Here {@code assignedCategory} is the index of the category
+   * with the maximum score.
+   *
+   * <p>
+   * If an {@code n-1} encoding is acceptable, and allocation performance
+   * is an issue, then the {@link #classify(Vector)} method is probably better
+   * to use.
+   *
+   * @see #classify(Vector)
+   * @see #classifyFull(Vector r, Vector instance)
+   *
+   * @param instance A vector of features to be classified.
+   * @return A vector of probabilities, one for each category.
+   */
+  public Vector classifyFull(Vector instance) {
+    return classifyFull(new DenseVector(numCategories()), instance);
+  }
+
+  /**
+   * Computes and returns a vector containing {@code n} scores, where
+   * {@code n} is {@code numCategories()}, given an input vector
+   * {@code instance}. Higher scores indicate that the input vector is more
+   * likely to belong to the corresponding category. The categories are denoted
+   * by the integers {@code 0} through {@code n-1} (inclusive). The
+   * main difference between this method and {@link #classifyFull(Vector)} is
+   * that this method allows a user to provide a previously allocated
+   * {@code Vector r} to store the returned scores.
+   *
+   * <p>
+   * Using this method it is possible to classify an input vector, for example,
+   * by selecting the category with the largest score. If
+   * {@code classifier} is an instance of
+   * {@code AbstractVectorClassifier}, {@code result} is a non-null
+   * {@code Vector}, and {@code input} is a {@code Vector} of
+   * features describing an element to be classified, then the following code
+   * could be used to classify {@code input}.<br>
+   * {@code
+   * Vector scores = classifier.classifyFull(result, input); // Notice that scores == result<br>
+   * int assignedCategory = scores.maxValueIndex();<br>
+   * } Here {@code assignedCategory} is the index of the category
+   * with the maximum score.
+   *
+   * @param r Where to put the results.
+   * @param instance  A vector of features to be classified.
+   * @return A vector of scores/probabilities, one for each category.
+   */
+  public Vector classifyFull(Vector r, Vector instance) {
+    r.viewPart(1, numCategories() - 1).assign(classify(instance));
+    r.setQuick(0, 1.0 - r.zSum());
+    return r;
+  }
+
+
+  /**
+   * Returns n-1 probabilities, one for each categories 1 through
+   * {@code n-1}, for each row of a matrix, where {@code n} is equal
+   * to {@code numCategories()}. The probability of the missing 0-th
+   * category is 1 - rowSum(this result).
+   *
+   * @param data  The matrix whose rows are the input vectors to classify
+   * @return A matrix of scores, one row per row of the input matrix, one column for each but the last category.
+   */
+  public Matrix classify(Matrix data) {
+    Matrix r = new DenseMatrix(data.numRows(), numCategories() - 1);
+    for (int row = 0; row < data.numRows(); row++) {
+      r.assignRow(row, classify(data.viewRow(row)));
+    }
+    return r;
+  }
+
+  /**
+   * Returns a matrix where the rows of the matrix each contain {@code n} probabilities, one for each category.
+   *
+   * @param data  The matrix whose rows are the input vectors to classify
+   * @return A matrix of scores, one row per row of the input matrix, one column for each but the last category.
+   */
+  public Matrix classifyFull(Matrix data) {
+    Matrix r = new DenseMatrix(data.numRows(), numCategories());
+    for (int row = 0; row < data.numRows(); row++) {
+      classifyFull(r.viewRow(row), data.viewRow(row));
+    }
+    return r;
+  }
+
+  /**
+   * Returns a vector of probabilities of category 1, one for each row
+   * of a matrix. This only makes sense if there are exactly two categories, but
+   * calling this method in that case can save a number of vector allocations.
+   * 
+   * @param data  The matrix whose rows are vectors to classify
+   * @return A vector of scores, with one value per row of the input matrix.
+   */
+  public Vector classifyScalar(Matrix data) {
+    Preconditions.checkArgument(numCategories() == 2, "Can only call classifyScalar with two categories");
+
+    Vector r = new DenseVector(data.numRows());
+    for (int row = 0; row < data.numRows(); row++) {
+      r.set(row, classifyScalar(data.viewRow(row)));
+    }
+    return r;
+  }
+
+  /**
+   * Returns a measure of how good the classification for a particular example
+   * actually is.
+   * 
+   * @param actual  The correct category for the example.
+   * @param data  The vector to be classified.
+   * @return The log likelihood of the correct answer as estimated by the current model. This will always be <= 0
+   *  and larger (closer to 0) indicates better accuracy. In order to simplify code that maintains eunning averages,
+   *  we bound this value at -100.
+   */
+  public double logLikelihood(int actual, Vector data) {
+    if (numCategories() == 2) {
+      double p = classifyScalar(data);
+      if (actual > 0) {
+        return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p));
+      } else {
+        return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p));
+      }
+    } else {
+      Vector p = classify(data);
+      if (actual > 0) {
+        return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p.get(actual - 1)));
+      } else {
+        return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p.zSum()));
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ClassifierResult.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ClassifierResult.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ClassifierResult.java
new file mode 100644
index 0000000..29eaa0d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ClassifierResult.java
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+/**
+ * Result of a document classification. The label and the associated score (usually probabilty)
+ */
+public class ClassifierResult {
+
+  private String label;
+  private double score;
+  private double logLikelihood = Double.MAX_VALUE;
+  
+  public ClassifierResult() { }
+  
+  public ClassifierResult(String label, double score) {
+    this.label = label;
+    this.score = score;
+  }
+  
+  public ClassifierResult(String label) {
+    this.label = label;
+  }
+
+  public ClassifierResult(String label, double score, double logLikelihood) {
+    this.label = label;
+    this.score = score;
+    this.logLikelihood = logLikelihood;
+  }
+
+  public double getLogLikelihood() {
+    return logLikelihood;
+  }
+
+  public void setLogLikelihood(double logLikelihood) {
+    this.logLikelihood = logLikelihood;
+  }
+
+  public String getLabel() {
+    return label;
+  }
+  
+  public double getScore() {
+    return score;
+  }
+  
+  public void setLabel(String label) {
+    this.label = label;
+  }
+  
+  public void setScore(double score) {
+    this.score = score;
+  }
+  
+  @Override
+  public String toString() {
+    return "ClassifierResult{" + "category='" + label + '\'' + ", score=" + score + '}';
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java
new file mode 100644
index 0000000..73ba521
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java
@@ -0,0 +1,444 @@
+/**
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import com.google.common.base.Preconditions;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.math3.stat.descriptive.moment.Mean;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.Matrix;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * The ConfusionMatrix Class stores the result of Classification of a Test Dataset.
+ * 
+ * The fact of whether there is a default is not stored. A row of zeros is the only indicator that there is no default.
+ * 
+ * See http://en.wikipedia.org/wiki/Confusion_matrix for background
+ */
+public class ConfusionMatrix {
+  private static final Logger LOG = LoggerFactory.getLogger(ConfusionMatrix.class);
+  private final Map<String,Integer> labelMap = new LinkedHashMap<>();
+  private final int[][] confusionMatrix;
+  private int samples = 0;
+  private String defaultLabel = "unknown";
+  
+  public ConfusionMatrix(Collection<String> labels, String defaultLabel) {
+    confusionMatrix = new int[labels.size() + 1][labels.size() + 1];
+    this.defaultLabel = defaultLabel;
+    int i = 0;
+    for (String label : labels) {
+      labelMap.put(label, i++);
+    }
+    labelMap.put(defaultLabel, i);
+  }
+  
+  public ConfusionMatrix(Matrix m) {
+    confusionMatrix = new int[m.numRows()][m.numRows()];
+    setMatrix(m);
+  }
+  
+  public int[][] getConfusionMatrix() {
+    return confusionMatrix;
+  }
+  
+  public Collection<String> getLabels() {
+    return Collections.unmodifiableCollection(labelMap.keySet());
+  }
+
+  private int numLabels() {
+    return labelMap.size();
+  }
+
+  public double getAccuracy(String label) {
+    int labelId = labelMap.get(label);
+    int labelTotal = 0;
+    int correct = 0;
+    for (int i = 0; i < numLabels(); i++) {
+      labelTotal += confusionMatrix[labelId][i];
+      if (i == labelId) {
+        correct += confusionMatrix[labelId][i];
+      }
+    }
+    return 100.0 * correct / labelTotal;
+  }
+
+  // Producer accuracy
+  public double getAccuracy() {
+    int total = 0;
+    int correct = 0;
+    for (int i = 0; i < numLabels(); i++) {
+      for (int j = 0; j < numLabels(); j++) {
+        total += confusionMatrix[i][j];
+        if (i == j) {
+          correct += confusionMatrix[i][j];
+        }
+      }
+    }
+    return 100.0 * correct / total;
+  }
+
+  /** Sum of true positives and false negatives */
+  private int getActualNumberOfTestExamplesForClass(String label) {
+    int labelId = labelMap.get(label);
+    int sum = 0;
+    for (int i = 0; i < numLabels(); i++) {
+      sum += confusionMatrix[labelId][i];
+    }
+    return sum;
+  }
+
+  public double getPrecision(String label) {
+    int labelId = labelMap.get(label);
+    int truePositives = confusionMatrix[labelId][labelId];
+    int falsePositives = 0;
+    for (int i = 0; i < numLabels(); i++) {
+      if (i == labelId) {
+        continue;
+      }
+      falsePositives += confusionMatrix[i][labelId];
+    }
+
+    if (truePositives + falsePositives == 0) {
+      return 0;
+    }
+
+    return ((double) truePositives) / (truePositives + falsePositives);
+  }
+
+  public double getWeightedPrecision() {
+    double[] precisions = new double[numLabels()];
+    double[] weights = new double[numLabels()];
+
+    int index = 0;
+    for (String label : labelMap.keySet()) {
+      precisions[index] = getPrecision(label);
+      weights[index] = getActualNumberOfTestExamplesForClass(label);
+      index++;
+    }
+    return new Mean().evaluate(precisions, weights);
+  }
+
+  public double getRecall(String label) {
+    int labelId = labelMap.get(label);
+    int truePositives = confusionMatrix[labelId][labelId];
+    int falseNegatives = 0;
+    for (int i = 0; i < numLabels(); i++) {
+      if (i == labelId) {
+        continue;
+      }
+      falseNegatives += confusionMatrix[labelId][i];
+    }
+    if (truePositives + falseNegatives == 0) {
+      return 0;
+    }
+    return ((double) truePositives) / (truePositives + falseNegatives);
+  }
+
+  public double getWeightedRecall() {
+    double[] recalls = new double[numLabels()];
+    double[] weights = new double[numLabels()];
+
+    int index = 0;
+    for (String label : labelMap.keySet()) {
+      recalls[index] = getRecall(label);
+      weights[index] = getActualNumberOfTestExamplesForClass(label);
+      index++;
+    }
+    return new Mean().evaluate(recalls, weights);
+  }
+
+  public double getF1score(String label) {
+    double precision = getPrecision(label);
+    double recall = getRecall(label);
+    if (precision + recall == 0) {
+      return 0;
+    }
+    return 2 * precision * recall / (precision + recall);
+  }
+
+  public double getWeightedF1score() {
+    double[] f1Scores = new double[numLabels()];
+    double[] weights = new double[numLabels()];
+
+    int index = 0;
+    for (String label : labelMap.keySet()) {
+      f1Scores[index] = getF1score(label);
+      weights[index] = getActualNumberOfTestExamplesForClass(label);
+      index++;
+    }
+    return new Mean().evaluate(f1Scores, weights);
+  }
+
+  // User accuracy 
+  public double getReliability() {
+    int count = 0;
+    double accuracy = 0;
+    for (String label: labelMap.keySet()) {
+      if (!label.equals(defaultLabel)) {
+        accuracy += getAccuracy(label);
+      }
+      count++;
+    }
+    return accuracy / count;
+  }
+  
+  /**
+   * Accuracy v.s. randomly classifying all samples.
+   * kappa() = (totalAccuracy() - randomAccuracy()) / (1 - randomAccuracy())
+   * Cohen, Jacob. 1960. A coefficient of agreement for nominal scales. 
+   * Educational And Psychological Measurement 20:37-46.
+   * 
+   * Formula and variable names from:
+   * http://www.yale.edu/ceo/OEFS/Accuracy.pdf
+   * 
+   * @return double
+   */
+  public double getKappa() {
+    double a = 0.0;
+    double b = 0.0;
+    for (int i = 0; i < confusionMatrix.length; i++) {
+      a += confusionMatrix[i][i];
+      double br = 0;
+      for (int j = 0; j < confusionMatrix.length; j++) {
+        br += confusionMatrix[i][j];
+      }
+      double bc = 0;
+      for (int[] vec : confusionMatrix) {
+        bc += vec[i];
+      }
+      b += br * bc;
+    }
+    return (samples * a - b) / (samples * samples - b);
+  }
+  
+  /**
+   * Standard deviation of normalized producer accuracy
+   * Not a standard score
+   * @return double
+   */
+  public RunningAverageAndStdDev getNormalizedStats() {
+    RunningAverageAndStdDev summer = new FullRunningAverageAndStdDev();
+    for (int d = 0; d < confusionMatrix.length; d++) {
+      double total = 0;
+      for (int j = 0; j < confusionMatrix.length; j++) {
+        total += confusionMatrix[d][j];
+      }
+      summer.addDatum(confusionMatrix[d][d] / (total + 0.000001));
+    }
+    
+    return summer;
+  }
+   
+  public int getCorrect(String label) {
+    int labelId = labelMap.get(label);
+    return confusionMatrix[labelId][labelId];
+  }
+  
+  public int getTotal(String label) {
+    int labelId = labelMap.get(label);
+    int labelTotal = 0;
+    for (int i = 0; i < labelMap.size(); i++) {
+      labelTotal += confusionMatrix[labelId][i];
+    }
+    return labelTotal;
+  }
+  
+  public void addInstance(String correctLabel, ClassifierResult classifiedResult) {
+    samples++;
+    incrementCount(correctLabel, classifiedResult.getLabel());
+  }
+  
+  public void addInstance(String correctLabel, String classifiedLabel) {
+    samples++;
+    incrementCount(correctLabel, classifiedLabel);
+  }
+  
+  public int getCount(String correctLabel, String classifiedLabel) {
+    if(!labelMap.containsKey(correctLabel)) {
+      LOG.warn("Label {} did not appear in the training examples", correctLabel);
+      return 0;
+    }
+    Preconditions.checkArgument(labelMap.containsKey(classifiedLabel), "Label not found: " + classifiedLabel);
+    int correctId = labelMap.get(correctLabel);
+    int classifiedId = labelMap.get(classifiedLabel);
+    return confusionMatrix[correctId][classifiedId];
+  }
+  
+  public void putCount(String correctLabel, String classifiedLabel, int count) {
+    if(!labelMap.containsKey(correctLabel)) {
+      LOG.warn("Label {} did not appear in the training examples", correctLabel);
+      return;
+    }
+    Preconditions.checkArgument(labelMap.containsKey(classifiedLabel), "Label not found: " + classifiedLabel);
+    int correctId = labelMap.get(correctLabel);
+    int classifiedId = labelMap.get(classifiedLabel);
+    if (confusionMatrix[correctId][classifiedId] == 0.0 && count != 0) {
+      samples++;
+    }
+    confusionMatrix[correctId][classifiedId] = count;
+  }
+  
+  public String getDefaultLabel() {
+    return defaultLabel;
+  }
+  
+  public void incrementCount(String correctLabel, String classifiedLabel, int count) {
+    putCount(correctLabel, classifiedLabel, count + getCount(correctLabel, classifiedLabel));
+  }
+  
+  public void incrementCount(String correctLabel, String classifiedLabel) {
+    incrementCount(correctLabel, classifiedLabel, 1);
+  }
+  
+  public ConfusionMatrix merge(ConfusionMatrix b) {
+    Preconditions.checkArgument(labelMap.size() == b.getLabels().size(), "The label sizes do not match");
+    for (String correctLabel : this.labelMap.keySet()) {
+      for (String classifiedLabel : this.labelMap.keySet()) {
+        incrementCount(correctLabel, classifiedLabel, b.getCount(correctLabel, classifiedLabel));
+      }
+    }
+    return this;
+  }
+  
+  public Matrix getMatrix() {
+    int length = confusionMatrix.length;
+    Matrix m = new DenseMatrix(length, length);
+    for (int r = 0; r < length; r++) {
+      for (int c = 0; c < length; c++) {
+        m.set(r, c, confusionMatrix[r][c]);
+      }
+    }
+    Map<String,Integer> labels = new HashMap<>();
+    for (Map.Entry<String, Integer> entry : labelMap.entrySet()) {
+      labels.put(entry.getKey(), entry.getValue());
+    }
+    m.setRowLabelBindings(labels);
+    m.setColumnLabelBindings(labels);
+    return m;
+  }
+  
+  public void setMatrix(Matrix m) {
+    int length = confusionMatrix.length;
+    if (m.numRows() != m.numCols()) {
+      throw new IllegalArgumentException(
+          "ConfusionMatrix: matrix(" + m.numRows() + ',' + m.numCols() + ") must be square");
+    }
+    for (int r = 0; r < length; r++) {
+      for (int c = 0; c < length; c++) {
+        confusionMatrix[r][c] = (int) Math.round(m.get(r, c));
+      }
+    }
+    Map<String,Integer> labels = m.getRowLabelBindings();
+    if (labels == null) {
+      labels = m.getColumnLabelBindings();
+    }
+    if (labels != null) {
+      String[] sorted = sortLabels(labels);
+      verifyLabels(length, sorted);
+      labelMap.clear();
+      for (int i = 0; i < length; i++) {
+        labelMap.put(sorted[i], i);
+      }
+    }
+  }
+  
+  private static String[] sortLabels(Map<String,Integer> labels) {
+    String[] sorted = new String[labels.size()];
+    for (Map.Entry<String,Integer> entry : labels.entrySet()) {
+      sorted[entry.getValue()] = entry.getKey();
+    }
+    return sorted;
+  }
+  
+  private static void verifyLabels(int length, String[] sorted) {
+    Preconditions.checkArgument(sorted.length == length, "One label, one row");
+    for (int i = 0; i < length; i++) {
+      if (sorted[i] == null) {
+        Preconditions.checkArgument(false, "One label, one row");
+      }
+    }
+  }
+  
+  /**
+   * This is overloaded. toString() is not a formatted report you print for a manager :)
+   * Assume that if there are no default assignments, the default feature was not used
+   */
+  @Override
+  public String toString() {
+    StringBuilder returnString = new StringBuilder(200);
+    returnString.append("=======================================================").append('\n');
+    returnString.append("Confusion Matrix\n");
+    returnString.append("-------------------------------------------------------").append('\n');
+    
+    int unclassified = getTotal(defaultLabel);
+    for (Map.Entry<String,Integer> entry : this.labelMap.entrySet()) {
+      if (entry.getKey().equals(defaultLabel) && unclassified == 0) {
+        continue;
+      }
+      
+      returnString.append(StringUtils.rightPad(getSmallLabel(entry.getValue()), 5)).append('\t');
+    }
+    
+    returnString.append("<--Classified as").append('\n');
+    for (Map.Entry<String,Integer> entry : this.labelMap.entrySet()) {
+      if (entry.getKey().equals(defaultLabel) && unclassified == 0) {
+        continue;
+      }
+      String correctLabel = entry.getKey();
+      int labelTotal = 0;
+      for (String classifiedLabel : this.labelMap.keySet()) {
+        if (classifiedLabel.equals(defaultLabel) && unclassified == 0) {
+          continue;
+        }
+        returnString.append(
+            StringUtils.rightPad(Integer.toString(getCount(correctLabel, classifiedLabel)), 5)).append('\t');
+        labelTotal += getCount(correctLabel, classifiedLabel);
+      }
+      returnString.append(" |  ").append(StringUtils.rightPad(String.valueOf(labelTotal), 6)).append('\t')
+      .append(StringUtils.rightPad(getSmallLabel(entry.getValue()), 5))
+      .append(" = ").append(correctLabel).append('\n');
+    }
+    if (unclassified > 0) {
+      returnString.append("Default Category: ").append(defaultLabel).append(": ").append(unclassified).append('\n');
+    }
+    returnString.append('\n');
+    return returnString.toString();
+  }
+  
+  static String getSmallLabel(int i) {
+    int val = i;
+    StringBuilder returnString = new StringBuilder();
+    do {
+      int n = val % 26;
+      returnString.insert(0, (char) ('a' + n));
+      val /= 26;
+    } while (val > 0);
+    return returnString.toString();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/OnlineLearner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/OnlineLearner.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/OnlineLearner.java
new file mode 100644
index 0000000..af1d5e7
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/OnlineLearner.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+import org.apache.mahout.math.Vector;
+
+import java.io.Closeable;
+
+/**
+ * The simplest interface for online learning algorithms.
+ */
+public interface OnlineLearner extends Closeable {
+  /**
+   * Updates the model using a particular target variable value and a feature vector.
+   * <p/>
+   * There may an assumption that if multiple passes through the training data are necessary, then
+   * the training examples will be presented in the same order.  This is because the order of
+   * training examples may be used to assign records to different data splits for evaluation by
+   * cross-validation.  Without the order invariance, records might be assigned to training and test
+   * splits and error estimates could be seriously affected.
+   * <p/>
+   * If re-ordering is necessary, then using the alternative API which allows a tracking key to be
+   * added to the training example can be used.
+   *
+   * @param actual   The value of the target variable.  This value should be in the half-open
+   *                 interval [0..n) where n is the number of target categories.
+   * @param instance The feature vector for this example.
+   */
+  void train(int actual, Vector instance);
+
+  /**
+   * Updates the model using a particular target variable value and a feature vector.
+   * <p/>
+   * There may an assumption that if multiple passes through the training data are necessary that
+   * the tracking key for a record will be the same for each pass and that there will be a
+   * relatively large number of distinct tracking keys and that the low-order bits of the tracking
+   * keys will not correlate with any of the input variables.  This tracking key is used to assign
+   * training examples to different test/training splits.
+   * <p/>
+   * Examples of useful tracking keys include id-numbers for the training records derived from
+   * a database id for the base table from the which the record is derived, or the offset of
+   * the original data record in a data file.
+   *
+   * @param trackingKey The tracking key for this training example.
+   * @param groupKey     An optional value that allows examples to be grouped in the computation of
+   * the update to the model.
+   * @param actual   The value of the target variable.  This value should be in the half-open
+   *                 interval [0..n) where n is the number of target categories.
+   * @param instance The feature vector for this example.
+   */
+  void train(long trackingKey, String groupKey, int actual, Vector instance);
+
+  /**
+   * Updates the model using a particular target variable value and a feature vector.
+   * <p/>
+   * There may an assumption that if multiple passes through the training data are necessary that
+   * the tracking key for a record will be the same for each pass and that there will be a
+   * relatively large number of distinct tracking keys and that the low-order bits of the tracking
+   * keys will not correlate with any of the input variables.  This tracking key is used to assign
+   * training examples to different test/training splits.
+   * <p/>
+   * Examples of useful tracking keys include id-numbers for the training records derived from
+   * a database id for the base table from the which the record is derived, or the offset of
+   * the original data record in a data file.
+   *
+   * @param trackingKey The tracking key for this training example.
+   * @param actual   The value of the target variable.  This value should be in the half-open
+   *                 interval [0..n) where n is the number of target categories.
+   * @param instance The feature vector for this example.
+   */
+  void train(long trackingKey, int actual, Vector instance);
+
+  /**
+   * Prepares the classifier for classification and deallocates any temporary data structures.
+   *
+   * An online classifier should be able to accept more training after being closed, but
+   * closing the classifier may make classification more efficient.
+   */
+  @Override
+  void close();
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/RegressionResultAnalyzer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/RegressionResultAnalyzer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/RegressionResultAnalyzer.java
new file mode 100644
index 0000000..35c11ee
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/RegressionResultAnalyzer.java
@@ -0,0 +1,144 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.commons.lang3.StringUtils;
+
+/**
+ * ResultAnalyzer captures the classification statistics and displays in a tabular manner
+ */
+public class RegressionResultAnalyzer {
+
+  private static class Result {
+    private final double actual;
+    private final double result;
+    Result(double actual, double result) {
+      this.actual = actual;
+      this.result = result;
+    }
+    double getActual() {
+      return actual;
+    }
+    double getResult() {
+      return result;
+    }
+  }
+  
+  private List<Result> results;
+  
+  /**
+   * 
+   * @param actual
+   *          The actual answer
+   * @param result
+   *          The regression result
+   */
+  public void addInstance(double actual, double result) {
+    if (results == null) {
+      results = new ArrayList<>();
+    }
+    results.add(new Result(actual, result));
+  }
+
+  /**
+   * 
+   * @param results
+   *          The results table
+   */
+  public void setInstances(double[][] results) {
+    for (double[] res : results) {
+      addInstance(res[0], res[1]);
+    }
+  }
+
+  @Override
+  public String toString() {
+    double sumActual = 0.0;
+    double sumActualSquared = 0.0;
+    double sumResult = 0.0;
+    double sumResultSquared = 0.0;
+    double sumActualResult = 0.0;
+    double sumAbsolute = 0.0;
+    double sumAbsoluteSquared = 0.0;
+    int predictable = 0;
+    int unpredictable = 0;
+
+    for (Result res : results) {
+      double actual = res.getActual();
+      double result = res.getResult();
+      if (Double.isNaN(result)) {
+        unpredictable++;
+      } else {
+        sumActual += actual;
+        sumActualSquared += actual * actual;
+        sumResult += result;
+        sumResultSquared += result * result;
+        sumActualResult += actual * result;
+        double absolute = Math.abs(actual - result);
+        sumAbsolute += absolute;
+        sumAbsoluteSquared += absolute * absolute;
+        predictable++;
+      }
+    }
+
+    StringBuilder returnString = new StringBuilder();
+    
+    returnString.append("=======================================================\n");
+    returnString.append("Summary\n");
+    returnString.append("-------------------------------------------------------\n");
+    
+    if (predictable > 0) {
+      double varActual = sumActualSquared - sumActual * sumActual / predictable;
+      double varResult = sumResultSquared - sumResult * sumResult / predictable;
+      double varCo = sumActualResult - sumActual * sumResult /  predictable;
+  
+      double correlation;
+      if (varActual * varResult <= 0) {
+        correlation = 0.0;
+      } else {
+        correlation = varCo / Math.sqrt(varActual * varResult);
+      }
+
+      Locale.setDefault(Locale.US);
+      NumberFormat decimalFormatter = new DecimalFormat("0.####");
+      
+      returnString.append(StringUtils.rightPad("Correlation coefficient", 40)).append(": ").append(
+        StringUtils.leftPad(decimalFormatter.format(correlation), 10)).append('\n');
+      returnString.append(StringUtils.rightPad("Mean absolute error", 40)).append(": ").append(
+        StringUtils.leftPad(decimalFormatter.format(sumAbsolute / predictable), 10)).append('\n');
+      returnString.append(StringUtils.rightPad("Root mean squared error", 40)).append(": ").append(
+        StringUtils.leftPad(decimalFormatter.format(Math.sqrt(sumAbsoluteSquared / predictable)),
+          10)).append('\n');
+    }
+    returnString.append(StringUtils.rightPad("Predictable Instances", 40)).append(": ").append(
+      StringUtils.leftPad(Integer.toString(predictable), 10)).append('\n');
+    returnString.append(StringUtils.rightPad("Unpredictable Instances", 40)).append(": ").append(
+      StringUtils.leftPad(Integer.toString(unpredictable), 10)).append('\n');
+    returnString.append(StringUtils.rightPad("Total Regressed Instances", 40)).append(": ").append(
+      StringUtils.leftPad(Integer.toString(results.size()), 10)).append('\n');
+    returnString.append('\n');
+
+    return returnString.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java
new file mode 100644
index 0000000..1711f19
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.Collection;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
+import org.apache.mahout.math.stats.OnlineSummarizer;
+
+/** ResultAnalyzer captures the classification statistics and displays in a tabular manner */
+public class ResultAnalyzer {
+  
+  private final ConfusionMatrix confusionMatrix;
+  private final OnlineSummarizer summarizer;
+  private boolean hasLL;
+
+  /*
+   * === Summary ===
+   * 
+   * Correctly Classified Instances 635 92.9722 % Incorrectly Classified Instances 48 7.0278 % Kappa statistic
+   * 0.923 Mean absolute error 0.0096 Root mean squared error 0.0817 Relative absolute error 9.9344 % Root
+   * relative squared error 37.2742 % Total Number of Instances 683
+   */
+  private int correctlyClassified;
+  private int incorrectlyClassified;
+  
+  public ResultAnalyzer(Collection<String> labelSet, String defaultLabel) {
+    confusionMatrix = new ConfusionMatrix(labelSet, defaultLabel);
+    summarizer = new OnlineSummarizer();
+  }
+  
+  public ConfusionMatrix getConfusionMatrix() {
+    return this.confusionMatrix;
+  }
+  
+  /**
+   * 
+   * @param correctLabel
+   *          The correct label
+   * @param classifiedResult
+   *          The classified result
+   * @return whether the instance was correct or not
+   */
+  public boolean addInstance(String correctLabel, ClassifierResult classifiedResult) {
+    boolean result = correctLabel.equals(classifiedResult.getLabel());
+    if (result) {
+      correctlyClassified++;
+    } else {
+      incorrectlyClassified++;
+    }
+    confusionMatrix.addInstance(correctLabel, classifiedResult);
+    if (classifiedResult.getLogLikelihood() != Double.MAX_VALUE) {
+      summarizer.add(classifiedResult.getLogLikelihood());
+      hasLL = true;
+    }
+    return result;
+  }
+  
+  @Override
+  public String toString() {
+    StringBuilder returnString = new StringBuilder();
+   
+    returnString.append('\n'); 
+    returnString.append("=======================================================\n");
+    returnString.append("Summary\n");
+    returnString.append("-------------------------------------------------------\n");
+    int totalClassified = correctlyClassified + incorrectlyClassified;
+    double percentageCorrect = (double) 100 * correctlyClassified / totalClassified;
+    double percentageIncorrect = (double) 100 * incorrectlyClassified / totalClassified;
+    NumberFormat decimalFormatter = new DecimalFormat("0.####");
+    
+    returnString.append(StringUtils.rightPad("Correctly Classified Instances", 40)).append(": ").append(
+      StringUtils.leftPad(Integer.toString(correctlyClassified), 10)).append('\t').append(
+      StringUtils.leftPad(decimalFormatter.format(percentageCorrect), 10)).append("%\n");
+    returnString.append(StringUtils.rightPad("Incorrectly Classified Instances", 40)).append(": ").append(
+      StringUtils.leftPad(Integer.toString(incorrectlyClassified), 10)).append('\t').append(
+      StringUtils.leftPad(decimalFormatter.format(percentageIncorrect), 10)).append("%\n");
+    returnString.append(StringUtils.rightPad("Total Classified Instances", 40)).append(": ").append(
+      StringUtils.leftPad(Integer.toString(totalClassified), 10)).append('\n');
+    returnString.append('\n');
+    
+    returnString.append(confusionMatrix);
+    returnString.append("=======================================================\n");
+    returnString.append("Statistics\n");
+    returnString.append("-------------------------------------------------------\n");
+    
+    RunningAverageAndStdDev normStats = confusionMatrix.getNormalizedStats();
+    returnString.append(StringUtils.rightPad("Kappa", 40)).append(
+      StringUtils.leftPad(decimalFormatter.format(confusionMatrix.getKappa()), 10)).append('\n');
+    returnString.append(StringUtils.rightPad("Accuracy", 40)).append(
+      StringUtils.leftPad(decimalFormatter.format(confusionMatrix.getAccuracy()), 10)).append("%\n");
+    returnString.append(StringUtils.rightPad("Reliability", 40)).append(
+      StringUtils.leftPad(decimalFormatter.format(normStats.getAverage() * 100.00000001), 10)).append("%\n");
+    returnString.append(StringUtils.rightPad("Reliability (standard deviation)", 40)).append(
+      StringUtils.leftPad(decimalFormatter.format(normStats.getStandardDeviation()), 10)).append('\n');
+    returnString.append(StringUtils.rightPad("Weighted precision", 40)).append(
+      StringUtils.leftPad(decimalFormatter.format(confusionMatrix.getWeightedPrecision()), 10)).append('\n');
+    returnString.append(StringUtils.rightPad("Weighted recall", 40)).append(
+      StringUtils.leftPad(decimalFormatter.format(confusionMatrix.getWeightedRecall()), 10)).append('\n');
+    returnString.append(StringUtils.rightPad("Weighted F1 score", 40)).append(
+      StringUtils.leftPad(decimalFormatter.format(confusionMatrix.getWeightedF1score()), 10)).append('\n');
+    
+    if (hasLL) {
+      returnString.append(StringUtils.rightPad("Log-likelihood", 30)).append("mean      : ").append(
+        StringUtils.leftPad(decimalFormatter.format(summarizer.getMean()), 10)).append('\n');
+      returnString.append(StringUtils.rightPad("", 30)).append(StringUtils.rightPad("25%-ile   : ", 10)).append(
+        StringUtils.leftPad(decimalFormatter.format(summarizer.getQuartile(1)), 10)).append('\n');
+      returnString.append(StringUtils.rightPad("", 30)).append(StringUtils.rightPad("75%-ile   : ", 10)).append(
+        StringUtils.leftPad(decimalFormatter.format(summarizer.getQuartile(3)), 10)).append('\n');
+    }
+
+    return returnString.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/Bagging.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/Bagging.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/Bagging.java
new file mode 100644
index 0000000..f79a429
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/Bagging.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df;
+
+import org.apache.mahout.classifier.df.builder.TreeBuilder;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.node.Node;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Arrays;
+import java.util.Random;
+
+/**
+ * Builds a tree using bagging
+ */
+@Deprecated
+public class Bagging {
+  
+  private static final Logger log = LoggerFactory.getLogger(Bagging.class);
+  
+  private final TreeBuilder treeBuilder;
+  
+  private final Data data;
+  
+  private final boolean[] sampled;
+  
+  public Bagging(TreeBuilder treeBuilder, Data data) {
+    this.treeBuilder = treeBuilder;
+    this.data = data;
+    sampled = new boolean[data.size()];
+  }
+  
+  /**
+   * Builds one tree
+   */
+  public Node build(Random rng) {
+    log.debug("Bagging...");
+    Arrays.fill(sampled, false);
+    Data bag = data.bagging(rng, sampled);
+    
+    log.debug("Building...");
+    return treeBuilder.build(rng, bag);
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DFUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DFUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DFUtils.java
new file mode 100644
index 0000000..c94292c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DFUtils.java
@@ -0,0 +1,174 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+
+/**
+ * Utility class that contains various helper methods
+ */
+@Deprecated
+public final class DFUtils {
+
+  private DFUtils() {
+  }
+
+  /**
+   * Writes an Node[] into a DataOutput
+   * @throws java.io.IOException
+   */
+  public static void writeArray(DataOutput out, Node[] array) throws IOException {
+    out.writeInt(array.length);
+    for (Node w : array) {
+      w.write(out);
+    }
+  }
+
+  /**
+   * Reads a Node[] from a DataInput
+   * @throws java.io.IOException
+   */
+  public static Node[] readNodeArray(DataInput in) throws IOException {
+    int length = in.readInt();
+    Node[] nodes = new Node[length];
+    for (int index = 0; index < length; index++) {
+      nodes[index] = Node.read(in);
+    }
+
+    return nodes;
+  }
+
+  /**
+   * Writes a double[] into a DataOutput
+   * @throws java.io.IOException
+   */
+  public static void writeArray(DataOutput out, double[] array) throws IOException {
+    out.writeInt(array.length);
+    for (double value : array) {
+      out.writeDouble(value);
+    }
+  }
+
+  /**
+   * Reads a double[] from a DataInput
+   * @throws java.io.IOException
+   */
+  public static double[] readDoubleArray(DataInput in) throws IOException {
+    int length = in.readInt();
+    double[] array = new double[length];
+    for (int index = 0; index < length; index++) {
+      array[index] = in.readDouble();
+    }
+
+    return array;
+  }
+
+  /**
+   * Writes an int[] into a DataOutput
+   * @throws java.io.IOException
+   */
+  public static void writeArray(DataOutput out, int[] array) throws IOException {
+    out.writeInt(array.length);
+    for (int value : array) {
+      out.writeInt(value);
+    }
+  }
+
+  /**
+   * Reads an int[] from a DataInput
+   * @throws java.io.IOException
+   */
+  public static int[] readIntArray(DataInput in) throws IOException {
+    int length = in.readInt();
+    int[] array = new int[length];
+    for (int index = 0; index < length; index++) {
+      array[index] = in.readInt();
+    }
+
+    return array;
+  }
+
+  /**
+   * Return a list of all files in the output directory
+   * @throws IOException if no file is found
+   */
+  public static Path[] listOutputFiles(FileSystem fs, Path outputPath) throws IOException {
+    List<Path> outputFiles = new ArrayList<>();
+    for (FileStatus s : fs.listStatus(outputPath, PathFilters.logsCRCFilter())) {
+      if (!s.isDir() && !s.getPath().getName().startsWith("_")) {
+        outputFiles.add(s.getPath());
+      }
+    }
+    if (outputFiles.isEmpty()) {
+      throw new IOException("No output found !");
+    }
+    return outputFiles.toArray(new Path[outputFiles.size()]);
+  }
+
+  /**
+   * Formats a time interval in milliseconds to a String in the form "hours:minutes:seconds:millis"
+   */
+  public static String elapsedTime(long milli) {
+    long seconds = milli / 1000;
+    milli %= 1000;
+
+    long minutes = seconds / 60;
+    seconds %= 60;
+
+    long hours = minutes / 60;
+    minutes %= 60;
+
+    return hours + "h " + minutes + "m " + seconds + "s " + milli;
+  }
+
+  public static void storeWritable(Configuration conf, Path path, Writable writable) throws IOException {
+    FileSystem fs = path.getFileSystem(conf);
+
+    try (FSDataOutputStream out = fs.create(path)) {
+      writable.write(out);
+    }
+  }
+
+  /**
+   * Write a string to a path.
+   * @param conf From which the file system will be picked
+   * @param path Where the string will be written
+   * @param string The string to write
+   * @throws IOException if things go poorly
+   */
+  public static void storeString(Configuration conf, Path path, String string) throws IOException {
+    try (DataOutputStream out = path.getFileSystem(conf).create(path)) {
+      out.write(string.getBytes(Charset.defaultCharset()));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DecisionForest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DecisionForest.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DecisionForest.java
new file mode 100644
index 0000000..c11cf34
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DecisionForest.java
@@ -0,0 +1,241 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.DataUtils;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.classifier.df.node.Node;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * Represents a forest of decision trees.
+ */
+@Deprecated
+public class DecisionForest implements Writable {
+  
+  private final List<Node> trees;
+  
+  private DecisionForest() {
+    trees = new ArrayList<>();
+  }
+  
+  public DecisionForest(List<Node> trees) {
+    Preconditions.checkArgument(trees != null && !trees.isEmpty(), "trees argument must not be null or empty");
+
+    this.trees = trees;
+  }
+  
+  List<Node> getTrees() {
+    return trees;
+  }
+
+  /**
+   * Classifies the data and calls callback for each classification
+   */
+  public void classify(Data data, double[][] predictions) {
+    Preconditions.checkArgument(data.size() == predictions.length, "predictions.length must be equal to data.size()");
+
+    if (data.isEmpty()) {
+      return; // nothing to classify
+    }
+
+    int treeId = 0;
+    for (Node tree : trees) {
+      for (int index = 0; index < data.size(); index++) {
+        if (predictions[index] == null) {
+          predictions[index] = new double[trees.size()];
+        }
+        predictions[index][treeId] = tree.classify(data.get(index));
+      }
+      treeId++;
+    }
+  }
+  
+  /**
+   * predicts the label for the instance
+   * 
+   * @param rng
+   *          Random number generator, used to break ties randomly
+   * @return NaN if the label cannot be predicted
+   */
+  public double classify(Dataset dataset, Random rng, Instance instance) {
+    if (dataset.isNumerical(dataset.getLabelId())) {
+      double sum = 0;
+      int cnt = 0;
+      for (Node tree : trees) {
+        double prediction = tree.classify(instance);
+        if (!Double.isNaN(prediction)) {
+          sum += prediction;
+          cnt++;
+        }
+      }
+
+      if (cnt > 0) {
+        return sum / cnt;
+      } else {
+        return Double.NaN;
+      }
+    } else {
+      int[] predictions = new int[dataset.nblabels()];
+      for (Node tree : trees) {
+        double prediction = tree.classify(instance);
+        if (!Double.isNaN(prediction)) {
+          predictions[(int) prediction]++;
+        }
+      }
+
+      if (DataUtils.sum(predictions) == 0) {
+        return Double.NaN; // no prediction available
+      }
+
+      return DataUtils.maxindex(rng, predictions);
+    }
+  }
+  
+  /**
+   * @return Mean number of nodes per tree
+   */
+  public long meanNbNodes() {
+    long sum = 0;
+    
+    for (Node tree : trees) {
+      sum += tree.nbNodes();
+    }
+    
+    return sum / trees.size();
+  }
+  
+  /**
+   * @return Total number of nodes in all the trees
+   */
+  public long nbNodes() {
+    long sum = 0;
+    
+    for (Node tree : trees) {
+      sum += tree.nbNodes();
+    }
+    
+    return sum;
+  }
+  
+  /**
+   * @return Mean maximum depth per tree
+   */
+  public long meanMaxDepth() {
+    long sum = 0;
+    
+    for (Node tree : trees) {
+      sum += tree.maxDepth();
+    }
+    
+    return sum / trees.size();
+  }
+  
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+    if (!(obj instanceof DecisionForest)) {
+      return false;
+    }
+    
+    DecisionForest rf = (DecisionForest) obj;
+    
+    return trees.size() == rf.getTrees().size() && trees.containsAll(rf.getTrees());
+  }
+  
+  @Override
+  public int hashCode() {
+    return trees.hashCode();
+  }
+
+  @Override
+  public void write(DataOutput dataOutput) throws IOException {
+    dataOutput.writeInt(trees.size());
+    for (Node tree : trees) {
+      tree.write(dataOutput);
+    }
+  }
+
+  /**
+   * Reads the trees from the input and adds them to the existing trees
+   */
+  @Override
+  public void readFields(DataInput dataInput) throws IOException {
+    int size = dataInput.readInt();
+    for (int i = 0; i < size; i++) {
+      trees.add(Node.read(dataInput));
+    }
+  }
+
+  /**
+   * Read the forest from inputStream
+   * @param dataInput - input forest
+   * @return {@link org.apache.mahout.classifier.df.DecisionForest}
+   * @throws IOException
+   */
+  public static DecisionForest read(DataInput dataInput) throws IOException {
+    DecisionForest forest = new DecisionForest();
+    forest.readFields(dataInput);
+    return forest;
+  }
+
+  /**
+   * Load the forest from a single file or a directory of files
+   * @throws java.io.IOException
+   */
+  public static DecisionForest load(Configuration conf, Path forestPath) throws IOException {
+    FileSystem fs = forestPath.getFileSystem(conf);
+    Path[] files;
+    if (fs.getFileStatus(forestPath).isDir()) {
+      files = DFUtils.listOutputFiles(fs, forestPath);
+    } else {
+      files = new Path[]{forestPath};
+    }
+
+    DecisionForest forest = null;
+    for (Path path : files) {
+      try (FSDataInputStream dataInput = new FSDataInputStream(fs.open(path))) {
+        if (forest == null) {
+          forest = read(dataInput);
+        } else {
+          forest.readFields(dataInput);
+        }
+      }
+    }
+
+    return forest;
+    
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ErrorEstimate.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ErrorEstimate.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ErrorEstimate.java
new file mode 100644
index 0000000..13cd386
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ErrorEstimate.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * Various methods to compute from the output of a random forest
+ */
+@Deprecated
+public final class ErrorEstimate {
+
+  private ErrorEstimate() {
+  }
+  
+  public static double errorRate(double[] labels, double[] predictions) {
+    Preconditions.checkArgument(labels.length == predictions.length, "labels.length != predictions.length");
+    double nberrors = 0; // number of instance that got bad predictions
+    double datasize = 0; // number of classified instances
+
+    for (int index = 0; index < labels.length; index++) {
+      if (predictions[index] == -1) {
+        continue; // instance not classified
+      }
+
+      if (predictions[index] != labels[index]) {
+        nberrors++;
+      }
+
+      datasize++;
+    }
+
+    return nberrors / datasize;
+  }
+
+}


[09/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RecordFactory.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RecordFactory.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RecordFactory.java
new file mode 100644
index 0000000..fbc825d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RecordFactory.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.math.Vector;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * A record factor understands how to convert a line of data into fields and then into a vector.
+ */
+public interface RecordFactory {
+  void defineTargetCategories(List<String> values);
+
+  RecordFactory maxTargetValue(int max);
+
+  boolean usesFirstLineAsSchema();
+
+  int processLine(String line, Vector featureVector);
+
+  Iterable<String> getPredictors();
+
+  Map<String, Set<Integer>> getTraceDictionary();
+
+  RecordFactory includeBiasTerm(boolean useBias);
+
+  List<String> getTargetCategories();
+
+  void firstLine(String line);
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/TPrior.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/TPrior.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/TPrior.java
new file mode 100644
index 0000000..0a7b6a7
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/TPrior.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.commons.math3.special.Gamma;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Provides a t-distribution as a prior.
+ */
+public class TPrior implements PriorFunction {
+  private double df;
+
+  public TPrior(double df) {
+    this.df = df;
+  }
+
+  @Override
+  public double age(double oldValue, double generations, double learningRate) {
+    for (int i = 0; i < generations; i++) {
+      oldValue -= learningRate * oldValue * (df + 1.0) / (df + oldValue * oldValue);
+    }
+    return oldValue;
+  }
+
+  @Override
+  public double logP(double betaIJ) {
+    return Gamma.logGamma((df + 1.0) / 2.0)
+        - Math.log(df * Math.PI)
+        - Gamma.logGamma(df / 2.0)
+        - (df + 1.0) / 2.0 * Math.log1p(betaIJ * betaIJ);
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeDouble(df);
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    df = in.readDouble();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/UniformPrior.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/UniformPrior.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/UniformPrior.java
new file mode 100644
index 0000000..23c812f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/UniformPrior.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * A uniform prior.  This is an improper prior that corresponds to no regularization at all.
+ */
+public class UniformPrior implements PriorFunction {
+  @Override
+  public double age(double oldValue, double generations, double learningRate) {
+    return oldValue;
+  }
+
+  @Override
+  public double logP(double betaIJ) {
+    return 0;
+  }
+
+  @Override
+  public void write(DataOutput dataOutput) throws IOException {
+    // nothing to write
+  }
+
+  @Override
+  public void readFields(DataInput dataInput) throws IOException {
+    // stateless class is trivial to read
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/package-info.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/package-info.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/package-info.java
new file mode 100644
index 0000000..c2ad966
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/package-info.java
@@ -0,0 +1,23 @@
+/**
+ * <p>Implements a variety of on-line logistric regression classifiers using SGD-based algorithms.
+ * SGD stands for Stochastic Gradient Descent and refers to a class of learning algorithms
+ * that make it relatively easy to build high speed on-line learning algorithms for a variety
+ * of problems, notably including supervised learning for classification.</p>
+ *
+ * <p>The primary class of interest in the this package is
+ * {@link org.apache.mahout.classifier.sgd.CrossFoldLearner} which contains a
+ * number (typically 5) of sub-learners, each of which is given a different portion of the
+ * training data.  Each of these sub-learners can then be evaluated on the data it was not
+ * trained on.  This allows fully incremental learning while still getting cross-validated
+ * performance estimates.</p>
+ *
+ * <p>The CrossFoldLearner implements {@link org.apache.mahout.classifier.OnlineLearner}
+ * and thus expects to be fed input in the form
+ * of a target variable and a feature vector.  The target variable is simply an integer in the
+ * half-open interval [0..numFeatures) where numFeatures is defined when the CrossFoldLearner
+ * is constructed.  The creation of feature vectors is facilitated by the classes that inherit
+ * from {@link org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder}.
+ * These classes currently implement a form of feature hashing with
+ * multiple probes to limit feature ambiguity.</p>
+ */
+package org.apache.mahout.classifier.sgd;

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/AbstractCluster.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
new file mode 100644
index 0000000..be7ed2a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
@@ -0,0 +1,390 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.Functions;
+import org.apache.mahout.math.function.SquareRootFunction;
+import org.codehaus.jackson.map.ObjectMapper;
+
+public abstract class AbstractCluster implements Cluster {
+  
+  // cluster persistent state
+  private int id;
+  
+  private long numObservations;
+  
+  private long totalObservations;
+  
+  private Vector center;
+  
+  private Vector radius;
+  
+  // the observation statistics
+  private double s0;
+  
+  private Vector s1;
+  
+  private Vector s2;
+
+  private static final ObjectMapper jxn = new ObjectMapper();
+  
+  protected AbstractCluster() {}
+  
+  protected AbstractCluster(Vector point, int id2) {
+    this.numObservations = (long) 0;
+    this.totalObservations = (long) 0;
+    this.center = point.clone();
+    this.radius = center.like();
+    this.s0 = (double) 0;
+    this.s1 = center.like();
+    this.s2 = center.like();
+    this.id = id2;
+  }
+  
+  protected AbstractCluster(Vector center2, Vector radius2, int id2) {
+    this.numObservations = (long) 0;
+    this.totalObservations = (long) 0;
+    this.center = new RandomAccessSparseVector(center2);
+    this.radius = new RandomAccessSparseVector(radius2);
+    this.s0 = (double) 0;
+    this.s1 = center.like();
+    this.s2 = center.like();
+    this.id = id2;
+  }
+  
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(id);
+    out.writeLong(getNumObservations());
+    out.writeLong(getTotalObservations());
+    VectorWritable.writeVector(out, getCenter());
+    VectorWritable.writeVector(out, getRadius());
+    out.writeDouble(s0);
+    VectorWritable.writeVector(out, s1);
+    VectorWritable.writeVector(out, s2);
+  }
+  
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    this.id = in.readInt();
+    this.setNumObservations(in.readLong());
+    this.setTotalObservations(in.readLong());
+    this.setCenter(VectorWritable.readVector(in));
+    this.setRadius(VectorWritable.readVector(in));
+    this.setS0(in.readDouble());
+    this.setS1(VectorWritable.readVector(in));
+    this.setS2(VectorWritable.readVector(in));
+  }
+  
+  @Override
+  public void configure(Configuration job) {
+    // nothing to do
+  }
+  
+  @Override
+  public Collection<Parameter<?>> getParameters() {
+    return Collections.emptyList();
+  }
+  
+  @Override
+  public void createParameters(String prefix, Configuration jobConf) {
+    // nothing to do
+  }
+  
+  @Override
+  public int getId() {
+    return id;
+  }
+
+  /**
+   * @param id
+   *          the id to set
+   */
+  protected void setId(int id) {
+    this.id = id;
+  }
+  
+  @Override
+  public long getNumObservations() {
+    return numObservations;
+  }
+
+  /**
+   * @param l
+   *          the numPoints to set
+   */
+  protected void setNumObservations(long l) {
+    this.numObservations = l;
+  }
+  
+  @Override
+  public long getTotalObservations() {
+    return totalObservations;
+  }
+
+  protected void setTotalObservations(long totalPoints) {
+    this.totalObservations = totalPoints;
+  }
+
+  @Override
+  public Vector getCenter() {
+    return center;
+  }
+
+  /**
+   * @param center
+   *          the center to set
+   */
+  protected void setCenter(Vector center) {
+    this.center = center;
+  }
+  
+  @Override
+  public Vector getRadius() {
+    return radius;
+  }
+
+  /**
+   * @param radius
+   *          the radius to set
+   */
+  protected void setRadius(Vector radius) {
+    this.radius = radius;
+  }
+  
+  /**
+   * @return the s0
+   */
+  protected double getS0() {
+    return s0;
+  }
+  
+  protected void setS0(double s0) {
+    this.s0 = s0;
+  }
+
+  /**
+   * @return the s1
+   */
+  protected Vector getS1() {
+    return s1;
+  }
+  
+  protected void setS1(Vector s1) {
+    this.s1 = s1;
+  }
+
+  /**
+   * @return the s2
+   */
+  protected Vector getS2() {
+    return s2;
+  }
+  
+  protected void setS2(Vector s2) {
+    this.s2 = s2;
+  }
+
+  @Override
+  public void observe(Model<VectorWritable> x) {
+    AbstractCluster cl = (AbstractCluster) x;
+    setS0(getS0() + cl.getS0());
+    setS1(getS1().plus(cl.getS1()));
+    setS2(getS2().plus(cl.getS2()));
+  }
+  
+  @Override
+  public void observe(VectorWritable x) {
+    observe(x.get());
+  }
+  
+  @Override
+  public void observe(VectorWritable x, double weight) {
+    observe(x.get(), weight);
+  }
+  
+  public void observe(Vector x, double weight) {
+    if (weight == 1.0) {
+      observe(x);
+    } else {
+      setS0(getS0() + weight);
+      Vector weightedX = x.times(weight);
+      if (getS1() == null) {
+        setS1(weightedX);
+      } else {
+        getS1().assign(weightedX, Functions.PLUS);
+      }
+      Vector x2 = x.times(x).times(weight);
+      if (getS2() == null) {
+        setS2(x2);
+      } else {
+        getS2().assign(x2, Functions.PLUS);
+      }
+    }
+  }
+  
+  public void observe(Vector x) {
+    setS0(getS0() + 1);
+    if (getS1() == null) {
+      setS1(x.clone());
+    } else {
+      getS1().assign(x, Functions.PLUS);
+    }
+    Vector x2 = x.times(x);
+    if (getS2() == null) {
+      setS2(x2);
+    } else {
+      getS2().assign(x2, Functions.PLUS);
+    }
+  }
+  
+  
+  @Override
+  public void computeParameters() {
+    if (getS0() == 0) {
+      return;
+    }
+    setNumObservations((long) getS0());
+    setTotalObservations(getTotalObservations() + getNumObservations());
+    setCenter(getS1().divide(getS0()));
+    // compute the component stds
+    if (getS0() > 1) {
+      setRadius(getS2().times(getS0()).minus(getS1().times(getS1())).assign(new SquareRootFunction()).divide(getS0()));
+    }
+    setS0(0);
+    setS1(center.like());
+    setS2(center.like());
+  }
+
+  @Override
+  public String asFormatString(String[] bindings) {
+    String fmtString = "";
+    try {
+      fmtString = jxn.writeValueAsString(asJson(bindings));
+    } catch (IOException e) {
+      log.error("Error writing JSON as String.", e);
+    }
+    return fmtString;
+  }
+
+  public Map<String,Object> asJson(String[] bindings) {
+    Map<String,Object> dict = new HashMap<>();
+    dict.put("identifier", getIdentifier());
+    dict.put("n", getNumObservations());
+    if (getCenter() != null) {
+      try {
+        dict.put("c", formatVectorAsJson(getCenter(), bindings));
+      } catch (IOException e) {
+        log.error("IOException:  ", e);
+      }
+    }
+    if (getRadius() != null) {
+      try {
+        dict.put("r", formatVectorAsJson(getRadius(), bindings));
+      } catch (IOException e) {
+        log.error("IOException:  ", e);
+      }
+    }
+    return dict;
+  }
+  
+  public abstract String getIdentifier();
+  
+  /**
+   * Compute the centroid by averaging the pointTotals
+   * 
+   * @return the new centroid
+   */
+  public Vector computeCentroid() {
+    return getS0() == 0 ? getCenter() : getS1().divide(getS0());
+  }
+
+  /**
+   * Return a human-readable formatted string representation of the vector, not
+   * intended to be complete nor usable as an input/output representation
+   */
+  public static String formatVector(Vector v, String[] bindings) {
+    String fmtString = "";
+    try {
+      fmtString = jxn.writeValueAsString(formatVectorAsJson(v, bindings));
+    } catch (IOException e) {
+      log.error("Error writing JSON as String.", e);
+    }
+    return fmtString;
+  }
+
+  /**
+   * Create a List of HashMaps containing vector terms and weights
+   *
+   * @return List<Object>
+   */
+  public static List<Object> formatVectorAsJson(Vector v, String[] bindings) throws IOException {
+
+    boolean hasBindings = bindings != null;
+    boolean isSparse = v.getNumNonZeroElements() != v.size();
+
+    // we assume sequential access in the output
+    Vector provider = v.isSequentialAccess() ? v : new SequentialAccessSparseVector(v);
+
+    List<Object> terms = new LinkedList<>();
+    String term = "";
+
+    for (Element elem : provider.nonZeroes()) {
+
+      if (hasBindings && bindings.length >= elem.index() + 1 && bindings[elem.index()] != null) {
+        term = bindings[elem.index()];
+      } else if (hasBindings || isSparse) {
+        term = String.valueOf(elem.index());
+      }
+
+      Map<String, Object> term_entry = new HashMap<>();
+      double roundedWeight = (double) Math.round(elem.get() * 1000) / 1000;
+      if (hasBindings || isSparse) {
+        term_entry.put(term, roundedWeight);
+        terms.add(term_entry);
+      } else {
+        terms.add(roundedWeight);
+      }
+    }
+
+    return terms;
+  }
+
+  @Override
+  public boolean isConverged() {
+    // Convergence has no meaning yet, perhaps in subclasses
+    return false;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Cluster.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Cluster.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Cluster.java
new file mode 100644
index 0000000..07d6927
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Cluster.java
@@ -0,0 +1,90 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering;
+
+import org.apache.mahout.common.parameters.Parametered;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import java.util.Map;
+
+/**
+ * Implementations of this interface have a printable representation and certain
+ * attributes that are common across all clustering implementations
+ * 
+ */
+public interface Cluster extends Model<VectorWritable>, Parametered {
+
+  // default directory for initial clusters to prime iterative clustering
+  // algorithms
+  String INITIAL_CLUSTERS_DIR = "clusters-0";
+  
+  // default directory for output of clusters per iteration
+  String CLUSTERS_DIR = "clusters-";
+  
+  // default suffix for output of clusters for final iteration
+  String FINAL_ITERATION_SUFFIX = "-final";
+  
+  /**
+   * Get the id of the Cluster
+   * 
+   * @return a unique integer
+   */
+  int getId();
+  
+  /**
+   * Get the "center" of the Cluster as a Vector
+   * 
+   * @return a Vector
+   */
+  Vector getCenter();
+  
+  /**
+   * Get the "radius" of the Cluster as a Vector. Usually the radius is the
+   * standard deviation expressed as a Vector of size equal to the center. Some
+   * clusters may return zero values if not appropriate.
+   * 
+   * @return aVector
+   */
+  Vector getRadius();
+    
+  /**
+   * Produce a custom, human-friendly, printable representation of the Cluster.
+   * 
+   * @param bindings
+   *          an optional String[] containing labels used to format the primary
+   *          Vector/s of this implementation.
+   * @return a String
+   */
+  String asFormatString(String[] bindings);
+
+  /**
+   * Produce a JSON representation of the Cluster.
+   *
+   * @param bindings
+   *          an optional String[] containing labels used to format the primary
+   *          Vector/s of this implementation.
+   * @return a Map
+   */
+  Map<String,Object> asJson(String[] bindings);
+
+  /**
+   * @return if the receiver has converged, or false if that has no meaning for
+   *         the implementation
+   */
+  boolean isConverged();
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ClusteringUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ClusteringUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ClusteringUtils.java
new file mode 100644
index 0000000..ad0f8ec
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ClusteringUtils.java
@@ -0,0 +1,306 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.WeightedVector;
+import org.apache.mahout.math.neighborhood.BruteSearch;
+import org.apache.mahout.math.neighborhood.ProjectionSearch;
+import org.apache.mahout.math.neighborhood.Searcher;
+import org.apache.mahout.math.neighborhood.UpdatableSearcher;
+import org.apache.mahout.math.random.WeightedThing;
+import org.apache.mahout.math.stats.OnlineSummarizer;
+
+public final class ClusteringUtils {
+  private ClusteringUtils() {
+  }
+
+  /**
+   * Computes the summaries for the distances in each cluster.
+   * @param datapoints iterable of datapoints.
+   * @param centroids iterable of Centroids.
+   * @return a list of OnlineSummarizers where the i-th element is the summarizer corresponding to the cluster whose
+   * index is i.
+   */
+  public static List<OnlineSummarizer> summarizeClusterDistances(Iterable<? extends Vector> datapoints,
+                                                                 Iterable<? extends Vector> centroids,
+                                                                 DistanceMeasure distanceMeasure) {
+    UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1);
+    searcher.addAll(centroids);
+    List<OnlineSummarizer> summarizers = new ArrayList<>();
+    if (searcher.size() == 0) {
+      return summarizers;
+    }
+    for (int i = 0; i < searcher.size(); ++i) {
+      summarizers.add(new OnlineSummarizer());
+    }
+    for (Vector v : datapoints) {
+      Centroid closest = (Centroid)searcher.search(v,  1).get(0).getValue();
+      OnlineSummarizer summarizer = summarizers.get(closest.getIndex());
+      summarizer.add(distanceMeasure.distance(v, closest));
+    }
+    return summarizers;
+  }
+
+  /**
+   * Adds up the distances from each point to its closest cluster and returns the sum.
+   * @param datapoints iterable of datapoints.
+   * @param centroids iterable of Centroids.
+   * @return the total cost described above.
+   */
+  public static double totalClusterCost(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids) {
+    DistanceMeasure distanceMeasure = new EuclideanDistanceMeasure();
+    UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1);
+    searcher.addAll(centroids);
+    return totalClusterCost(datapoints, searcher);
+  }
+
+  /**
+   * Adds up the distances from each point to its closest cluster and returns the sum.
+   * @param datapoints iterable of datapoints.
+   * @param centroids searcher of Centroids.
+   * @return the total cost described above.
+   */
+  public static double totalClusterCost(Iterable<? extends Vector> datapoints, Searcher centroids) {
+    double totalCost = 0;
+    for (Vector vector : datapoints) {
+      totalCost += centroids.searchFirst(vector, false).getWeight();
+    }
+    return totalCost;
+  }
+
+  /**
+   * Estimates the distance cutoff. In StreamingKMeans, the distance between two vectors divided
+   * by this value is used as a probability threshold when deciding whether to form a new cluster
+   * or not.
+   * Small values (comparable to the minimum distance between two points) are preferred as they
+   * guarantee with high likelihood that all but very close points are put in separate clusters
+   * initially. The clusters themselves are actually collapsed periodically when their number goes
+   * over the maximum number of clusters and the distanceCutoff is increased.
+   * So, the returned value is only an initial estimate.
+   * @param data the datapoints whose distance is to be estimated.
+   * @param distanceMeasure the distance measure used to compute the distance between two points.
+   * @return the minimum distance between the first sampleLimit points
+   * @see org.apache.mahout.clustering.streaming.cluster.StreamingKMeans#clusterInternal(Iterable, boolean)
+   */
+  public static double estimateDistanceCutoff(List<? extends Vector> data, DistanceMeasure distanceMeasure) {
+    BruteSearch searcher = new BruteSearch(distanceMeasure);
+    searcher.addAll(data);
+    double minDistance = Double.POSITIVE_INFINITY;
+    for (Vector vector : data) {
+      double closest = searcher.searchFirst(vector, true).getWeight();
+      if (minDistance > 0 && closest < minDistance) {
+        minDistance = closest;
+      }
+      searcher.add(vector);
+    }
+    return minDistance;
+  }
+
+  public static <T extends Vector> double estimateDistanceCutoff(
+      Iterable<T> data, DistanceMeasure distanceMeasure, int sampleLimit) {
+    return estimateDistanceCutoff(Lists.newArrayList(Iterables.limit(data, sampleLimit)), distanceMeasure);
+  }
+
+  /**
+   * Computes the Davies-Bouldin Index for a given clustering.
+   * See http://en.wikipedia.org/wiki/Clustering_algorithm#Internal_evaluation
+   * @param centroids list of centroids
+   * @param distanceMeasure distance measure for inter-cluster distances
+   * @param clusterDistanceSummaries summaries of the clusters; See summarizeClusterDistances
+   * @return the Davies-Bouldin Index
+   */
+  public static double daviesBouldinIndex(List<? extends Vector> centroids, DistanceMeasure distanceMeasure,
+                                          List<OnlineSummarizer> clusterDistanceSummaries) {
+    Preconditions.checkArgument(centroids.size() == clusterDistanceSummaries.size(),
+        "Number of centroids and cluster summaries differ.");
+    int n = centroids.size();
+    double totalDBIndex = 0;
+    // The inner loop shouldn't be reduced for j = i + 1 to n because the computation of the Davies-Bouldin
+    // index is not really symmetric.
+    // For a given cluster i, we look for a cluster j that maximizes the ratio of the sum of average distances
+    // from points in cluster i to its center and and points in cluster j to its center to the distance between
+    // cluster i and cluster j.
+    // The maximization is the key issue, as the cluster that maximizes this ratio might be j for i but is NOT
+    // NECESSARILY i for j.
+    for (int i = 0; i < n; ++i) {
+      double averageDistanceI = clusterDistanceSummaries.get(i).getMean();
+      double maxDBIndex = 0;
+      for (int j = 0; j < n; ++j) {
+        if (i != j) {
+          double dbIndex = (averageDistanceI + clusterDistanceSummaries.get(j).getMean())
+              / distanceMeasure.distance(centroids.get(i), centroids.get(j));
+          if (dbIndex > maxDBIndex) {
+            maxDBIndex = dbIndex;
+          }
+        }
+      }
+      totalDBIndex += maxDBIndex;
+    }
+    return totalDBIndex / n;
+  }
+
+  /**
+   * Computes the Dunn Index of a given clustering. See http://en.wikipedia.org/wiki/Dunn_index
+   * @param centroids list of centroids
+   * @param distanceMeasure distance measure to compute inter-centroid distance with
+   * @param clusterDistanceSummaries summaries of the clusters; See summarizeClusterDistances
+   * @return the Dunn Index
+   */
+  public static double dunnIndex(List<? extends Vector> centroids, DistanceMeasure distanceMeasure,
+                                 List<OnlineSummarizer> clusterDistanceSummaries) {
+    Preconditions.checkArgument(centroids.size() == clusterDistanceSummaries.size(),
+        "Number of centroids and cluster summaries differ.");
+    int n = centroids.size();
+    // Intra-cluster distances will come from the OnlineSummarizer, and will be the median distance (noting that
+    // the median for just one value is that value).
+    // A variety of metrics can be used for the intra-cluster distance including max distance between two points,
+    // mean distance, etc. Median distance was chosen as this is more robust to outliers and characterizes the
+    // distribution of distances (from a point to the center) better.
+    double maxIntraClusterDistance = 0;
+    for (OnlineSummarizer summarizer : clusterDistanceSummaries) {
+      if (summarizer.getCount() > 0) {
+        double intraClusterDistance;
+        if (summarizer.getCount() == 1) {
+          intraClusterDistance = summarizer.getMean();
+        } else {
+          intraClusterDistance = summarizer.getMedian();
+        }
+        if (maxIntraClusterDistance < intraClusterDistance) {
+          maxIntraClusterDistance = intraClusterDistance;
+        }
+      }
+    }
+    double minDunnIndex = Double.POSITIVE_INFINITY;
+    for (int i = 0; i < n; ++i) {
+      // Distances are symmetric, so d(i, j) = d(j, i).
+      for (int j = i + 1; j < n; ++j) {
+        double dunnIndex = distanceMeasure.distance(centroids.get(i), centroids.get(j));
+        if (minDunnIndex > dunnIndex) {
+          minDunnIndex = dunnIndex;
+        }
+      }
+    }
+    return minDunnIndex / maxIntraClusterDistance;
+  }
+
+  public static double choose2(double n) {
+    return n * (n - 1) / 2;
+  }
+
+  /**
+   * Creates a confusion matrix by searching for the closest cluster of both the row clustering and column clustering
+   * of a point and adding its weight to that cell of the matrix.
+   * It doesn't matter which clustering is the row clustering and which is the column clustering. If they're
+   * interchanged, the resulting matrix is the transpose of the original one.
+   * @param rowCentroids clustering one
+   * @param columnCentroids clustering two
+   * @param datapoints datapoints whose closest cluster we need to find
+   * @param distanceMeasure distance measure to use
+   * @return the confusion matrix
+   */
+  public static Matrix getConfusionMatrix(List<? extends Vector> rowCentroids, List<? extends  Vector> columnCentroids,
+                                          Iterable<? extends Vector> datapoints, DistanceMeasure distanceMeasure) {
+    Searcher rowSearcher = new BruteSearch(distanceMeasure);
+    rowSearcher.addAll(rowCentroids);
+    Searcher columnSearcher = new BruteSearch(distanceMeasure);
+    columnSearcher.addAll(columnCentroids);
+
+    int numRows = rowCentroids.size();
+    int numCols = columnCentroids.size();
+    Matrix confusionMatrix = new DenseMatrix(numRows, numCols);
+
+    for (Vector vector : datapoints) {
+      WeightedThing<Vector> closestRowCentroid = rowSearcher.search(vector, 1).get(0);
+      WeightedThing<Vector> closestColumnCentroid = columnSearcher.search(vector, 1).get(0);
+      int row = ((Centroid) closestRowCentroid.getValue()).getIndex();
+      int column = ((Centroid) closestColumnCentroid.getValue()).getIndex();
+      double vectorWeight;
+      if (vector instanceof WeightedVector) {
+        vectorWeight = ((WeightedVector) vector).getWeight();
+      } else {
+        vectorWeight = 1;
+      }
+      confusionMatrix.set(row, column, confusionMatrix.get(row, column) + vectorWeight);
+    }
+
+    return confusionMatrix;
+  }
+
+  /**
+   * Computes the Adjusted Rand Index for a given confusion matrix.
+   * @param confusionMatrix confusion matrix; not to be confused with the more restrictive ConfusionMatrix class
+   * @return the Adjusted Rand Index
+   */
+  public static double getAdjustedRandIndex(Matrix confusionMatrix) {
+    int numRows = confusionMatrix.numRows();
+    int numCols = confusionMatrix.numCols();
+    double rowChoiceSum = 0;
+    double columnChoiceSum = 0;
+    double totalChoiceSum = 0;
+    double total = 0;
+    for (int i = 0; i < numRows; ++i) {
+      double rowSum = 0;
+      for (int j = 0; j < numCols; ++j) {
+        rowSum += confusionMatrix.get(i, j);
+        totalChoiceSum += choose2(confusionMatrix.get(i, j));
+      }
+      total += rowSum;
+      rowChoiceSum += choose2(rowSum);
+    }
+    for (int j = 0; j < numCols; ++j) {
+      double columnSum = 0;
+      for (int i = 0; i < numRows; ++i) {
+        columnSum += confusionMatrix.get(i, j);
+      }
+      columnChoiceSum += choose2(columnSum);
+    }
+    double rowColumnChoiceSumDivTotal = rowChoiceSum * columnChoiceSum / choose2(total);
+    return (totalChoiceSum - rowColumnChoiceSumDivTotal)
+        / ((rowChoiceSum + columnChoiceSum) / 2 - rowColumnChoiceSumDivTotal);
+  }
+
+  /**
+   * Computes the total weight of the points in the given Vector iterable.
+   * @param data iterable of points
+   * @return total weight
+   */
+  public static double totalWeight(Iterable<? extends Vector> data) {
+    double sum = 0;
+    for (Vector row : data) {
+      Preconditions.checkNotNull(row);
+      if (row instanceof WeightedVector) {
+        sum += ((WeightedVector)row).getWeight();
+      } else {
+        sum++;
+      }
+    }
+    return sum;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java
new file mode 100644
index 0000000..c25e039
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+import org.apache.mahout.math.Vector;
+
+public interface GaussianAccumulator {
+
+  /**
+   * @return the number of observations
+   */
+  double getN();
+
+  /**
+   * @return the mean of the observations
+   */
+  Vector getMean();
+
+  /**
+   * @return the std of the observations
+   */
+  Vector getStd();
+  
+  /**
+   * @return the average of the vector std elements
+   */
+  double getAverageStd();
+  
+  /**
+   * @return the variance of the observations
+   */
+  Vector getVariance();
+
+  /**
+   * Observe the vector 
+   * 
+   * @param x a Vector
+   * @param weight the double observation weight (usually 1.0)
+   */
+  void observe(Vector x, double weight);
+
+  /**
+   * Compute the mean, variance and standard deviation
+   */
+  void compute();
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Model.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Model.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Model.java
new file mode 100644
index 0000000..79dab30
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Model.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * A model is a probability distribution over observed data points and allows
+ * the probability of any data point to be computed. All Models have a
+ * persistent representation and extend
+ * WritablesampleFromPosterior(Model<VectorWritable>[])
+ */
+public interface Model<O> extends Writable {
+  
+  /**
+   * Return the probability that the observation is described by this model
+   * 
+   * @param x
+   *          an Observation from the posterior
+   * @return the probability that x is in the receiver
+   */
+  double pdf(O x);
+  
+  /**
+   * Observe the given observation, retaining information about it
+   * 
+   * @param x
+   *          an Observation from the posterior
+   */
+  void observe(O x);
+  
+  /**
+   * Observe the given observation, retaining information about it
+   * 
+   * @param x
+   *          an Observation from the posterior
+   * @param weight
+   *          a double weighting factor
+   */
+  void observe(O x, double weight);
+  
+  /**
+   * Observe the given model, retaining information about its observations
+   * 
+   * @param x
+   *          a Model<0>
+   */
+  void observe(Model<O> x);
+  
+  /**
+   * Compute a new set of posterior parameters based upon the Observations that
+   * have been observed since my creation
+   */
+  void computeParameters();
+  
+  /**
+   * Return the number of observations that this model has seen since its
+   * parameters were last computed
+   * 
+   * @return a long
+   */
+  long getNumObservations();
+  
+  /**
+   * Return the number of observations that this model has seen over its
+   * lifetime
+   * 
+   * @return a long
+   */
+  long getTotalObservations();
+  
+  /**
+   * @return a sample of my posterior model
+   */
+  Model<VectorWritable> sampleFromPosterior();
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ModelDistribution.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ModelDistribution.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ModelDistribution.java
new file mode 100644
index 0000000..d77bf40
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ModelDistribution.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+/** A model distribution allows us to sample a model from its prior distribution. */
+public interface ModelDistribution<O> {
+  
+  /**
+   * Return a list of models sampled from the prior
+   * 
+   * @param howMany
+   *          the int number of models to return
+   * @return a Model<Observation>[] representing what is known apriori
+   */
+  Model<O>[] sampleFromPrior(int howMany);
+  
+  /**
+   * Return a list of models sampled from the posterior
+   * 
+   * @param posterior
+   *          the Model<Observation>[] after observations
+   * @return a Model<Observation>[] representing what is known apriori
+   */
+  Model<O>[] sampleFromPosterior(Model<O>[] posterior);
+  
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
new file mode 100644
index 0000000..b76e00f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering;
+
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.SquareRootFunction;
+
+/**
+ * An online Gaussian statistics accumulator based upon Knuth (who cites Welford) which is declared to be
+ * numerically-stable. See http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+ */
+public class OnlineGaussianAccumulator implements GaussianAccumulator {
+
+  private double sumWeight;
+  private Vector mean;
+  private Vector s;
+  private Vector variance;
+
+  @Override
+  public double getN() {
+    return sumWeight;
+  }
+
+  @Override
+  public Vector getMean() {
+    return mean;
+  }
+
+  @Override
+  public Vector getStd() {
+    return variance.clone().assign(new SquareRootFunction());
+  }
+
+  /* from Wikipedia: http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+   * 
+   * Weighted incremental algorithm
+   * 
+   * def weighted_incremental_variance(dataWeightPairs):
+   * mean = 0
+   * S = 0
+   * sumweight = 0
+   * for x, weight in dataWeightPairs: # Alternately "for x in zip(data, weight):"
+   *     temp = weight + sumweight
+   *     Q = x - mean
+   *      R = Q * weight / temp
+   *      S = S + sumweight * Q * R
+   *      mean = mean + R
+   *      sumweight = temp
+   *  Variance = S / (sumweight-1)  # if sample is the population, omit -1
+   *  return Variance
+   */
+  @Override
+  public void observe(Vector x, double weight) {
+    double temp = weight + sumWeight;
+    Vector q;
+    if (mean == null) {
+      mean = x.like();
+      q = x.clone();
+    } else {
+      q = x.minus(mean);
+    }
+    Vector r = q.times(weight).divide(temp);
+    if (s == null) {
+      s = q.times(sumWeight).times(r);
+    } else {
+      s = s.plus(q.times(sumWeight).times(r));
+    }
+    mean = mean.plus(r);
+    sumWeight = temp;
+    variance = s.divide(sumWeight - 1); //  # if sample is the population, omit -1
+  }
+
+  @Override
+  public void compute() {
+    // nothing to do here!
+  }
+
+  @Override
+  public double getAverageStd() {
+    if (sumWeight == 0.0) {
+      return 0.0;
+    } else {
+      Vector std = getStd();
+      return std.zSum() / std.size();
+    }
+  }
+
+  @Override
+  public Vector getVariance() {
+    return variance;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java
new file mode 100644
index 0000000..138e830
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering;
+
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+import org.apache.mahout.math.function.SquareRootFunction;
+
+/**
+ * An online Gaussian accumulator that uses a running power sums approach as reported 
+ * on http://en.wikipedia.org/wiki/Standard_deviation
+ * Suffers from overflow, underflow and roundoff error but has minimal observe-time overhead
+ */
+public class RunningSumsGaussianAccumulator implements GaussianAccumulator {
+
+  private double s0;
+  private Vector s1;
+  private Vector s2;
+  private Vector mean;
+  private Vector std;
+
+  @Override
+  public double getN() {
+    return s0;
+  }
+
+  @Override
+  public Vector getMean() {
+    return mean;
+  }
+
+  @Override
+  public Vector getStd() {
+    return std;
+  }
+
+  @Override
+  public double getAverageStd() {
+    if (s0 == 0.0) {
+      return 0.0;
+    } else {
+      return std.zSum() / std.size();
+    }
+  }
+
+  @Override
+  public Vector getVariance() {
+    return std.times(std);
+  }
+
+  @Override
+  public void observe(Vector x, double weight) {
+    s0 += weight;
+    Vector weightedX = x.times(weight);
+    if (s1 == null) {
+      s1 = weightedX;
+    } else {
+      s1.assign(weightedX, Functions.PLUS);
+    }
+    Vector x2 = x.times(x).times(weight);
+    if (s2 == null) {
+      s2 = x2;
+    } else {
+      s2.assign(x2, Functions.PLUS);
+    }
+  }
+
+  @Override
+  public void compute() {
+    if (s0 != 0.0) {
+      mean = s1.divide(s0);
+      std = s2.times(s0).minus(s1.times(s1)).assign(new SquareRootFunction()).divide(s0);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/UncommonDistributions.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/UncommonDistributions.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/UncommonDistributions.java
new file mode 100644
index 0000000..ef43e1b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/UncommonDistributions.java
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+import org.apache.commons.math3.distribution.NormalDistribution;
+import org.apache.commons.math3.distribution.RealDistribution;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.RandomWrapper;
+
+public final class UncommonDistributions {
+
+  private static final RandomWrapper RANDOM = RandomUtils.getRandom();
+  
+  private UncommonDistributions() {}
+  
+  // =============== start of BSD licensed code. See LICENSE.txt
+  /**
+   * Returns a double sampled according to this distribution. Uniformly fast for all k > 0. (Reference:
+   * Non-Uniform Random Variate Generation, Devroye http://cgm.cs.mcgill.ca/~luc/rnbookindex.html) Uses
+   * Cheng's rejection algorithm (GB) for k>=1, rejection from Weibull distribution for 0 < k < 1.
+   */
+  public static double rGamma(double k, double lambda) {
+    boolean accept = false;
+    if (k >= 1.0) {
+      // Cheng's algorithm
+      double b = k - Math.log(4.0);
+      double c = k + Math.sqrt(2.0 * k - 1.0);
+      double lam = Math.sqrt(2.0 * k - 1.0);
+      double cheng = 1.0 + Math.log(4.5);
+      double x;
+      do {
+        double u = RANDOM.nextDouble();
+        double v = RANDOM.nextDouble();
+        double y = 1.0 / lam * Math.log(v / (1.0 - v));
+        x = k * Math.exp(y);
+        double z = u * v * v;
+        double r = b + c * y - x;
+        if (r >= 4.5 * z - cheng || r >= Math.log(z)) {
+          accept = true;
+        }
+      } while (!accept);
+      return x / lambda;
+    } else {
+      // Weibull algorithm
+      double c = 1.0 / k;
+      double d = (1.0 - k) * Math.pow(k, k / (1.0 - k));
+      double x;
+      do {
+        double u = RANDOM.nextDouble();
+        double v = RANDOM.nextDouble();
+        double z = -Math.log(u);
+        double e = -Math.log(v);
+        x = Math.pow(z, c);
+        if (z + e >= d + x) {
+          accept = true;
+        }
+      } while (!accept);
+      return x / lambda;
+    }
+  }
+  
+  // ============= end of BSD licensed code
+  
+  /**
+   * Returns a random sample from a beta distribution with the given shapes
+   * 
+   * @param shape1
+   *          a double representing shape1
+   * @param shape2
+   *          a double representing shape2
+   * @return a Vector of samples
+   */
+  public static double rBeta(double shape1, double shape2) {
+    double gam1 = rGamma(shape1, 1.0);
+    double gam2 = rGamma(shape2, 1.0);
+    return gam1 / (gam1 + gam2);
+    
+  }
+  
+  /**
+   * Return a random value from a normal distribution with the given mean and standard deviation
+   * 
+   * @param mean
+   *          a double mean value
+   * @param sd
+   *          a double standard deviation
+   * @return a double sample
+   */
+  public static double rNorm(double mean, double sd) {
+    RealDistribution dist = new NormalDistribution(RANDOM.getRandomGenerator(),
+                                                   mean,
+                                                   sd,
+                                                   NormalDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY);
+    return dist.sample();
+  }
+  
+  /**
+   * Returns an integer sampled according to this distribution. Takes time proportional to np + 1. (Reference:
+   * Non-Uniform Random Variate Generation, Devroye http://cgm.cs.mcgill.ca/~luc/rnbookindex.html) Second
+   * time-waiting algorithm.
+   */
+  public static int rBinomial(int n, double p) {
+    if (p >= 1.0) {
+      return n; // needed to avoid infinite loops and negative results
+    }
+    double q = -Math.log1p(-p);
+    double sum = 0.0;
+    int x = 0;
+    while (sum <= q) {
+      double u = RANDOM.nextDouble();
+      double e = -Math.log(u);
+      sum += e / (n - x);
+      x++;
+    }
+    if (x == 0) {
+      return 0;
+    }
+    return x - 1;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
new file mode 100644
index 0000000..930fd44
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.canopy;
+
+import org.apache.mahout.clustering.iterator.DistanceMeasureCluster;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Vector;
+
+/**
+ * This class models a canopy as a center point, the number of points that are contained within it according
+ * to the application of some distance metric, and a point total which is the sum of all the points and is
+ * used to compute the centroid when needed.
+ */
+@Deprecated
+public class Canopy extends DistanceMeasureCluster {
+  
+  /** Used for deserialization as a writable */
+  public Canopy() { }
+  
+  /**
+   * Create a new Canopy containing the given point and canopyId
+   * 
+   * @param center a point in vector space
+   * @param canopyId an int identifying the canopy local to this process only
+   * @param measure a DistanceMeasure to use
+   */
+  public Canopy(Vector center, int canopyId, DistanceMeasure measure) {
+    super(center, canopyId, measure);
+    observe(center);
+  }
+
+  public String asFormatString() {
+    return "C" + this.getId() + ": " + this.computeCentroid().asFormatString();
+  }
+
+  @Override
+  public String toString() {
+    return getIdentifier() + ": " + getCenter().asFormatString();
+  }
+  
+  @Override
+  public String getIdentifier() {
+    return "C-" + getId();
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java
new file mode 100644
index 0000000..3ce4757
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java
@@ -0,0 +1,220 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.canopy;
+
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Vector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.collect.Lists;
+
+@Deprecated
+public class CanopyClusterer {
+
+  private static final Logger log = LoggerFactory.getLogger(CanopyClusterer.class);
+
+  private int nextCanopyId;
+
+  // the T1 distance threshold
+  private double t1;
+
+  // the T2 distance threshold
+  private double t2;
+
+  // the T3 distance threshold
+  private double t3;
+
+  // the T4 distance threshold
+  private double t4;
+
+  // the distance measure
+  private DistanceMeasure measure;
+
+  public CanopyClusterer(DistanceMeasure measure, double t1, double t2) {
+    this.t1 = t1;
+    this.t2 = t2;
+    this.t3 = t1;
+    this.t4 = t2;
+    this.measure = measure;
+  }
+
+  public double getT1() {
+    return t1;
+  }
+
+  public double getT2() {
+    return t2;
+  }
+
+  public double getT3() {
+    return t3;
+  }
+
+  public double getT4() {
+    return t4;
+  }
+
+  /**
+   * Used by CanopyReducer to set t1=t3 and t2=t4 configuration values
+   */
+  public void useT3T4() {
+    t1 = t3;
+    t2 = t4;
+  }
+
+  /**
+   * This is the same algorithm as the reference but inverted to iterate over
+   * existing canopies instead of the points. Because of this it does not need
+   * to actually store the points, instead storing a total points vector and
+   * the number of points. From this a centroid can be computed.
+   * <p/>
+   * This method is used by the CanopyMapper, CanopyReducer and CanopyDriver.
+   * 
+   * @param point
+   *            the point to be added
+   * @param canopies
+   *            the List<Canopy> to be appended
+   */
+  public void addPointToCanopies(Vector point, Collection<Canopy> canopies) {
+    boolean pointStronglyBound = false;
+    for (Canopy canopy : canopies) {
+      double dist = measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point);
+      if (dist < t1) {
+        if (log.isDebugEnabled()) {
+          log.debug("Added point: {} to canopy: {}", AbstractCluster.formatVector(point, null), canopy.getIdentifier());
+        }
+        canopy.observe(point);
+      }
+      pointStronglyBound = pointStronglyBound || dist < t2;
+    }
+    if (!pointStronglyBound) {
+      if (log.isDebugEnabled()) {
+        log.debug("Created new Canopy:{} at center:{}", nextCanopyId, AbstractCluster.formatVector(point, null));
+      }
+      canopies.add(new Canopy(point, nextCanopyId++, measure));
+    }
+  }
+
+  /**
+   * Return if the point is covered by the canopy
+   * 
+   * @param point
+   *            a point
+   * @return if the point is covered
+   */
+  public boolean canopyCovers(Canopy canopy, Vector point) {
+    return measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point) < t1;
+  }
+
+  /**
+   * Iterate through the points, adding new canopies. Return the canopies.
+   * 
+   * @param points
+   *            a list<Vector> defining the points to be clustered
+   * @param measure
+   *            a DistanceMeasure to use
+   * @param t1
+   *            the T1 distance threshold
+   * @param t2
+   *            the T2 distance threshold
+   * @return the List<Canopy> created
+   */
+  public static List<Canopy> createCanopies(List<Vector> points,
+                                            DistanceMeasure measure,
+                                            double t1,
+                                            double t2) {
+    List<Canopy> canopies = Lists.newArrayList();
+    /**
+     * Reference Implementation: Given a distance metric, one can create
+     * canopies as follows: Start with a list of the data points in any
+     * order, and with two distance thresholds, T1 and T2, where T1 > T2.
+     * (These thresholds can be set by the user, or selected by
+     * cross-validation.) Pick a point on the list and measure its distance
+     * to all other points. Put all points that are within distance
+     * threshold T1 into a canopy. Remove from the list all points that are
+     * within distance threshold T2. Repeat until the list is empty.
+     */
+    int nextCanopyId = 0;
+    while (!points.isEmpty()) {
+      Iterator<Vector> ptIter = points.iterator();
+      Vector p1 = ptIter.next();
+      ptIter.remove();
+      Canopy canopy = new Canopy(p1, nextCanopyId++, measure);
+      canopies.add(canopy);
+      while (ptIter.hasNext()) {
+        Vector p2 = ptIter.next();
+        double dist = measure.distance(p1, p2);
+        // Put all points that are within distance threshold T1 into the
+        // canopy
+        if (dist < t1) {
+          canopy.observe(p2);
+        }
+        // Remove from the list all points that are within distance
+        // threshold T2
+        if (dist < t2) {
+          ptIter.remove();
+        }
+      }
+      for (Canopy c : canopies) {
+        c.computeParameters();
+      }
+    }
+    return canopies;
+  }
+
+  /**
+   * Iterate through the canopies, adding their centroids to a list
+   * 
+   * @param canopies
+   *            a List<Canopy>
+   * @return the List<Vector>
+   */
+  public static List<Vector> getCenters(Iterable<Canopy> canopies) {
+    List<Vector> result = Lists.newArrayList();
+    for (Canopy canopy : canopies) {
+      result.add(canopy.getCenter());
+    }
+    return result;
+  }
+
+  /**
+   * Iterate through the canopies, resetting their center to their centroids
+   * 
+   * @param canopies
+   *            a List<Canopy>
+   */
+  public static void updateCentroids(Iterable<Canopy> canopies) {
+    for (Canopy canopy : canopies) {
+      canopy.computeParameters();
+    }
+  }
+
+  public void setT3(double t3) {
+    this.t3 = t3;
+  }
+
+  public void setT4(double t4) {
+    this.t4 = t4;
+  }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java
new file mode 100644
index 0000000..2f24026
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.canopy;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+
+@Deprecated
+public final class CanopyConfigKeys {
+
+  private CanopyConfigKeys() {}
+
+  public static final String T1_KEY = "org.apache.mahout.clustering.canopy.t1";
+
+  public static final String T2_KEY = "org.apache.mahout.clustering.canopy.t2";
+
+  public static final String T3_KEY = "org.apache.mahout.clustering.canopy.t3";
+
+  public static final String T4_KEY = "org.apache.mahout.clustering.canopy.t4";
+
+  // keys used by Driver, Mapper, Combiner & Reducer
+  public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.canopy.measure";
+
+  public static final String CF_KEY = "org.apache.mahout.clustering.canopy.canopyFilter";
+
+  /**
+   * Create a {@link CanopyClusterer} from the Hadoop configuration.
+   *
+   * @param configuration Hadoop configuration
+   *
+   * @return CanopyClusterer
+   */
+  public static CanopyClusterer configureCanopyClusterer(Configuration configuration) {
+    double t1 = Double.parseDouble(configuration.get(T1_KEY));
+    double t2 = Double.parseDouble(configuration.get(T2_KEY));
+
+    DistanceMeasure measure = ClassUtils.instantiateAs(configuration.get(DISTANCE_MEASURE_KEY), DistanceMeasure.class);
+    measure.configure(configuration);
+
+    CanopyClusterer canopyClusterer = new CanopyClusterer(measure, t1, t2);
+
+    String d = configuration.get(T3_KEY);
+    if (d != null) {
+      canopyClusterer.setT3(Double.parseDouble(d));
+    }
+
+    d = configuration.get(T4_KEY);
+    if (d != null) {
+      canopyClusterer.setT4(Double.parseDouble(d));
+    }
+    return canopyClusterer;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
new file mode 100644
index 0000000..06dc947
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
@@ -0,0 +1,379 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.canopy;
+
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassificationDriver;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.clustering.iterator.CanopyClusteringPolicy;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.topdown.PathDirectory;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.collect.Lists;
+import com.google.common.io.Closeables;
+
+@Deprecated
+public class CanopyDriver extends AbstractJob {
+
+  public static final String DEFAULT_CLUSTERED_POINTS_DIRECTORY = "clusteredPoints";
+
+  private static final Logger log = LoggerFactory.getLogger(CanopyDriver.class);
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new CanopyDriver(), args);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+
+    addInputOption();
+    addOutputOption();
+    addOption(DefaultOptionCreator.distanceMeasureOption().create());
+    addOption(DefaultOptionCreator.t1Option().create());
+    addOption(DefaultOptionCreator.t2Option().create());
+    addOption(DefaultOptionCreator.t3Option().create());
+    addOption(DefaultOptionCreator.t4Option().create());
+    addOption(DefaultOptionCreator.clusterFilterOption().create());
+    addOption(DefaultOptionCreator.overwriteOption().create());
+    addOption(DefaultOptionCreator.clusteringOption().create());
+    addOption(DefaultOptionCreator.methodOption().create());
+    addOption(DefaultOptionCreator.outlierThresholdOption().create());
+
+    if (parseArguments(args) == null) {
+      return -1;
+    }
+
+    Path input = getInputPath();
+    Path output = getOutputPath();
+    Configuration conf = getConf();
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.delete(conf, output);
+    }
+    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+    double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
+    double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
+    double t3 = t1;
+    if (hasOption(DefaultOptionCreator.T3_OPTION)) {
+      t3 = Double.parseDouble(getOption(DefaultOptionCreator.T3_OPTION));
+    }
+    double t4 = t2;
+    if (hasOption(DefaultOptionCreator.T4_OPTION)) {
+      t4 = Double.parseDouble(getOption(DefaultOptionCreator.T4_OPTION));
+    }
+    int clusterFilter = 0;
+    if (hasOption(DefaultOptionCreator.CLUSTER_FILTER_OPTION)) {
+      clusterFilter = Integer
+          .parseInt(getOption(DefaultOptionCreator.CLUSTER_FILTER_OPTION));
+    }
+    boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
+    boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION)
+        .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD);
+    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
+    double clusterClassificationThreshold = 0.0;
+    if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) {
+      clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD));
+    }
+    run(conf, input, output, measure, t1, t2, t3, t4, clusterFilter,
+        runClustering, clusterClassificationThreshold, runSequential);
+    return 0;
+  }
+
+  /**
+   * Build a directory of Canopy clusters from the input arguments and, if
+   * requested, cluster the input vectors using these clusters
+   * 
+   * @param conf
+   *          the Configuration
+   * @param input
+   *          the Path to the directory containing input vectors
+   * @param output
+   *          the Path for all output directories
+   * @param measure
+   *          the DistanceMeasure
+   * @param t1
+   *          the double T1 distance metric
+   * @param t2
+   *          the double T2 distance metric
+   * @param t3
+   *          the reducer's double T1 distance metric
+   * @param t4
+   *          the reducer's double T2 distance metric
+   * @param clusterFilter
+   *          the minimum canopy size output by the mappers
+   * @param runClustering
+   *          cluster the input vectors if true
+   * @param clusterClassificationThreshold 
+   *          vectors having pdf below this value will not be clustered. Its value should be between 0 and 1.
+   * @param runSequential
+   *          execute sequentially if true
+   */
+  public static void run(Configuration conf, Path input, Path output,
+      DistanceMeasure measure, double t1, double t2, double t3, double t4,
+      int clusterFilter, boolean runClustering, double clusterClassificationThreshold, boolean runSequential)
+    throws IOException, InterruptedException, ClassNotFoundException {
+    Path clustersOut = buildClusters(conf, input, output, measure, t1, t2, t3,
+        t4, clusterFilter, runSequential);
+    if (runClustering) {
+      clusterData(conf, input, clustersOut, output, clusterClassificationThreshold, runSequential);
+    }
+  }
+
+  /**
+   * Convenience method to provide backward compatibility
+   */
+  public static void run(Configuration conf, Path input, Path output,
+      DistanceMeasure measure, double t1, double t2, boolean runClustering,
+      double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException,
+      ClassNotFoundException {
+    run(conf, input, output, measure, t1, t2, t1, t2, 0, runClustering,
+        clusterClassificationThreshold, runSequential);
+  }
+
+  /**
+   * Convenience method creates new Configuration() Build a directory of Canopy
+   * clusters from the input arguments and, if requested, cluster the input
+   * vectors using these clusters
+   * 
+   * @param input
+   *          the Path to the directory containing input vectors
+   * @param output
+   *          the Path for all output directories
+   * @param t1
+   *          the double T1 distance metric
+   * @param t2
+   *          the double T2 distance metric
+   * @param runClustering
+   *          cluster the input vectors if true
+   * @param clusterClassificationThreshold
+   *          vectors having pdf below this value will not be clustered. Its value should be between 0 and 1. 
+   * @param runSequential
+   *          execute sequentially if true
+   */
+  public static void run(Path input, Path output, DistanceMeasure measure,
+      double t1, double t2, boolean runClustering, double clusterClassificationThreshold, boolean runSequential)
+    throws IOException, InterruptedException, ClassNotFoundException {
+    run(new Configuration(), input, output, measure, t1, t2, runClustering,
+        clusterClassificationThreshold, runSequential);
+  }
+
+  /**
+   * Convenience method for backwards compatibility
+   * 
+   */
+  public static Path buildClusters(Configuration conf, Path input, Path output,
+      DistanceMeasure measure, double t1, double t2, int clusterFilter,
+      boolean runSequential) throws IOException, InterruptedException,
+      ClassNotFoundException {
+    return buildClusters(conf, input, output, measure, t1, t2, t1, t2,
+        clusterFilter, runSequential);
+  }
+
+  /**
+   * Build a directory of Canopy clusters from the input vectors and other
+   * arguments. Run sequential or mapreduce execution as requested
+   * 
+   * @param conf
+   *          the Configuration to use
+   * @param input
+   *          the Path to the directory containing input vectors
+   * @param output
+   *          the Path for all output directories
+   * @param measure
+   *          the DistanceMeasure
+   * @param t1
+   *          the double T1 distance metric
+   * @param t2
+   *          the double T2 distance metric
+   * @param t3
+   *          the reducer's double T1 distance metric
+   * @param t4
+   *          the reducer's double T2 distance metric
+   * @param clusterFilter
+   *          the int minimum size of canopies produced
+   * @param runSequential
+   *          a boolean indicates to run the sequential (reference) algorithm
+   * @return the canopy output directory Path
+   */
+  public static Path buildClusters(Configuration conf, Path input, Path output,
+      DistanceMeasure measure, double t1, double t2, double t3, double t4,
+      int clusterFilter, boolean runSequential) throws IOException,
+      InterruptedException, ClassNotFoundException {
+    log.info("Build Clusters Input: {} Out: {} Measure: {} t1: {} t2: {}",
+             input, output, measure, t1, t2);
+    if (runSequential) {
+      return buildClustersSeq(input, output, measure, t1, t2, clusterFilter);
+    } else {
+      return buildClustersMR(conf, input, output, measure, t1, t2, t3, t4,
+          clusterFilter);
+    }
+  }
+
+  /**
+   * Build a directory of Canopy clusters from the input vectors and other
+   * arguments. Run sequential execution
+   * 
+   * @param input
+   *          the Path to the directory containing input vectors
+   * @param output
+   *          the Path for all output directories
+   * @param measure
+   *          the DistanceMeasure
+   * @param t1
+   *          the double T1 distance metric
+   * @param t2
+   *          the double T2 distance metric
+   * @param clusterFilter
+   *          the int minimum size of canopies produced
+   * @return the canopy output directory Path
+   */
+  private static Path buildClustersSeq(Path input, Path output,
+      DistanceMeasure measure, double t1, double t2, int clusterFilter)
+    throws IOException {
+    CanopyClusterer clusterer = new CanopyClusterer(measure, t1, t2);
+    Collection<Canopy> canopies = Lists.newArrayList();
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(input.toUri(), conf);
+
+    for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(
+        input, PathType.LIST, PathFilters.logsCRCFilter(), conf)) {
+      clusterer.addPointToCanopies(vw.get(), canopies);
+    }
+
+    Path canopyOutputDir = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
+    Path path = new Path(canopyOutputDir, "part-r-00000");
+    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
+        Text.class, ClusterWritable.class);
+    try {
+      ClusterWritable clusterWritable = new ClusterWritable();
+      for (Canopy canopy : canopies) {
+        canopy.computeParameters();
+        if (log.isDebugEnabled()) {
+          log.debug("Writing Canopy:{} center:{} numPoints:{} radius:{}",
+                    canopy.getIdentifier(),
+                    AbstractCluster.formatVector(canopy.getCenter(), null),
+                    canopy.getNumObservations(),
+                    AbstractCluster.formatVector(canopy.getRadius(), null));
+        }
+        if (canopy.getNumObservations() > clusterFilter) {
+          clusterWritable.setValue(canopy);
+          writer.append(new Text(canopy.getIdentifier()), clusterWritable);
+        }
+      }
+    } finally {
+      Closeables.close(writer, false);
+    }
+    return canopyOutputDir;
+  }
+
+  /**
+   * Build a directory of Canopy clusters from the input vectors and other
+   * arguments. Run mapreduce execution
+   * 
+   * @param conf
+   *          the Configuration
+   * @param input
+   *          the Path to the directory containing input vectors
+   * @param output
+   *          the Path for all output directories
+   * @param measure
+   *          the DistanceMeasure
+   * @param t1
+   *          the double T1 distance metric
+   * @param t2
+   *          the double T2 distance metric
+   * @param t3
+   *          the reducer's double T1 distance metric
+   * @param t4
+   *          the reducer's double T2 distance metric
+   * @param clusterFilter
+   *          the int minimum size of canopies produced
+   * @return the canopy output directory Path
+   */
+  private static Path buildClustersMR(Configuration conf, Path input,
+      Path output, DistanceMeasure measure, double t1, double t2, double t3,
+      double t4, int clusterFilter) throws IOException, InterruptedException,
+      ClassNotFoundException {
+    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass()
+        .getName());
+    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(t1));
+    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(t2));
+    conf.set(CanopyConfigKeys.T3_KEY, String.valueOf(t3));
+    conf.set(CanopyConfigKeys.T4_KEY, String.valueOf(t4));
+    conf.set(CanopyConfigKeys.CF_KEY, String.valueOf(clusterFilter));
+
+    Job job = new Job(conf, "Canopy Driver running buildClusters over input: "
+        + input);
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    job.setOutputFormatClass(SequenceFileOutputFormat.class);
+    job.setMapperClass(CanopyMapper.class);
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(VectorWritable.class);
+    job.setReducerClass(CanopyReducer.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(ClusterWritable.class);
+    job.setNumReduceTasks(1);
+    job.setJarByClass(CanopyDriver.class);
+
+    FileInputFormat.addInputPath(job, input);
+    Path canopyOutputDir = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
+    FileOutputFormat.setOutputPath(job, canopyOutputDir);
+    if (!job.waitForCompletion(true)) {
+      throw new InterruptedException("Canopy Job failed processing " + input);
+    }
+    return canopyOutputDir;
+  }
+
+  private static void clusterData(Configuration conf,
+                                  Path points,
+                                  Path canopies,
+                                  Path output,
+                                  double clusterClassificationThreshold,
+                                  boolean runSequential)
+    throws IOException, InterruptedException, ClassNotFoundException {
+    ClusterClassifier.writePolicy(new CanopyClusteringPolicy(), canopies);
+    ClusterClassificationDriver.run(conf, points, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY),
+                                    clusterClassificationThreshold, true, runSequential);
+  }
+
+}


[31/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor

Posted by ra...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/resources/bank-full.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/resources/bank-full.csv b/community/mahout-mr/mr-examples/src/main/resources/bank-full.csv
new file mode 100644
index 0000000..d7a2ede
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/resources/bank-full.csv
@@ -0,0 +1,45212 @@
+"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"
+58;"management";"married";"tertiary";"no";2143;"yes";"no";"unknown";5;"may";261;1;-1;0;"unknown";"no"
+44;"technician";"single";"secondary";"no";29;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
+33;"entrepreneur";"married";"secondary";"no";2;"yes";"yes";"unknown";5;"may";76;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"unknown";"no";1506;"yes";"no";"unknown";5;"may";92;1;-1;0;"unknown";"no"
+33;"unknown";"single";"unknown";"no";1;"no";"no";"unknown";5;"may";198;1;-1;0;"unknown";"no"
+35;"management";"married";"tertiary";"no";231;"yes";"no";"unknown";5;"may";139;1;-1;0;"unknown";"no"
+28;"management";"single";"tertiary";"no";447;"yes";"yes";"unknown";5;"may";217;1;-1;0;"unknown";"no"
+42;"entrepreneur";"divorced";"tertiary";"yes";2;"yes";"no";"unknown";5;"may";380;1;-1;0;"unknown";"no"
+58;"retired";"married";"primary";"no";121;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
+43;"technician";"single";"secondary";"no";593;"yes";"no";"unknown";5;"may";55;1;-1;0;"unknown";"no"
+41;"admin.";"divorced";"secondary";"no";270;"yes";"no";"unknown";5;"may";222;1;-1;0;"unknown";"no"
+29;"admin.";"single";"secondary";"no";390;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";6;"yes";"no";"unknown";5;"may";517;1;-1;0;"unknown";"no"
+58;"technician";"married";"unknown";"no";71;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
+57;"services";"married";"secondary";"no";162;"yes";"no";"unknown";5;"may";174;1;-1;0;"unknown";"no"
+51;"retired";"married";"primary";"no";229;"yes";"no";"unknown";5;"may";353;1;-1;0;"unknown";"no"
+45;"admin.";"single";"unknown";"no";13;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";52;"yes";"no";"unknown";5;"may";38;1;-1;0;"unknown";"no"
+60;"retired";"married";"primary";"no";60;"yes";"no";"unknown";5;"may";219;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";54;1;-1;0;"unknown";"no"
+28;"blue-collar";"married";"secondary";"no";723;"yes";"yes";"unknown";5;"may";262;1;-1;0;"unknown";"no"
+56;"management";"married";"tertiary";"no";779;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
+32;"blue-collar";"single";"primary";"no";23;"yes";"yes";"unknown";5;"may";160;1;-1;0;"unknown";"no"
+25;"services";"married";"secondary";"no";50;"yes";"no";"unknown";5;"may";342;1;-1;0;"unknown";"no"
+40;"retired";"married";"primary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+44;"admin.";"married";"secondary";"no";-372;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
+39;"management";"single";"tertiary";"no";255;"yes";"no";"unknown";5;"may";296;1;-1;0;"unknown";"no"
+52;"entrepreneur";"married";"secondary";"no";113;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
+46;"management";"single";"secondary";"no";-246;"yes";"no";"unknown";5;"may";255;2;-1;0;"unknown";"no"
+36;"technician";"single";"secondary";"no";265;"yes";"yes";"unknown";5;"may";348;1;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";839;"no";"yes";"unknown";5;"may";225;1;-1;0;"unknown";"no"
+49;"management";"married";"tertiary";"no";378;"yes";"no";"unknown";5;"may";230;1;-1;0;"unknown";"no"
+60;"admin.";"married";"secondary";"no";39;"yes";"yes";"unknown";5;"may";208;1;-1;0;"unknown";"no"
+59;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";226;1;-1;0;"unknown";"no"
+51;"management";"married";"tertiary";"no";10635;"yes";"no";"unknown";5;"may";336;1;-1;0;"unknown";"no"
+57;"technician";"divorced";"secondary";"no";63;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
+25;"blue-collar";"married";"secondary";"no";-7;"yes";"no";"unknown";5;"may";365;1;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";-3;"no";"no";"unknown";5;"may";1666;1;-1;0;"unknown";"no"
+36;"admin.";"divorced";"secondary";"no";506;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
+44;"services";"divorced";"secondary";"no";2586;"yes";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
+50;"management";"married";"secondary";"no";49;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
+60;"blue-collar";"married";"unknown";"no";104;"yes";"no";"unknown";5;"may";22;1;-1;0;"unknown";"no"
+54;"retired";"married";"secondary";"no";529;"yes";"no";"unknown";5;"may";1492;1;-1;0;"unknown";"no"
+58;"retired";"married";"unknown";"no";96;"yes";"no";"unknown";5;"may";616;1;-1;0;"unknown";"no"
+36;"admin.";"single";"primary";"no";-171;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
+58;"self-employed";"married";"tertiary";"no";-364;"yes";"no";"unknown";5;"may";355;1;-1;0;"unknown";"no"
+44;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
+55;"technician";"divorced";"secondary";"no";0;"no";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
+29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";363;1;-1;0;"unknown";"no"
+54;"blue-collar";"married";"secondary";"no";1291;"yes";"no";"unknown";5;"may";266;1;-1;0;"unknown";"no"
+48;"management";"divorced";"tertiary";"no";-244;"yes";"no";"unknown";5;"may";253;1;-1;0;"unknown";"no"
+32;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";179;1;-1;0;"unknown";"no"
+42;"admin.";"single";"secondary";"no";-76;"yes";"no";"unknown";5;"may";787;1;-1;0;"unknown";"no"
+24;"technician";"single";"secondary";"no";-103;"yes";"yes";"unknown";5;"may";145;1;-1;0;"unknown";"no"
+38;"entrepreneur";"single";"tertiary";"no";243;"no";"yes";"unknown";5;"may";174;1;-1;0;"unknown";"no"
+38;"management";"single";"tertiary";"no";424;"yes";"no";"unknown";5;"may";104;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"unknown";"no";306;"yes";"no";"unknown";5;"may";13;1;-1;0;"unknown";"no"
+40;"blue-collar";"single";"unknown";"no";24;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
+46;"services";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";1778;1;-1;0;"unknown";"no"
+32;"admin.";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
+53;"technician";"divorced";"secondary";"no";989;"yes";"no";"unknown";5;"may";812;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";249;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";790;"yes";"no";"unknown";5;"may";391;1;-1;0;"unknown";"no"
+49;"blue-collar";"married";"unknown";"no";154;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
+51;"management";"married";"tertiary";"no";6530;"yes";"no";"unknown";5;"may";91;1;-1;0;"unknown";"no"
+60;"retired";"married";"tertiary";"no";100;"no";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
+59;"management";"divorced";"tertiary";"no";59;"yes";"no";"unknown";5;"may";273;1;-1;0;"unknown";"no"
+55;"technician";"married";"secondary";"no";1205;"yes";"no";"unknown";5;"may";158;2;-1;0;"unknown";"no"
+35;"blue-collar";"single";"secondary";"no";12223;"yes";"yes";"unknown";5;"may";177;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"secondary";"no";5935;"yes";"yes";"unknown";5;"may";258;1;-1;0;"unknown";"no"
+31;"services";"married";"secondary";"no";25;"yes";"yes";"unknown";5;"may";172;1;-1;0;"unknown";"no"
+54;"management";"married";"secondary";"no";282;"yes";"yes";"unknown";5;"may";154;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
+43;"technician";"married";"secondary";"no";1937;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";384;"yes";"no";"unknown";5;"may";176;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";582;"no";"yes";"unknown";5;"may";211;1;-1;0;"unknown";"no"
+55;"services";"divorced";"secondary";"no";91;"no";"no";"unknown";5;"may";349;1;-1;0;"unknown";"no"
+49;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";5;"may";272;1;-1;0;"unknown";"no"
+55;"services";"divorced";"secondary";"yes";1;"yes";"no";"unknown";5;"may";208;1;-1;0;"unknown";"no"
+45;"admin.";"single";"secondary";"no";206;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
+47;"services";"divorced";"secondary";"no";164;"no";"no";"unknown";5;"may";212;1;-1;0;"unknown";"no"
+42;"technician";"single";"secondary";"no";690;"yes";"no";"unknown";5;"may";20;1;-1;0;"unknown";"no"
+59;"admin.";"married";"secondary";"no";2343;"yes";"no";"unknown";5;"may";1042;1;-1;0;"unknown";"yes"
+46;"self-employed";"married";"tertiary";"no";137;"yes";"yes";"unknown";5;"may";246;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";173;"yes";"no";"unknown";5;"may";529;2;-1;0;"unknown";"no"
+56;"admin.";"married";"secondary";"no";45;"no";"no";"unknown";5;"may";1467;1;-1;0;"unknown";"yes"
+41;"technician";"married";"secondary";"no";1270;"yes";"no";"unknown";5;"may";1389;1;-1;0;"unknown";"yes"
+46;"management";"divorced";"secondary";"no";16;"yes";"yes";"unknown";5;"may";188;2;-1;0;"unknown";"no"
+57;"retired";"married";"secondary";"no";486;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
+42;"management";"single";"secondary";"no";50;"no";"no";"unknown";5;"may";48;1;-1;0;"unknown";"no"
+30;"technician";"married";"secondary";"no";152;"yes";"yes";"unknown";5;"may";213;2;-1;0;"unknown";"no"
+60;"admin.";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";583;1;-1;0;"unknown";"no"
+60;"blue-collar";"married";"unknown";"no";54;"yes";"no";"unknown";5;"may";221;1;-1;0;"unknown";"no"
+57;"entrepreneur";"divorced";"secondary";"no";-37;"no";"no";"unknown";5;"may";173;1;-1;0;"unknown";"no"
+36;"management";"married";"tertiary";"no";101;"yes";"yes";"unknown";5;"may";426;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";383;"no";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
+60;"retired";"married";"tertiary";"no";81;"yes";"no";"unknown";5;"may";101;1;-1;0;"unknown";"no"
+39;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";203;1;-1;0;"unknown";"no"
+46;"management";"married";"tertiary";"no";229;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";-674;"yes";"no";"unknown";5;"may";257;1;-1;0;"unknown";"no"
+53;"blue-collar";"married";"primary";"no";90;"no";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
+52;"blue-collar";"married";"primary";"no";128;"yes";"no";"unknown";5;"may";229;1;-1;0;"unknown";"no"
+59;"blue-collar";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";55;3;-1;0;"unknown";"no"
+27;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";400;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";54;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
+47;"technician";"married";"tertiary";"no";151;"yes";"no";"unknown";5;"may";190;1;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";61;"no";"yes";"unknown";5;"may";21;1;-1;0;"unknown";"no"
+59;"retired";"single";"secondary";"no";30;"yes";"no";"unknown";5;"may";514;1;-1;0;"unknown";"no"
+45;"management";"married";"tertiary";"no";523;"yes";"no";"unknown";5;"may";849;2;-1;0;"unknown";"no"
+29;"services";"divorced";"secondary";"no";31;"yes";"no";"unknown";5;"may";194;1;-1;0;"unknown";"no"
+46;"technician";"divorced";"secondary";"no";79;"no";"no";"unknown";5;"may";144;1;-1;0;"unknown";"no"
+56;"self-employed";"married";"primary";"no";-34;"yes";"yes";"unknown";5;"may";212;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"primary";"no";448;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
+59;"retired";"divorced";"primary";"no";81;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";144;"yes";"no";"unknown";5;"may";247;2;-1;0;"unknown";"no"
+41;"admin.";"married";"secondary";"no";351;"yes";"no";"unknown";5;"may";518;1;-1;0;"unknown";"no"
+33;"management";"single";"tertiary";"no";-67;"yes";"no";"unknown";5;"may";364;1;-1;0;"unknown";"no"
+59;"management";"divorced";"tertiary";"no";262;"no";"no";"unknown";5;"may";178;1;-1;0;"unknown";"no"
+57;"technician";"married";"primary";"no";0;"no";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+56;"technician";"divorced";"unknown";"no";56;"yes";"no";"unknown";5;"may";439;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
+34;"admin.";"married";"unknown";"no";3;"yes";"no";"unknown";5;"may";120;3;-1;0;"unknown";"no"
+43;"services";"married";"secondary";"no";41;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
+52;"technician";"married";"tertiary";"no";7;"no";"yes";"unknown";5;"may";175;1;-1;0;"unknown";"no"
+33;"technician";"single";"secondary";"no";105;"yes";"no";"unknown";5;"may";262;2;-1;0;"unknown";"no"
+29;"admin.";"single";"secondary";"no";818;"yes";"yes";"unknown";5;"may";61;1;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";-16;"yes";"yes";"unknown";5;"may";78;1;-1;0;"unknown";"no"
+31;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";143;1;-1;0;"unknown";"no"
+55;"services";"married";"secondary";"no";2476;"yes";"no";"unknown";5;"may";579;1;-1;0;"unknown";"yes"
+55;"management";"married";"unknown";"no";1185;"no";"no";"unknown";5;"may";677;1;-1;0;"unknown";"no"
+32;"admin.";"single";"secondary";"no";217;"yes";"no";"unknown";5;"may";345;1;-1;0;"unknown";"no"
+38;"technician";"single";"secondary";"no";1685;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
+55;"admin.";"single";"secondary";"no";802;"yes";"yes";"unknown";5;"may";100;2;-1;0;"unknown";"no"
+28;"unemployed";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
+23;"blue-collar";"married";"secondary";"no";94;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
+32;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";136;1;-1;0;"unknown";"no"
+43;"services";"single";"unknown";"no";0;"no";"no";"unknown";5;"may";73;1;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";517;"yes";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
+46;"blue-collar";"married";"secondary";"no";265;"yes";"no";"unknown";5;"may";541;1;-1;0;"unknown";"no"
+53;"housemaid";"divorced";"primary";"no";947;"yes";"no";"unknown";5;"may";163;1;-1;0;"unknown";"no"
+34;"self-employed";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";301;1;-1;0;"unknown";"no"
+57;"unemployed";"married";"tertiary";"no";42;"no";"no";"unknown";5;"may";46;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";37;"yes";"no";"unknown";5;"may";204;1;-1;0;"unknown";"no"
+59;"blue-collar";"married";"secondary";"no";57;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";22;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
+56;"blue-collar";"divorced";"primary";"no";8;"yes";"no";"unknown";5;"may";157;2;-1;0;"unknown";"no"
+48;"unemployed";"married";"secondary";"no";293;"yes";"no";"unknown";5;"may";243;1;-1;0;"unknown";"no"
+43;"services";"married";"primary";"no";3;"yes";"no";"unknown";5;"may";186;2;-1;0;"unknown";"no"
+54;"blue-collar";"married";"primary";"no";348;"yes";"no";"unknown";5;"may";579;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"unknown";"no";-19;"yes";"no";"unknown";5;"may";163;2;-1;0;"unknown";"no"
+26;"student";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";610;2;-1;0;"unknown";"no"
+40;"management";"married";"tertiary";"no";-4;"yes";"no";"unknown";5;"may";2033;1;-1;0;"unknown";"no"
+39;"management";"married";"secondary";"no";18;"yes";"no";"unknown";5;"may";85;1;-1;0;"unknown";"no"
+50;"technician";"married";"primary";"no";139;"no";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
+41;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"unknown";"no";1883;"yes";"no";"unknown";5;"may";57;1;-1;0;"unknown";"no"
+60;"retired";"divorced";"secondary";"no";216;"yes";"no";"unknown";5;"may";238;1;-1;0;"unknown";"no"
+52;"blue-collar";"married";"secondary";"no";782;"yes";"no";"unknown";5;"may";93;3;-1;0;"unknown";"no"
+48;"blue-collar";"married";"secondary";"no";904;"yes";"no";"unknown";5;"may";128;2;-1;0;"unknown";"no"
+48;"services";"married";"unknown";"no";1705;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
+39;"technician";"single";"tertiary";"no";47;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+47;"services";"single";"secondary";"no";176;"yes";"no";"unknown";5;"may";303;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";1225;"yes";"no";"unknown";5;"may";558;5;-1;0;"unknown";"no"
+45;"technician";"married";"secondary";"no";86;"yes";"no";"unknown";5;"may";270;1;-1;0;"unknown";"no"
+26;"admin.";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";228;1;-1;0;"unknown";"no"
+52;"management";"married";"tertiary";"no";271;"yes";"no";"unknown";5;"may";99;1;-1;0;"unknown";"no"
+54;"technician";"married";"secondary";"no";1378;"yes";"no";"unknown";5;"may";240;1;-1;0;"unknown";"no"
+54;"admin.";"married";"tertiary";"no";184;"no";"no";"unknown";5;"may";673;2;-1;0;"unknown";"yes"
+50;"blue-collar";"married";"primary";"no";0;"no";"no";"unknown";5;"may";233;3;-1;0;"unknown";"no"
+35;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";1056;1;-1;0;"unknown";"no"
+44;"services";"married";"secondary";"no";1357;"yes";"yes";"unknown";5;"may";250;1;-1;0;"unknown";"no"
+53;"entrepreneur";"married";"unknown";"no";19;"yes";"no";"unknown";5;"may";252;1;-1;0;"unknown";"no"
+35;"retired";"single";"primary";"no";434;"no";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
+60;"admin.";"divorced";"secondary";"no";92;"yes";"no";"unknown";5;"may";130;1;-1;0;"unknown";"no"
+53;"admin.";"divorced";"secondary";"no";1151;"yes";"no";"unknown";5;"may";412;1;-1;0;"unknown";"no"
+48;"unemployed";"married";"secondary";"no";41;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
+34;"technician";"married";"secondary";"no";51;"yes";"no";"unknown";5;"may";19;2;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"no";214;"yes";"no";"unknown";5;"may";458;2;-1;0;"unknown";"no"
+51;"management";"married";"secondary";"no";1161;"yes";"no";"unknown";5;"may";717;1;-1;0;"unknown";"no"
+31;"services";"married";"tertiary";"no";37;"yes";"no";"unknown";5;"may";313;1;-1;0;"unknown";"no"
+35;"technician";"divorced";"secondary";"no";787;"yes";"no";"unknown";5;"may";683;2;-1;0;"unknown";"no"
+35;"services";"married";"secondary";"no";59;"yes";"no";"unknown";5;"may";1077;1;-1;0;"unknown";"no"
+38;"technician";"married";"secondary";"no";253;"yes";"no";"unknown";5;"may";416;1;-1;0;"unknown";"no"
+36;"admin.";"married";"tertiary";"no";211;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
+58;"retired";"married";"primary";"no";235;"yes";"no";"unknown";5;"may";167;1;-1;0;"unknown";"no"
+40;"services";"divorced";"unknown";"no";4384;"yes";"no";"unknown";5;"may";315;1;-1;0;"unknown";"no"
+54;"management";"married";"secondary";"no";4080;"no";"no";"unknown";5;"may";140;1;-1;0;"unknown";"no"
+34;"blue-collar";"single";"secondary";"no";53;"yes";"yes";"unknown";5;"may";346;1;-1;0;"unknown";"no"
+31;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";562;1;-1;0;"unknown";"no"
+51;"retired";"married";"secondary";"no";2127;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
+33;"management";"married";"tertiary";"no";377;"yes";"no";"unknown";5;"may";217;1;-1;0;"unknown";"no"
+55;"management";"married";"tertiary";"no";73;"yes";"no";"unknown";5;"may";142;2;-1;0;"unknown";"no"
+42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";5;"may";67;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";243;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
+33;"blue-collar";"single";"secondary";"no";307;"yes";"no";"unknown";5;"may";309;2;-1;0;"unknown";"no"
+38;"services";"married";"secondary";"no";155;"yes";"no";"unknown";5;"may";248;1;-1;0;"unknown";"no"
+50;"technician";"divorced";"tertiary";"no";173;"no";"yes";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+43;"management";"married";"tertiary";"no";400;"yes";"no";"unknown";5;"may";256;1;-1;0;"unknown";"no"
+61;"blue-collar";"divorced";"primary";"no";1428;"yes";"no";"unknown";5;"may";82;2;-1;0;"unknown";"no"
+47;"admin.";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
+48;"self-employed";"married";"tertiary";"no";7;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";575;"yes";"no";"unknown";5;"may";477;1;-1;0;"unknown";"no"
+35;"student";"single";"unknown";"no";298;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
+35;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";471;1;-1;0;"unknown";"no"
+50;"services";"married";"secondary";"no";5699;"yes";"no";"unknown";5;"may";381;2;-1;0;"unknown";"no"
+41;"management";"married";"tertiary";"no";176;"yes";"yes";"unknown";5;"may";42;1;-1;0;"unknown";"no"
+41;"management";"married";"tertiary";"no";517;"yes";"no";"unknown";5;"may";251;1;-1;0;"unknown";"no"
+39;"services";"single";"unknown";"no";257;"yes";"no";"unknown";5;"may";408;1;-1;0;"unknown";"no"
+42;"retired";"married";"secondary";"no";56;"yes";"no";"unknown";5;"may";215;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";-390;"yes";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
+53;"retired";"married";"secondary";"no";330;"yes";"no";"unknown";5;"may";216;2;-1;0;"unknown";"no"
+59;"housemaid";"divorced";"primary";"no";195;"no";"no";"unknown";5;"may";366;2;-1;0;"unknown";"no"
+36;"services";"married";"secondary";"no";301;"yes";"no";"unknown";5;"may";210;1;-1;0;"unknown";"no"
+54;"blue-collar";"married";"primary";"no";-41;"yes";"no";"unknown";5;"may";288;1;-1;0;"unknown";"no"
+40;"technician";"married";"tertiary";"no";483;"yes";"no";"unknown";5;"may";168;1;-1;0;"unknown";"no"
+47;"unknown";"married";"unknown";"no";28;"no";"no";"unknown";5;"may";338;2;-1;0;"unknown";"no"
+53;"unemployed";"married";"unknown";"no";13;"no";"no";"unknown";5;"may";410;3;-1;0;"unknown";"no"
+46;"housemaid";"married";"primary";"no";965;"no";"no";"unknown";5;"may";177;1;-1;0;"unknown";"no"
+39;"management";"married";"tertiary";"no";378;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
+40;"unemployed";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
+28;"blue-collar";"married";"primary";"no";324;"yes";"no";"unknown";5;"may";175;1;-1;0;"unknown";"no"
+35;"entrepreneur";"divorced";"secondary";"no";-69;"yes";"no";"unknown";5;"may";300;1;-1;0;"unknown";"no"
+55;"retired";"married";"secondary";"no";0;"no";"yes";"unknown";5;"may";136;1;-1;0;"unknown";"no"
+43;"technician";"divorced";"unknown";"no";205;"yes";"no";"unknown";5;"may";1419;1;-1;0;"unknown";"no"
+48;"blue-collar";"married";"primary";"no";278;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
+58;"management";"married";"unknown";"no";1065;"yes";"no";"unknown";5;"may";213;3;-1;0;"unknown";"no"
+33;"management";"single";"tertiary";"no";34;"yes";"no";"unknown";5;"may";27;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"unknown";"no";1033;"no";"no";"unknown";5;"may";238;2;-1;0;"unknown";"no"
+53;"services";"divorced";"secondary";"no";1467;"yes";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"primary";"no";-12;"yes";"no";"unknown";5;"may";18;1;-1;0;"unknown";"no"
+31;"services";"married";"secondary";"no";388;"yes";"no";"unknown";5;"may";730;2;-1;0;"unknown";"no"
+57;"entrepreneur";"married";"secondary";"no";294;"yes";"no";"unknown";5;"may";746;2;-1;0;"unknown";"no"
+53;"blue-collar";"married";"unknown";"no";1827;"no";"no";"unknown";5;"may";121;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"primary";"no";627;"yes";"no";"unknown";5;"may";247;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";5;"may";40;1;-1;0;"unknown";"no"
+53;"admin.";"divorced";"secondary";"no";315;"yes";"no";"unknown";5;"may";181;2;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
+44;"admin.";"divorced";"secondary";"no";66;"yes";"no";"unknown";5;"may";206;1;-1;0;"unknown";"no"
+49;"blue-collar";"divorced";"primary";"no";-9;"yes";"yes";"unknown";5;"may";389;1;-1;0;"unknown";"no"
+46;"technician";"married";"secondary";"no";349;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
+43;"entrepreneur";"married";"unknown";"no";100;"yes";"no";"unknown";5;"may";702;1;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
+43;"technician";"married";"secondary";"no";434;"yes";"no";"unknown";5;"may";117;1;-1;0;"unknown";"no"
+49;"management";"married";"tertiary";"no";3237;"yes";"no";"unknown";5;"may";232;3;-1;0;"unknown";"no"
+42;"management";"married";"unknown";"no";275;"no";"no";"unknown";5;"may";408;2;-1;0;"unknown";"no"
+22;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
+40;"management";"married";"tertiary";"no";207;"yes";"no";"unknown";5;"may";39;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";483;"yes";"no";"unknown";5;"may";282;1;-1;0;"unknown";"no"
+51;"services";"married";"secondary";"no";2248;"yes";"no";"unknown";5;"may";714;2;-1;0;"unknown";"no"
+49;"admin.";"married";"secondary";"no";428;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
+53;"blue-collar";"married";"secondary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+34;"services";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";142;1;-1;0;"unknown";"no"
+33;"technician";"divorced";"secondary";"no";140;"yes";"no";"unknown";5;"may";227;1;-1;0;"unknown";"no"
+50;"management";"single";"tertiary";"no";297;"yes";"no";"unknown";5;"may";119;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";279;"yes";"no";"unknown";5;"may";361;1;-1;0;"unknown";"no"
+59;"entrepreneur";"divorced";"secondary";"no";901;"yes";"no";"unknown";5;"may";73;3;-1;0;"unknown";"no"
+30;"technician";"single";"secondary";"no";2573;"yes";"no";"unknown";5;"may";67;2;-1;0;"unknown";"no"
+36;"services";"married";"secondary";"no";143;"yes";"yes";"unknown";5;"may";350;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";475;"yes";"no";"unknown";5;"may";332;2;-1;0;"unknown";"no"
+53;"blue-collar";"married";"secondary";"no";70;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
+34;"management";"single";"tertiary";"no";318;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";275;"yes";"no";"unknown";5;"may";132;1;-1;0;"unknown";"no"
+42;"management";"divorced";"tertiary";"no";742;"yes";"no";"unknown";5;"may";58;3;-1;0;"unknown";"no"
+41;"entrepreneur";"married";"primary";"no";236;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
+30;"student";"single";"tertiary";"no";25;"yes";"no";"unknown";5;"may";89;2;-1;0;"unknown";"no"
+37;"management";"single";"tertiary";"no";600;"yes";"no";"unknown";5;"may";152;1;-1;0;"unknown";"no"
+39;"admin.";"divorced";"secondary";"no";-349;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
+41;"blue-collar";"married";"primary";"no";183;"yes";"yes";"unknown";5;"may";110;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";463;1;-1;0;"unknown";"no"
+42;"management";"single";"tertiary";"no";0;"yes";"yes";"unknown";5;"may";562;2;-1;0;"unknown";"yes"
+40;"blue-collar";"divorced";"primary";"no";0;"yes";"no";"unknown";5;"may";962;1;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";1078;"yes";"no";"unknown";5;"may";10;4;-1;0;"unknown";"no"
+56;"entrepreneur";"divorced";"secondary";"no";155;"no";"no";"unknown";5;"may";118;3;-1;0;"unknown";"no"
+37;"admin.";"married";"secondary";"no";190;"yes";"no";"unknown";5;"may";92;2;-1;0;"unknown";"no"
+59;"retired";"married";"secondary";"no";319;"yes";"no";"unknown";5;"may";143;3;-1;0;"unknown";"no"
+39;"services";"divorced";"secondary";"no";-185;"yes";"no";"unknown";5;"may";189;3;-1;0;"unknown";"no"
+49;"services";"married";"secondary";"no";47;"no";"no";"unknown";5;"may";234;2;-1;0;"unknown";"no"
+38;"services";"single";"secondary";"no";570;"yes";"no";"unknown";5;"may";75;2;-1;0;"unknown";"no"
+36;"self-employed";"married";"tertiary";"no";19;"no";"no";"unknown";5;"may";189;2;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";61;"yes";"no";"unknown";5;"may";621;3;-1;0;"unknown";"no"
+41;"admin.";"married";"secondary";"no";-62;"yes";"yes";"unknown";5;"may";55;2;-1;0;"unknown";"no"
+54;"technician";"married";"tertiary";"no";258;"no";"no";"unknown";5;"may";310;4;-1;0;"unknown";"no"
+58;"blue-collar";"married";"primary";"no";76;"yes";"no";"unknown";5;"may";156;2;-1;0;"unknown";"no"
+30;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";5;2;-1;0;"unknown";"no"
+33;"admin.";"single";"secondary";"no";352;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
+47;"admin.";"married";"secondary";"no";368;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
+50;"technician";"single";"tertiary";"no";339;"yes";"no";"unknown";5;"may";2;3;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";1331;"yes";"no";"unknown";5;"may";286;2;-1;0;"unknown";"no"
+40;"self-employed";"married";"secondary";"no";672;"yes";"no";"unknown";5;"may";164;2;-1;0;"unknown";"no"
+37;"management";"married";"tertiary";"no";58;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
+54;"technician";"single";"unknown";"no";447;"yes";"no";"unknown";5;"may";742;2;-1;0;"unknown";"no"
+24;"student";"single";"secondary";"no";423;"yes";"no";"unknown";5;"may";226;3;-1;0;"unknown";"no"
+54;"management";"married";"tertiary";"no";0;"no";"no";"unknown";5;"may";120;2;-1;0;"unknown";"no"
+34;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";5;"may";362;4;-1;0;"unknown";"no"
+56;"technician";"divorced";"primary";"no";13;"yes";"no";"unknown";5;"may";357;2;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";200;2;-1;0;"unknown";"no"
+24;"student";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";204;2;-1;0;"unknown";"no"
+42;"blue-collar";"divorced";"primary";"no";28;"yes";"no";"unknown";5;"may";126;3;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";792;"yes";"no";"unknown";5;"may";65;2;-1;0;"unknown";"no"
+42;"blue-collar";"married";"unknown";"no";408;"yes";"no";"unknown";5;"may";107;2;-1;0;"unknown";"no"
+51;"admin.";"married";"secondary";"no";531;"yes";"no";"unknown";5;"may";267;2;-1;0;"unknown";"no"
+57;"retired";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";248;2;-1;0;"unknown";"no"
+36;"services";"single";"secondary";"no";62;"yes";"no";"unknown";5;"may";215;2;-1;0;"unknown";"no"
+53;"services";"married";"unknown";"no";257;"yes";"no";"unknown";5;"may";209;2;-1;0;"unknown";"no"
+50;"technician";"married";"secondary";"no";1234;"yes";"no";"unknown";5;"may";205;2;-1;0;"unknown";"no"
+54;"management";"married";"tertiary";"no";313;"yes";"no";"unknown";5;"may";83;2;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";5;"may";106;3;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";129;"yes";"yes";"unknown";5;"may";189;2;-1;0;"unknown";"no"
+43;"management";"married";"unknown";"no";0;"yes";"no";"unknown";5;"may";105;2;-1;0;"unknown";"no"
+56;"admin.";"married";"secondary";"no";353;"yes";"no";"unknown";5;"may";106;2;-1;0;"unknown";"no"
+54;"technician";"married";"unknown";"no";851;"yes";"no";"unknown";5;"may";108;2;-1;0;"unknown";"no"
+55;"services";"divorced";"primary";"no";96;"yes";"yes";"unknown";5;"may";311;2;-1;0;"unknown";"no"
+37;"services";"divorced";"secondary";"no";398;"yes";"yes";"unknown";5;"may";214;2;-1;0;"unknown";"no"
+33;"admin.";"single";"tertiary";"no";193;"no";"no";"unknown";5;"may";132;2;-1;0;"unknown";"no"
+46;"admin.";"married";"secondary";"no";-358;"yes";"no";"unknown";5;"may";358;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";539;"yes";"yes";"unknown";5;"may";453;2;-1;0;"unknown";"no"
+51;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";364;2;-1;0;"unknown";"no"
+40;"retired";"single";"primary";"no";0;"no";"no";"unknown";5;"may";136;2;-1;0;"unknown";"no"
+42;"blue-collar";"married";"secondary";"no";490;"yes";"no";"unknown";5;"may";386;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";173;2;-1;0;"unknown";"no"
+49;"blue-collar";"married";"unknown";"no";403;"yes";"no";"unknown";5;"may";241;2;-1;0;"unknown";"no"
+48;"management";"married";"secondary";"no";161;"yes";"no";"unknown";5;"may";224;3;-1;0;"unknown";"no"
+32;"technician";"divorced";"tertiary";"no";2558;"no";"no";"unknown";5;"may";148;2;-1;0;"unknown";"no"
+31;"admin.";"single";"secondary";"no";98;"yes";"no";"unknown";5;"may";196;2;-1;0;"unknown";"no"
+55;"management";"single";"tertiary";"no";115;"no";"no";"unknown";5;"may";111;4;-1;0;"unknown";"no"
+40;"blue-collar";"single";"secondary";"no";436;"yes";"no";"unknown";5;"may";231;3;-1;0;"unknown";"no"
+47;"technician";"married";"tertiary";"no";831;"yes";"no";"unknown";5;"may";316;3;-1;0;"unknown";"no"
+57;"technician";"married";"unknown";"no";206;"yes";"no";"unknown";5;"may";216;3;-1;0;"unknown";"no"
+41;"blue-collar";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";240;2;-1;0;"unknown";"no"
+48;"blue-collar";"married";"secondary";"no";1;"no";"no";"unknown";5;"may";669;3;-1;0;"unknown";"no"
+42;"blue-collar";"married";"unknown";"no";57;"yes";"no";"unknown";5;"may";425;2;-1;0;"unknown";"no"
+30;"blue-collar";"single";"secondary";"no";-457;"yes";"no";"unknown";5;"may";143;2;-1;0;"unknown";"no"
+58;"management";"single";"tertiary";"no";1387;"yes";"no";"unknown";5;"may";174;5;-1;0;"unknown";"no"
+45;"management";"divorced";"tertiary";"no";24598;"yes";"no";"unknown";5;"may";313;3;-1;0;"unknown";"no"
+49;"blue-collar";"married";"secondary";"no";30;"yes";"no";"unknown";5;"may";135;4;-1;0;"unknown";"no"
+42;"admin.";"single";"secondary";"no";1022;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";56;"yes";"yes";"unknown";5;"may";152;2;-1;0;"unknown";"no"
+51;"admin.";"single";"secondary";"yes";-2;"no";"no";"unknown";5;"may";402;3;-1;0;"unknown";"no"
+32;"services";"single";"secondary";"no";121;"yes";"no";"unknown";5;"may";213;2;-1;0;"unknown";"no"
+41;"blue-collar";"single";"secondary";"no";842;"yes";"no";"unknown";5;"may";144;3;-1;0;"unknown";"no"
+43;"management";"divorced";"secondary";"no";693;"yes";"no";"unknown";5;"may";124;3;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"secondary";"no";-333;"yes";"no";"unknown";5;"may";183;2;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";1533;"yes";"no";"unknown";5;"may";325;2;-1;0;"unknown";"no"
+34;"management";"married";"tertiary";"no";46;"yes";"no";"unknown";5;"may";39;4;-1;0;"unknown";"no"
+53;"services";"married";"unknown";"no";18;"no";"no";"unknown";5;"may";503;2;-1;0;"unknown";"no"
+45;"technician";"married";"secondary";"no";44;"yes";"no";"unknown";5;"may";95;4;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";-100;"yes";"no";"unknown";5;"may";680;2;-1;0;"unknown";"no"
+44;"services";"married";"tertiary";"no";510;"yes";"no";"unknown";5;"may";421;4;-1;0;"unknown";"no"
+55;"management";"married";"tertiary";"no";685;"yes";"no";"unknown";5;"may";174;3;-1;0;"unknown";"no"
+46;"management";"single";"tertiary";"no";187;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
+45;"blue-collar";"married";"secondary";"no";66;"yes";"no";"unknown";5;"may";808;2;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";560;"yes";"no";"unknown";5;"may";198;3;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";195;2;-1;0;"unknown";"no"
+59;"unknown";"divorced";"unknown";"no";27;"no";"no";"unknown";5;"may";347;3;-1;0;"unknown";"no"
+31;"admin.";"single";"secondary";"no";12;"yes";"no";"unknown";5;"may";208;2;-1;0;"unknown";"no"
+44;"blue-collar";"single";"secondary";"no";34;"yes";"no";"unknown";5;"may";404;4;-1;0;"unknown";"no"
+33;"entrepreneur";"single";"tertiary";"no";1068;"yes";"no";"unknown";5;"may";396;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";216;4;-1;0;"unknown";"no"
+46;"admin.";"single";"tertiary";"no";377;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
+48;"management";"married";"tertiary";"no";263;"yes";"no";"unknown";5;"may";350;2;-1;0;"unknown";"no"
+42;"services";"married";"secondary";"no";1263;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
+27;"services";"married";"secondary";"no";8;"yes";"no";"unknown";6;"may";88;3;-1;0;"unknown";"no"
+48;"admin.";"married";"secondary";"no";126;"yes";"yes";"unknown";6;"may";379;2;-1;0;"unknown";"no"
+59;"admin.";"married";"secondary";"no";230;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
+46;"technician";"married";"tertiary";"no";841;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
+38;"admin.";"divorced";"secondary";"no";308;"yes";"no";"unknown";6;"may";102;1;-1;0;"unknown";"no"
+43;"management";"divorced";"tertiary";"no";1;"yes";"no";"unknown";6;"may";306;1;-1;0;"unknown";"no"
+38;"admin.";"divorced";"tertiary";"no";86;"yes";"no";"unknown";6;"may";218;1;-1;0;"unknown";"no"
+23;"student";"single";"secondary";"no";157;"yes";"no";"unknown";6;"may";54;1;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";22;"yes";"no";"unknown";6;"may";344;1;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";46;"yes";"yes";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";1293;"no";"no";"unknown";6;"may";652;1;-1;0;"unknown";"no"
+25;"admin.";"single";"secondary";"no";122;"yes";"no";"unknown";6;"may";286;1;-1;0;"unknown";"no"
+48;"blue-collar";"married";"unknown";"no";131;"yes";"no";"unknown";6;"may";189;1;-1;0;"unknown";"no"
+49;"blue-collar";"single";"secondary";"no";143;"yes";"no";"unknown";6;"may";83;1;-1;0;"unknown";"no"
+38;"admin.";"single";"secondary";"no";393;"no";"no";"unknown";6;"may";184;2;-1;0;"unknown";"no"
+43;"blue-collar";"married";"primary";"no";98;"yes";"no";"unknown";6;"may";235;1;-1;0;"unknown";"no"
+33;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";290;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";224;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";757;"yes";"no";"unknown";6;"may";133;1;-1;0;"unknown";"no"
+49;"services";"married";"secondary";"no";245;"yes";"yes";"unknown";6;"may";318;1;-1;0;"unknown";"no"
+40;"management";"married";"secondary";"no";8486;"no";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
+43;"admin.";"married";"unknown";"no";350;"no";"no";"unknown";6;"may";437;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";20;"yes";"no";"unknown";6;"may";402;1;-1;0;"unknown";"no"
+58;"services";"married";"secondary";"no";1667;"yes";"yes";"unknown";6;"may";85;1;-1;0;"unknown";"no"
+57;"technician";"married";"unknown";"no";345;"yes";"no";"unknown";6;"may";125;1;-1;0;"unknown";"no"
+32;"unemployed";"married";"secondary";"no";10;"yes";"no";"unknown";6;"may";501;4;-1;0;"unknown";"no"
+56;"management";"married";"tertiary";"no";830;"yes";"yes";"unknown";6;"may";1201;1;-1;0;"unknown";"yes"
+58;"blue-collar";"divorced";"unknown";"no";29;"yes";"no";"unknown";6;"may";253;1;-1;0;"unknown";"no"
+60;"retired";"divorced";"secondary";"no";545;"yes";"no";"unknown";6;"may";1030;1;-1;0;"unknown";"yes"
+37;"technician";"married";"tertiary";"no";8730;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
+46;"technician";"divorced";"tertiary";"no";477;"yes";"no";"unknown";6;"may";114;1;-1;0;"unknown";"no"
+27;"admin.";"married";"secondary";"no";4;"yes";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";769;2;-1;0;"unknown";"no"
+32;"technician";"single";"secondary";"no";0;"yes";"yes";"unknown";6;"may";135;3;-1;0;"unknown";"no"
+40;"admin.";"single";"secondary";"no";263;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";1;"no";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";283;"no";"yes";"unknown";6;"may";199;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"primary";"no";206;"yes";"no";"unknown";6;"may";152;1;-1;0;"unknown";"no"
+42;"housemaid";"married";"primary";"no";17;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
+48;"technician";"married";"secondary";"no";141;"yes";"yes";"unknown";6;"may";424;1;-1;0;"unknown";"no"
+29;"self-employed";"single";"tertiary";"no";16;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
+50;"services";"married";"secondary";"no";206;"yes";"no";"unknown";6;"may";154;1;-1;0;"unknown";"no"
+52;"technician";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";203;2;-1;0;"unknown";"no"
+50;"management";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";326;1;-1;0;"unknown";"no"
+58;"retired";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";393;1;-1;0;"unknown";"no"
+46;"blue-collar";"divorced";"primary";"no";1927;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
+38;"technician";"married";"secondary";"no";284;"yes";"no";"unknown";6;"may";483;1;-1;0;"unknown";"no"
+46;"blue-collar";"married";"secondary";"no";1660;"yes";"no";"unknown";6;"may";259;1;-1;0;"unknown";"no"
+32;"services";"single";"secondary";"no";406;"yes";"no";"unknown";6;"may";227;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";230;"yes";"no";"unknown";6;"may";673;1;-1;0;"unknown";"no"
+39;"admin.";"single";"secondary";"no";-25;"yes";"no";"unknown";6;"may";576;1;-1;0;"unknown";"no"
+48;"admin.";"married";"secondary";"no";182;"yes";"no";"unknown";6;"may";180;2;-1;0;"unknown";"no"
+36;"entrepreneur";"married";"tertiary";"no";1169;"yes";"no";"unknown";6;"may";168;2;-1;0;"unknown";"no"
+34;"admin.";"divorced";"secondary";"no";67;"yes";"no";"unknown";6;"may";90;1;-1;0;"unknown";"no"
+40;"technician";"married";"secondary";"no";77;"no";"no";"unknown";6;"may";505;1;-1;0;"unknown";"no"
+43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";245;1;-1;0;"unknown";"no"
+52;"blue-collar";"divorced";"primary";"no";55;"yes";"yes";"unknown";6;"may";186;1;-1;0;"unknown";"no"
+33;"technician";"married";"secondary";"yes";72;"yes";"no";"unknown";6;"may";623;1;-1;0;"unknown";"no"
+49;"management";"single";"tertiary";"no";163;"yes";"no";"unknown";6;"may";496;3;-1;0;"unknown";"no"
+32;"management";"single";"tertiary";"no";151;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
+39;"admin.";"single";"secondary";"no";113;"yes";"no";"unknown";6;"may";342;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
+38;"technician";"single";"tertiary";"no";9;"yes";"no";"unknown";6;"may";185;3;-1;0;"unknown";"no"
+43;"management";"married";"secondary";"no";375;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
+39;"services";"married";"secondary";"no";1142;"yes";"no";"unknown";6;"may";276;1;-1;0;"unknown";"no"
+54;"blue-collar";"married";"primary";"no";2102;"yes";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
+38;"technician";"single";"tertiary";"no";4325;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";217;"yes";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+55;"admin.";"married";"secondary";"no";131;"yes";"no";"unknown";6;"may";744;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";1680;"yes";"no";"unknown";6;"may";765;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";119;1;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";320;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
+55;"admin.";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"secondary";"no";39;"no";"no";"unknown";6;"may";241;1;-1;0;"unknown";"no"
+35;"management";"single";"tertiary";"no";560;"yes";"no";"unknown";6;"may";181;1;-1;0;"unknown";"no"
+58;"technician";"divorced";"secondary";"no";469;"no";"no";"unknown";6;"may";196;1;-1;0;"unknown";"no"
+35;"admin.";"married";"secondary";"no";530;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
+49;"services";"married";"primary";"no";61;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
+34;"technician";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";139;"yes";"no";"unknown";6;"may";309;2;-1;0;"unknown";"no"
+24;"self-employed";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
+34;"technician";"married";"secondary";"no";367;"yes";"no";"unknown";6;"may";140;1;-1;0;"unknown";"no"
+51;"admin.";"divorced";"secondary";"no";228;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
+39;"technician";"single";"unknown";"no";45248;"yes";"no";"unknown";6;"may";1623;1;-1;0;"unknown";"yes"
+50;"self-employed";"married";"unknown";"no";-84;"yes";"no";"unknown";6;"may";101;1;-1;0;"unknown";"no"
+32;"services";"single";"secondary";"no";310;"yes";"no";"unknown";6;"may";144;1;-1;0;"unknown";"no"
+42;"blue-collar";"married";"unknown";"no";132;"yes";"no";"unknown";6;"may";238;1;-1;0;"unknown";"no"
+50;"technician";"married";"secondary";"no";797;"yes";"no";"unknown";6;"may";354;1;-1;0;"unknown";"no"
+40;"services";"married";"secondary";"no";71;"no";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
+46;"management";"divorced";"unknown";"no";2;"yes";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
+37;"management";"married";"tertiary";"no";231;"yes";"yes";"unknown";6;"may";451;2;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";270;"yes";"yes";"unknown";6;"may";159;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";274;"yes";"yes";"unknown";6;"may";409;1;-1;0;"unknown";"no"
+40;"admin.";"single";"secondary";"no";-109;"yes";"yes";"unknown";6;"may";170;1;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";608;1;-1;0;"unknown";"yes"
+33;"blue-collar";"single";"secondary";"yes";-60;"no";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
+35;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
+58;"blue-collar";"divorced";"secondary";"no";-11;"no";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";-509;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
+39;"unemployed";"married";"primary";"no";408;"yes";"no";"unknown";6;"may";53;1;-1;0;"unknown";"no"
+36;"services";"single";"primary";"no";58;"yes";"no";"unknown";6;"may";134;1;-1;0;"unknown";"no"
+57;"retired";"single";"secondary";"no";1640;"no";"yes";"unknown";6;"may";204;4;-1;0;"unknown";"no"
+36;"admin.";"single";"secondary";"no";20;"yes";"no";"unknown";6;"may";186;1;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";71;"yes";"no";"unknown";6;"may";678;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";52;"yes";"no";"unknown";6;"may";182;1;-1;0;"unknown";"no"
+44;"self-employed";"married";"tertiary";"no";292;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
+44;"services";"divorced";"secondary";"no";424;"yes";"no";"unknown";6;"may";27;1;-1;0;"unknown";"no"
+39;"housemaid";"single";"primary";"no";109;"yes";"no";"unknown";6;"may";699;3;-1;0;"unknown";"no"
+46;"blue-collar";"married";"unknown";"no";1044;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";983;"yes";"no";"unknown";6;"may";97;1;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";869;"no";"no";"unknown";6;"may";1677;1;-1;0;"unknown";"yes"
+40;"blue-collar";"married";"primary";"no";668;"yes";"no";"unknown";6;"may";283;2;-1;0;"unknown";"no"
+50;"management";"married";"tertiary";"no";964;"yes";"no";"unknown";6;"may";323;1;-1;0;"unknown";"no"
+31;"management";"single";"secondary";"no";301;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";140;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
+39;"management";"single";"secondary";"no";1877;"yes";"no";"unknown";6;"may";185;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";1127;"yes";"no";"unknown";6;"may";47;1;-1;0;"unknown";"no"
+41;"technician";"married";"secondary";"no";871;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
+41;"technician";"married";"secondary";"no";767;"yes";"yes";"unknown";6;"may";204;1;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
+30;"services";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";30;2;-1;0;"unknown";"no"
+54;"management";"divorced";"primary";"no";0;"no";"no";"unknown";6;"may";472;1;-1;0;"unknown";"no"
+43;"blue-collar";"divorced";"secondary";"no";110;"yes";"yes";"unknown";6;"may";448;1;-1;0;"unknown";"no"
+59;"management";"divorced";"tertiary";"no";-76;"yes";"yes";"unknown";6;"may";264;1;-1;0;"unknown";"no"
+47;"technician";"married";"unknown";"no";178;"yes";"no";"unknown";6;"may";169;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";288;1;-1;0;"unknown";"no"
+32;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";176;2;-1;0;"unknown";"no"
+29;"blue-collar";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";215;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";337;1;-1;0;"unknown";"no"
+55;"unemployed";"married";"tertiary";"no";5345;"no";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
+30;"blue-collar";"divorced";"secondary";"no";-209;"yes";"no";"unknown";6;"may";188;2;-1;0;"unknown";"no"
+39;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
+39;"blue-collar";"divorced";"secondary";"no";42;"yes";"no";"unknown";6;"may";226;2;-1;0;"unknown";"no"
+50;"blue-collar";"divorced";"secondary";"no";41;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";-99;"yes";"no";"unknown";6;"may";111;2;-1;0;"unknown";"no"
+37;"technician";"single";"secondary";"no";17;"yes";"no";"unknown";6;"may";164;1;-1;0;"unknown";"no"
+46;"admin.";"married";"primary";"no";276;"yes";"yes";"unknown";6;"may";157;2;-1;0;"unknown";"no"
+32;"technician";"single";"unknown";"no";-170;"no";"no";"unknown";6;"may";46;1;-1;0;"unknown";"no"
+37;"management";"single";"tertiary";"no";230;"yes";"yes";"unknown";6;"may";374;1;-1;0;"unknown";"no"
+29;"blue-collar";"married";"secondary";"no";9;"yes";"no";"unknown";6;"may";349;1;-1;0;"unknown";"no"
+41;"blue-collar";"married";"secondary";"no";946;"yes";"no";"unknown";6;"may";325;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";1297;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
+57;"retired";"divorced";"secondary";"no";-331;"yes";"no";"unknown";6;"may";531;1;-1;0;"unknown";"no"
+48;"blue-collar";"single";"secondary";"no";44;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
+60;"retired";"married";"secondary";"yes";15;"no";"no";"unknown";6;"may";80;1;-1;0;"unknown";"no"
+26;"admin.";"single";"secondary";"no";712;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
+58;"retired";"married";"secondary";"no";5435;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";507;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
+55;"unemployed";"divorced";"secondary";"no";387;"yes";"no";"unknown";6;"may";918;1;-1;0;"unknown";"yes"
+41;"blue-collar";"married";"primary";"no";0;"yes";"yes";"unknown";6;"may";238;1;-1;0;"unknown";"no"
+50;"management";"divorced";"secondary";"no";1716;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
+49;"entrepreneur";"married";"secondary";"no";167;"yes";"yes";"unknown";6;"may";198;3;-1;0;"unknown";"no"
+44;"admin.";"married";"unknown";"no";40;"no";"yes";"unknown";6;"may";160;2;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";148;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
+31;"technician";"married";"secondary";"no";17;"yes";"yes";"unknown";6;"may";120;1;-1;0;"unknown";"no"
+34;"blue-collar";"single";"tertiary";"no";1011;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
+46;"management";"single";"unknown";"no";1527;"yes";"no";"unknown";6;"may";269;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";744;"no";"no";"unknown";6;"may";157;1;-1;0;"unknown";"no"
+52;"admin.";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";128;1;-1;0;"unknown";"no"
+29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
+53;"retired";"married";"primary";"no";136;"yes";"no";"unknown";6;"may";267;2;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";1335;"yes";"no";"unknown";6;"may";371;2;-1;0;"unknown";"no"
+38;"management";"married";"secondary";"no";517;"yes";"no";"unknown";6;"may";288;2;-1;0;"unknown";"no"
+46;"management";"married";"tertiary";"no";459;"yes";"no";"unknown";6;"may";221;1;-1;0;"unknown";"no"
+48;"management";"divorced";"unknown";"no";549;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
+30;"admin.";"divorced";"secondary";"no";83;"yes";"yes";"unknown";6;"may";310;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";213;"no";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
+31;"housemaid";"married";"primary";"no";203;"yes";"no";"unknown";6;"may";604;3;-1;0;"unknown";"no"
+42;"services";"single";"secondary";"no";518;"yes";"no";"unknown";6;"may";198;1;-1;0;"unknown";"no"
+40;"management";"single";"tertiary";"no";3877;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
+52;"admin.";"married";"secondary";"no";1236;"yes";"no";"unknown";6;"may";247;1;-1;0;"unknown";"no"
+45;"blue-collar";"divorced";"secondary";"no";756;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
+48;"blue-collar";"married";"secondary";"no";157;"yes";"no";"unknown";6;"may";73;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";263;2;-1;0;"unknown";"no"
+34;"blue-collar";"married";"unknown";"no";245;"yes";"no";"unknown";6;"may";13;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"primary";"no";-144;"yes";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
+46;"blue-collar";"married";"secondary";"no";71;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
+49;"services";"divorced";"secondary";"no";505;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
+50;"technician";"married";"primary";"no";249;"yes";"no";"unknown";6;"may";129;1;-1;0;"unknown";"no"
+34;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
+40;"unemployed";"single";"secondary";"no";11;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
+36;"admin.";"married";"secondary";"no";639;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
+59;"blue-collar";"divorced";"unknown";"no";124;"yes";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";250;1;-1;0;"unknown";"no"
+36;"self-employed";"married";"tertiary";"no";107;"yes";"no";"unknown";6;"may";146;1;-1;0;"unknown";"no"
+56;"services";"married";"secondary";"no";473;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
+42;"services";"divorced";"secondary";"no";372;"yes";"yes";"unknown";6;"may";121;2;-1;0;"unknown";"no"
+30;"admin.";"married";"secondary";"no";46;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
+30;"student";"single";"tertiary";"no";34;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
+47;"self-employed";"married";"unknown";"no";935;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
+33;"blue-collar";"married";"secondary";"no";-10;"yes";"no";"unknown";6;"may";123;1;-1;0;"unknown";"no"
+36;"admin.";"married";"secondary";"no";-106;"yes";"no";"unknown";6;"may";130;2;-1;0;"unknown";"no"
+39;"services";"divorced";"primary";"no";471;"yes";"no";"unknown";6;"may";161;2;-1;0;"unknown";"no"
+56;"admin.";"divorced";"secondary";"no";778;"yes";"no";"unknown";6;"may";149;2;-1;0;"unknown";"no"
+39;"blue-collar";"divorced";"unknown";"no";170;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
+42;"technician";"married";"secondary";"no";315;"yes";"no";"unknown";6;"may";259;2;-1;0;"unknown";"no"
+52;"blue-collar";"married";"secondary";"no";3165;"no";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
+36;"admin.";"divorced";"secondary";"no";131;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
+35;"entrepreneur";"married";"secondary";"yes";204;"yes";"no";"unknown";6;"may";424;2;-1;0;"unknown";"no"
+47;"technician";"married";"secondary";"no";83;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
+59;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";6;"may";97;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";5431;"yes";"yes";"unknown";6;"may";383;1;-1;0;"unknown";"no"
+38;"management";"married";"unknown";"no";1759;"yes";"no";"unknown";6;"may";440;1;-1;0;"unknown";"no"
+46;"unemployed";"married";"secondary";"no";-125;"yes";"no";"unknown";6;"may";23;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+28;"services";"single";"secondary";"no";5090;"yes";"no";"unknown";6;"may";1297;3;-1;0;"unknown";"yes"
+38;"technician";"married";"unknown";"no";573;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
+56;"blue-collar";"married";"secondary";"no";1602;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
+41;"blue-collar";"single";"primary";"yes";-137;"yes";"yes";"unknown";6;"may";189;1;-1;0;"unknown";"no"
+52;"technician";"married";"unknown";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"no";193;"no";"no";"unknown";6;"may";179;1;-1;0;"unknown";"no"
+61;"retired";"married";"secondary";"no";195;"yes";"yes";"unknown";6;"may";179;1;-1;0;"unknown";"no"
+53;"entrepreneur";"married";"secondary";"no";288;"no";"no";"unknown";6;"may";69;1;-1;0;"unknown";"no"
+47;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";6;"may";105;2;-1;0;"unknown";"no"
+53;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";6;"may";266;3;-1;0;"unknown";"no"
+46;"services";"married";"secondary";"no";216;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
+39;"blue-collar";"divorced";"primary";"no";190;"yes";"yes";"unknown";6;"may";96;2;-1;0;"unknown";"no"
+56;"technician";"divorced";"secondary";"no";99;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
+55;"services";"divorced";"primary";"no";2298;"yes";"no";"unknown";6;"may";162;2;-1;0;"unknown";"no"
+44;"management";"married";"tertiary";"no";17;"yes";"no";"unknown";6;"may";352;2;-1;0;"unknown";"no"
+37;"technician";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";76;4;-1;0;"unknown";"no"
+35;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";154;2;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";840;"yes";"no";"unknown";6;"may";310;2;-1;0;"unknown";"no"
+37;"services";"married";"secondary";"no";358;"yes";"no";"unknown";6;"may";390;3;-1;0;"unknown";"no"
+30;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";369;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";-325;"yes";"yes";"unknown";6;"may";112;2;-1;0;"unknown";"no"
+36;"technician";"single";"secondary";"no";-15;"yes";"no";"unknown";6;"may";341;3;-1;0;"unknown";"no"
+38;"technician";"married";"secondary";"no";581;"yes";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
+41;"admin.";"divorced";"primary";"no";4070;"yes";"no";"unknown";6;"may";140;2;-1;0;"unknown";"no"
+48;"retired";"married";"secondary";"no";74;"no";"yes";"unknown";6;"may";315;1;-1;0;"unknown";"no"
+55;"services";"divorced";"secondary";"no";141;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
+28;"services";"divorced";"secondary";"no";89;"no";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"yes";0;"yes";"no";"unknown";6;"may";138;3;-1;0;"unknown";"no"
+30;"blue-collar";"married";"secondary";"no";450;"no";"no";"unknown";6;"may";526;2;-1;0;"unknown";"no"
+48;"technician";"married";"tertiary";"no";310;"no";"no";"unknown";6;"may";135;1;-1;0;"unknown";"no"
+31;"self-employed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";36;5;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";384;"yes";"no";"unknown";6;"may";1906;3;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";395;"yes";"no";"unknown";6;"may";219;2;-1;0;"unknown";"no"
+37;"services";"single";"unknown";"no";-118;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
+56;"blue-collar";"married";"primary";"no";5;"yes";"yes";"unknown";6;"may";407;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"secondary";"no";50;"yes";"yes";"unknown";6;"may";121;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";285;"yes";"yes";"unknown";6;"may";209;1;-1;0;"unknown";"no"
+49;"technician";"married";"unknown";"no";15;"no";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";653;"yes";"yes";"unknown";6;"may";208;1;-1;0;"unknown";"no"
+43;"self-employed";"married";"secondary";"no";918;"yes";"no";"unknown";6;"may";193;1;-1;0;"unknown";"no"
+32;"services";"married";"secondary";"no";243;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
+29;"technician";"single";"tertiary";"no";405;"yes";"no";"unknown";6;"may";65;1;-1;0;"unknown";"no"
+48;"management";"divorced";"tertiary";"no";1328;"yes";"no";"unknown";6;"may";339;1;-1;0;"unknown";"no"
+55;"services";"married";"primary";"no";255;"yes";"no";"unknown";6;"may";285;1;-1;0;"unknown";"no"
+53;"blue-collar";"married";"secondary";"no";3397;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
+47;"technician";"married";"unknown";"no";2106;"yes";"no";"unknown";6;"may";168;1;-1;0;"unknown";"no"
+39;"management";"married";"tertiary";"no";2877;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
+31;"blue-collar";"single";"tertiary";"no";60;"yes";"yes";"unknown";6;"may";389;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";2226;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";2880;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
+40;"technician";"single";"unknown";"no";-5;"yes";"no";"unknown";6;"may";78;2;-1;0;"unknown";"no"
+48;"technician";"married";"secondary";"no";147;"no";"no";"unknown";6;"may";142;3;-1;0;"unknown";"no"
+33;"technician";"divorced";"secondary";"no";7;"yes";"yes";"unknown";6;"may";87;1;-1;0;"unknown";"no"
+40;"technician";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
+59;"retired";"married";"primary";"no";-119;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
+30;"technician";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";703;1;-1;0;"unknown";"yes"
+31;"management";"single";"tertiary";"no";1852;"yes";"no";"unknown";6;"may";170;3;-1;0;"unknown";"no"
+35;"unemployed";"married";"secondary";"no";533;"yes";"no";"unknown";6;"may";802;1;-1;0;"unknown";"no"
+54;"technician";"divorced";"secondary";"no";21;"yes";"no";"unknown";6;"may";381;2;-1;0;"unknown";"no"
+34;"admin.";"single";"unknown";"no";2434;"yes";"no";"unknown";6;"may";218;4;-1;0;"unknown";"no"
+32;"technician";"married";"secondary";"no";90;"yes";"yes";"unknown";6;"may";57;2;-1;0;"unknown";"no"
+56;"admin.";"divorced";"unknown";"no";4246;"yes";"no";"unknown";6;"may";304;2;-1;0;"unknown";"no"
+32;"admin.";"single";"tertiary";"no";395;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
+42;"blue-collar";"married";"primary";"no";15;"yes";"no";"unknown";6;"may";230;1;-1;0;"unknown";"no"
+33;"services";"married";"tertiary";"no";85;"no";"no";"unknown";6;"may";262;3;-1;0;"unknown";"no"
+52;"entrepreneur";"married";"tertiary";"no";-184;"yes";"yes";"unknown";6;"may";392;2;-1;0;"unknown";"no"
+52;"services";"married";"secondary";"no";660;"no";"no";"unknown";6;"may";201;2;-1;0;"unknown";"no"
+52;"blue-collar";"divorced";"primary";"yes";-183;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
+30;"unemployed";"divorced";"secondary";"no";1144;"yes";"no";"unknown";6;"may";252;1;-1;0;"unknown";"no"
+44;"services";"divorced";"secondary";"no";1;"yes";"no";"unknown";6;"may";235;4;-1;0;"unknown";"no"
+35;"admin.";"married";"secondary";"no";69;"yes";"yes";"unknown";6;"may";235;2;-1;0;"unknown";"no"
+55;"management";"single";"secondary";"no";220;"yes";"no";"unknown";6;"may";328;2;-1;0;"unknown";"no"
+33;"blue-collar";"married";"primary";"no";332;"yes";"no";"unknown";6;"may";116;2;-1;0;"unknown";"no"
+37;"blue-collar";"single";"secondary";"no";240;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
+42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";293;1;-1;0;"unknown";"no"
+43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";37;2;-1;0;"unknown";"no"
+38;"entrepreneur";"married";"tertiary";"no";898;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";123;"yes";"yes";"unknown";6;"may";530;2;-1;0;"unknown";"no"
+31;"student";"single";"secondary";"no";252;"yes";"no";"unknown";6;"may";175;3;-1;0;"unknown";"no"
+41;"management";"married";"tertiary";"no";65;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
+41;"technician";"married";"secondary";"no";-366;"yes";"yes";"unknown";6;"may";29;3;-1;0;"unknown";"no"
+29;"student";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";311;2;-1;0;"unknown";"no"
+38;"admin.";"single";"secondary";"no";221;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
+44;"self-employed";"divorced";"tertiary";"no";4;"yes";"no";"unknown";6;"may";312;3;-1;0;"unknown";"no"
+39;"admin.";"married";"secondary";"no";104;"yes";"no";"unknown";6;"may";412;1;-1;0;"unknown";"no"
+28;"technician";"single";"secondary";"no";312;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
+33;"blue-collar";"married";"secondary";"no";-349;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
+41;"services";"married";"unknown";"no";4;"no";"no";"unknown";6;"may";284;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";-322;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
+29;"admin.";"married";"secondary";"no";-150;"yes";"no";"unknown";6;"may";328;1;-1;0;"unknown";"no"
+38;"management";"married";"unknown";"no";1349;"yes";"no";"unknown";6;"may";100;1;-1;0;"unknown";"no"
+32;"admin.";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";226;1;-1;0;"unknown";"no"
+45;"services";"married";"secondary";"no";1259;"yes";"no";"unknown";6;"may";507;1;-1;0;"unknown";"no"
+33;"admin.";"single";"secondary";"no";101;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";848;"yes";"no";"unknown";6;"may";684;2;-1;0;"unknown";"no"
+41;"entrepreneur";"married";"unknown";"no";89;"yes";"no";"unknown";6;"may";333;2;-1;0;"unknown";"no"
+41;"blue-collar";"married";"secondary";"no";140;"yes";"no";"unknown";6;"may";311;3;-1;0;"unknown";"no"
+35;"admin.";"single";"secondary";"no";148;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
+40;"technician";"single";"secondary";"no";200;"yes";"no";"unknown";6;"may";322;2;-1;0;"unknown";"no"
+60;"self-employed";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";202;4;-1;0;"unknown";"no"
+47;"services";"divorced";"secondary";"no";201;"yes";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
+46;"blue-collar";"married";"primary";"no";530;"yes";"no";"unknown";6;"may";739;3;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";273;2;-1;0;"unknown";"no"
+49;"self-employed";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
+29;"blue-collar";"married";"secondary";"no";43;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
+31;"management";"single";"tertiary";"no";-173;"yes";"no";"unknown";6;"may";396;2;-1;0;"unknown";"no"
+38;"management";"married";"tertiary";"no";389;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";215;"yes";"yes";"unknown";6;"may";308;3;-1;0;"unknown";"no"
+35;"technician";"married";"secondary";"no";-131;"yes";"no";"unknown";6;"may";467;2;-1;0;"unknown";"no"
+31;"management";"single";"secondary";"no";783;"yes";"no";"unknown";6;"may";320;1;-1;0;"unknown";"no"
+41;"admin.";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
+46;"services";"married";"unknown";"no";80;"yes";"no";"unknown";6;"may";245;2;-1;0;"unknown";"no"
+40;"services";"divorced";"secondary";"no";105;"yes";"no";"unknown";6;"may";189;2;-1;0;"unknown";"no"
+29;"admin.";"married";"secondary";"no";182;"yes";"yes";"unknown";6;"may";477;1;-1;0;"unknown";"no"
+49;"admin.";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";65;3;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"no";510;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
+40;"management";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
+53;"admin.";"married";"secondary";"no";244;"yes";"yes";"unknown";6;"may";197;2;-1;0;"unknown";"no"
+49;"management";"married";"tertiary";"no";92;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";6;"may";64;2;-1;0;"unknown";"no"
+29;"student";"single";"secondary";"no";948;"yes";"no";"unknown";6;"may";75;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";6;"may";400;2;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";710;"yes";"no";"unknown";6;"may";378;3;-1;0;"unknown";"no"
+39;"services";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";118;2;-1;0;"unknown";"no"
+36;"technician";"married";"secondary";"no";368;"yes";"yes";"unknown";6;"may";1597;2;-1;0;"unknown";"yes"
+44;"entrepreneur";"married";"tertiary";"no";1631;"yes";"no";"unknown";6;"may";346;2;-1;0;"unknown";"no"
+40;"admin.";"married";"secondary";"no";6;"yes";"no";"unknown";6;"may";60;3;-1;0;"unknown";"no"
+49;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";6;"may";276;2;-1;0;"unknown";"no"
+30;"technician";"single";"unknown";"no";-48;"yes";"no";"unknown";6;"may";152;2;-1;0;"unknown";"no"
+57;"management";"married";"tertiary";"no";2142;"yes";"no";"unknown";6;"may";251;3;-1;0;"unknown";"no"
+24;"services";"single";"secondary";"no";77;"yes";"yes";"unknown";6;"may";390;2;-1;0;"unknown";"no"
+46;"blue-collar";"married";"unknown";"no";401;"yes";"no";"unknown";6;"may";306;2;-1;0;"unknown";"no"
+33;"admin.";"married";"secondary";"no";21;"no";"no";"unknown";6;"may";189;3;-1;0;"unknown";"no"
+43;"services";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";125;2;-1;0;"unknown";"no"
+43;"admin.";"single";"secondary";"no";-497;"yes";"no";"unknown";6;"may";234;2;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"primary";"no";369;"no";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
+44;"technician";"single";"unknown";"no";78;"yes";"no";"unknown";6;"may";13;6;-1;0;"unknown";"no"
+35;"technician";"single";"tertiary";"no";226;"yes";"yes";"unknown";6;"may";283;3;-1;0;"unknown";"no"
+47;"technician";"married";"secondary";"no";503;"yes";"no";"unknown";6;"may";109;2;-1;0;"unknown";"no"
+33;"blue-collar";"married";"secondary";"no";372;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
+31;"admin.";"married";"secondary";"no";0;"yes";"yes";"unknown";6;"may";144;2;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";121;2;-1;0;"unknown";"no"
+36;"entrepreneur";"married";"tertiary";"no";125;"yes";"no";"unknown";6;"may";95;3;-1;0;"unknown";"no"
+56;"retired";"divorced";"primary";"no";4;"yes";"no";"unknown";6;"may";31;3;-1;0;"unknown";"no"
+40;"admin.";"single";"unknown";"no";419;"yes";"no";"unknown";6;"may";112;3;-1;0;"unknown";"no"
+41;"admin.";"divorced";"secondary";"no";322;"yes";"no";"unknown";6;"may";87;4;-1;0;"unknown";"no"
+53;"retired";"married";"secondary";"no";303;"yes";"no";"unknown";6;"may";593;2;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";607;"yes";"no";"unknown";6;"may";99;2;-1;0;"unknown";"no"
+44;"blue-collar";"divorced";"secondary";"no";579;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";3047;"yes";"no";"unknown";6;"may";285;2;-1;0;"unknown";"no"
+54;"technician";"divorced";"secondary";"no";83;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
+58;"management";"married";"tertiary";"no";68;"yes";"no";"unknown";6;"may";172;5;-1;0;"unknown";"no"
+52;"blue-collar";"married";"primary";"no";58;"yes";"no";"unknown";6;"may";213;3;-1;0;"unknown";"no"
+28;"admin.";"single";"secondary";"no";251;"yes";"no";"unknown";6;"may";178;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";688;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
+60;"retired";"married";"primary";"no";364;"yes";"no";"unknown";6;"may";631;2;-1;0;"unknown";"no"
+42;"services";"divorced";"secondary";"no";55;"yes";"no";"unknown";6;"may";176;5;-1;0;"unknown";"no"
+42;"admin.";"married";"secondary";"no";101;"yes";"no";"unknown";6;"may";32;3;-1;0;"unknown";"no"
+44;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";1529;2;-1;0;"unknown";"no"
+51;"blue-collar";"divorced";"primary";"no";325;"yes";"no";"unknown";6;"may";254;2;-1;0;"unknown";"no"
+49;"blue-collar";"married";"primary";"no";198;"yes";"no";"unknown";6;"may";200;2;-1;0;"unknown";"no"
+47;"entrepreneur";"married";"unknown";"no";209;"yes";"no";"unknown";6;"may";135;2;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";112;4;-1;0;"unknown";"no"
+34;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";314;3;-1;0;"unknown";"no"
+35;"services";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";597;3;-1;0;"unknown";"no"
+35;"blue-collar";"single";"secondary";"no";376;"yes";"yes";"unknown";6;"may";207;3;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";-7;"yes";"no";"unknown";6;"may";410;2;-1;0;"unknown";"no"
+55;"technician";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
+55;"retired";"married";"secondary";"no";143;"yes";"no";"unknown";6;"may";42;3;-1;0;"unknown";"no"
+35;"management";"single";"tertiary";"no";550;"yes";"no";"unknown";6;"may";55;2;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";162;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
+53;"management";"married";"tertiary";"no";115;"yes";"no";"unknown";6;"may";336;3;-1;0;"unknown";"no"
+41;"blue-collar";"married";"primary";"no";512;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
+57;"blue-collar";"married";"unknown";"no";807;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
+45;"blue-collar";"married";"unknown";"no";248;"yes";"no";"unknown";6;"may";88;5;-1;0;"unknown";"no"
+43;"blue-collar";"married";"primary";"no";1211;"yes";"no";"unknown";6;"may";208;3;-1;0;"unknown";"no"
+56;"self-employed";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";305;2;-1;0;"unknown";"no"
+31;"entrepreneur";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";206;2;-1;0;"unknown";"no"
+37;"blue-collar";"single";"secondary";"no";88;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
+30;"management";"married";"tertiary";"no";32;"yes";"no";"unknown";6;"may";122;3;-1;0;"unknown";"no"
+30;"admin.";"single";"secondary";"no";115;"yes";"no";"unknown";6;"may";66;3;-1;0;"unknown";"no"
+54;"blue-collar";"married";"secondary";"no";254;"yes";"no";"unknown";6;"may";66;2;-1;0;"unknown";"no"
+36;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";6;"may";164;2;-1;0;"unknown";"no"
+55;"unemployed";"married";"tertiary";"no";383;"no";"no";"unknown";6;"may";343;3;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";569;"yes";"yes";"unknown";6;"may";126;2;-1;0;"unknown";"no"
+38;"housemaid";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";59;3;-1;0;"unknown";"no"
+48;"admin.";"married";"secondary";"no";3754;"yes";"no";"unknown";6;"may";249;3;-1;0;"unknown";"no"
+55;"housemaid";"divorced";"tertiary";"no";6920;"yes";"no";"unknown";6;"may";406;3;-1;0;"unknown";"no"
+59;"services";"married";"secondary";"no";307;"yes";"yes";"unknown";6;"may";250;7;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";-421;"yes";"no";"unknown";6;"may";183;5;-1;0;"unknown";"no"
+33;"blue-collar";"divorced";"secondary";"no";60;"no";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";67;"yes";"no";"unknown";6;"may";220;2;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";402;"yes";"no";"unknown";6;"may";153;3;-1;0;"unknown";"no"
+30;"self-employed";"single";"tertiary";"no";800;"no";"no";"unknown";6;"may";95;2;-1;0;"unknown";"no"
+42;"technician";"married";"tertiary";"no";239;"yes";"yes";"unknown";6;"may";191;3;-1;0;"unknown";"no"
+51;"blue-collar";"divorced";"secondary";"no";421;"yes";"no";"unknown";6;"may";216;2;-1;0;"unknown";"no"
+44;"admin.";"divorced";"secondary";"no";161;"yes";"no";"unknown";7;"may";89;2;-1;0;"unknown";"no"
+46;"technician";"married";"secondary";"yes";289;"no";"no";"unknown";7;"may";51;3;-1;0;"unknown";"no"
+29;"student";"single";"secondary";"no";110;"yes";"no";"unknown";7;"may";169;3;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";245;"yes";"no";"unknown";7;"may";148;3;-1;0;"unknown";"no"
+42;"services";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";132;3;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";156;"yes";"no";"unknown";7;"may";117;3;-1;0;"unknown";"no"
+42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";7;"may";275;4;-1;0;"unknown";"no"
+39;"admin.";"married";"secondary";"no";20;"yes";"no";"unknown";7;"may";124;2;-1;0;"unknown";"no"
+55;"technician";"single";"tertiary";"no";92;"yes";"no";"unknown";7;"may";118;3;-1;0;"unknown";"no"
+46;"services";"married";"secondary";"no";89;"yes";"no";"unknown";7;"may";479;2;-1;0;"unknown";"no"
+42;"blue-collar";"married";"secondary";"no";166;"yes";"no";"unknown";7;"may";285;3;-1;0;"unknown";"no"
+45;"management";"married";"tertiary";"no";103;"yes";"no";"unknown";7;"may";35;4;-1;0;"unknown";"no"
+43;"blue-collar";"married";"primary";"no";-454;"yes";"no";"unknown";7;"may";322;2;-1;0;"unknown";"no"
+42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";7;"may";202;2;-1;0;"unknown";"no"
+30;"admin.";"married";"secondary";"no";4;"no";"no";"unknown";7;"may";172;8;-1;0;"unknown";"no"
+47;"blue-collar";"married";"secondary";"no";1001;"yes";"no";"unknown";7;"may";201;4;-1;0;"unknown";"no"
+51;"services";"divorced";"secondary";"no";-69;"yes";"no";"unknown";7;"may";216;3;-1;0;"unknown";"no"
+38;"technician";"single";"secondary";"no";42;"yes";"no";"unknown";7;"may";195;2;-1;0;"unknown";"no"
+57;"technician";"married";"unknown";"no";1617;"yes";"no";"unknown";7;"may";96;2;-1;0;"unknown";"no"
+42;"management";"divorced";"tertiary";"no";221;"yes";"no";"unknown";7;"may";720;2;-1;0;"unknown";"no"
+32;"technician";"divorced";"secondary";"no";210;"yes";"yes";"unknown";7;"may";188;2;-1;0;"unknown";"no"
+46;"management";"married";"tertiary";"no";0;"no";"no";"unknown";7;"may";70;2;-1;0;"unknown";"no"
+29;"student";"single";"tertiary";"no";185;"yes";"no";"unknown";7;"may";141;3;-1;0;"unknown";"no"
+59;"retired";"married";"secondary";"no";836;"yes";"no";"unknown";7;"may";106;1;-1;0;"unknown";"no"
+32;"blue-collar";"single";"secondary";"no";301;"yes";"no";"unknown";7;"may";395;2;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";503;"yes";"no";"unknown";7;"may";629;2;-1;0;"unknown";"no"
+40;"retired";"married";"primary";"no";407;"yes";"no";"unknown";7;"may";502;1;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";53;"yes";"no";"unknown";7;"may";446;1;-1;0;"unknown";"no"
+46;"self-employed";"married";"tertiary";"no";2303;"yes";"no";"unknown";7;"may";241;1;-1;0;"unknown";"no"
+43;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";7;"may";131;3;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";205;"yes";"no";"unknown";7;"may";312;1;-1;0;"unknown";"no"
+39;"management";"married";"tertiary";"no";305;"yes";"no";"unknown";7;"may";275;6;-1;0;"unknown";"no"
+30;"blue-collar";"divorced";"secondary";"no";251;"yes";"yes";"unknown";7;"may";120;2;-1;0;"unknown";"no"
+56;"retired";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";333;4;-1;0;"unknown";"no"
+29;"technician";"married";"secondary";"no";8;"no";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"secondary";"no";139;"yes";"no";"unknown";7;"may";91;1;-1;0;"unknown";"no"
+36;"services";"married";"secondary";"no";184;"yes";"no";"unknown";7;"may";128;3;-1;0;"unknown";"no"
+37;"blue-collar";"single";"secondary";"no";238;"yes";"no";"unknown";7;"may";200;2;-1;0;"unknown";"no"
+35;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";7;"may";326;1;-1;0;"unknown";"no"
+35;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";7;"may";292;1;-1;0;"unknown";"no"
+47;"services";"married";"primary";"no";222;"yes";"no";"unknown";7;"may";68;1;-1;0;"unknown";"no"
+31;"services";"married";"secondary";"no";414;"yes";"no";"unknown";7;"may";215;1;-1;0;"unknown";"no"
+56;"retired";"single";"primary";"no";223;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";197;"no";"no";"unknown";7;"may";32;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";-251;"yes";"no";"unknown";7;"may";162;1;-1;0;"unknown";"no"
+45;"self-employed";"divorced";"secondary";"no";-139;"yes";"no";"unknown";7;"may";152;3;-1;0;"unknown";"no"
+47;"blue-collar";"married";"unknown";"no";733;"yes";"no";"unknown";7;"may";268;1;-1;0;"unknown";"no"
+29;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";7;"may";104;2;-1;0;"unknown";"no"
+57;"services";"married";"secondary";"no";1;"no";"no";"unknown";7;"may";852;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";97;"yes";"no";"unknown";7;"may";923;3;-1;0;"unknown";"no"
+31;"blue-collar";"single";"primary";"no";435;"yes";"no";"unknown";7;"may";159;2;-1;0;"unknown";"no"
+31;"management";"divorced";"tertiary";"no";0;"yes";"no";"unknown";7;"may";953;3;-1;0;"unknown";"no"
+37;"technician";"single";"tertiary";"no";147;"no";"no";"unknown";7;"may";416;2;-1;0;"unknown";"no"
+30;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";174;1;-1;0;"unknown";"no"
+58;"services";"divorced";"secondary";"no";1109;"yes";"yes";"unknown";7;"may";180;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";404;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";981;"yes";"no";"unknown";7;"may";294;1;-1;0;"unknown";"no"
+33;"blue-collar";"single";"primary";"no";95;"yes";"no";"unknown";7;"may";102;1;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";302;"yes";"no";"unknown";7;"may";124;1;-1;0;"unknown";"no"
+36;"services";"divorced";"secondary";"no";-290;"yes";"yes";"unknown";7;"may";128;1;-1;0;"unknown";"no"
+37;"services";"single";"secondary";"no";259;"yes";"no";"unknown";7;"may";130;1;-1;0;"unknown";"no"
+35;"blue-collar";"married";"secondary";"no";527;"yes";"yes";"unknown";7;"may";143;1;-1;0;"unknown";"no"
+55;"retired";"married";"secondary";"no";102;"yes";"no";"unknown";7;"may";74;1;-1;0;"unknown";"no"
+34;"management";"single";"tertiary";"no";872;"yes";"no";"unknown";7;"may";105;2;-1;0;"unknown";"no"
+40;"management";"divorced";"tertiary";"no";490;"yes";"no";"unknown";7;"may";477;2;-1;0;"unknown";"no"
+42;"blue-collar";"single";"primary";"no";19;"yes";"no";"unknown";7;"may";158;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";16;"yes";"no";"unknown";7;"may";250;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";386;"yes";"no";"unknown";7;"may";168;1;-1;0;"unknown";"no"
+35;"technician";"single";"secondary";"no";539;"yes";"no";"unknown";7;"may";520;1;-1;0;"unknown";"no"
+44;"technician";"divorced";"secondary";"no";-329;"yes";"no";"unknown";7;"may";171;1;-1;0;"unknown";"no"
+30;"services";"single";"secondary";"no";-174;"yes";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
+45;"entrepreneur";"married";"secondary";"no";68;"yes";"no";"unknown";7;"may";254;1;-1;0;"unknown";"no"
+35;"blue-collar";"single";"unknown";"yes";-532;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
+36;"admin.";"divorced";"secondary";"no";0;"yes";"no";"unknown";7;"may";133;2;-1;0;"unknown";"no"
+49;"blue-collar";"married";"secondary";"no";64;"yes";"no";"unknown";7;"may";293;3;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";1415;"yes";"no";"unknown";7;"may";485;1;-1;0;"unknown";"no"
+31;"technician";"single";"secondary";"no";147;"yes";"no";"unknown";7;"may";374;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";72;"yes";"no";"unknown";7;"may";425;6;-1;0;"unknown";"no"
+37;"services";"single";"secondary";"no";-196;"yes";"no";"unknown";7;"may";207;1;-1;0;"unknown";"no"
+33;"blue-collar";"married";"primary";"no";716;"yes";"no";"unknown";7;"may";83;3;-1;0;"unknown";"no"
+37;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";7;"may";228;1;-1;0;"unknown";"no"
+42;"services";"married";"secondary";"no";-246;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
+56;"blue-collar";"married";"secondary";"no";-203;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";245;"yes";"yes";"unknown";7;"may";732;2;-1;0;"unknown";"yes"
+36;"services";"single";"secondary";"no";342;"yes";"no";"unknown";7;"may";142;1;-1;0;"unknown";"no"
+29;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
+54;"management";"married";"tertiary";"yes";-248;"yes";"yes";"unknown";7;"may";112;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";376;"yes";"no";"unknown";7;"may";1521;1;-1;0;"unknown";"no"
+43;"blue-collar";"divorced";"secondary";"no";370;"yes";"no";"unknown";7;"may";216;1;-1;0;"unknown";"no"
+47;"admin.";"single";"secondary";"no";594;"yes";"no";"unknown";7;"may";161;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"secondary";"no";387;"yes";"no";"unknown";7;"may";122;2;-1;0;"unknown";"no"
+38;"services";"married";"secondary";"no";208;"yes";"no";"unknown";7;"may";800;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";563;"yes";"no";"unknown";7;"may";615;1;-1;0;"unknown";"no"
+33;"services";"divorced";"secondary";"no";392;"yes";"yes";"unknown";7;"may";254;1;-1;0;"unknown";"no"
+33;"retired";"married";"secondary";"no";165;"no";"no";"unknown";7;"may";111;1;-1;0;"unknown";"no"
+53;"admin.";"divorced";"unknown";"no";236;"yes";"no";"unknown";7;"may";354;1;-1;0;"unknown";"no"
+37;"services";"married";"primary";"no";52;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
+40;"management";"single";"tertiary";"no";1265;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";693;"yes";"no";"unknown";7;"may";327;3;-1;0;"unknown";"no"
+35;"technician";"married";"secondary";"no";118;"yes";"no";"unknown";7;"may";236;1;-1;0;"unknown";"no"
+49;"blue-collar";"married";"primary";"no";3659;"yes";"no";"unknown";7;"may";160;1;-1;0;"unknown";"no"
+26;"blue-collar";"single";"secondary";"no";24;"yes";"no";"unknown";7;"may";180;1;-1;0;"unknown";"no"
+38;"management";"single";"tertiary";"no";673;"yes";"no";"unknown";7;"may";184;1;-1;0;"unknown";"no"
+52;"self-employed";"married";"secondary";"no";273;"no";"no";"unknown";7;"may";227;1;-1;0;"unknown";"no"
+33;"services";"divorced";"secondary";"no";327;"yes";"no";"unknown";7;"may";109;1;-1;0;"unknown";"no"
+31;"admin.";"single";"secondary";"no";299;"yes";"no";"unknown";7;"may";492;2;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";298;1;-1;0;"unknown";"no"
+35;"blue-collar";"single";"primary";"no";109;"yes";"no";"unknown";7;"may";83;2;-1;0;"unknown";"no"
+55;"management";"divorced";"tertiary";"no";552;"no";"no";"unknown";7;"may";241;2;-1;0;"unknown";"no"
+32;"blue-collar";"divorced";"primary";"no";473;"yes";"no";"unknown";7;"may";204;2;-1;0;"unknown";"no"
+37;"unknown";"single";"unknown";"no";414;"yes";"no";"unknown";7;"may";131;1;-1;0;"unknown";"no"
+45;"blue-collar";"m

<TRUNCATED>