You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2017/12/05 07:44:59 UTC

madlib git commit: K-NN: Fix minor issues in documentation

Repository: madlib
Updated Branches:
  refs/heads/master 775afd05d -> c73cf8507


K-NN: Fix minor issues in documentation

Closes #208


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/c73cf850
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/c73cf850
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/c73cf850

Branch: refs/heads/master
Commit: c73cf850791210c47ffea66c87da546c2049265a
Parents: 775afd0
Author: Frank McQuillan <fm...@pivotal.io>
Authored: Mon Dec 4 12:24:46 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Mon Dec 4 23:44:02 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/knn/knn.sql_in | 43 +++++++++++++-------------
 1 file changed, 22 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/c73cf850/src/ports/postgres/modules/knn/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in
index 17d81ad..cdc9704 100644
--- a/src/ports/postgres/modules/knn/knn.sql_in
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -133,21 +133,18 @@ neighbors that were used in the voting/averaging, sorted
 from closest to furthest.</dd>
 
 <dt>fn_dist (optional)</dt>
-<dd>TEXT, default: squared_dist_norm2'. The name of the function to use to calculate the distance from a data point to a centroid.
+<dd>TEXT, default: 'squared_dist_norm2'. The name of the function
+used to calculate the distance between data points.
 
-The following distance functions can be used (computation of barycenter/mean in parentheses):
+The following distance functions can be used:
 <ul>
-<li><b>\ref dist_norm1</b>:  1-norm/Manhattan (element-wise median
-[Note that MADlib does not provide a median aggregate function for support and
-performance reasons.])</li>
-<li><b>\ref dist_norm2</b>: 2-norm/Euclidean (element-wise mean)</li>
-<li><b>\ref squared_dist_norm2</b>: squared Euclidean distance (element-wise mean)</li>
-<li><b>\ref dist_angle</b>: angle (element-wise mean of normalized points)</li>
-<li><b>\ref dist_tanimoto</b>: tanimoto (element-wise mean of normalized points <a href="#kmeans-lit-5">[5]</a>)</li>
+<li><b>\ref dist_norm1</b>:  1-norm/Manhattan</li>
+<li><b>\ref dist_norm2</b>: 2-norm/Euclidean</li>
+<li><b>\ref squared_dist_norm2</b>: squared Euclidean distance</li>
+<li><b>\ref dist_angle</b>: angle</li>
+<li><b>\ref dist_tanimoto</b>: tanimoto</li>
 <li><b>user defined function</b> with signature <tt>DOUBLE PRECISION[] x, DOUBLE PRECISION[] y -> DOUBLE PRECISION</tt></li></ul></dd>
 
-
-
 </dl>
 
 
@@ -168,6 +165,11 @@ The output of the KNN module is a table with the following columns:
         <th>prediction</th>
         <td>INTEGER. Label in case of classification, average value in case of regression.</td>
     </tr>
+        <tr>
+        <th>k_nearest_neighbours</th>
+        <td>INTEGER[]. List of nearest neighbors, sorted closest to furthest
+        from the corresponding test point.</td>
+    </tr>
 </table>
 
 
@@ -236,14 +238,14 @@ DROP TABLE IF EXISTS knn_result_classification;
 SELECT * FROM madlib.knn(
                 'knn_train_data',      -- Table of training data
                 'data',                -- Col name of training data
-                'id',                  -- Col Name of id in train data
+                'id',                  -- Col name of id in train data
                 'label',               -- Training labels
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
                 'id',                  -- Col name of id in test data
                 'knn_result_classification',  -- Output table
                  3,                    -- Number of nearest neighbors
-                 True                  -- True if you want to show Nearest-Neighbors by id, False otherwise
+                 True,                 -- True to list nearest-neighbors by id
                  'madlib.squared_dist_norm2' -- Distance function
                 );
 SELECT * from knn_result_classification ORDER BY id;
@@ -260,7 +262,8 @@ Result:
   6 | {50,45} |          0 | {6,7,8}
 (6 rows)
 </pre>
-Note that the nearest neighbors are sorted from closest to furthest from the corresponding test point.
+Note that the nearest neighbors are sorted from closest
+to furthest from the corresponding test point.
 
 -#  Run KNN for regression:
 <pre class="example">
@@ -275,8 +278,8 @@ SELECT * FROM madlib.knn(
                 'id',                  -- Col name of id in test data
                 'knn_result_regression',  -- Output table
                  3,                    -- Number of nearest neighbors
-                True                   -- True if you want to show Nearest-Neighbors, False otherwise
-                'madlib.squared_dist_norm2' -- Distance function
+                True,                  -- True to list nearest-neighbors by id
+                'madlib.dist_norm2'    -- Distance function
                 );
 SELECT * FROM knn_result_regression ORDER BY id;
 </pre>
@@ -293,7 +296,8 @@ Result:
 (6 rows)
 </pre>
 
--#  List nearest neighbors only, without doing classification or regression:
+-#  List nearest neighbors only, without doing classification
+or regression:
 <pre class="example">
 DROP TABLE IF EXISTS knn_result_list_neighbors;
 SELECT * FROM madlib.knn(
@@ -334,9 +338,6 @@ vector (a test point) is classified by assigning the label which is most
 frequent among the k training samples nearest to that test point.
 In case of regression, average of the values of these k training samples
 is assigned to the test point.
-The only distance metric supported in this version is MADlib's squared_dist_norm2.
-Other distance metrics will be added in a future release of this module.
-
 
 @anchor literature
 @literature
@@ -535,4 +536,4 @@ BEGIN
     RETURN returnstring;
 END;
 $$ LANGUAGE plpgsql VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); 
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');