You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2012/02/11 11:22:31 UTC

svn commit: r1243022 [26/38] - in /mahout/site/new_website: ./ MAHOUT/ MAHOUT/2010/ MAHOUT/2010/09/ MAHOUT/2010/09/14/ MAHOUT/2011/ MAHOUT/2011/10/ MAHOUT/2011/10/21/ MAHOUT/books-tutorials-and-talks.data/ MAHOUT/books-tutorials-talks.data/ MAHOUT/book...

Added: mahout/site/new_website/MAHOUT/restricted-boltzmann-machines.html
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/restricted-boltzmann-machines.html?rev=1243022&view=auto
==============================================================================
--- mahout/site/new_website/MAHOUT/restricted-boltzmann-machines.html (added)
+++ mahout/site/new_website/MAHOUT/restricted-boltzmann-machines.html Sat Feb 11 10:22:15 2012
@@ -0,0 +1,155 @@
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<HTML>
+  <HEAD>
+    <LINK type="text/css" rel="stylesheet" href="resources/space.css">
+    <STYLE type="text/css">
+      .footer {
+        background-image:      url('https://cwiki.apache.org/confluence/images/border/border_bottom.gif');
+        background-repeat:     repeat-x;
+        background-position:   left top;
+        padding-top:           4px;
+        color:                 #666;
+      }
+    </STYLE>
+    <SCRIPT type="text/javascript" language="javascript">
+      var hide = null;
+      var show = null;
+      var children = null;
+
+      function init() {
+        /* Search form initialization */
+        var form = document.forms['search'];
+        if (form != null) {
+          form.elements['domains'].value = location.hostname;
+          form.elements['sitesearch'].value = location.hostname;
+        }
+
+        /* Children initialization */
+        hide = document.getElementById('hide');
+        show = document.getElementById('show');
+        children = document.all != null ?
+                   document.all['children'] :
+                   document.getElementById('children');
+        if (children != null) {
+          children.style.display = 'none';
+          show.style.display = 'inline';
+          hide.style.display = 'none';
+        }
+      }
+
+      function showChildren() {
+        children.style.display = 'block';
+        show.style.display = 'none';
+        hide.style.display = 'inline';
+      }
+
+      function hideChildren() {
+        children.style.display = 'none';
+        show.style.display = 'inline';
+        hide.style.display = 'none';
+      }
+    </SCRIPT>
+    <TITLE>Restricted Boltzmann Machines</TITLE>
+  <META http-equiv="Content-Type" content="text/html;charset=UTF-8"></HEAD>
+  <BODY onload="init()">
+    <TABLE border="0" cellpadding="2" cellspacing="0" width="100%">
+      <TR class="topBar">
+        <TD align="left" valign="middle" class="topBarDiv" align="left" nowrap="">
+          &nbsp;<A href="mahout-wiki.html" title="Apache Mahout">Apache Mahout</A>&nbsp;&gt;&nbsp;<A href="mahout-wiki.html" title="Mahout Wiki">Mahout Wiki</A>&nbsp;&gt;&nbsp;<A href="algorithms.html" title="Algorithms">Algorithms</A>&nbsp;&gt;&nbsp;<A href="" title="Restricted Boltzmann Machines">Restricted Boltzmann Machines</A>
+        </TD>
+        <TD align="right" valign="middle" nowrap="">
+          <FORM name="search" action="http://www.google.com/search" method="get">
+            <INPUT type="hidden" name="ie" value="UTF-8">
+            <INPUT type="hidden" name="oe" value="UTF-8">
+            <INPUT type="hidden" name="domains" value="">
+            <INPUT type="hidden" name="sitesearch" value="">
+            <INPUT type="text" name="q" maxlength="255" value="">        
+            <INPUT type="submit" name="btnG" value="Google Search">
+          </FORM>
+        </TD>
+      </TR> 
+    </TABLE>
+
+    <DIV id="PageContent">
+      <DIV class="pageheader" style="padding: 6px 0px 0px 0px;">
+        <!-- We'll enable this once we figure out how to access (and save) the logo resource -->
+        <!--img src="/wiki/images/confluence_logo.gif" style="float: left; margin: 4px 4px 4px 10px;" border="0"-->
+        <DIV style="margin: 0px 10px 0px 10px" class="smalltext">Apache Mahout</DIV>
+        <DIV style="margin: 0px 10px 8px 10px" class="pagetitle">Restricted Boltzmann Machines</DIV>
+
+        <DIV class="greynavbar" align="right" style="padding: 2px 10px; margin: 0px;">
+          <A href="https://cwiki.apache.org/confluence/pages/editpage.action?pageId=20644288">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/notep_16.gif" height="16" width="16" border="0" align="absmiddle" title="Edit Page"></A>
+            <A href="https://cwiki.apache.org/confluence/pages/editpage.action?pageId=20644288">Edit Page</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/browse_space.gif" height="16" width="16" border="0" align="absmiddle" title="Browse Space"></A>
+            <A href="https://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">Browse Space</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=20644288">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/add_page_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add Page"></A>
+          <A href="https://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=20644288">Add Page</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=20644288">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/add_blogentry_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add News"></A>
+          <A href="https://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=20644288">Add News</A>
+        </DIV>
+      </DIV>
+      <DIV class="pagesubheading" style="margin: 0px 10px 0px 10px;">
+        #editReport()
+      </DIV>
+
+      <DIV class="pagecontent">
+        <DIV class="wiki-content">
+          <P>NOTE: This implementation is a Work-In-Progress, at least till September, 2010. </P>
+
+<P>The JIRA issue is <A href="https://issues.apache.org/jira/browse/MAHOUT-375" class="external-link" rel="nofollow">here</A>. </P>
+
+<H3><A name="RestrictedBoltzmannMachines-BoltzmannMachines"></A>Boltzmann Machines</H3>
+<P>Boltzmann Machines are a type of stochastic neural networks that closely resemble physical processes. They define a network of units with an overall energy that is evolved over a period of time, until it reaches thermal equilibrium. </P>
+
+<P>However, the convergence speed of Boltzmann machines that have unconstrained connectivity is low.</P>
+
+<H3><A name="RestrictedBoltzmannMachines-RestrictedBoltzmannMachines"></A>Restricted Boltzmann Machines</H3>
+<P>Restricted Boltzmann Machines are a variant, that are 'restricted' in the sense that connections between hidden units of a single layer are <EM>not</EM> allowed. In addition, stacking multiple RBM's is also feasible, with the activities of the hidden units forming the base for a higher-level RBM. The combination of these two features renders RBM's highly usable for parallelization. </P>
+
+<P>In the Netflix Prize, RBM's offered distinctly orthogonal predictions to SVD and k-NN approaches, and contributed immensely to the final solution.</P>
+
+<H3><A name="RestrictedBoltzmannMachines-RBM%27sinApacheMahout"></A>RBM's in Apache Mahout</H3>
+<P>An implementation of Restricted Boltzmann Machines is being developed for Apache Mahout as a Google Summer of Code 2010 project. A recommender interface will also be provided. The key aims of the implementation are:</P>
+<OL>
+	<LI>Accurate - should replicate known results, including those of the Netflix Prize</LI>
+	<LI>Fast - The implementation uses Map-Reduce, hence, it should be fast</LI>
+	<LI>Scale - Should scale to large datasets, with a design whose critical parts don't need a dependency between the amount of memory on your cluster systems and the size of your dataset</LI>
+</OL>
+
+
+<P>You can view the patch as it develops <A href="http://github.com/sisirkoppaka/mahout-rbm/compare/trunk...rbm" class="external-link" rel="nofollow">here</A>.</P>
+        </DIV>
+
+        
+      </DIV>
+    </DIV>
+    <DIV class="footer">
+      Generated by
+      <A href="http://www.atlassian.com/confluence/">Atlassian Confluence</A> (Version: 3.2 Build: 1810 Mar 16, 2010)
+      <A href="http://could.it/autoexport/">Auto Export Plugin</A> (Version: 1.0.0-dkulp)
+    </DIV>
+<SCRIPT type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-17359171-1']);
+  _gaq.push(['_setDomainName', 'none']);
+  _gaq.push(['_setAllowLinker', true]);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</SCRIPT>
+  </BODY>
+</HTML>
\ No newline at end of file

Added: mahout/site/new_website/MAHOUT/rowsimilarityjob.html
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/rowsimilarityjob.html?rev=1243022&view=auto
==============================================================================
--- mahout/site/new_website/MAHOUT/rowsimilarityjob.html (added)
+++ mahout/site/new_website/MAHOUT/rowsimilarityjob.html Sat Feb 11 10:22:15 2012
@@ -0,0 +1,176 @@
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<HTML>
+  <HEAD>
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/space.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/master.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/wiki-content.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/abs.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/menu.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/menu-ie.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/tables.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/panels.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/master-ie.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/renderer-macros.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/content-types.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/login.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/information-macros.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/layout-macros.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/default-theme.css">
+    <LINK type="text/css" rel="stylesheet" href="resources/space.css">
+    <STYLE type="text/css">
+      .footer {
+        background-image:      url('https://cwiki.apache.org/confluence/images/border/border_bottom.gif');
+        background-repeat:     repeat-x;
+        background-position:   left top;
+        padding-top:           4px;
+        color:                 #666;
+      }
+    </STYLE>
+    <SCRIPT type="text/javascript" language="javascript">
+      var hide = null;
+      var show = null;
+      var children = null;
+
+      function init() {
+        /* Search form initialization */
+        var form = document.forms['search'];
+        if (form != null) {
+          form.elements['domains'].value = location.hostname;
+          form.elements['sitesearch'].value = location.hostname;
+        }
+
+        /* Children initialization */
+        hide = document.getElementById('hide');
+        show = document.getElementById('show');
+        children = document.all != null ?
+                   document.all['children'] :
+                   document.getElementById('children');
+        if (children != null) {
+          children.style.display = 'none';
+          show.style.display = 'inline';
+          hide.style.display = 'none';
+        }
+      }
+
+      function showChildren() {
+        children.style.display = 'block';
+        show.style.display = 'none';
+        hide.style.display = 'inline';
+      }
+
+      function hideChildren() {
+        children.style.display = 'none';
+        show.style.display = 'inline';
+        hide.style.display = 'none';
+      }
+    </SCRIPT>
+    <TITLE>RowSimilarityJob</TITLE>
+  <META http-equiv="Content-Type" content="text/html;charset=UTF-8"></HEAD>
+  <BODY onload="init()">
+    <TABLE border="0" cellpadding="2" cellspacing="0" width="100%">
+      <TR class="topBar">
+        <TD align="left" valign="middle" class="topBarDiv" align="left" nowrap="">
+          &nbsp;<A href="mahout-wiki.html" title="Apache Mahout">Apache Mahout</A>&nbsp;&gt;&nbsp;<A href="mahout-wiki.html" title="Mahout Wiki">Mahout Wiki</A>&nbsp;&gt;&nbsp;<A href="algorithms.html" title="Algorithms">Algorithms</A>&nbsp;&gt;&nbsp;<A href="" title="RowSimilarityJob">RowSimilarityJob</A>
+        </TD>
+        <TD align="right" valign="middle" nowrap="">
+          <FORM name="search" action="http://www.google.com/search" method="get">
+            <INPUT type="hidden" name="ie" value="UTF-8">
+            <INPUT type="hidden" name="oe" value="UTF-8">
+            <INPUT type="hidden" name="domains" value="">
+            <INPUT type="hidden" name="sitesearch" value="">
+            <INPUT type="text" name="q" maxlength="255" value="">        
+            <INPUT type="submit" name="btnG" value="Google Search">
+          </FORM>
+        </TD>
+      </TR> 
+    </TABLE>
+
+    <DIV id="PageContent">
+      <DIV class="pageheader" style="padding: 6px 0px 0px 0px;">
+        <!-- We'll enable this once we figure out how to access (and save) the logo resource -->
+        <!--img src="/wiki/images/confluence_logo.gif" style="float: left; margin: 4px 4px 4px 10px;" border="0"-->
+        <DIV style="margin: 0px 10px 0px 10px" class="smalltext">Apache Mahout</DIV>
+        <DIV style="margin: 0px 10px 8px 10px" class="pagetitle">RowSimilarityJob</DIV>
+
+        <DIV class="greynavbar" align="right" style="padding: 2px 10px; margin: 0px;">
+          <A href="https://cwiki.apache.org/confluence/pages/editpage.action?pageId=27837166">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/notep_16.gif" height="16" width="16" border="0" align="absmiddle" title="Edit Page"></A>
+            <A href="https://cwiki.apache.org/confluence/pages/editpage.action?pageId=27837166">Edit Page</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/browse_space.gif" height="16" width="16" border="0" align="absmiddle" title="Browse Space"></A>
+            <A href="https://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">Browse Space</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=27837166">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/add_page_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add Page"></A>
+          <A href="https://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=27837166">Add Page</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=27837166">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/add_blogentry_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add News"></A>
+          <A href="https://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=27837166">Add News</A>
+        </DIV>
+      </DIV>
+
+      <DIV class="pagecontent">
+        <DIV class="wiki-content">
+          <P>A brief description of RowSimilarityJob:</P>
+
+<P>(originally from <A href="http://mail-archives.apache.org/mod_mbox/mahout-user/201202.mbox/browser" class="external-link" rel="nofollow">mailing list</A>)</P>
+
+<P>The goal is to compute all pairwise similarities between the rows of a<BR>
+sparse matrix A.</P>
+
+<P>The computation should be executed in a way that only rows that have at<BR>
+least one non-zero value in the same dimension (column) are compared. We<BR>
+need this to avoid a quadratic number of pairwise comparisons.<BR>
+Furthermore we should be able to 'embed' arbitrary similarity measures<BR>
+and we should always be able to use a combiner in all MapReduce steps.</P>
+
+<P>The computation is executed using three MapReduce passes:</P>
+
+<P>In the first step, the rows of A are preprocessed via<BR>
+VectorSimilarityMeasure.normalize() (they could e.g. be binarized or<BR>
+scaled to unit-length), a single number for each row of A is computed<BR>
+via VectorSimilarityMeasure.norm() (e.g. L1 norm) and A' is formed.</P>
+
+<P>The second steps operates on the rows of A' (the columns of A). The<BR>
+mapper sums up all pairwise cooccurrences using<BR>
+VectorSimilarityMeasure.aggregate() (as vectors, thereby using the so<BR>
+called 'stripes' pattern). The reducers sums up all cooccurrence vectors<BR>
+for one row and uses the similarity measure and the precomputed numbers<BR>
+from step one to compute all similarities via<BR>
+VectorSimilarityMeasure.similarity().</P>
+
+<P>The third step ensures that only the top k similar rows per row are kept.</P>
+
+<P>It's hard to see from the code but actually the job performs the matrix<BR>
+multiplication AA' via outer products with a modified (similarity<BR>
+measure specific) dot product.</P>
+        </DIV>
+
+        
+      </DIV>
+    </DIV>
+    <DIV class="footer">
+      Generated by
+      <A href="http://www.atlassian.com/confluence/">Atlassian Confluence</A> (Version: 3.4.9 Build: 2042 Feb 14, 2011)
+      <A href="http://could.it/autoexport/">Auto Export Plugin</A> (Version: 1.0.0-dkulp)
+    </DIV>
+<SCRIPT type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-17359171-1']);
+  _gaq.push(['_setDomainName', 'none']);
+  _gaq.push(['_setAllowLinker', true]);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</SCRIPT>
+  </BODY>
+</HTML>
\ No newline at end of file

Added: mahout/site/new_website/MAHOUT/sample-clusters-animation.data/Canopy.png
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/sample-clusters-animation.data/Canopy.png?rev=1243022&view=auto
==============================================================================
Binary file - no diff available.

Propchange: mahout/site/new_website/MAHOUT/sample-clusters-animation.data/Canopy.png
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: mahout/site/new_website/MAHOUT/sample-clusters-animation.data/Canopy.png.jpeg
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/sample-clusters-animation.data/Canopy.png.jpeg?rev=1243022&view=auto
==============================================================================
Binary file - no diff available.

Propchange: mahout/site/new_website/MAHOUT/sample-clusters-animation.data/Canopy.png.jpeg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: mahout/site/new_website/MAHOUT/sample-clusters-animation.data/Clustering.png
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/sample-clusters-animation.data/Clustering.png?rev=1243022&view=auto
==============================================================================
Binary file - no diff available.

Propchange: mahout/site/new_website/MAHOUT/sample-clusters-animation.data/Clustering.png
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: mahout/site/new_website/MAHOUT/sample-clusters-animation.data/Clustering.png.jpeg
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/sample-clusters-animation.data/Clustering.png.jpeg?rev=1243022&view=auto
==============================================================================
Binary file - no diff available.

Propchange: mahout/site/new_website/MAHOUT/sample-clusters-animation.data/Clustering.png.jpeg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: mahout/site/new_website/MAHOUT/sample-clusters-animation.data/animation.gif
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/sample-clusters-animation.data/animation.gif?rev=1243022&view=auto
==============================================================================
Binary file - no diff available.

Propchange: mahout/site/new_website/MAHOUT/sample-clusters-animation.data/animation.gif
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: mahout/site/new_website/MAHOUT/sample-clusters-animation.data/animation.gif.jpeg
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/sample-clusters-animation.data/animation.gif.jpeg?rev=1243022&view=auto
==============================================================================
Binary file - no diff available.

Propchange: mahout/site/new_website/MAHOUT/sample-clusters-animation.data/animation.gif.jpeg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: mahout/site/new_website/MAHOUT/sample-clusters-animation.html
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/sample-clusters-animation.html?rev=1243022&view=auto
==============================================================================
--- mahout/site/new_website/MAHOUT/sample-clusters-animation.html (added)
+++ mahout/site/new_website/MAHOUT/sample-clusters-animation.html Sat Feb 11 10:22:15 2012
@@ -0,0 +1,181 @@
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<HTML>
+  <HEAD>
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/space.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/master.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/wiki-content.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/abs.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/menu.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/menu-ie.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/tables.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/panels.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/master-ie.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/renderer-macros.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/content-types.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/login.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/information-macros.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/layout-macros.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/default-theme.css">
+    <LINK type="text/css" rel="stylesheet" href="resources/space.css">
+    <STYLE type="text/css">
+      .footer {
+        background-image:      url('https://cwiki.apache.org/confluence/images/border/border_bottom.gif');
+        background-repeat:     repeat-x;
+        background-position:   left top;
+        padding-top:           4px;
+        color:                 #666;
+      }
+    </STYLE>
+    <SCRIPT type="text/javascript" language="javascript">
+      var hide = null;
+      var show = null;
+      var children = null;
+
+      function init() {
+        /* Search form initialization */
+        var form = document.forms['search'];
+        if (form != null) {
+          form.elements['domains'].value = location.hostname;
+          form.elements['sitesearch'].value = location.hostname;
+        }
+
+        /* Children initialization */
+        hide = document.getElementById('hide');
+        show = document.getElementById('show');
+        children = document.all != null ?
+                   document.all['children'] :
+                   document.getElementById('children');
+        if (children != null) {
+          children.style.display = 'none';
+          show.style.display = 'inline';
+          hide.style.display = 'none';
+        }
+      }
+
+      function showChildren() {
+        children.style.display = 'block';
+        show.style.display = 'none';
+        hide.style.display = 'inline';
+      }
+
+      function hideChildren() {
+        children.style.display = 'none';
+        show.style.display = 'inline';
+        hide.style.display = 'none';
+      }
+    </SCRIPT>
+    <TITLE>Sample Clusters Animation</TITLE>
+  <META http-equiv="Content-Type" content="text/html;charset=UTF-8"></HEAD>
+  <BODY onload="init()">
+    <TABLE border="0" cellpadding="2" cellspacing="0" width="100%">
+      <TR class="topBar">
+        <TD align="left" valign="middle" class="topBarDiv" align="left" nowrap="">
+          &nbsp;<A href="mahout-wiki.html" title="Apache Mahout">Apache Mahout</A>&nbsp;&gt;&nbsp;<A href="mahout-wiki.html" title="Mahout Wiki">Mahout Wiki</A>&nbsp;&gt;&nbsp;<A href="viewing-result.html" title="Viewing Result">Viewing Result</A>&nbsp;&gt;&nbsp;<A href="visualizing-sample-clusters.html" title="Visualizing Sample Clusters">Visualizing Sample Clusters</A>&nbsp;&gt;&nbsp;<A href="" title="Sample Clusters Animation">Sample Clusters Animation</A>
+        </TD>
+        <TD align="right" valign="middle" nowrap="">
+          <FORM name="search" action="http://www.google.com/search" method="get">
+            <INPUT type="hidden" name="ie" value="UTF-8">
+            <INPUT type="hidden" name="oe" value="UTF-8">
+            <INPUT type="hidden" name="domains" value="">
+            <INPUT type="hidden" name="sitesearch" value="">
+            <INPUT type="text" name="q" maxlength="255" value="">        
+            <INPUT type="submit" name="btnG" value="Google Search">
+          </FORM>
+        </TD>
+      </TR> 
+    </TABLE>
+
+    <DIV id="PageContent">
+      <DIV class="pageheader" style="padding: 6px 0px 0px 0px;">
+        <!-- We'll enable this once we figure out how to access (and save) the logo resource -->
+        <!--img src="/wiki/images/confluence_logo.gif" style="float: left; margin: 4px 4px 4px 10px;" border="0"-->
+        <DIV style="margin: 0px 10px 0px 10px" class="smalltext">Apache Mahout</DIV>
+        <DIV style="margin: 0px 10px 8px 10px" class="pagetitle">Sample Clusters Animation</DIV>
+
+        <DIV class="greynavbar" align="right" style="padding: 2px 10px; margin: 0px;">
+          <A href="https://cwiki.apache.org/confluence/pages/editpage.action?pageId=27825557">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/notep_16.gif" height="16" width="16" border="0" align="absmiddle" title="Edit Page"></A>
+            <A href="https://cwiki.apache.org/confluence/pages/editpage.action?pageId=27825557">Edit Page</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/browse_space.gif" height="16" width="16" border="0" align="absmiddle" title="Browse Space"></A>
+            <A href="https://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">Browse Space</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=27825557">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/add_page_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add Page"></A>
+          <A href="https://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=27825557">Add Page</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=27825557">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/add_blogentry_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add News"></A>
+          <A href="https://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=27825557">Add News</A>
+        </DIV>
+      </DIV>
+
+      <DIV class="pagecontent">
+        <DIV class="wiki-content">
+          <H1><A name="SampleClustersAnimation-DemoAnimation"></A>Demo Animation</H1>
+
+<P>This is an animation made from screen caps of all the  o.a.m.clustering.display.Display*.java demo apps.</P>
+
+<P><SPAN class="image-wrap" style=""><IMG src="sample-clusters-animation.data/animation.gif" width="400" style="border: 1px solid black"></SPAN></P>
+
+<P>All of the programs used the same set of random samples. The ellipses show the different clustering algorithms. For more details, see the programs.</P>
+
+<P><EM>This animation was made with <A href="http://www.onyxbits.de/giftedmotion" class="external-link" rel="nofollow">giftedmotion.jar</A>. What a border!</EM></P>
+<H1><A name="SampleClustersAnimation-ScreenCaptures"></A>Screen Captures</H1>
+<H2><A name="SampleClustersAnimation-DisplayClustering"></A>DisplayClustering</H2>
+<P>The original random dataset with semi-illustrative ellipses.</P>
+
+<P><SPAN class="image-wrap" style=""><IMG src="sample-clusters-animation.data/Clustering.png" width="400" style="border: 1px solid black"></SPAN></P>
+
+<H2><A name="SampleClustersAnimation-DisplayCanopy.java"></A>DisplayCanopy.java</H2>
+<P><SPAN class="image-wrap" style=""><IMG src="sample-clusters-animation.data/Canopy.png" width="400" style="border: 1px solid black"></SPAN></P>
+<H2><A name="SampleClustersAnimation-DisplayKMeans.java"></A>DisplayKMeans.java</H2>
+<P>KMeans algorithm with <EM>significance</EM> set to 5% or better.</P>
+
+<P><SPAN class="image-wrap" style=""><IMG src="/confluence/download/attachments/27825557/KMeans_5%25min.png?version=1&modificationDate=1314602016698" width="400" style="border: 1px solid black"></SPAN></P>
+<H2><A name="SampleClustersAnimation-DisplayDirichlet.java"></A>DisplayDirichlet.java</H2>
+<P>Dirichlet Process algorithm, based on a normal distribution, with <EM>significance</EM> set to 5% or better.</P>
+
+<P><SPAN class="image-wrap" style=""><IMG src="/confluence/download/attachments/27825557/DirichletProcess_NormalDistribution_5%25min.png?version=1&modificationDate=1314601998143" width="400" style="border: 1px solid black"></SPAN></P>
+
+<H2><A name="SampleClustersAnimation-DisplayFuzzyKMeans.java"></A>DisplayFuzzyKMeans.java</H2>
+
+<P><SPAN class="image-wrap" style=""><IMG src="/confluence/download/attachments/27825557/FuzzyKMeans_5%25min.png?version=1&modificationDate=1314602005336" width="400" style="border: 1px solid black"></SPAN></P>
+<H2><A name="SampleClustersAnimation-DisplayMeanShift.java"></A>DisplayMeanShift.java</H2>
+<P>MeanShift variant of Canopy algorithm, with significance set to 2%. (In the code it seems like this should be 5%?)<BR>
+<SPAN class="image-wrap" style=""><IMG src="/confluence/download/attachments/27825557/MeanShiftCanopy_2%25min.png?version=1&modificationDate=1314602023506" width="400" style="border: 1px solid black"></SPAN></P>
+
+<H2><A name="SampleClustersAnimation-DisplaySpectralKMeans.java"></A>DisplaySpectralKMeans.java</H2>
+<P>When this works. And when someone wants to redo the animation.</P>
+
+
+
+        </DIV>
+
+        
+      </DIV>
+    </DIV>
+    <DIV class="footer">
+      Generated by
+      <A href="http://www.atlassian.com/confluence/">Atlassian Confluence</A> (Version: 3.4.9 Build: 2042 Feb 14, 2011)
+      <A href="http://could.it/autoexport/">Auto Export Plugin</A> (Version: 1.0.0-dkulp)
+    </DIV>
+<SCRIPT type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-17359171-1']);
+  _gaq.push(['_setDomainName', 'none']);
+  _gaq.push(['_setAllowLinker', true]);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</SCRIPT>
+  </BODY>
+</HTML>
\ No newline at end of file

Added: mahout/site/new_website/MAHOUT/spectral-clustering.html
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/spectral-clustering.html?rev=1243022&view=auto
==============================================================================
--- mahout/site/new_website/MAHOUT/spectral-clustering.html (added)
+++ mahout/site/new_website/MAHOUT/spectral-clustering.html Sat Feb 11 10:22:15 2012
@@ -0,0 +1,317 @@
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<HTML>
+  <HEAD>
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/space.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/master.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/wiki-content.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/abs.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/menu.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/menu-ie.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/tables.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/panels.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/master-ie.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/renderer-macros.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/content-types.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/login.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/information-macros.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/layout-macros.css">
+<LINK type="text/css" rel="stylesheet" href="https://cwiki.apache.org/confluence/display/MAHOUT/$stylebase/default-theme.css">
+    <LINK type="text/css" rel="stylesheet" href="resources/space.css">
+    <STYLE type="text/css">
+      .footer {
+        background-image:      url('https://cwiki.apache.org/confluence/images/border/border_bottom.gif');
+        background-repeat:     repeat-x;
+        background-position:   left top;
+        padding-top:           4px;
+        color:                 #666;
+      }
+    </STYLE>
+    <SCRIPT type="text/javascript" language="javascript">
+      var hide = null;
+      var show = null;
+      var children = null;
+
+      function init() {
+        /* Search form initialization */
+        var form = document.forms['search'];
+        if (form != null) {
+          form.elements['domains'].value = location.hostname;
+          form.elements['sitesearch'].value = location.hostname;
+        }
+
+        /* Children initialization */
+        hide = document.getElementById('hide');
+        show = document.getElementById('show');
+        children = document.all != null ?
+                   document.all['children'] :
+                   document.getElementById('children');
+        if (children != null) {
+          children.style.display = 'none';
+          show.style.display = 'inline';
+          hide.style.display = 'none';
+        }
+      }
+
+      function showChildren() {
+        children.style.display = 'block';
+        show.style.display = 'none';
+        hide.style.display = 'inline';
+      }
+
+      function hideChildren() {
+        children.style.display = 'none';
+        show.style.display = 'inline';
+        hide.style.display = 'none';
+      }
+    </SCRIPT>
+    <TITLE>Spectral Clustering</TITLE>
+  <META http-equiv="Content-Type" content="text/html;charset=UTF-8"></HEAD>
+  <BODY onload="init()">
+    <TABLE border="0" cellpadding="2" cellspacing="0" width="100%">
+      <TR class="topBar">
+        <TD align="left" valign="middle" class="topBarDiv" align="left" nowrap="">
+          &nbsp;<A href="mahout-wiki.html" title="Apache Mahout">Apache Mahout</A>&nbsp;&gt;&nbsp;<A href="mahout-wiki.html" title="Mahout Wiki">Mahout Wiki</A>&nbsp;&gt;&nbsp;<A href="algorithms.html" title="Algorithms">Algorithms</A>&nbsp;&gt;&nbsp;<A href="" title="Spectral Clustering">Spectral Clustering</A>
+        </TD>
+        <TD align="right" valign="middle" nowrap="">
+          <FORM name="search" action="http://www.google.com/search" method="get">
+            <INPUT type="hidden" name="ie" value="UTF-8">
+            <INPUT type="hidden" name="oe" value="UTF-8">
+            <INPUT type="hidden" name="domains" value="">
+            <INPUT type="hidden" name="sitesearch" value="">
+            <INPUT type="text" name="q" maxlength="255" value="">        
+            <INPUT type="submit" name="btnG" value="Google Search">
+          </FORM>
+        </TD>
+      </TR> 
+    </TABLE>
+
+    <DIV id="PageContent">
+      <DIV class="pageheader" style="padding: 6px 0px 0px 0px;">
+        <!-- We'll enable this once we figure out how to access (and save) the logo resource -->
+        <!--img src="/wiki/images/confluence_logo.gif" style="float: left; margin: 4px 4px 4px 10px;" border="0"-->
+        <DIV style="margin: 0px 10px 0px 10px" class="smalltext">Apache Mahout</DIV>
+        <DIV style="margin: 0px 10px 8px 10px" class="pagetitle">Spectral Clustering</DIV>
+
+        <DIV class="greynavbar" align="right" style="padding: 2px 10px; margin: 0px;">
+          <A href="https://cwiki.apache.org/confluence/pages/editpage.action?pageId=23334397">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/notep_16.gif" height="16" width="16" border="0" align="absmiddle" title="Edit Page"></A>
+            <A href="https://cwiki.apache.org/confluence/pages/editpage.action?pageId=23334397">Edit Page</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/browse_space.gif" height="16" width="16" border="0" align="absmiddle" title="Browse Space"></A>
+            <A href="https://cwiki.apache.org/confluence/pages/listpages.action?key=MAHOUT">Browse Space</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=23334397">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/add_page_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add Page"></A>
+          <A href="https://cwiki.apache.org/confluence/pages/createpage.action?spaceKey=MAHOUT&fromPageId=23334397">Add Page</A>
+          &nbsp;
+          <A href="https://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=23334397">
+            <IMG src="https://cwiki.apache.org/confluence/images/icons/add_blogentry_16.gif" height="16" width="16" border="0" align="absmiddle" title="Add News"></A>
+          <A href="https://cwiki.apache.org/confluence/pages/createblogpost.action?spaceKey=MAHOUT&fromPageId=23334397">Add News</A>
+        </DIV>
+      </DIV>
+
+      <DIV class="pagecontent">
+        <DIV class="wiki-content">
+          <P>Spectral clustering, a more powerful and specialized algorithm (compared to K-means), derives its name from spectral analysis of a graph, which is how the data are represented. Each object to be clustered can initially be represented as an <EM>n</EM>&#45;dimensional numeric vector, but the difference with this algorithm is that there must also be some method for performing a comparison between each object and expressing this comparison as a scalar.</P>
+
+<P>This <EM>n</EM> by <EM>n</EM> comparison of all objects with all others forms the <EM>affinity</EM> matrix, which can be intuitively thought of as a rough representation of an underlying undirected, weighted, and fully-connected graph whose edges express the relative relationships, or affinities, between each pair of objects in the original data. This affinity matrix forms the basis from which the two spectral clustering algorithms operate.</P>
+
+<P>The equation by which the affinities are calculated can vary depending on the user's circumstances; typically, the equation takes the form of:</P>
+
+<P>exp( <EM>d</EM><SUP>2</SUP> / <EM>c</EM> )</P>
+
+<P>where <EM>d</EM> is the Euclidean distance between a pair of points, and <EM>c</EM> is a scaling factor. <EM>c</EM> is often calculated relative to a <EM>k</EM>&#45;neighborhood of closest points to the current point; all other affinities are set to 0 outside of the neighborhood. Again, this formula can vary depending on the situation (e.g. a fully-connected graph would ignore the <EM>k</EM>&#45;neighborhood and calculate affinities for all pairs of points).</P>
+
+<P><A href="http://spectrallyclustered.wordpress.com/2010/05/27/intro-and-spectral-clustering-101/" class="external-link" rel="nofollow">Full overview on spectral clustering</A></P>
+
+<H2><A name="SpectralClustering-KMeansSpectralClustering"></A>K-Means Spectral Clustering</H2>
+
+<H3><A name="SpectralClustering-Overview"></A>Overview</H3>
+
+<P>This consists of a few basic steps of generalized spectral clustering, followed by standard k-means clustering over the intermediate results. Again, this process begins with an affinity matrix <B>A</B> - whether or not it is fully-connected depends on the user's need.</P>
+
+<P><B>A</B> is then transformed into a pseudo-Laplacian matrix via a multiplication with a diagonal matrix whose entries consist of the sums of the rows of <B>A</B>. The sums are modified to be the inverse square root of their original values. The final operation looks something like:</P>
+
+<P>L = D^{-1/2} A D^{-1/2}</P>
+
+<P><B>L</B> has some properties that are of interest to us; most importantly, while it is symmetric like <B>A</B>, it has a more stable eigen-decomposition. <B>L</B> is decomposed into its constituent eigenvectors and corresponding eigenvalues (though the latter will not be needed for future calculations); the matrix of eigenvectors, <B>U</B>, is what we are now interested in.</P>
+
+<P>Assuming <B>U</B> is a column matrix (the eigenvectors comprise the columns), then we will now use the <EM>rows</EM> of <B>U</B> as proxy data for the original data points. We will run each row through standard K-means clustering, and the label that each proxy point receives will be transparently assigned to the corresponding original data point, resulting in the final clustering assignments.</P>
+
+<P><A href="http://spectrallyclustered.wordpress.com/2010/06/05/sprint-1-k-means-spectral-clustering/" class="external-link" rel="nofollow">Full overview on k-means spectral clustering</A></P>
+
+<H3><A name="SpectralClustering-Implementation"></A>Implementation</H3>
+
+<P>The Mahout implementation consists of a single driver - SpectralKMeansDriver - calling upon several common utilities. The driver performs the operations in sequential fashion: reading in and constructing the affinity matrix, building the diagonal matrix, building the pseudo-Laplacian and decomposing it, and clustering the components.</P>
+
+<P>The affinity matrix input is the most important part of this algorithm. It consists of text files which follow a specific format: that of a weighted, undirected graph. In order to represent a graph in text files, each line of a text file represents a single directional edge between two nodes. There are three comma-separated values on the line. The first number indicates the source node, the second is the destination node, and the third is the weight. For example:</P>
+
+<P>0, 1, 2.5</P>
+
+<P>would indicate the directional edge from node 0 to node 1 has a weight of 2.5. <B>Please note: as of 8/16/2010, Eigencuts assumes the affinity matrix is symmetric, hence there should be a corresponding line in the text file of: 1, 0, 2.5.</B> Also, each node should be an integer value.</P>
+
+<P>M/R jobs written for SpectralKMeans:</P>
+<UL>
+	<LI>AffinityMatrixInputJob (reads the raw input into a DistributedRowMatrix)</LI>
+	<LI>MatrixDiagonalizeJob (constructs the diagonal matrix)</LI>
+	<LI>UnitVectorizerJob (converts the eigenvector matrix <B>U</B> to unit rows)</LI>
+	<LI>VectorMatrixMultiplicationJob (multiplies <B>D</B> with <B>A</B>)</LI>
+</UL>
+
+
+<P>M/R jobs already in Mahout that were used:</P>
+<UL>
+	<LI>DistributedRowMatrix.transpose()</LI>
+	<LI>DistributedLanczosSolver</LI>
+	<LI>EigenVerfierJob</LI>
+	<LI>KMeansDriver</LI>
+</UL>
+
+
+<H2><A name="SpectralClustering-EigencutsSpectralClustering"></A>Eigencuts Spectral Clustering</H2>
+
+<H3><A name="SpectralClustering-Overview"></A>Overview</H3>
+
+<P>Intuitively, Eigencuts can be thought of as part 2 of the K-means algorithm, in that it performs the same initial steps up until the k-means clustering. The algorithm uses the same affinity matrix <B>A</B>, constructs the same diagonal matrix <B>D</B>, performs the same multiplication to create the pseudo-Laplacian <B>L</B>, and conducts an eigen-decomposition on <B>L</B> to obtain the eigenvectors and eigenvalues. But here is where the two algorithms differentiate.</P>
+
+<P>For each eigenvector, we wish to determine how stable its flow of probability is within the underlying graph of the original data. Intuitively, this is intrinsically related to the min-cut, max-flow problem of finding bottlenecks: if we perturb the flow rate on a specific edge, and overall the flow is stable, then we can conclude that this edge was not a bottleneck. If, however, perturbing an edge significantly alters the overall flow, we know this edge's eigenflow is very unstable and is a bottleneck.</P>
+
+<P>We have an <A href="http://spectrallyclustered.files.wordpress.com/2010/07/sensitivityequation.png" class="external-link" rel="nofollow">explicit form</A> of this &quot;sensitivity&quot; calculation (<A href="http://spectrallyclustered.wordpress.com/2010/07/15/sprint-3-three-last-mr-tasks/" class="external-link" rel="nofollow">full post here, under &quot;computing sensitivities&quot;</A>). The next step is called &quot;non-maximal suppression&quot;, which effectively means we will ignore any of the calculated sensitivities for which there exists another more negative sensitivity in the same neighborhood as the current one, effectively &quot;suppressing&quot; it.</P>
+
+<P>This non-maximal suppression then plays a role in the final affinity-cutting step, where we &quot;cut&quot; the affinity (set to 0) between any two nodes (effectively destroying the edge between them) for which the sensitivity calculated at that edge passes some threshold, <EM>and</EM> for which the sensitivity was <EM>not</EM> suppressed in the previous step.</P>
+
+<P>Once the cutting has been completed, the process loops upon itself (starting with the recalculation of <B>D</B> using the modified <B>A</B>) until no new cuts in <B>A</B> are made in the final step.</P>
+
+<P><A href="http://spectrallyclustered.wordpress.com/2010/07/06/sprint-3-introduction-to-eigencuts/" class="external-link" rel="nofollow">Full overview on Eigencuts spectral clustering</A></P>
+
+<H3><A name="SpectralClustering-Implementation"></A>Implementation</H3>
+
+<P>Since the first half of Eigencuts uses the same calculations as Spectral K-means, it uses the same common M/R tasks, both those specific to spectral clustering, as well as those general to Mahout. Unlike SpectralKMeans, however, there are no DistributedRowMatrix-specific operations performed, and hence there is no need for the data type at all; Mahout Vectors are used heavily instead.</P>
+
+<P>Once the initial affinity matrix is constructed, there is a loop within the EigencutsDriver over the calculation of <B>D</B>, the creation of <B>L</B> and its eigen-decomposition, the calculation of the sensitivities, and the actual affinity cuts, such that the loop terminates only when no cuts are made to the affinity matrix <B>A</B>. The final matrix <B>A</B> will then be representative of a graph structure whose data points exist in intra-connected clusters.</P>
+
+<P>M/R tasks specific to Eigencuts:</P>
+<UL>
+	<LI>EigencutsSensitivityJob (calculates the perturbation effects on edge weights)</LI>
+	<LI>EigencutsAffinityCutsJob (sets edge weights to 0)</LI>
+</UL>
+
+
+<P>M/R tasks within spectral clustering:</P>
+<UL>
+	<LI>AffinityMatrixInputJob (reads the raw input into a DistributedRowMatrix)</LI>
+	<LI>MatrixDiagonalizeJob (constructs the diagonal matrix)</LI>
+	<LI>VectorMatrixMultiplicationJob (multiplies <B>D</B> with <B>A</B>)</LI>
+</UL>
+
+
+<P>M/R tasks general to Mahout:</P>
+<UL>
+	<LI>DistributedLanczosSolver</LI>
+	<LI>EigenVerifierJob</LI>
+</UL>
+
+
+<H2><A name="SpectralClustering-Quickstart"></A>Quickstart</H2>
+
+<P>As noted before, the data for both these algorithms - the affinity matrix - is required to be symmetric. As of the first release, there is no built-in mechanism in Mahout for generating the affinities from raw data, as the formula this follows varies depending on the user's need, so it is left as an exercise to the user to generate the affinities prior to using these algorithms.</P>
+
+<P>The affinity input should follow the standard format for textual representation of a graph, namely:</P>
+
+<P>node_i, node_j, value_ij</P>
+
+<P>For example, the following 3x3 affinity matrix:</P>
+
+<DIV class="table-wrap">
+<TABLE class="confluenceTable"><TBODY>
+<TR>
+<TD class="confluenceTd">0.0</TD>
+<TD class="confluenceTd">0.8</TD>
+<TD class="confluenceTd">0.5</TD>
+</TR>
+<TR>
+<TD class="confluenceTd">0.8</TD>
+<TD class="confluenceTd">0.0</TD>
+<TD class="confluenceTd">0.9</TD>
+</TR>
+<TR>
+<TD class="confluenceTd">0.5</TD>
+<TD class="confluenceTd">0.9</TD>
+<TD class="confluenceTd">0.0</TD>
+</TR>
+</TBODY></TABLE>
+</DIV>
+
+
+<P>would have the following textual input format:</P>
+
+<P>0, 0, 0<BR>
+0, 1, 0.8<BR>
+0, 2, 0.5<BR>
+1, 0, 0.8<BR>
+1, 1, 0<BR>
+1, 2, 0.9<BR>
+2, 0, 0.5<BR>
+2, 1, 0.9<BR>
+2, 2, 0</P>
+
+<P>Then simply invoke Spectral K-means or Eigencuts, using the input directory and setting the other parameters as necessary (e.g. &quot;k&quot; for K-means, &quot;beta&quot; for Eigencuts, etc).</P>
+
+<H2><A name="SpectralClustering-Examples"></A>Examples</H2>
+
+<P><B>NOTE: Am still waiting for Carnegie Mellon/Univ. of Pittsburgh approval for official data set</B></P>
+
+<P>For these algorithms, it is useful to have a viable example, so I have created a small but effective synthetic data set to show how these algorithms operate. The raw data was generated synthetically and <A href="http://dl.dropbox.com/u/1377610/rawdata.csv" class="external-link" rel="nofollow">can be viewed here</A>. It consists of 450 two-dimensional points drawn from 3 separate gaussian distributions their own means and standard deviations (<A href="http://spectrallyclustered.files.wordpress.com/2010/07/clusters.png" class="external-link" rel="nofollow">here is an image of the plotted points</A>. This same data in affinity matrix form looks like this: <A href="http://dl.dropbox.com/u/1377610/affinity.csv" class="external-link" rel="nofollow">view the affinity data set</A></P>
+
+<P>In order to run the program, then, for spectral k-means:</P>
+
+<P>bin/mahout spectralkmeans -i /path/to/directory/with/affinity/matrix -o /output/path -k 3 -d 450</P>
+
+<P>and for eigencuts:</P>
+
+<P>bin/mahout eigencuts -i /path/to/directory/with/affinity/matrix -o /output/path -b 2 -d 450</P>
+
+<P>In both cases, the &quot;-d&quot; flag refers to the dimensionality of the affinity matrix; since there are 450 points, this value will be 450. In spectral k-means, the &quot;-k&quot; is the number of clusters to estimate. In Eigencuts, the &quot;-b&quot; refers to an initial estimate on the half-life of the flow of probability in the graph; higher estimates correspond to fewer clusters.</P>
+
+<P>Spectral k-means will eventually yield the clustering results, and there should only be a single mistake: point 54. This is because that particular point is in between the two left-hand clusters in the image, and so has high affinities with both clusters.</P>
+
+<P><A href="http://spectrallyclustered.wordpress.com/2010/07/14/sprint-3-quick-update/" class="external-link" rel="nofollow">Full overview of example here</A></P>
+
+<H3><A name="SpectralClustering-Resources"></A>Resources</H3>
+
+<UL>
+	<LI><A href="http://spectrallyclustered.wordpress.com/2010/05/27/intro-and-spectral-clustering-101/" class="external-link" rel="nofollow">http://spectrallyclustered.wordpress.com/2010/05/27/intro-and-spectral-clustering-101/</A></LI>
+	<LI><A href="http://www.stanford.edu/class/ee378B/papers/luxburg-spectral.pdf" class="external-link" rel="nofollow">http://www.stanford.edu/class/ee378B/papers/luxburg-spectral.pdf</A></LI>
+	<LI><A href="http://en.wikipedia.org/wiki/Laplacian_matrix" class="external-link" rel="nofollow">http://en.wikipedia.org/wiki/Laplacian_matrix</A></LI>
+	<LI><A href="http://en.wikipedia.org/wiki/Cluster_analysis#Spectral_clustering" class="external-link" rel="nofollow">http://en.wikipedia.org/wiki/Cluster_analysis#Spectral_clustering</A></LI>
+</UL>
+
+        </DIV>
+
+        
+      </DIV>
+    </DIV>
+    <DIV class="footer">
+      Generated by
+      <A href="http://www.atlassian.com/confluence/">Atlassian Confluence</A> (Version: 3.4.9 Build: 2042 Feb 14, 2011)
+      <A href="http://could.it/autoexport/">Auto Export Plugin</A> (Version: 1.0.0-dkulp)
+    </DIV>
+<SCRIPT type="text/javascript">
+
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-17359171-1']);
+  _gaq.push(['_setDomainName', 'none']);
+  _gaq.push(['_setAllowLinker', true]);
+  _gaq.push(['_trackPageview']);
+
+  (function() {
+    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+  })();
+
+</SCRIPT>
+  </BODY>
+</HTML>
\ No newline at end of file

Added: mahout/site/new_website/MAHOUT/stochastic-singular-value-decomposition.data/SSVD-CLI.lyx
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/stochastic-singular-value-decomposition.data/SSVD-CLI.lyx?rev=1243022&view=auto
==============================================================================
--- mahout/site/new_website/MAHOUT/stochastic-singular-value-decomposition.data/SSVD-CLI.lyx (added)
+++ mahout/site/new_website/MAHOUT/stochastic-singular-value-decomposition.data/SSVD-CLI.lyx Sat Feb 11 10:22:15 2012
@@ -0,0 +1,917 @@
+#LyX 2.0 created this file. For more info see http://www.lyx.org/
+\lyxformat 413
+\begin_document
+\begin_header
+\textclass article
+\use_default_options true
+\maintain_unincluded_children false
+\language english
+\language_package default
+\inputencoding auto
+\fontencoding global
+\font_roman lmodern
+\font_sans default
+\font_typewriter default
+\font_default_family default
+\use_non_tex_fonts false
+\font_sc false
+\font_osf false
+\font_sf_scale 100
+\font_tt_scale 100
+
+\graphics default
+\default_output_format default
+\output_sync 0
+\bibtex_command default
+\index_command default
+\paperfontsize default
+\spacing single
+\use_hyperref true
+\pdf_bookmarks true
+\pdf_bookmarksnumbered false
+\pdf_bookmarksopen false
+\pdf_bookmarksopenlevel 1
+\pdf_breaklinks false
+\pdf_pdfborder false
+\pdf_colorlinks false
+\pdf_backref false
+\pdf_pdfusetitle true
+\papersize default
+\use_geometry false
+\use_amsmath 1
+\use_esint 1
+\use_mhchem 1
+\use_mathdots 1
+\cite_engine basic
+\use_bibtopic false
+\use_indices false
+\paperorientation portrait
+\suppress_date true
+\use_refstyle 0
+\index Index
+\shortcut idx
+\color #008000
+\end_index
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation skip
+\defskip bigskip
+\quotes_language english
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+\tracking_changes false
+\output_changes false
+\html_math_output 0
+\html_css_as_file 0
+\html_be_strict false
+\end_header
+
+\begin_body
+
+\begin_layout Title
+Command Line Interface, 
+\begin_inset Newline newline
+\end_inset
+
+Stochastic SVD
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Dmitriy Lyubimov, dlyubimov at apache dot org
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+Overview.
+\end_layout
+
+\begin_layout Standard
+Stochasitc SVD method in Mahout produces reduced rank Singular Value Decompositi
+on output in its strict mathematical definition:
+\begin_inset Formula 
+\[
+\mathbf{A}=\mathbf{U}\boldsymbol{\Sigma}\mathbf{V}^{\top},
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+i.
+ e.
+ it creates outputs for matrices 
+\begin_inset Formula $\mathbf{U}$
+\end_inset
+
+, 
+\begin_inset Formula $\mathbf{V}$
+\end_inset
+
+ and 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+, each of which may be requested individually.
+ The desired rank of decomposition, henceforth denoted as 
+\begin_inset Formula $k\in\mathbb{N}_{1}$
+\end_inset
+
+, is a parameter of the algorithm.
+ The singular values inside diagonal matrix 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+ satisfy 
+\begin_inset Formula $\sigma_{i+1}\leq\sigma_{i}$
+\end_inset
+
+ 
+\begin_inset Formula $\forall i\in\left[1,k-1\right]$
+\end_inset
+
+, i.e.
+ sorted from biggest to smallest.
+ Cases of rank deficiency 
+\begin_inset Formula $\mbox{rank}\left(\mathbf{A}\right)<k$
+\end_inset
+
+ are handled by producing 0s in singular value positions once deficiency
+ takes place.
+ 
+\end_layout
+
+\begin_layout Paragraph
+Single space for comparing row-items and column-items.
+\end_layout
+
+\begin_layout Standard
+On top of it, there's an option to present decomposition output in a form
+ of 
+\begin_inset Formula 
+\begin{equation}
+\mathbf{A}=\mathbf{U}_{\sigma}V_{\sigma}^{\top},\label{eq:baked-sigma}
+\end{equation}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+where one can request 
+\begin_inset Formula $\mathbf{U}_{\sigma}=\mathbf{U}\boldsymbol{\Sigma}^{0.5}$
+\end_inset
+
+ instead of 
+\begin_inset Formula $\mathbf{U}$
+\end_inset
+
+ (but not both), 
+\begin_inset Formula $\mathbf{V}_{\sigma}=\mathbf{V}\boldsymbol{\Sigma}^{0.5}$
+\end_inset
+
+ instead of 
+\begin_inset Formula $\mathbf{V}$
+\end_inset
+
+ (but not both).
+ Here, notation 
+\begin_inset Formula $\boldsymbol{\Sigma}^{0.5}$
+\end_inset
+
+ implies diagonal matrix containing square roots of the singular values:
+ 
+\begin_inset Formula 
+\[
+\boldsymbol{\Sigma}^{0.5}=\left(\begin{matrix}\sqrt{\sigma_{1}} & \cdots & 0\\
+\vdots & \ddots & \vdots\\
+0 & \cdots & \sqrt{\sigma_{k}}
+\end{matrix}\right).
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Original singular values 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+ are still produced and saved regardless.
+\end_layout
+
+\begin_layout Standard
+This option is a nod to a common need of comparing actors represented by
+ both input rows and input columns in a common space.
+ E.g.
+ if LSI is performed such that rows are documents and columns are terms
+ then it is possible to compare documents and terms (ether existing or fold
+ in new ones) in one common space and perform similarity measurement between
+ a document and a term, rather than computing just term2term or document2documen
+t similarities.
+\end_layout
+
+\begin_layout Paragraph
+Folding in new observations.
+\end_layout
+
+\begin_layout Standard
+It is probably worth mentioning the operation of 
+\begin_inset Quotes eld
+\end_inset
+
+folding in
+\begin_inset Quotes erd
+\end_inset
+
+ new observations in context of this method, since it is often a basis for
+ incremental methods.
+\end_layout
+
+\begin_layout Standard
+If 
+\begin_inset Formula $\tilde{\mathbf{c}}_{r}\,\,\left(\tilde{\mathbf{c}}_{c}\right)$
+\end_inset
+
+ is a new row (column) observation in addition to original input 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+, then correspondent 
+\begin_inset Quotes eld
+\end_inset
+
+new
+\begin_inset Quotes erd
+\end_inset
+
+ row vectors of 
+\begin_inset Formula $\tilde{\mathbf{U}}$
+\end_inset
+
+ 
+\begin_inset Formula $\left(\tilde{\mathbf{V}}\right)$
+\end_inset
+
+ can be obtained as 
+\begin_inset Formula 
+\begin{eqnarray*}
+\mathbf{\tilde{u}} & = & \boldsymbol{\Sigma}^{-1}\mathbf{V}^{\top}\tilde{\mathbf{c}}_{r},\\
+\tilde{\mathbf{v}} & = & \boldsymbol{\Sigma}^{-1}\mathbf{U}^{\top}\tilde{\mathbf{c}}_{c}.
+\end{eqnarray*}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Similarly, for the case 
+\begin_inset CommandInset ref
+LatexCommand formatted
+reference "eq:baked-sigma"
+
+\end_inset
+
+ folding in new observations into rows of 
+\begin_inset Formula $\tilde{\mathbf{U}}_{\sigma}\,\,\left(\tilde{\mathbf{V}}_{\sigma}\right)$
+\end_inset
+
+ would look like
+\begin_inset Formula 
+\begin{eqnarray*}
+\mathbf{\tilde{u}}_{\sigma} & = & \mathbf{V}_{\sigma}^{\top}\tilde{\mathbf{c}}_{r},\\
+\tilde{\mathbf{v}}_{\sigma} & = & \mathbf{U}_{\sigma}^{\top}\tilde{\mathbf{c}}_{c}.
+\end{eqnarray*}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Thus, new rows can be added to matrices denoted as 
+\begin_inset Formula $\tilde{\mathbf{U}}\,\,\left(\tilde{\mathbf{V}}\right)$
+\end_inset
+
+ corresponding to new observations as new observations become available,
+ i.e.
+ incrementally.
+ Given that new observations are usually moderately sparse vectors, it might
+ be feasible to do fold-in in real time or almost real time, assuming proper
+ fast row-wise indexing of 
+\begin_inset Formula $\mathbf{U}\,\left(\mathbf{V}\right)$
+\end_inset
+
+ exists (e.g.
+ using a batch request to an HBase table containing rows of 
+\begin_inset Formula $\mathbf{U}\,\left(\mathbf{V}\right)$
+\end_inset
+
+).
+ However, since operation of folding in new observations doesn't change
+ original decomposition and its spaces, such new observations cannot be
+ considered 'training' examples.
+ Typically, from time to time accumulated new observations can be added
+ to original input 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+ and the whole decomposition can be recomputed again.
+\end_layout
+
+\begin_layout Standard
+Common applications for SVD include Latent Semantic Analysis (LSA), Principal
+ Component Analysis (PCA), dimensionality reduction and others.
+\end_layout
+
+\begin_layout Section
+File formats
+\end_layout
+
+\begin_layout Standard
+Input 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+, as well as outputs 
+\begin_inset Formula $\mathbf{U}\left(\mathbf{U}_{\sigma}\right),\,\mathbf{V}\left(\mathbf{V}_{\sigma}\right)$
+\end_inset
+
+, are in Mahout's Distributed Row Matrix format, i.e.
+ set of sequence files where value is of 
+\family typewriter
+VectorWritable
+\family default
+ type.
+ As far as keys are concerned, rows of 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+ may be keyed (identified) by any 
+\family typewriter
+Writable 
+\family default
+(for as long as it is instantiable thru a default constructor).
+ That, among other thnigs, means that this method can be applied directly
+ on the output of 
+\family typewriter
+seq2sparse 
+\family default
+where keys are of 
+\family typewriter
+Text
+\family default
+ type
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+(TODO: re-verify)
+\end_layout
+
+\end_inset
+
+.
+ 
+\end_layout
+
+\begin_layout Standard
+Definition of output 
+\begin_inset Formula $\mathbf{U}\,\,\left(\mathbf{U}_{\sigma}\right)$
+\end_inset
+
+ is identical to definition of the input matrix 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+, and the keys of corresponding rows in 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+ are copied to corresponding rows of output 
+\begin_inset Formula $\mathbf{U}\,\,\left(\mathbf{U}_{\sigma}\right)$
+\end_inset
+
+.
+ 
+\end_layout
+
+\begin_layout Standard
+Definition of output 
+\begin_inset Formula $\mathbf{V}\,\,\left(\mathbf{V}_{\sigma}\right)$
+\end_inset
+
+ is always sequence file(s) of 
+\family typewriter
+(IntWritable,
+\begin_inset Newline newline
+\end_inset
+
+
+\family default
+ 
+\family typewriter
+VectorWritable)
+\family default
+ where key corresponds to a column index of the input 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+.
+ 
+\end_layout
+
+\begin_layout Standard
+Output of 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+ is encoded by a single output file with a single vector value 
+\family typewriter
+(VectorWritable)
+\family default
+ with main diagonal entries of 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+ aka singular values 
+\begin_inset Formula $\left(\begin{matrix}\sigma_{1} & \cdots & \sigma_{k}\end{matrix}\right)$
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Section
+Usage
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+As of Mahout 0.6 trunk
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout LyX-Code
+mahout ssvd <options>
+\end_layout
+
+\begin_layout Paragraph
+Options.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-k,
+\begin_inset space ~
+\end_inset
+
+--rank
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\family default
+\size default
+ (required): the requested SVD rank (minimum number of singular values and
+ dimensions in U, V matrices) 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-p,
+\begin_inset space ~
+\end_inset
+
+--oversampling
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\family default
+\size default
+ (required): stochastic SVD oversampling.
+ The value of 
+\begin_inset Formula $k+p$
+\end_inset
+
+ directly impacts running time and memory requirements.
+ 
+\series bold
+\emph on
+k+p=500 is probably more than reasonable
+\series default
+\emph default
+.
+ Typically 
+\begin_inset Formula $k+p$
+\end_inset
+
+ is taken within range 50...200.
+ 
+\begin_inset Formula $p$
+\end_inset
+
+ doesn't seem to have to be very significant (perhaps 5..10).
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-q,
+\begin_inset space ~
+\end_inset
+
+--powerIter
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\size default
+ 
+\family default
+(optional, default 0): number of power iterations to perform.
+ This helps fighting data noise and improve precision significantly more
+ than just increasing 
+\begin_inset Formula $p$
+\end_inset
+
+.
+ Each additional power iteration adds 2 more steps (map/reduce + map-only).
+ Experimental data suggests using 
+\begin_inset Formula $q=1$
+\end_inset
+
+ is already producing quite good results which are hard to much improve
+ upon.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-r,
+\begin_inset space ~
+\end_inset
+
+--blockHeight
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\family default
+\size default
+ (optional, default 10,000): the number of rows of source matrix for block
+ computations.
+ Taller blocking causes more memory use but produces less blocks and therefore
+ somewhat better running times.
+ The most optimal mode from the running time point of view should be 1 block
+ per 1 mapper.
+ 
+\emph on
+This cannot be less than k+p.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-oh,
+\begin_inset space ~
+\end_inset
+
+--outerProdBlockHeight
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\size default
+ 
+\family default
+(optional, default 10,000): the block height in multiplication operations.
+ With extreme sparse matrices increasing that parameter will lead to better
+ performance by reducing computational pressure on the shuffle and sort
+ and grouping sparse records together.
+ However, setting it too high may cause larger block values formed and written
+ and may cause OOM.
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Matrix mutliplications are the biggest bottleneck of this method as of the
+ time of this writing.
+ Tweaking this parameters for bigger blocks will help tremendeously but
+ may cause OOM and/or GC thrashing which will again either decrease performance
+ dramatically or even derail the whole job.
+ So balance must be striken here.
+ Default is good for dense inputs and safe for sparse inputs).
+\end_layout
+
+\end_inset
+
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-s,
+\begin_inset space ~
+\end_inset
+
+--minSplitSize
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\family default
+\size default
+ (optional, default: use Hadoop's default): minimum split size to use in
+ mappers reading 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+ input.
+ 
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+
+\emph on
+As of this day, I haven't heard of a case where somebody would actually
+ have to use this option and actually increase split size and how it has
+ played out.
+ So this option is experimental.
+ 
+\end_layout
+
+\begin_layout Plain Layout
+Since in this version projection block formation happens in mappers, for
+ a sufficiently wide input matrix the algorithm may not be able to read
+ minimum 
+\begin_inset Formula $k+p$
+\end_inset
+
+ rows and form a block of minimum height required, so in that case the job
+ would bail out at the very first mapping step.
+ If this happens, one of the recourses available is to force increase in
+ the MapReduce split size using SequenceFileInputFormat.setMinSplitSize()
+ property.
+ Increasing this significantly over HDFS block size may result in network
+ IO to mappers.
+ Another caveat is that one sometimes does not want too many mappers because
+ it may in fact increase time of the computation.
+ Consequently, this option should probably be left alone unless one has
+ significant amount of mappers (as in thousands of map tasks) at which point
+ reducing amount of mappers may actually improve the throughput (just a
+ guesstimate at this point).
+ 
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--computeU
+\begin_inset space ~
+\end_inset
+
+<true|false>
+\family default
+\size default
+ (optional, default true).
+ Request computation of the U matrix 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--computeV
+\begin_inset space ~
+\end_inset
+
+<true|false>
+\family default
+\size default
+ (optional, default true).
+ Request computation of the V matrix 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--vHalfSigma
+\begin_inset space ~
+\end_inset
+
+<true|false>
+\family default
+\size default
+ (optional, default: false): compute 
+\begin_inset Formula $\mathbf{V}_{\sigma}=\mathbf{V}\boldsymbol{\Sigma}^{0.5}$
+\end_inset
+
+ instead of 
+\begin_inset Formula $\mathbf{V}$
+\end_inset
+
+ (see overview for explanation).
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--uHalfSigma
+\begin_inset space ~
+\end_inset
+
+<true|false>
+\family default
+\size default
+ (optional, default: false): compute 
+\begin_inset Formula $\mathbf{U}_{\sigma}=\mathbf{U}\boldsymbol{\Sigma}^{0.5}$
+\end_inset
+
+ instead of 
+\begin_inset Formula $\mathbf{U}$
+\end_inset
+
+.
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--reduceTasks
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\family default
+\size default
+ optional.
+ The number of reducers to use (where applicable): depends on the size of
+ the hadoop cluster.
+ At this point it could also be overwritten by a standard hadoop property
+ using -D option
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+TODO: reverify
+\end_layout
+
+\end_inset
+
+.
+
+\series bold
+\emph on
+ Probably always needs to be specified as by default Hadoop would set it
+ to 1, which is certainly far below the cluster capacity.
+ 
+\series default
+\emph default
+Recommended value for this option ~ 95% or ~190% of available reducer capacity
+ to allow for opportunistic executions.
+\end_layout
+
+\begin_layout Paragraph
+Standard Mahout options.
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--input
+\begin_inset space ~
+\end_inset
+
+<glob>
+\family default
+\size default
+ HDFS glob specification where the DistributedRowMatrix input to be found.
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--output
+\begin_inset space ~
+\end_inset
+
+<hdfs-dir>
+\family default
+\size default
+ non-existent hdfs directory where to output 
+\begin_inset Formula $\mathbf{U},\mathbf{V}$
+\end_inset
+
+ and 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+ (singular values) files.
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--tempDir
+\begin_inset space ~
+\end_inset
+
+<temp-dir>
+\family default
+\size default
+ temporary dir where to store intermediate files (cleaned up upon normal
+ completion).
+ This is a standard Mahout optional parameter.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-ow,
+\begin_inset space ~
+\end_inset
+
+--overwrite
+\size default
+ 
+\family default
+overwrite output if exists.
+\end_layout
+
+\begin_layout Section
+Embedded use
+\end_layout
+
+\begin_layout Standard
+It is possible to instantiate and use 
+\family typewriter
+SSVDSolver
+\family default
+ class in embedded fashion in Hadoop-enabled applications.
+ This class would have getter and setter methods for each option available
+ via command line.
+ See javadoc for details.
+\end_layout
+
+\end_body
+\end_document

Added: mahout/site/new_website/MAHOUT/stochastic-singular-value-decomposition.data/SSVD-CLI.pdf
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/stochastic-singular-value-decomposition.data/SSVD-CLI.pdf?rev=1243022&view=auto
==============================================================================
Files mahout/site/new_website/MAHOUT/stochastic-singular-value-decomposition.data/SSVD-CLI.pdf (added) and mahout/site/new_website/MAHOUT/stochastic-singular-value-decomposition.data/SSVD-CLI.pdf Sat Feb 11 10:22:15 2012 differ

Added: mahout/site/new_website/MAHOUT/stochastic-singular-value-decomposition.data/ssvd.R
URL: http://svn.apache.org/viewvc/mahout/site/new_website/MAHOUT/stochastic-singular-value-decomposition.data/ssvd.R?rev=1243022&view=auto
==============================================================================
--- mahout/site/new_website/MAHOUT/stochastic-singular-value-decomposition.data/ssvd.R (added)
+++ mahout/site/new_website/MAHOUT/stochastic-singular-value-decomposition.data/ssvd.R Sat Feb 11 10:22:15 2012
@@ -0,0 +1,135 @@
+
+# standard SSVD
+ssvd.svd <- function(x, k, p=25, qiter=0 ) { 
+
+a <- as.matrix(x)
+m <- nrow(a)
+n <- ncol(a)
+p <- min( min(m,n)-k,p)
+r <- k+p
+
+omega <- matrix ( rnorm(r*n), nrow=n, ncol=r)
+
+y <- a %*% omega
+
+q <- qr.Q(qr(y))
+
+b<- t(q) %*% a
+
+#power iterations
+for ( i in 1:qiter ) { 
+  y <- a %*% t(b)
+  q <- qr.Q(qr(y))
+  b <- t(q) %*% a
+}
+
+bbt <- b %*% t(b)
+
+e <- eigen(bbt, symmetric=T)
+
+res <- list()
+
+res$svalues <- sqrt(e$values)[1:k]
+uhat=e$vectors[1:k,1:k]
+
+res$u <- (q %*% e$vectors)[,1:k]
+res$v <- (t(b) %*% e$vectors %*% diag(1/e$values))[,1:k]
+
+return(res)
+}
+
+
+
+#############
+## ssvd with pci options
+ssvd.cpca <- function ( x, k, p=25, qiter=0, fixY=T ) { 
+
+a <- as.matrix(x)
+m <- nrow(a)
+n <- ncol(a)
+p <- min( min(m,n)-k,p)
+r <- k+p
+
+
+# compute median xi
+xi<-colMeans(a)
+
+omega <- matrix ( rnorm(r*n), nrow=n, ncol=r)
+
+y <- a %*% omega
+
+#fix y
+if ( fixY ) { 
+  #debug
+  cat ("fixing Y...\n");
+
+  s_o = t(omega) %*% cbind(xi)
+  for (i in 1:r ) y[,i]<- y[,i]-s_o[i]
+}
+
+
+q <- qr.Q(qr(y))
+
+b<- t(q) %*% a
+
+# compute sum of q rows 
+s_q <- cbind(colSums(q))
+
+# compute B*xi
+# of course in MR implementation 
+# it will be collected as sums of ( B[,i] * xi[i] ) and reduced after.
+s_b <- b %*% cbind(xi)
+
+
+#power iterations
+for ( i in 1:qiter ) { 
+
+  # fix b 
+  b <- b - s_q %*% rbind(xi) 
+
+  y <- a %*% t(b)
+
+  # fix y 
+  if ( fixY )  
+    for (i in 1:r ) y[,i]<- y[,i]-s_b[i]
+  
+
+  q <- qr.Q(qr(y))
+  b <- t(q) %*% a
+
+  # recompute s_{q}
+  s_q <- cbind(colSums(q))
+
+  #recompute s_{b}
+  s_b <- b %*% cbind(xi)
+
+}
+
+
+
+#C is the outer product of S_q and S_b per doc
+C <- s_q %*% t(s_b)
+
+# fixing BB'
+bbt <- b %*% t(b) -C -t(C) + sum(xi * xi)* (s_q %*% t(s_q))
+
+e <- eigen(bbt, symmetric=T)
+
+res <- list()
+
+res$svalues <- sqrt(e$values)[1:k]
+uhat=e$vectors[1:k,1:k]
+
+res$u <- (q %*% e$vectors)[,1:k]
+
+res$v <- (t(b- s_q %*% rbind(xi) ) %*% e$vectors %*% diag(1/e$values))[,1:k]
+
+return(res)
+
+}
+
+
+
+
+
+