You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by bu...@apache.org on 2015/03/19 22:21:47 UTC

svn commit: r944380 [17/24] - in /websites/staging/mahout/trunk/content: ./ developers/ general/ users/basics/ users/classification/ users/clustering/ users/dim-reduction/ users/mapreduce/ users/mapreduce/classification/ users/mapreduce/clustering/ use...

Added: websites/staging/mahout/trunk/content/users/mapreduce/clustering/lda-commandline.html
==============================================================================
--- websites/staging/mahout/trunk/content/users/mapreduce/clustering/lda-commandline.html (added)
+++ websites/staging/mahout/trunk/content/users/mapreduce/clustering/lda-commandline.html Thu Mar 19 21:21:45 2015
@@ -0,0 +1,351 @@
+<!DOCTYPE html>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <title>Apache Mahout: Scalable machine learning and data mining</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+  <meta name="Distribution" content="Global">
+  <meta name="Robots" content="index,follow">
+  <meta name="keywords" content="apache, apache hadoop, apache lucene,
+        business data mining, cluster analysis,
+        collaborative filtering, data extraction, data filtering, data framework, data integration,
+        data matching, data mining, data mining algorithms, data mining analysis, data mining data,
+        data mining introduction, data mining software,
+        data mining techniques, data representation, data set, datamining,
+        feature extraction, fuzzy k means, genetic algorithm, hadoop,
+        hierarchical clustering, high dimensional, introduction to data mining, kmeans,
+        knowledge discovery, learning approach, learning approaches, learning methods,
+        learning techniques, lucene, machine learning, machine translation, mahout apache,
+        mahout taste, map reduce hadoop, mining data, mining methods, naive bayes,
+        natural language processing,
+        supervised, text mining, time series data, unsupervised, web data mining">
+  <link rel="shortcut icon" type="image/x-icon" href="http://mahout.apache.org/images/favicon.ico">
+  <script type="text/javascript" src="/js/prototype.js"></script>
+  <script type="text/javascript" src="/js/effects.js"></script>
+  <script type="text/javascript" src="/js/search.js"></script>
+  <script type="text/javascript" src="/js/slides.js"></script>
+
+  <link href="/css/bootstrap.min.css" rel="stylesheet" media="screen">
+  <link href="/css/bootstrap-responsive.css" rel="stylesheet">
+  <link rel="stylesheet" href="/css/global.css" type="text/css">
+
+  <!-- mathJax stuff -- use `\(...\)` for inline style math in markdown -->
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    tex2jax: {
+      skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+    }
+  });
+  MathJax.Hub.Queue(function() {
+    var all = MathJax.Hub.getAllJax(), i;
+    for(i = 0; i < all.length; i += 1) {
+      all[i].SourceElement().parentNode.className += ' has-jax';
+    }
+  });
+  </script>
+  <script type="text/javascript">
+    var mathjax = document.createElement('script'); 
+    mathjax.type = 'text/javascript'; 
+    mathjax.async = true;
+
+    mathjax.src = ('https:' == document.location.protocol) ?
+        'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' : 
+        'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+	
+	  var s = document.getElementsByTagName('script')[0]; 
+    s.parentNode.insertBefore(mathjax, s);
+  </script>
+</head>
+
+<body id="home" data-twttr-rendered="true">
+  <div id="wrap">
+   <div id="header">
+    <div id="logo"><a href="/overview.html"></a></div>
+  <div id="search">
+    <form id="search-form" action="http://www.google.com/search" method="get" class="navbar-search pull-right">    
+      <input value="http://mahout.apache.org" name="sitesearch" type="hidden">
+      <input class="search-query" name="q" id="query" type="text">
+      <input id="submission" type="image" src="/images/mahout-lupe.png" alt="Search" />
+    </form>
+  </div>
+
+    <div class="navbar navbar-inverse" style="position:absolute;top:133px;padding-right:0px;padding-left:0px;">
+      <div class="navbar-inner" style="border: none; background: #999; border: none; border-radius: 0px;">
+        <div class="container">
+          <button type="button" class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <!-- <a class="brand" href="#">Apache Community Development Project</a> -->
+          <div class="nav-collapse collapse">
+            <ul class="nav">
+              <li><a href="/">Home</a></li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">General<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/general/downloads.html">Downloads</a>
+                  <li><a href="/general/who-we-are.html">Who we are</a>
+                  <li><a href="/general/mailing-lists,-irc-and-archives.html">Mailing Lists</a>
+                  <li><a href="/general/release-notes.html">Release Notes</a> 
+                  <li><a href="/general/books-tutorials-and-talks.html">Books, Tutorials, Talks</a></li>
+                  <li><a href="/general/powered-by-mahout.html">Powered By Mahout</a>
+                  <li><a href="/general/professional-support.html">Professional Support</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Resources</li>
+                  <li><a href="/general/reference-reading.html">Reference Reading</a>
+                  <li><a href="/general/faq.html">FAQ</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Legal</li>
+                  <li><a href="http://www.apache.org/licenses/">License</a></li>
+                  <li><a href="http://www.apache.org/security/">Security</a></li>
+                  <li><a href="/general/privacy-policy.html">Privacy Policy</a>
+                </ul>
+              </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Developers<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/developers/developer-resources.html">Developer resources</a></li>
+                  <li><a href="/developers/version-control.html">Version control</a></li>
+                  <li><a href="/developers/buildingmahout.html">Build from source</a></li>
+                  <li><a href="/developers/issue-tracker.html">Issue tracker</a></li>
+                  <li><a href="https://builds.apache.org/job/Mahout-Quality/" target="_blank">Code quality reports</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">Contributions</li>
+                  <li><a href="/developers/how-to-contribute.html">How to contribute</a></li>
+                  <li><a href="/developers/how-to-become-a-committer.html">How to become a committer</a></li>
+                  <li><a href="/developers/gsoc.html">GSoC</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">For committers</li>
+                  <li><a href="/developers/how-to-update-the-website.html">How to update the website</a></li>
+                  <li><a href="/developers/patch-check-list.html">Patch check list</a></li>
+                  <li><a href="/developers/github.html">Handling Github PRs</a></li>
+                  <li><a href="/developers/how-to-release.html">How to release</a></li>
+                  <li><a href="/developers/thirdparty-dependencies.html">Third party dependencies</a></li>
+                </ul>
+               </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Basics<b class="caret"></b></a>
+                 <ul class="dropdown-menu">
+                  <li><a href="/users/basics/algorithms.html">List of algorithms</a>
+                  <li><a href="/users/basics/quickstart.html">Quickstart</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Working with text</li>
+                  <li><a href="/users/basics/creating-vectors-from-text.html">Creating vectors from text</a>
+                  <li><a href="/users/basics/collocations.html">Collocations</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Dimensionality reduction</li>
+                  <li><a href="/users/dim-reduction/dimensional-reduction.html">Singular Value Decomposition</a></li>
+                  <li><a href="/users/dim-reduction/ssvd.html">Stochastic SVD</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">Topic Models</li>      
+                  <li><a href="/users/clustering/latent-dirichlet-allocation.html">Latent Dirichlet Allocation</a></li>
+                </ul>
+                 </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Spark<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/users/sparkbindings/home.html">Scala &amp; Spark Bindings Overview</a></li>
+                  <li><a href="/users/sparkbindings/play-with-shell.html">Playing with Mahout's Spark Shell</a></li>
+			      <li class="divider"></li>
+                  <li><a href="/users/sparkbindings/faq.html">FAQ</a></li>
+                </ul>
+               </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Classification<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/users/mapreduce/classification/bayesian.html">Naive Bayes</a></li>
+                  <li><a href="/users/mapreduce/classification/hidden-markov-models.html">Hidden Markov Models</a></li>
+                  <li><a href="/users/mapreduce/classification/logistic-regression.html">Logistic Regression</a></li>
+                  <li><a href="/users/mapreduce/classification/partial-implementation.html">Random Forest</a></li>
+
+                  <li class="divider"></li>
+                  <li class="nav-header">Examples</li>
+                  <li><a href="/users/mapreduce/classification/breiman-example.html">Breiman example</a></li>
+                  <li><a href="/users/mapreduce/classification/twenty-newsgroups.html">20 newsgroups example</a></li>
+                </ul></li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Clustering<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li><a href="/users/mapreduce/clustering/k-means-clustering.html">k-Means</a></li>
+                <li><a href="/users/mapreduce/clustering/canopy-clustering.html">Canopy</a></li>
+                <li><a href="/users/mapreduce/clustering/fuzzy-k-means.html">Fuzzy k-Means</a></li>
+                <li><a href="/users/mapreduce/clustering/streaming-k-means.html">Streaming KMeans</a></li>
+                <li><a href="/users/mapreduce/clustering/spectral-clustering.html">Spectral Clustering</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Commandline usage</li>
+                <li><a href="/users/mapreduce/clustering/k-means-commandline.html">Options for k-Means</a></li>
+                <li><a href="/users/mapreduce/clustering/canopy-commandline.html">Options for Canopy</a></li>
+                <li><a href="/users/mapreduce/clustering/fuzzy-k-means-commandline.html">Options for Fuzzy k-Means</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Examples</li>
+                <li><a href="/users/mapreduce/clustering/clustering-of-synthetic-control-data.html">Synthetic data</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Post processing</li>
+                <li><a href="/users/mapreduce/clustering/cluster-dumper.html">Cluster Dumper tool</a></li>
+                <li><a href="/users/mapreduce/clustering/visualizing-sample-clusters.html">Cluster visualisation</a></li>
+                </ul></li>
+                <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Recommendations<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li><a href="/users/mapreduce/recommender/quickstart.html">Quickstart</a></li>
+                <li><a href="/users/mapreduce/recommender/recommender-first-timer-faq.html">First Timer FAQ</a></li>
+                <li><a href="/users/mapreduce/recommender/userbased-5-minutes.html">A user-based recommender <br/>in 5 minutes</a></li>
+		<li><a href="/users/mapreduce/recommender/matrix-factorization.html">Matrix factorization-based<br/> recommenders</a></li>
+                <li><a href="/users/mapreduce/recommender/recommender-documentation.html">Overview</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Hadoop</li>
+                <li><a href="/users/mapreduce/recommender/intro-itembased-hadoop.html">Intro to item-based recommendations<br/> with Hadoop</a></li>
+                <li><a href="/users/mapreduce/recommender/intro-als-hadoop.html">Intro to ALS recommendations<br/> with Hadoop</a></li>
+                <li class="nav-header">Spark</li>
+                <li><a href="/users/mapreduce/recommender/intro-cooccurrence-spark.html">Intro to cooccurrence-based<br/> recommendations with Spark</a></li>
+              </ul>
+            </li>
+           </ul>
+          </div><!--/.nav-collapse -->
+        </div>
+      </div>
+    </div>
+
+</div>
+
+ <div id="sidebar">
+  <div id="sidebar-wrap">
+    <h2>Twitter</h2>
+	<ul class="sidemenu">
+		<li>
+<a class="twitter-timeline" href="https://twitter.com/ApacheMahout" data-widget-id="422861673444028416">Tweets by @ApacheMahout</a>
+<script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script>
+</li>
+	</ul>
+    <h2>Apache Software Foundation</h2>
+    <ul class="sidemenu">
+      <li><a href="http://www.apache.org/foundation/how-it-works.html">How the ASF works</a></li>
+      <li><a href="http://www.apache.org/foundation/getinvolved.html">Get Involved</a></li>
+      <li><a href="http://www.apache.org/dev/">Developer Resources</a></li>
+      <li><a href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li>
+      <li><a href="http://www.apache.org/foundation/thanks.html">Thanks</a></li>
+    </ul>
+    <h2>Related Projects</h2>
+    <ul class="sidemenu">
+      <li><a href="http://lucene.apache.org/">Lucene</a></li>
+      <li><a href="http://hadoop.apache.org/">Hadoop</a></li>
+    </ul>
+  </div>
+</div>
+
+  <div id="content-wrap" class="clearfix">
+   <div id="main">
+    <p><a name="lda-commandline-RunningLatentDirichletAllocation(algorithm)fromtheCommandLine"></a></p>
+<h1 id="running-latent-dirichlet-allocation-algorithm-from-the-command-line">Running Latent Dirichlet Allocation (algorithm) from the Command Line</h1>
+<p><a href="https://issues.apache.org/jira/browse/MAHOUT-897">Since Mahout v0.6</a>
+ lda has been implemented as Collapsed Variable Bayes (cvb). </p>
+<p>Mahout's LDA can be launched from the same command line invocation whether
+you are running on a single machine in stand-alone mode or on a larger
+Hadoop cluster. The difference is determined by the $HADOOP_HOME and
+$HADOOP_CONF_DIR environment variables. If both are set to an operating
+Hadoop cluster on the target machine then the invocation will run the LDA
+algorithm on that cluster. If either of the environment variables are
+missing then the stand-alone Hadoop configuration will be invoked instead.</p>
+<div class="codehilite"><pre><span class="o">./</span><span class="n">bin</span><span class="o">/</span><span class="n">mahout</span> <span class="n">cvb</span> <span class="o">&lt;</span><span class="n">OPTIONS</span><span class="o">&gt;</span>
+</pre></div>
+
+
+<ul>
+<li>In $MAHOUT_HOME/, build the jar containing the job (mvn install) The job
+will be generated in $MAHOUT_HOME/core/target/ and it's name will contain
+the Mahout version number. For example, when using Mahout 0.3 release, the
+job will be mahout-core-0.3.job</li>
+</ul>
+<p><a name="lda-commandline-Testingitononesinglemachinew/ocluster"></a></p>
+<h2 id="testing-it-on-one-single-machine-wo-cluster">Testing it on one single machine w/o cluster</h2>
+<ul>
+<li>Put the data: cp <PATH TO DATA> testdata</li>
+<li>
+<p>Run the Job: </p>
+<p>./bin/mahout cvb -i testdata <OTHER OPTIONS></p>
+</li>
+</ul>
+<p><a name="lda-commandline-Runningitonthecluster"></a></p>
+<h2 id="running-it-on-the-cluster">Running it on the cluster</h2>
+<ul>
+<li>(As needed) Start up Hadoop: $HADOOP_HOME/bin/start-all.sh</li>
+<li>Put the data: $HADOOP_HOME/bin/hadoop fs -put <PATH TO DATA> testdata</li>
+<li>
+<p>Run the Job: </p>
+<p>export HADOOP_HOME=<Hadoop Home Directory>
+export HADOOP_CONF_DIR=$HADOOP_HOME/conf
+./bin/mahout cvb -i testdata <OTHER OPTIONS></p>
+</li>
+<li>
+<p>Get the data out of HDFS and have a look. Use bin/hadoop fs -lsr output
+to view all outputs.</p>
+</li>
+</ul>
+<p><a name="lda-commandline-CommandlineoptionsfromMahoutcvbversion0.8"></a></p>
+<h1 id="command-line-options-from-mahout-cvb-version-08">Command line options from Mahout cvb version 0.8</h1>
+<div class="codehilite"><pre><span class="n">mahout</span> <span class="n">cvb</span> <span class="o">-</span><span class="n">h</span> 
+  <span class="o">--</span><span class="n">input</span> <span class="p">(</span><span class="o">-</span><span class="nb">i</span><span class="p">)</span> <span class="n">input</span>                      <span class="n">Path</span> <span class="n">to</span> <span class="n">job</span> <span class="n">input</span> <span class="n">directory</span><span class="p">.</span>        
+  <span class="o">--</span><span class="n">output</span> <span class="p">(</span><span class="o">-</span><span class="n">o</span><span class="p">)</span> <span class="n">output</span>                    <span class="n">The</span> <span class="n">directory</span> <span class="n">pathname</span> <span class="k">for</span> <span class="n">output</span><span class="p">.</span>  
+  <span class="o">--</span><span class="n">maxIter</span> <span class="p">(</span><span class="o">-</span><span class="n">x</span><span class="p">)</span> <span class="n">maxIter</span>                  <span class="n">The</span> <span class="n">maximum</span> <span class="n">number</span> <span class="n">of</span> <span class="n">iterations</span><span class="p">.</span>     
+  <span class="o">--</span><span class="n">convergenceDelta</span> <span class="p">(</span><span class="o">-</span><span class="n">cd</span><span class="p">)</span> <span class="n">convergenceDelta</span>       <span class="n">The</span> <span class="n">convergence</span> <span class="n">delta</span> <span class="n">value</span>           
+  <span class="o">--</span><span class="n">overwrite</span> <span class="p">(</span><span class="o">-</span><span class="n">ow</span><span class="p">)</span>                   <span class="n">If</span> <span class="n">present</span><span class="p">,</span> <span class="n">overwrite</span> <span class="n">the</span> <span class="n">output</span> <span class="n">directory</span> <span class="n">before</span> <span class="n">running</span> <span class="n">job</span>    
+  <span class="o">--</span><span class="n">num_topics</span> <span class="p">(</span><span class="o">-</span><span class="n">k</span><span class="p">)</span> <span class="n">num_topics</span>                <span class="n">Number</span> <span class="n">of</span> <span class="n">topics</span> <span class="n">to</span> <span class="n">learn</span>      
+  <span class="o">--</span><span class="n">num_terms</span> <span class="p">(</span><span class="o">-</span><span class="n">nt</span><span class="p">)</span> <span class="n">num_terms</span>                 <span class="n">Vocabulary</span> <span class="nb">size</span>   
+  <span class="o">--</span><span class="n">doc_topic_smoothing</span> <span class="p">(</span><span class="o">-</span><span class="n">a</span><span class="p">)</span> <span class="n">doc_topic_smoothing</span>      <span class="n">Smoothing</span> <span class="k">for</span> <span class="n">document</span><span class="o">/</span><span class="n">topic</span> <span class="n">distribution</span>      
+  <span class="o">--</span><span class="n">term_topic_smoothing</span> <span class="p">(</span><span class="o">-</span><span class="n">e</span><span class="p">)</span> <span class="n">term_topic_smoothing</span>    <span class="n">Smoothing</span> <span class="k">for</span> <span class="n">topic</span><span class="o">/</span><span class="n">term</span> <span class="n">distribution</span>      
+  <span class="o">--</span><span class="n">dictionary</span> <span class="p">(</span><span class="o">-</span><span class="n">dict</span><span class="p">)</span> <span class="n">dictionary</span>             <span class="n">Path</span> <span class="n">to</span> <span class="n">term</span><span class="o">-</span><span class="n">dictionary</span> <span class="n">file</span><span class="p">(</span><span class="n">s</span><span class="p">)</span> <span class="p">(</span><span class="n">glob</span> <span class="n">expression</span> <span class="n">supported</span><span class="p">)</span> 
+  <span class="o">--</span><span class="n">doc_topic_output</span> <span class="p">(</span><span class="o">-</span><span class="n">dt</span><span class="p">)</span> <span class="n">doc_topic_output</span>       <span class="n">Output</span> <span class="n">path</span> <span class="k">for</span> <span class="n">the</span> <span class="n">training</span> <span class="n">doc</span><span class="o">/</span><span class="n">topic</span> <span class="n">distribution</span>        
+  <span class="o">--</span><span class="n">topic_model_temp_dir</span> <span class="p">(</span><span class="o">-</span><span class="n">mt</span><span class="p">)</span> <span class="n">topic_model_temp_dir</span>   <span class="n">Path</span> <span class="n">to</span> <span class="n">intermediate</span> <span class="n">model</span> <span class="n">path</span> <span class="p">(</span><span class="n">useful</span> <span class="k">for</span> <span class="n">restarting</span><span class="p">)</span>       
+  <span class="o">--</span><span class="n">iteration_block_size</span> <span class="p">(</span><span class="o">-</span><span class="n">block</span><span class="p">)</span> <span class="n">iteration_block_size</span>    <span class="n">Number</span> <span class="n">of</span> <span class="n">iterations</span> <span class="n">per</span> <span class="n">perplexity</span> <span class="n">check</span>  
+  <span class="o">--</span><span class="n">random_seed</span> <span class="p">(</span><span class="o">-</span><span class="n">seed</span><span class="p">)</span> <span class="n">random_seed</span>           <span class="n">Random</span> <span class="n">seed</span>       
+  <span class="o">--</span><span class="n">test_set_fraction</span> <span class="p">(</span><span class="o">-</span><span class="n">tf</span><span class="p">)</span> <span class="n">test_set_fraction</span>         <span class="n">Fraction</span> <span class="n">of</span> <span class="n">data</span> <span class="n">to</span> <span class="n">hold</span> <span class="n">out</span> <span class="k">for</span> <span class="n">testing</span>  
+  <span class="o">--</span><span class="n">num_train_threads</span> <span class="p">(</span><span class="o">-</span><span class="n">ntt</span><span class="p">)</span> <span class="n">num_train_threads</span>        <span class="n">number</span> <span class="n">of</span> <span class="n">threads</span> <span class="n">per</span> <span class="n">mapper</span> <span class="n">to</span> <span class="n">train</span> <span class="n">with</span>  
+  <span class="o">--</span><span class="n">num_update_threads</span> <span class="p">(</span><span class="o">-</span><span class="n">nut</span><span class="p">)</span> <span class="n">num_update_threads</span>      <span class="n">number</span> <span class="n">of</span> <span class="n">threads</span> <span class="n">per</span> <span class="n">mapper</span> <span class="n">to</span> <span class="n">update</span> <span class="n">the</span> <span class="n">model</span> <span class="n">with</span>        
+  <span class="o">--</span><span class="n">max_doc_topic_iters</span> <span class="p">(</span><span class="o">-</span><span class="n">mipd</span><span class="p">)</span> <span class="n">max_doc_topic_iters</span>   <span class="n">max</span> <span class="n">number</span> <span class="n">of</span> <span class="n">iterations</span> <span class="n">per</span> <span class="n">doc</span> <span class="k">for</span> <span class="n">p</span><span class="p">(</span><span class="n">topic</span><span class="o">|</span><span class="n">doc</span><span class="p">)</span> <span class="n">learning</span>          
+  <span class="o">--</span><span class="n">num_reduce_tasks</span> <span class="n">num_reduce_tasks</span>             <span class="n">number</span> <span class="n">of</span> <span class="n">reducers</span> <span class="n">to</span> <span class="n">use</span> <span class="n">during</span> <span class="n">model</span> <span class="n">estimation</span>        
+  <span class="o">--</span><span class="n">backfill_perplexity</span>                   <span class="n">enable</span> <span class="n">backfilling</span> <span class="n">of</span> <span class="n">missing</span> <span class="n">perplexity</span> <span class="n">values</span>       
+  <span class="o">--</span><span class="n">help</span> <span class="p">(</span><span class="o">-</span><span class="n">h</span><span class="p">)</span>                         <span class="n">Print</span> <span class="n">out</span> <span class="n">help</span>    
+  <span class="o">--</span><span class="n">tempDir</span> <span class="n">tempDir</span>                   <span class="n">Intermediate</span> <span class="n">output</span> <span class="n">directory</span>      
+  <span class="o">--</span><span class="n">startPhase</span> <span class="n">startPhase</span>                 <span class="n">First</span> <span class="n">phase</span> <span class="n">to</span> <span class="n">run</span>    
+  <span class="o">--</span><span class="n">endPhase</span> <span class="n">endPhase</span>                     <span class="n">Last</span> <span class="n">phase</span> <span class="n">to</span> <span class="n">run</span>
+</pre></div>
+   </div>
+  </div>     
+</div> 
+  <footer class="footer" align="center">
+    <div class="container">
+      <p>
+        Copyright &copy; 2014 The Apache Software Foundation, Licensed under
+        the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
+        <br />
+        Apache and the Apache feather logos are trademarks of The Apache Software Foundation.
+      </p>
+    </div>
+  </footer>
+  
+  <script src="/js/jquery-1.9.1.min.js"></script>
+  <script src="/js/bootstrap.min.js"></script>
+  <script>
+    (function() {
+      var cx = '012254517474945470291:vhsfv7eokdc';
+      var gcse = document.createElement('script');
+      gcse.type = 'text/javascript';
+      gcse.async = true;
+      gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') +
+          '//www.google.com/cse/cse.js?cx=' + cx;
+      var s = document.getElementsByTagName('script')[0];
+      s.parentNode.insertBefore(gcse, s);
+    })();
+  </script>
+</body>
+</html>

Added: websites/staging/mahout/trunk/content/users/mapreduce/clustering/llr---log-likelihood-ratio.html
==============================================================================
--- websites/staging/mahout/trunk/content/users/mapreduce/clustering/llr---log-likelihood-ratio.html (added)
+++ websites/staging/mahout/trunk/content/users/mapreduce/clustering/llr---log-likelihood-ratio.html Thu Mar 19 21:21:45 2015
@@ -0,0 +1,311 @@
+<!DOCTYPE html>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <title>Apache Mahout: Scalable machine learning and data mining</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+  <meta name="Distribution" content="Global">
+  <meta name="Robots" content="index,follow">
+  <meta name="keywords" content="apache, apache hadoop, apache lucene,
+        business data mining, cluster analysis,
+        collaborative filtering, data extraction, data filtering, data framework, data integration,
+        data matching, data mining, data mining algorithms, data mining analysis, data mining data,
+        data mining introduction, data mining software,
+        data mining techniques, data representation, data set, datamining,
+        feature extraction, fuzzy k means, genetic algorithm, hadoop,
+        hierarchical clustering, high dimensional, introduction to data mining, kmeans,
+        knowledge discovery, learning approach, learning approaches, learning methods,
+        learning techniques, lucene, machine learning, machine translation, mahout apache,
+        mahout taste, map reduce hadoop, mining data, mining methods, naive bayes,
+        natural language processing,
+        supervised, text mining, time series data, unsupervised, web data mining">
+  <link rel="shortcut icon" type="image/x-icon" href="http://mahout.apache.org/images/favicon.ico">
+  <script type="text/javascript" src="/js/prototype.js"></script>
+  <script type="text/javascript" src="/js/effects.js"></script>
+  <script type="text/javascript" src="/js/search.js"></script>
+  <script type="text/javascript" src="/js/slides.js"></script>
+
+  <link href="/css/bootstrap.min.css" rel="stylesheet" media="screen">
+  <link href="/css/bootstrap-responsive.css" rel="stylesheet">
+  <link rel="stylesheet" href="/css/global.css" type="text/css">
+
+  <!-- mathJax stuff -- use `\(...\)` for inline style math in markdown -->
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    tex2jax: {
+      skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+    }
+  });
+  MathJax.Hub.Queue(function() {
+    var all = MathJax.Hub.getAllJax(), i;
+    for(i = 0; i < all.length; i += 1) {
+      all[i].SourceElement().parentNode.className += ' has-jax';
+    }
+  });
+  </script>
+  <script type="text/javascript">
+    var mathjax = document.createElement('script'); 
+    mathjax.type = 'text/javascript'; 
+    mathjax.async = true;
+
+    mathjax.src = ('https:' == document.location.protocol) ?
+        'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' : 
+        'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+	
+	  var s = document.getElementsByTagName('script')[0]; 
+    s.parentNode.insertBefore(mathjax, s);
+  </script>
+</head>
+
+<body id="home" data-twttr-rendered="true">
+  <div id="wrap">
+   <div id="header">
+    <div id="logo"><a href="/overview.html"></a></div>
+  <div id="search">
+    <form id="search-form" action="http://www.google.com/search" method="get" class="navbar-search pull-right">    
+      <input value="http://mahout.apache.org" name="sitesearch" type="hidden">
+      <input class="search-query" name="q" id="query" type="text">
+      <input id="submission" type="image" src="/images/mahout-lupe.png" alt="Search" />
+    </form>
+  </div>
+
+    <div class="navbar navbar-inverse" style="position:absolute;top:133px;padding-right:0px;padding-left:0px;">
+      <div class="navbar-inner" style="border: none; background: #999; border: none; border-radius: 0px;">
+        <div class="container">
+          <button type="button" class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <!-- <a class="brand" href="#">Apache Community Development Project</a> -->
+          <div class="nav-collapse collapse">
+            <ul class="nav">
+              <li><a href="/">Home</a></li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">General<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/general/downloads.html">Downloads</a>
+                  <li><a href="/general/who-we-are.html">Who we are</a>
+                  <li><a href="/general/mailing-lists,-irc-and-archives.html">Mailing Lists</a>
+                  <li><a href="/general/release-notes.html">Release Notes</a> 
+                  <li><a href="/general/books-tutorials-and-talks.html">Books, Tutorials, Talks</a></li>
+                  <li><a href="/general/powered-by-mahout.html">Powered By Mahout</a>
+                  <li><a href="/general/professional-support.html">Professional Support</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Resources</li>
+                  <li><a href="/general/reference-reading.html">Reference Reading</a>
+                  <li><a href="/general/faq.html">FAQ</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Legal</li>
+                  <li><a href="http://www.apache.org/licenses/">License</a></li>
+                  <li><a href="http://www.apache.org/security/">Security</a></li>
+                  <li><a href="/general/privacy-policy.html">Privacy Policy</a>
+                </ul>
+              </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Developers<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/developers/developer-resources.html">Developer resources</a></li>
+                  <li><a href="/developers/version-control.html">Version control</a></li>
+                  <li><a href="/developers/buildingmahout.html">Build from source</a></li>
+                  <li><a href="/developers/issue-tracker.html">Issue tracker</a></li>
+                  <li><a href="https://builds.apache.org/job/Mahout-Quality/" target="_blank">Code quality reports</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">Contributions</li>
+                  <li><a href="/developers/how-to-contribute.html">How to contribute</a></li>
+                  <li><a href="/developers/how-to-become-a-committer.html">How to become a committer</a></li>
+                  <li><a href="/developers/gsoc.html">GSoC</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">For committers</li>
+                  <li><a href="/developers/how-to-update-the-website.html">How to update the website</a></li>
+                  <li><a href="/developers/patch-check-list.html">Patch check list</a></li>
+                  <li><a href="/developers/github.html">Handling Github PRs</a></li>
+                  <li><a href="/developers/how-to-release.html">How to release</a></li>
+                  <li><a href="/developers/thirdparty-dependencies.html">Third party dependencies</a></li>
+                </ul>
+               </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Basics<b class="caret"></b></a>
+                 <ul class="dropdown-menu">
+                  <li><a href="/users/basics/algorithms.html">List of algorithms</a>
+                  <li><a href="/users/basics/quickstart.html">Quickstart</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Working with text</li>
+                  <li><a href="/users/basics/creating-vectors-from-text.html">Creating vectors from text</a>
+                  <li><a href="/users/basics/collocations.html">Collocations</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Dimensionality reduction</li>
+                  <li><a href="/users/dim-reduction/dimensional-reduction.html">Singular Value Decomposition</a></li>
+                  <li><a href="/users/dim-reduction/ssvd.html">Stochastic SVD</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">Topic Models</li>      
+                  <li><a href="/users/clustering/latent-dirichlet-allocation.html">Latent Dirichlet Allocation</a></li>
+                </ul>
+                 </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Spark<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/users/sparkbindings/home.html">Scala &amp; Spark Bindings Overview</a></li>
+                  <li><a href="/users/sparkbindings/play-with-shell.html">Playing with Mahout's Spark Shell</a></li>
+			      <li class="divider"></li>
+                  <li><a href="/users/sparkbindings/faq.html">FAQ</a></li>
+                </ul>
+               </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Classification<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/users/mapreduce/classification/bayesian.html">Naive Bayes</a></li>
+                  <li><a href="/users/mapreduce/classification/hidden-markov-models.html">Hidden Markov Models</a></li>
+                  <li><a href="/users/mapreduce/classification/logistic-regression.html">Logistic Regression</a></li>
+                  <li><a href="/users/mapreduce/classification/partial-implementation.html">Random Forest</a></li>
+
+                  <li class="divider"></li>
+                  <li class="nav-header">Examples</li>
+                  <li><a href="/users/mapreduce/classification/breiman-example.html">Breiman example</a></li>
+                  <li><a href="/users/mapreduce/classification/twenty-newsgroups.html">20 newsgroups example</a></li>
+                </ul></li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Clustering<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li><a href="/users/mapreduce/clustering/k-means-clustering.html">k-Means</a></li>
+                <li><a href="/users/mapreduce/clustering/canopy-clustering.html">Canopy</a></li>
+                <li><a href="/users/mapreduce/clustering/fuzzy-k-means.html">Fuzzy k-Means</a></li>
+                <li><a href="/users/mapreduce/clustering/streaming-k-means.html">Streaming KMeans</a></li>
+                <li><a href="/users/mapreduce/clustering/spectral-clustering.html">Spectral Clustering</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Commandline usage</li>
+                <li><a href="/users/mapreduce/clustering/k-means-commandline.html">Options for k-Means</a></li>
+                <li><a href="/users/mapreduce/clustering/canopy-commandline.html">Options for Canopy</a></li>
+                <li><a href="/users/mapreduce/clustering/fuzzy-k-means-commandline.html">Options for Fuzzy k-Means</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Examples</li>
+                <li><a href="/users/mapreduce/clustering/clustering-of-synthetic-control-data.html">Synthetic data</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Post processing</li>
+                <li><a href="/users/mapreduce/clustering/cluster-dumper.html">Cluster Dumper tool</a></li>
+                <li><a href="/users/mapreduce/clustering/visualizing-sample-clusters.html">Cluster visualisation</a></li>
+                </ul></li>
+                <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Recommendations<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li><a href="/users/mapreduce/recommender/quickstart.html">Quickstart</a></li>
+                <li><a href="/users/mapreduce/recommender/recommender-first-timer-faq.html">First Timer FAQ</a></li>
+                <li><a href="/users/mapreduce/recommender/userbased-5-minutes.html">A user-based recommender <br/>in 5 minutes</a></li>
+		<li><a href="/users/mapreduce/recommender/matrix-factorization.html">Matrix factorization-based<br/> recommenders</a></li>
+                <li><a href="/users/mapreduce/recommender/recommender-documentation.html">Overview</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Hadoop</li>
+                <li><a href="/users/mapreduce/recommender/intro-itembased-hadoop.html">Intro to item-based recommendations<br/> with Hadoop</a></li>
+                <li><a href="/users/mapreduce/recommender/intro-als-hadoop.html">Intro to ALS recommendations<br/> with Hadoop</a></li>
+                <li class="nav-header">Spark</li>
+                <li><a href="/users/mapreduce/recommender/intro-cooccurrence-spark.html">Intro to cooccurrence-based<br/> recommendations with Spark</a></li>
+              </ul>
+            </li>
+           </ul>
+          </div><!--/.nav-collapse -->
+        </div>
+      </div>
+    </div>
+
+</div>
+
+ <div id="sidebar">
+  <div id="sidebar-wrap">
+    <h2>Twitter</h2>
+	<ul class="sidemenu">
+		<li>
+<a class="twitter-timeline" href="https://twitter.com/ApacheMahout" data-widget-id="422861673444028416">Tweets by @ApacheMahout</a>
+<script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script>
+</li>
+	</ul>
+    <h2>Apache Software Foundation</h2>
+    <ul class="sidemenu">
+      <li><a href="http://www.apache.org/foundation/how-it-works.html">How the ASF works</a></li>
+      <li><a href="http://www.apache.org/foundation/getinvolved.html">Get Involved</a></li>
+      <li><a href="http://www.apache.org/dev/">Developer Resources</a></li>
+      <li><a href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li>
+      <li><a href="http://www.apache.org/foundation/thanks.html">Thanks</a></li>
+    </ul>
+    <h2>Related Projects</h2>
+    <ul class="sidemenu">
+      <li><a href="http://lucene.apache.org/">Lucene</a></li>
+      <li><a href="http://hadoop.apache.org/">Hadoop</a></li>
+    </ul>
+  </div>
+</div>
+
+  <div id="content-wrap" class="clearfix">
+   <div id="main">
+    <h1 id="likelihood-ratio-test">Likelihood ratio test</h1>
+<p><em>Likelihood ratio test is used to compare the fit of two models one
+of which is nested within the other.</em></p>
+<p>In the context of machine learning and the Mahout project in particular,
+the term LLR is usually meant to refer to a test of significance for two
+binomial distributions, also known as the G squared statistic.  This is a
+special case of the multinomial test and is closely related to mutual
+information.  The value of this statistic is not normally used in this
+context as a true frequentist test of significance since there would be
+obvious and dreadful problems to do with multiple comparisons, but rather
+as a heuristic score to order pairs of items with the most interestingly
+connected items having higher scores.  In this usage, the LLR has proven
+very useful for discriminating pairs of features that have interesting
+degrees of cooccurrence and those that do not with usefully small false
+positive and false negative rates.  The LLR is typically far more suitable
+in the case of small than many other measures such as Pearson's
+correlation, Pearson's chi squared statistic or z statistics.  The LLR as
+stated does not, however, make any use of rating data which can limit its
+applicability in problems such as the Netflix competition. </p>
+<p>The actual value of the LLR is not usually very helpful other than as a way
+of ordering pairs of items.  As such, it is often used to determine a
+sparse set of coefficients to be estimated by other means such as TF-IDF. 
+Since the actual estimation of these coefficients can be done in a way that
+is independent of the training data such as by general corpus statistics,
+and since the ordering imposed by the LLR is relatively robust to counting
+fluctuation, this technique can provide very strong results in very sparse
+problems where the potential number of features vastly out-numbers the
+number of training examples and where features are highly interdependent.</p>
+<p>See Also: </p>
+<ul>
+<li><a href="http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html">Blog post "surprise and coincidence"</a></li>
+<li><a href="http://en.wikipedia.org/wiki/G-test">G-Test</a></li>
+<li><a href="http://en.wikipedia.org/wiki/Likelihood-ratio_test">Likelihood Ratio Test</a></li>
+</ul>
+   </div>
+  </div>     
+</div> 
+  <footer class="footer" align="center">
+    <div class="container">
+      <p>
+        Copyright &copy; 2014 The Apache Software Foundation, Licensed under
+        the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
+        <br />
+        Apache and the Apache feather logos are trademarks of The Apache Software Foundation.
+      </p>
+    </div>
+  </footer>
+  
+  <script src="/js/jquery-1.9.1.min.js"></script>
+  <script src="/js/bootstrap.min.js"></script>
+  <script>
+    (function() {
+      var cx = '012254517474945470291:vhsfv7eokdc';
+      var gcse = document.createElement('script');
+      gcse.type = 'text/javascript';
+      gcse.async = true;
+      gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') +
+          '//www.google.com/cse/cse.js?cx=' + cx;
+      var s = document.getElementsByTagName('script')[0];
+      s.parentNode.insertBefore(gcse, s);
+    })();
+  </script>
+</body>
+</html>

Added: websites/staging/mahout/trunk/content/users/mapreduce/clustering/spectral-clustering.html
==============================================================================
--- websites/staging/mahout/trunk/content/users/mapreduce/clustering/spectral-clustering.html (added)
+++ websites/staging/mahout/trunk/content/users/mapreduce/clustering/spectral-clustering.html Thu Mar 19 21:21:45 2015
@@ -0,0 +1,344 @@
+<!DOCTYPE html>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <title>Apache Mahout: Scalable machine learning and data mining</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+  <meta name="Distribution" content="Global">
+  <meta name="Robots" content="index,follow">
+  <meta name="keywords" content="apache, apache hadoop, apache lucene,
+        business data mining, cluster analysis,
+        collaborative filtering, data extraction, data filtering, data framework, data integration,
+        data matching, data mining, data mining algorithms, data mining analysis, data mining data,
+        data mining introduction, data mining software,
+        data mining techniques, data representation, data set, datamining,
+        feature extraction, fuzzy k means, genetic algorithm, hadoop,
+        hierarchical clustering, high dimensional, introduction to data mining, kmeans,
+        knowledge discovery, learning approach, learning approaches, learning methods,
+        learning techniques, lucene, machine learning, machine translation, mahout apache,
+        mahout taste, map reduce hadoop, mining data, mining methods, naive bayes,
+        natural language processing,
+        supervised, text mining, time series data, unsupervised, web data mining">
+  <link rel="shortcut icon" type="image/x-icon" href="http://mahout.apache.org/images/favicon.ico">
+  <script type="text/javascript" src="/js/prototype.js"></script>
+  <script type="text/javascript" src="/js/effects.js"></script>
+  <script type="text/javascript" src="/js/search.js"></script>
+  <script type="text/javascript" src="/js/slides.js"></script>
+
+  <link href="/css/bootstrap.min.css" rel="stylesheet" media="screen">
+  <link href="/css/bootstrap-responsive.css" rel="stylesheet">
+  <link rel="stylesheet" href="/css/global.css" type="text/css">
+
+  <!-- mathJax stuff -- use `\(...\)` for inline style math in markdown -->
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    tex2jax: {
+      skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+    }
+  });
+  MathJax.Hub.Queue(function() {
+    var all = MathJax.Hub.getAllJax(), i;
+    for(i = 0; i < all.length; i += 1) {
+      all[i].SourceElement().parentNode.className += ' has-jax';
+    }
+  });
+  </script>
+  <script type="text/javascript">
+    var mathjax = document.createElement('script'); 
+    mathjax.type = 'text/javascript'; 
+    mathjax.async = true;
+
+    mathjax.src = ('https:' == document.location.protocol) ?
+        'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' : 
+        'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+	
+	  var s = document.getElementsByTagName('script')[0]; 
+    s.parentNode.insertBefore(mathjax, s);
+  </script>
+</head>
+
+<body id="home" data-twttr-rendered="true">
+  <div id="wrap">
+   <div id="header">
+    <div id="logo"><a href="/overview.html"></a></div>
+  <div id="search">
+    <form id="search-form" action="http://www.google.com/search" method="get" class="navbar-search pull-right">    
+      <input value="http://mahout.apache.org" name="sitesearch" type="hidden">
+      <input class="search-query" name="q" id="query" type="text">
+      <input id="submission" type="image" src="/images/mahout-lupe.png" alt="Search" />
+    </form>
+  </div>
+
+    <div class="navbar navbar-inverse" style="position:absolute;top:133px;padding-right:0px;padding-left:0px;">
+      <div class="navbar-inner" style="border: none; background: #999; border: none; border-radius: 0px;">
+        <div class="container">
+          <button type="button" class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <!-- <a class="brand" href="#">Apache Community Development Project</a> -->
+          <div class="nav-collapse collapse">
+            <ul class="nav">
+              <li><a href="/">Home</a></li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">General<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/general/downloads.html">Downloads</a>
+                  <li><a href="/general/who-we-are.html">Who we are</a>
+                  <li><a href="/general/mailing-lists,-irc-and-archives.html">Mailing Lists</a>
+                  <li><a href="/general/release-notes.html">Release Notes</a> 
+                  <li><a href="/general/books-tutorials-and-talks.html">Books, Tutorials, Talks</a></li>
+                  <li><a href="/general/powered-by-mahout.html">Powered By Mahout</a>
+                  <li><a href="/general/professional-support.html">Professional Support</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Resources</li>
+                  <li><a href="/general/reference-reading.html">Reference Reading</a>
+                  <li><a href="/general/faq.html">FAQ</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Legal</li>
+                  <li><a href="http://www.apache.org/licenses/">License</a></li>
+                  <li><a href="http://www.apache.org/security/">Security</a></li>
+                  <li><a href="/general/privacy-policy.html">Privacy Policy</a>
+                </ul>
+              </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Developers<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/developers/developer-resources.html">Developer resources</a></li>
+                  <li><a href="/developers/version-control.html">Version control</a></li>
+                  <li><a href="/developers/buildingmahout.html">Build from source</a></li>
+                  <li><a href="/developers/issue-tracker.html">Issue tracker</a></li>
+                  <li><a href="https://builds.apache.org/job/Mahout-Quality/" target="_blank">Code quality reports</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">Contributions</li>
+                  <li><a href="/developers/how-to-contribute.html">How to contribute</a></li>
+                  <li><a href="/developers/how-to-become-a-committer.html">How to become a committer</a></li>
+                  <li><a href="/developers/gsoc.html">GSoC</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">For committers</li>
+                  <li><a href="/developers/how-to-update-the-website.html">How to update the website</a></li>
+                  <li><a href="/developers/patch-check-list.html">Patch check list</a></li>
+                  <li><a href="/developers/github.html">Handling Github PRs</a></li>
+                  <li><a href="/developers/how-to-release.html">How to release</a></li>
+                  <li><a href="/developers/thirdparty-dependencies.html">Third party dependencies</a></li>
+                </ul>
+               </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Basics<b class="caret"></b></a>
+                 <ul class="dropdown-menu">
+                  <li><a href="/users/basics/algorithms.html">List of algorithms</a>
+                  <li><a href="/users/basics/quickstart.html">Quickstart</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Working with text</li>
+                  <li><a href="/users/basics/creating-vectors-from-text.html">Creating vectors from text</a>
+                  <li><a href="/users/basics/collocations.html">Collocations</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Dimensionality reduction</li>
+                  <li><a href="/users/dim-reduction/dimensional-reduction.html">Singular Value Decomposition</a></li>
+                  <li><a href="/users/dim-reduction/ssvd.html">Stochastic SVD</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">Topic Models</li>      
+                  <li><a href="/users/clustering/latent-dirichlet-allocation.html">Latent Dirichlet Allocation</a></li>
+                </ul>
+                 </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Spark<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/users/sparkbindings/home.html">Scala &amp; Spark Bindings Overview</a></li>
+                  <li><a href="/users/sparkbindings/play-with-shell.html">Playing with Mahout's Spark Shell</a></li>
+			      <li class="divider"></li>
+                  <li><a href="/users/sparkbindings/faq.html">FAQ</a></li>
+                </ul>
+               </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Classification<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/users/mapreduce/classification/bayesian.html">Naive Bayes</a></li>
+                  <li><a href="/users/mapreduce/classification/hidden-markov-models.html">Hidden Markov Models</a></li>
+                  <li><a href="/users/mapreduce/classification/logistic-regression.html">Logistic Regression</a></li>
+                  <li><a href="/users/mapreduce/classification/partial-implementation.html">Random Forest</a></li>
+
+                  <li class="divider"></li>
+                  <li class="nav-header">Examples</li>
+                  <li><a href="/users/mapreduce/classification/breiman-example.html">Breiman example</a></li>
+                  <li><a href="/users/mapreduce/classification/twenty-newsgroups.html">20 newsgroups example</a></li>
+                </ul></li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Clustering<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li><a href="/users/mapreduce/clustering/k-means-clustering.html">k-Means</a></li>
+                <li><a href="/users/mapreduce/clustering/canopy-clustering.html">Canopy</a></li>
+                <li><a href="/users/mapreduce/clustering/fuzzy-k-means.html">Fuzzy k-Means</a></li>
+                <li><a href="/users/mapreduce/clustering/streaming-k-means.html">Streaming KMeans</a></li>
+                <li><a href="/users/mapreduce/clustering/spectral-clustering.html">Spectral Clustering</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Commandline usage</li>
+                <li><a href="/users/mapreduce/clustering/k-means-commandline.html">Options for k-Means</a></li>
+                <li><a href="/users/mapreduce/clustering/canopy-commandline.html">Options for Canopy</a></li>
+                <li><a href="/users/mapreduce/clustering/fuzzy-k-means-commandline.html">Options for Fuzzy k-Means</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Examples</li>
+                <li><a href="/users/mapreduce/clustering/clustering-of-synthetic-control-data.html">Synthetic data</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Post processing</li>
+                <li><a href="/users/mapreduce/clustering/cluster-dumper.html">Cluster Dumper tool</a></li>
+                <li><a href="/users/mapreduce/clustering/visualizing-sample-clusters.html">Cluster visualisation</a></li>
+                </ul></li>
+                <li class="dropdown"> <a href="#" class="dropdown-toggle" data-toggle="dropdown">Recommendations<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li><a href="/users/mapreduce/recommender/quickstart.html">Quickstart</a></li>
+                <li><a href="/users/mapreduce/recommender/recommender-first-timer-faq.html">First Timer FAQ</a></li>
+                <li><a href="/users/mapreduce/recommender/userbased-5-minutes.html">A user-based recommender <br/>in 5 minutes</a></li>
+		<li><a href="/users/mapreduce/recommender/matrix-factorization.html">Matrix factorization-based<br/> recommenders</a></li>
+                <li><a href="/users/mapreduce/recommender/recommender-documentation.html">Overview</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Hadoop</li>
+                <li><a href="/users/mapreduce/recommender/intro-itembased-hadoop.html">Intro to item-based recommendations<br/> with Hadoop</a></li>
+                <li><a href="/users/mapreduce/recommender/intro-als-hadoop.html">Intro to ALS recommendations<br/> with Hadoop</a></li>
+                <li class="nav-header">Spark</li>
+                <li><a href="/users/mapreduce/recommender/intro-cooccurrence-spark.html">Intro to cooccurrence-based<br/> recommendations with Spark</a></li>
+              </ul>
+            </li>
+           </ul>
+          </div><!--/.nav-collapse -->
+        </div>
+      </div>
+    </div>
+
+</div>
+
+ <div id="sidebar">
+  <div id="sidebar-wrap">
+    <h2>Twitter</h2>
+	<ul class="sidemenu">
+		<li>
+<a class="twitter-timeline" href="https://twitter.com/ApacheMahout" data-widget-id="422861673444028416">Tweets by @ApacheMahout</a>
+<script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script>
+</li>
+	</ul>
+    <h2>Apache Software Foundation</h2>
+    <ul class="sidemenu">
+      <li><a href="http://www.apache.org/foundation/how-it-works.html">How the ASF works</a></li>
+      <li><a href="http://www.apache.org/foundation/getinvolved.html">Get Involved</a></li>
+      <li><a href="http://www.apache.org/dev/">Developer Resources</a></li>
+      <li><a href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li>
+      <li><a href="http://www.apache.org/foundation/thanks.html">Thanks</a></li>
+    </ul>
+    <h2>Related Projects</h2>
+    <ul class="sidemenu">
+      <li><a href="http://lucene.apache.org/">Lucene</a></li>
+      <li><a href="http://hadoop.apache.org/">Hadoop</a></li>
+    </ul>
+  </div>
+</div>
+
+  <div id="content-wrap" class="clearfix">
+   <div id="main">
+    <h1 id="spectral-clustering-overview">Spectral Clustering Overview</h1>
+<p>Spectral clustering, as its name implies, makes use of the spectrum (or eigenvalues) of the similarity matrix of the data. It examines the <em>connectedness</em> of the data, whereas other clustering algorithms such as k-means use the <em>compactness</em> to assign clusters. Consequently, in situations where k-means performs well, spectral clustering will also perform well. Additionally, there are situations in which k-means will underperform (e.g. concentric circles), but spectral clustering will be able to segment the underlying clusters. Spectral clustering is also very useful for image segmentation.</p>
+<p>At its simplest, spectral clustering relies on the following four steps:</p>
+<ol>
+<li>
+<p>Computing a similarity (or <em>affinity</em>) matrix <code>\(\mathbf{A}\)</code> from the data. This involves determining a pairwise distance function <code>\(f\)</code> that takes a pair of data points and returns a scalar.</p>
+</li>
+<li>
+<p>Computing a graph Laplacian <code>\(\mathbf{L}\)</code> from the affinity matrix. There are several types of graph Laplacians; which is used will often depends on the situation.</p>
+</li>
+<li>
+<p>Computing the eigenvectors and eigenvalues of <code>\(\mathbf{L}\)</code>. The degree of this decomposition is often modulated by <code>\(k\)</code>, or the number of clusters. Put another way, <code>\(k\)</code> eigenvectors and eigenvalues are computed.</p>
+</li>
+<li>
+<p>The <code>\(k\)</code> eigenvectors are used as "proxy" data for the original dataset, and fed into k-means clustering. The resulting cluster assignments are transparently passed back to the original data.</p>
+</li>
+</ol>
+<p>For more theoretical background on spectral clustering, such as how affinity matrices are computed, the different types of graph Laplacians, and whether the top or bottom eigenvectors and eigenvalues are computed, please read <a href="http://link.springer.com/article/10.1007/s11222-007-9033-z">Ulrike von Luxburg's article in <em>Statistics and Computing</em> from December 2007</a>. It provides an excellent description of the linear algebra operations behind spectral clustering, and imbues a thorough understanding of the types of situations in which it can be used.</p>
+<h1 id="mahout-spectral-clustering">Mahout Spectral Clustering</h1>
+<p>As of Mahout 0.3, spectral clustering has been implemented to take advantage of the MapReduce framework. It uses <a href="http://mahout.apache.org/users/dim-reduction/ssvd.html">SSVD</a> for dimensionality reduction of the input data set, and <a href="http://mahout.apache.org/users/clustering/k-means-clustering.html">k-means</a> to perform the final clustering.</p>
+<p><strong>(<a href="https://issues.apache.org/jira/browse/MAHOUT-1538">MAHOUT-1538</a> will port the existing Hadoop MapReduce implementation to Mahout DSL, allowing for one of several distinct distributed back-ends to conduct the computation)</strong></p>
+<h2 id="input">Input</h2>
+<p>The input format for the algorithm currently takes the form of a Hadoop-backed affinity matrix in the form of text files. Each line of the text file specifies a single element of the affinity matrix: the row index <code>\(i\)</code>, the column index <code>\(j\)</code>, and the value:</p>
+<p><code>i, j, value</code></p>
+<p>The affinity matrix is symmetric, and any unspecified <code>\(i, j\)</code> pairs are assumed to be 0 for sparsity. The row and column indices are 0-indexed. Thus, only the non-zero entries of either the upper or lower triangular need be specified.</p>
+<p>The matrix elements specified in the text files are collected into a Mahout <code>DistributedRowMatrix</code>.</p>
+<p><strong>(<a href="https://issues.apache.org/jira/browse/MAHOUT-1539">MAHOUT-1539</a> will allow for the creation of the affinity matrix to occur as part of the core spectral clustering algorithm, as opposed to the current requirement that the user create this matrix themselves and provide it, rather than the original data, to the algorithm)</strong></p>
+<h2 id="running-spectral-clustering">Running spectral clustering</h2>
+<p><strong>(<a href="https://issues.apache.org/jira/browse/MAHOUT-1540">MAHOUT-1540</a> will provide a running example of this algorithm and this section will be updated to show how to run the example and what the expected output should be; until then, this section provides a how-to for simply running the algorithm on arbitrary input)</strong></p>
+<p>Spectral clustering can be invoked with the following arguments.</p>
+<div class="codehilite"><pre><span class="n">bin</span><span class="o">/</span><span class="n">mahout</span> <span class="n">spectralkmeans</span> <span class="o">\</span>
+    <span class="o">-</span><span class="nb">i</span> <span class="o">&lt;</span><span class="n">affinity</span> <span class="n">matrix</span> <span class="n">directory</span><span class="o">&gt;</span> <span class="o">\</span>
+    <span class="o">-</span><span class="n">o</span> <span class="o">&lt;</span><span class="n">output</span> <span class="n">working</span> <span class="n">directory</span><span class="o">&gt;</span> <span class="o">\</span>
+    <span class="o">-</span><span class="n">d</span> <span class="o">&lt;</span><span class="n">number</span> <span class="n">of</span> <span class="n">data</span> <span class="n">points</span><span class="o">&gt;</span> <span class="o">\</span>
+    <span class="o">-</span><span class="n">k</span> <span class="o">&lt;</span><span class="n">number</span> <span class="n">of</span> <span class="n">clusters</span> <span class="n">AND</span> <span class="n">number</span> <span class="n">of</span> <span class="n">top</span> <span class="n">eigenvectors</span> <span class="n">to</span> <span class="n">use</span><span class="o">&gt;</span> <span class="o">\</span>
+    <span class="o">-</span><span class="n">x</span> <span class="o">&lt;</span><span class="n">maximum</span> <span class="n">number</span> <span class="n">of</span> <span class="n">k</span><span class="o">-</span><span class="n">means</span> <span class="n">iterations</span><span class="o">&gt;</span>
+</pre></div>
+
+
+<p>The affinity matrix can be contained in a single text file (using the aforementioned one-line-per-entry format) or span many text files <a href="https://issues.apache.org/jira/browse/MAHOUT-978">per (MAHOUT-978</a>, do not prefix text files with a leading underscore '_' or period '.'). The <code>-d</code> flag is required for the algorithm to know the dimensions of the affinity matrix. <code>-k</code> is the number of top eigenvectors from the normalized graph Laplacian in the SSVD step, and also the number of clusters given to k-means after the SSVD step.</p>
+<h2 id="example">Example</h2>
+<p>To provide a simple example, take the following affinity matrix, contained in a text file called <code>affinity.txt</code>:</p>
+<div class="codehilite"><pre>0<span class="p">,</span> 0<span class="p">,</span> 0
+0<span class="p">,</span> 1<span class="p">,</span> 0<span class="p">.</span>8
+0<span class="p">,</span> 2<span class="p">,</span> 0<span class="p">.</span>5
+1<span class="p">,</span> 0<span class="p">,</span> 0<span class="p">.</span>8
+1<span class="p">,</span> 1<span class="p">,</span> 0
+1<span class="p">,</span> 2<span class="p">,</span> 0<span class="p">.</span>9
+2<span class="p">,</span> 0<span class="p">,</span> 0<span class="p">.</span>5
+2<span class="p">,</span> 1<span class="p">,</span> 0<span class="p">.</span>9
+2<span class="p">,</span> 2<span class="p">,</span> 0
+</pre></div>
+
+
+<p>With this 3-by-3 matrix, <code>-d</code> would be <code>3</code>. Furthermore, since all affinity matrices are assumed to be symmetric, the entries specifying both <code>1, 2, 0.9</code> and <code>2, 1, 0.9</code> are redundant; only one of these is needed. Additionally, any entries that are 0, such as those along the diagonal, also need not be specified at all. They are provided here for completeness.</p>
+<p>In general, larger values indicate a stronger "connectedness", whereas smaller values indicate a weaker connectedness. This will vary somewhat depending on the distance function used, though a common one is the <a href="http://en.wikipedia.org/wiki/RBF_kernel">RBF kernel</a> (used in the above example) which returns values in the range [0, 1], where 0 indicates completely disconnected (or completely dissimilar) and 1 is fully connected (or identical).</p>
+<p>The call signature with this matrix could be as follows:</p>
+<div class="codehilite"><pre><span class="n">bin</span><span class="o">/</span><span class="n">mahout</span> <span class="n">spectralkmeans</span> <span class="o">\</span>
+    <span class="o">-</span><span class="nb">i</span> <span class="n">s3</span><span class="p">:</span><span class="o">//</span><span class="n">mahout</span><span class="o">-</span><span class="n">example</span><span class="o">/</span><span class="n">input</span><span class="o">/</span> <span class="o">\</span>
+    <span class="o">-</span><span class="n">o</span> <span class="n">s3</span><span class="p">:</span><span class="o">//</span><span class="n">mahout</span><span class="o">-</span><span class="n">example</span><span class="o">/</span><span class="n">output</span><span class="o">/</span> <span class="o">\</span>
+    <span class="o">-</span><span class="n">d</span> 3 <span class="o">\</span>
+    <span class="o">-</span><span class="n">k</span> 2 <span class="o">\</span>
+    <span class="o">-</span><span class="n">x</span> 10
+</pre></div>
+
+
+<p>There are many other optional arguments, in particular for tweaking the SSVD process (block size, number of power iterations, etc) and the k-means clustering step (distance measure, convergence delta, etc).</p>
+   </div>
+  </div>     
+</div> 
+  <footer class="footer" align="center">
+    <div class="container">
+      <p>
+        Copyright &copy; 2014 The Apache Software Foundation, Licensed under
+        the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
+        <br />
+        Apache and the Apache feather logos are trademarks of The Apache Software Foundation.
+      </p>
+    </div>
+  </footer>
+  
+  <script src="/js/jquery-1.9.1.min.js"></script>
+  <script src="/js/bootstrap.min.js"></script>
+  <script>
+    (function() {
+      var cx = '012254517474945470291:vhsfv7eokdc';
+      var gcse = document.createElement('script');
+      gcse.type = 'text/javascript';
+      gcse.async = true;
+      gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') +
+          '//www.google.com/cse/cse.js?cx=' + cx;
+      var s = document.getElementsByTagName('script')[0];
+      s.parentNode.insertBefore(gcse, s);
+    })();
+  </script>
+</body>
+</html>