You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2012/07/12 11:26:03 UTC

svn commit: r1360593 [5/17] - in /mahout/site/trunk: ./ cgi-bin/ content/ content/attachments/ content/attachments/101992/ content/attachments/116559/ content/attachments/22872433/ content/attachments/22872443/ content/attachments/23335706/ content/att...

Added: mahout/site/trunk/content/attachments/27832158/28016857.lyx
URL: http://svn.apache.org/viewvc/mahout/site/trunk/content/attachments/27832158/28016857.lyx?rev=1360593&view=auto
==============================================================================
--- mahout/site/trunk/content/attachments/27832158/28016857.lyx (added)
+++ mahout/site/trunk/content/attachments/27832158/28016857.lyx Thu Jul 12 09:25:54 2012
@@ -0,0 +1,1615 @@
+#LyX 2.0 created this file. For more info see http://www.lyx.org/
+\lyxformat 413
+\begin_document
+\begin_header
+\textclass article
+\begin_preamble
+
+\end_preamble
+\use_default_options true
+\maintain_unincluded_children false
+\language english
+\language_package default
+\inputencoding auto
+\fontencoding global
+\font_roman lmodern
+\font_sans default
+\font_typewriter default
+\font_default_family default
+\use_non_tex_fonts false
+\font_sc false
+\font_osf false
+\font_sf_scale 100
+\font_tt_scale 100
+
+\graphics default
+\default_output_format default
+\output_sync 0
+\bibtex_command default
+\index_command default
+\paperfontsize default
+\spacing single
+\use_hyperref true
+\pdf_bookmarks true
+\pdf_bookmarksnumbered false
+\pdf_bookmarksopen false
+\pdf_bookmarksopenlevel 1
+\pdf_breaklinks false
+\pdf_pdfborder false
+\pdf_colorlinks false
+\pdf_backref false
+\pdf_pdfusetitle true
+\papersize default
+\use_geometry false
+\use_amsmath 1
+\use_esint 1
+\use_mhchem 1
+\use_mathdots 1
+\cite_engine basic
+\use_bibtopic false
+\use_indices false
+\paperorientation portrait
+\suppress_date true
+\use_refstyle 0
+\index Index
+\shortcut idx
+\color #008000
+\end_index
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation skip
+\defskip bigskip
+\quotes_language english
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+\tracking_changes false
+\output_changes false
+\html_math_output 0
+\html_css_as_file 0
+\html_be_strict false
+\end_header
+
+\begin_body
+
+\begin_layout Title
+Command Line Interface, 
+\begin_inset Newline newline
+\end_inset
+
+Stochastic SVD
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Dmitriy Lyubimov, dlyubimov at apache dot org
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\begin_inset CommandInset toc
+LatexCommand tableofcontents
+
+\end_inset
+
+
+\begin_inset Newpage pagebreak
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+Background information
+\end_layout
+
+\begin_layout Subsection
+Stochastic SVD (SSVD)
+\end_layout
+
+\begin_layout Standard
+Stochasitc SVD method in Mahout produces reduced rank Singular Value Decompositi
+on output in its strict mathematical definition:
+\begin_inset Formula 
+\[
+\mathbf{A}\approx\mathbf{U}\boldsymbol{\Sigma}\mathbf{V}^{\top},
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+i.
+ e.
+ it creates outputs for matrices 
+\begin_inset Formula $\mathbf{U}$
+\end_inset
+
+, 
+\begin_inset Formula $\mathbf{V}$
+\end_inset
+
+ and 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+, each of which may be requested individually.
+ The desired rank of decomposition, henceforth denoted as 
+\begin_inset Formula $k\in\mathbb{N}_{1}$
+\end_inset
+
+, is a parameter of the algorithm.
+ The singular values inside diagonal matrix 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+ satisfy 
+\begin_inset Formula $\sigma_{i+1}\leq\sigma_{i}$
+\end_inset
+
+ 
+\begin_inset Formula $\forall i\in\left[1,k-1\right]$
+\end_inset
+
+, i.e.
+ sorted from biggest to smallest.
+ Cases of rank deficiency 
+\begin_inset Formula $\mbox{rank}\left(\mathbf{A}\right)<k$
+\end_inset
+
+ are handled by producing 0s in singular value positions once deficiency
+ takes place.
+ 
+\end_layout
+
+\begin_layout Paragraph
+Single space for comparing row-items and column-items.
+\end_layout
+
+\begin_layout Standard
+On top of it, there's an option to present decomposition output in a form
+ of 
+\begin_inset Formula 
+\begin{equation}
+\mathbf{A}\approx\mathbf{U}_{\sigma}\mathbf{V}_{\sigma}^{\top},\label{eq:baked-sigma}
+\end{equation}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+where one can request 
+\begin_inset Formula $\mathbf{U}_{\sigma}=\mathbf{U}\boldsymbol{\Sigma}^{0.5}$
+\end_inset
+
+ instead of 
+\begin_inset Formula $\mathbf{U}$
+\end_inset
+
+ (but not both), 
+\begin_inset Formula $\mathbf{V}_{\sigma}=\mathbf{V}\boldsymbol{\Sigma}^{0.5}$
+\end_inset
+
+ instead of 
+\begin_inset Formula $\mathbf{V}$
+\end_inset
+
+ (but not both).
+ Here, notation 
+\begin_inset Formula $\boldsymbol{\Sigma}^{0.5}$
+\end_inset
+
+ implies diagonal matrix containing square roots of the singular values:
+ 
+\begin_inset Formula 
+\[
+\boldsymbol{\Sigma}^{0.5}=\left(\begin{matrix}\sqrt{\sigma_{1}} & \cdots & 0\\
+\vdots & \ddots & \vdots\\
+0 & \cdots & \sqrt{\sigma_{k}}
+\end{matrix}\right).
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Original singular values 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+ are still produced and saved regardless.
+\end_layout
+
+\begin_layout Standard
+This option is a nod to a common need of comparing actors represented by
+ both input rows and input columns in a common space.
+ E.g.
+ if LSI is performed such that rows are documents and columns are terms
+ then it is possible to compare documents and terms (ether existing or fold
+ in new ones) in one common space and perform similarity measurement between
+ a document and a term, rather than computing just term2term or document2documen
+t similarities.
+\end_layout
+
+\begin_layout Paragraph
+Folding in new observations.
+\end_layout
+
+\begin_layout Standard
+It is probably worth mentioning the operation of 
+\begin_inset Quotes eld
+\end_inset
+
+folding in
+\begin_inset Quotes erd
+\end_inset
+
+ new observations in context of this method, since it is often a basis for
+ incremental methods.
+\end_layout
+
+\begin_layout Standard
+If 
+\begin_inset Formula $\tilde{\mathbf{c}}_{r}\,\,\left(\tilde{\mathbf{c}}_{c}\right)$
+\end_inset
+
+ is a new row (column) observation in addition to original input 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+, then correspondent 
+\begin_inset Quotes eld
+\end_inset
+
+new
+\begin_inset Quotes erd
+\end_inset
+
+ row vectors of 
+\begin_inset Formula $\tilde{\mathbf{U}}$
+\end_inset
+
+ 
+\begin_inset Formula $\left(\tilde{\mathbf{V}}\right)$
+\end_inset
+
+ can be obtained as 
+\begin_inset Formula 
+\begin{eqnarray}
+\mathbf{\tilde{u}} & = & \boldsymbol{\Sigma}^{-1}\mathbf{V}^{\top}\tilde{\mathbf{c}}_{r},\label{eq:fold-in-u}\\
+\tilde{\mathbf{v}} & = & \boldsymbol{\Sigma}^{-1}\mathbf{U}^{\top}\tilde{\mathbf{c}}_{c}.
+\end{eqnarray}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Similarly, for the case 
+\begin_inset CommandInset ref
+LatexCommand formatted
+reference "eq:baked-sigma"
+
+\end_inset
+
+ folding in new observations into rows of 
+\begin_inset Formula $\tilde{\mathbf{U}}_{\sigma}\,\,\left(\tilde{\mathbf{V}}_{\sigma}\right)$
+\end_inset
+
+ would look like
+\begin_inset Formula 
+\begin{eqnarray}
+\mathbf{\tilde{u}}_{\sigma} & = & \mathbf{V}_{\sigma}^{\top}\tilde{\mathbf{c}}_{r},\label{eq:fold-in-u-sigma}\\
+\tilde{\mathbf{v}}_{\sigma} & = & \mathbf{U}_{\sigma}^{\top}\tilde{\mathbf{c}}_{c}.
+\end{eqnarray}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Thus, new rows can be added to matrices denoted as 
+\begin_inset Formula $\tilde{\mathbf{U}}\,\,\left(\tilde{\mathbf{V}}\right)$
+\end_inset
+
+ corresponding to new observations as new observations become available,
+ i.e.
+ incrementally.
+ Given that new observations are usually moderately sparse vectors, it might
+ be feasible to do fold-in in real time or almost real time, assuming proper
+ fast row-wise indexing of 
+\begin_inset Formula $\mathbf{U}\,\left(\mathbf{V}\right)$
+\end_inset
+
+ exists (e.g.
+ using a batch request to an HBase table containing rows of 
+\begin_inset Formula $\mathbf{U}\,\left(\mathbf{V}\right)$
+\end_inset
+
+).
+ However, since operation of folding in new observations doesn't change
+ original decomposition and its spaces, such new observations cannot be
+ considered 'training' examples.
+ Typically, from time to time accumulated new observations can be added
+ to original input 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+ and the whole decomposition can be recomputed again.
+\end_layout
+
+\begin_layout Standard
+Common applications for SVD include Latent Semantic Analysis (LSA), Principal
+ Component Analysis (PCA), dimensionality reduction and others.
+\end_layout
+
+\begin_layout Subsection
+PCA options in SSVD
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:PCA-options-in"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Some of interesting applications of SVD is dimensionality reduction and
+ Principal Component Analysis.
+ As of MAHOUT-817, SSVD method is equipped with options helping to produce
+ both PCA and dimensionality reduction transformations.
+\end_layout
+
+\begin_layout Paragraph
+Outline: data points are row vectors.
+\end_layout
+
+\begin_layout Standard
+We approach general PCA and dimensionality reduction problem with respect
+ to input expressed in Mahout's distributed row matrix format.
+ We also assume data points are row vectors in such matrix
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Note that in wikipedia article input data points are considered to be column
+ vectors, so our input is transpose w.r.t.
+ to case outlined in the wikipedia PCA article.
+\end_layout
+
+\end_inset
+
+.
+ We denote such 
+\begin_inset Formula $m\times n$
+\end_inset
+
+ input matrix as 
+\begin_inset Formula $\mathbf{A}^{\left(pca\right)}$
+\end_inset
+
+: 
+\begin_inset Formula 
+\[
+\mathbf{A}^{\left(pca\right)}=\left(\begin{matrix}\mathbf{a}_{1}^{\top}\\
+\mathbf{a}_{2}^{\top}\\
+\vdots\\
+\mathbf{a}_{m}^{\top}
+\end{matrix}\right)
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Column mean is n-vector
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+In many cases in literature one would find PCA mean vector denoted as 
+\begin_inset Formula $\boldsymbol{\mu}$
+\end_inset
+
+.
+ 
+\end_layout
+
+\end_inset
+
+ 
+\begin_inset Formula 
+\begin{eqnarray*}
+\mathbf{\boldsymbol{\xi}} & = & \left(\begin{matrix}\xi_{1}\\
+\xi_{2}\\
+\vdots\\
+\xi_{n}
+\end{matrix}\right)\\
+ & = & \frac{1}{m}\sum_{i}^{m}\mathbf{a}_{i}.
+\end{eqnarray*}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+We denote 
+\begin_inset Formula $m\times n$
+\end_inset
+
+ mean matrix as 
+\begin_inset Formula 
+\[
+\boldsymbol{\Xi}=\left(\begin{matrix}\mathbf{\boldsymbol{\xi}}^{\top}\\
+\mathbf{\boldsymbol{\xi}}^{\top}\\
+\vdots\\
+\mathbf{\boldsymbol{\xi}}^{\top}
+\end{matrix}\right)\in\mathbb{R}^{m\times n}
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Traditional approach under these settings starts with finding a column mean
+ 
+\begin_inset Formula $\boldsymbol{\xi}$
+\end_inset
+
+ and subtract it from all data points (row vectors) of the input
+\begin_inset Formula 
+\[
+\mathbf{A}=\mathbf{A}^{\left(pca\right)}-\boldsymbol{\Xi}
+\]
+
+\end_inset
+
+ and then proceeds with finding a reduced 
+\begin_inset Formula $k$
+\end_inset
+
+-rank SVD: 
+\end_layout
+
+\begin_layout Standard
+\begin_inset Formula 
+\[
+\mathbf{A}\approx\mathbf{U}\boldsymbol{\Sigma}\mathbf{V}^{\top}.
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+At this point rows of 
+\begin_inset Formula $\mathbf{U}$
+\end_inset
+
+ correspond to original data points (rows of 
+\begin_inset Formula $\mathbf{A}^{\left(pca\right)}$
+\end_inset
+
+) converted to PCA space and 
+\begin_inset Formula $\mathbf{V}$
+\end_inset
+
+ represents approximate eigenvectors of the covariance matrix of our input
+ (and we are done with PCA part at this point).
+ Note that since our datapoints in this case are row vectors in the input
+ (and not column vectors), covariance matrix is 
+\begin_inset Formula $\mathbf{C}=\frac{1}{m}\mathbf{A}^{\top}\mathbf{A}$
+\end_inset
+
+.
+ 
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Some courses (see Andrew Ng's ML online lectures) also mention a metric
+ aka 
+\begin_inset Quotes eld
+\end_inset
+
+percent of variance retained
+\begin_inset Quotes erd
+\end_inset
+
+ 
+\begin_inset Formula $\sum_{i=1}^{k}\sigma_{i}/\sum_{i=1}^{n}\sigma_{i}\cdot100\%$
+\end_inset
+
+ that seems to suggest that it would be a metric to compare 
+\begin_inset Quotes eld
+\end_inset
+
+goodness
+\begin_inset Quotes erd
+\end_inset
+
+ of choice for 
+\begin_inset Formula $k$
+\end_inset
+
+ .
+ We of course cannot know all of the singular values of the full SVD in
+ context of SSVD, although we can estimate that 
+\family roman
+\series medium
+\shape up
+\size normal
+\emph off
+\bar no
+\strikeout off
+\uuline off
+\uwave off
+\noun off
+\color none
+
+\begin_inset Formula 
+\[
+\frac{\sum_{i=1}^{k}\sigma_{i}}{\sum_{i=1}^{n}\sigma_{i}}\geq\frac{\sum_{i=1}^{k}\sigma_{i}}{\left(n-k\right)\sigma_{k}+\sum_{i=1}^{k}\sigma_{i}},
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Plain Layout
+i.e.
+ we can make a statement in a sense that 
+\begin_inset Quotes eld
+\end_inset
+
+we have 
+\emph on
+at least 
+\emph default
+that much variance retained
+\begin_inset Quotes erd
+\end_inset
+
+ instead.
+ Whether this estimate is useful or not in practice, is not clear to me
+ as I have no error estimate (and, more importantly, no error estimate 
+\emph on
+with
+\emph default
+ stochastic error baked in) between expression on left and right of the
+ inequality at the moment.
+ But at least it puts some metric threshold on how big 
+\begin_inset Formula $k$
+\end_inset
+
+ we may need and identify some cases where smaller values of k will suffice
+ given particular input and minimum variance retained prerequisite thus
+ reducing need for flops.
+ Another possible way to estimate it is perhaps to try and fit existing
+ singular values into asymptotically decaying curve in order to produce
+ a better guess for 
+\begin_inset Formula $\sum_{i=k+1}^{n}\sigma_{i}$
+\end_inset
+
+.
+ 
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Paragraph
+Transformations to/from for new observation data points.
+\end_layout
+
+\begin_layout Standard
+Dimensionality reduction transformations are directly following from SVD
+ fold-in operation 
+\begin_inset CommandInset ref
+LatexCommand eqref
+reference "eq:fold-in-u"
+
+\end_inset
+
+ described above.
+\end_layout
+
+\begin_layout Standard
+Transformation of any new data point observation of an 
+\begin_inset Formula $n$
+\end_inset
+
+-vector 
+\begin_inset Formula $\tilde{\mathbf{c}_{r}}$
+\end_inset
+
+ 
+\emph on
+into
+\emph default
+ reduced dimensionality PCA space 
+\begin_inset Formula $k$
+\end_inset
+
+-vector 
+\begin_inset Formula $\tilde{\mathbf{u}}$
+\end_inset
+
+ will look like 
+\begin_inset Formula 
+\begin{equation}
+\mathbf{\tilde{u}}=\boldsymbol{\Sigma^{-1}}\mathbf{V}^{\top}\left(\tilde{\mathbf{c}}_{r}-\boldsymbol{\xi}\right)\label{eq:toPca}
+\end{equation}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+(this operation is essentially an SVD fold-in operation corrected for the
+ mean subtraction).
+ Note that, again, if input vectors tend to be quite sparse, then 
+\begin_inset CommandInset ref
+LatexCommand eqref
+reference "eq:toPca"
+
+\end_inset
+
+ could be decomposed as 
+\begin_inset Formula 
+\[
+\tilde{\mathbf{u}}=\boldsymbol{\Sigma^{-1}}\mathbf{V}^{\top}\tilde{\mathbf{c}}_{r}-\boldsymbol{\Sigma^{-1}}\mathbf{V}^{\top}\boldsymbol{\xi},
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+and online conversion can be sped up by precomputing term 
+\begin_inset Formula $\boldsymbol{\Sigma^{-1}}\mathbf{V}^{\top}\boldsymbol{\xi}$
+\end_inset
+
+ which ends up to be a small dense 
+\begin_inset Formula $k$
+\end_inset
+
+-vector, and big matrix 
+\begin_inset Formula $\mathbf{V}$
+\end_inset
+
+ could be row-indexed for fast online matrix-vector multiplication.
+\end_layout
+
+\begin_layout Standard
+Inverse transformation (from reduced PCA space into original space) looks
+ like 
+\begin_inset Formula 
+\begin{equation}
+\tilde{\mathbf{c}}_{r}=\boldsymbol{\xi}+\mathbf{V}\boldsymbol{\Sigma}\tilde{\mathbf{u}}.\label{eq:frompca}
+\end{equation}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Paragraph
+MAHOUT-817 goals: why brute force approach is hard in context of big data
+ computations.
+\end_layout
+
+\begin_layout Standard
+In context of massive computations, input 
+\begin_inset Formula $\mathbf{A}^{\left(pca\right)}$
+\end_inset
+
+ is often rather sparse.
+ Sparse matrices are packed and their subsequent operations within Mahout
+ framework are optimized to account for degenerate nature of zero elements
+ computation.
+ Sometimes such reduction of need for flops and space may approach several
+ orders of magnitude.
+ However, mean subtraction step would turn such sparse inputs into a dense
+ matrix 
+\begin_inset Formula $\left(\mathbf{A}^{\left(pca\right)}-\boldsymbol{\Xi}\right)$
+\end_inset
+
+.
+ Such intermediate input will take a lot of space and subsequent SVD will
+ require a lot of flops.
+ Fortunately, this can be addressed in context of SSVD method in a way that
+ will be (almost) cost-equivalent to a regular SSVD computation on original
+ sparse input 
+\begin_inset Formula $\mathbf{A}^{\left(pca\right)}$
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+MAHOUT-817 addresses two goals:
+\end_layout
+
+\begin_layout Itemize
+Provide column-wise mean computation step in the whole pipeline (or use
+ outside mean vector if already available)
+\end_layout
+
+\begin_layout Itemize
+Lift the dense matrix data concerns per above.
+\end_layout
+
+\begin_layout Standard
+The sparser the original input is, the more efficiency gain is to be had
+ by using SSVD PCA options compared to brute-force approach.
+ 
+\end_layout
+
+\begin_layout Standard
+If original input is 100% dense, SSVD PCA options will have roughly the
+ same cost as brute-force approach.
+\end_layout
+
+\begin_layout Standard
+MAHOUT-817 introduces two additional pca options: 
+\family typewriter
+--pca
+\family default
+ and 
+\family typewriter
+--pcaOffset
+\family default
+ that request to treat incoming data as a PCA input.
+ SVD rank opton 
+\family typewriter
+-k
+\family default
+ will correspond to the reduced dimensionality of PCA space.
+
+\family typewriter
+ 
+\family default
+See 
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+S
+\end_layout
+
+\end_inset
+
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Usage"
+
+\end_inset
+
+ for details.
+\end_layout
+
+\begin_layout Section
+Input/output file formats and layout
+\end_layout
+
+\begin_layout Standard
+Input 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+, as well as outputs 
+\begin_inset Formula $\mathbf{U}\left(\mathbf{U}_{\sigma}\right),\,\mathbf{V}\left(\mathbf{V}_{\sigma}\right)$
+\end_inset
+
+, are in Mahout's Distributed Row Matrix format, i.e.
+ set of sequence files where value is of 
+\family typewriter
+VectorWritable
+\family default
+ type.
+ As far as keys are concerned, rows of 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+ may be keyed (identified) by any 
+\family typewriter
+Writable 
+\family default
+(for as long as it is instantiable thru a default constructor).
+ That, among other thnigs, means that this method can be applied directly
+ on the output of 
+\family typewriter
+seq2sparse 
+\family default
+where keys are of 
+\family typewriter
+Text
+\family default
+ type
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+\begin_inset CommandInset label
+LatexCommand label
+name "fn:(TODO:-re-verify)"
+
+\end_inset
+
+(TODO: re-verify)
+\end_layout
+
+\end_inset
+
+.
+ 
+\end_layout
+
+\begin_layout Standard
+Definition of output 
+\begin_inset Formula $\mathbf{U}\,\,\left(\mathbf{U}_{\sigma}\right)$
+\end_inset
+
+ is identical to definition of the input matrix 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+, and the keys of corresponding rows in 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+ are copied to corresponding rows of output 
+\begin_inset Formula $\mathbf{U}\,\,\left(\mathbf{U}_{\sigma}\right)$
+\end_inset
+
+.
+ 
+\end_layout
+
+\begin_layout Standard
+Definition of output 
+\begin_inset Formula $\mathbf{V}\,\,\left(\mathbf{V}_{\sigma}\right)$
+\end_inset
+
+ is always sequence file(s) of 
+\family typewriter
+(IntWritable,
+\begin_inset Newline newline
+\end_inset
+
+
+\family default
+ 
+\family typewriter
+VectorWritable)
+\family default
+ where key corresponds to a column index of the input 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+.
+ 
+\end_layout
+
+\begin_layout Standard
+Output of 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+ is encoded by a single output file with a single vector value 
+\family typewriter
+(VectorWritable)
+\family default
+ with main diagonal entries of 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+ aka singular values 
+\begin_inset Formula $\left(\begin{matrix}\sigma_{1} & \cdots & \sigma_{k}\end{matrix}\right)$
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Section
+CLI usage
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Usage"
+
+\end_inset
+
+
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+As of Mahout 0.7 trunk.
+ 
+\begin_inset Newline newline
+\end_inset
+
+12/13/2011 adjusted for MAHOUT-922 changes.
+\begin_inset Newline newline
+\end_inset
+
+02/22/2012 MAHOUT-817.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout LyX-Code
+mahout ssvd <options>
+\end_layout
+
+\begin_layout Paragraph
+Options.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-k,
+\begin_inset space ~
+\end_inset
+
+--rank
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\family default
+\size default
+ (required): the requested SVD rank (minimum number of singular values and
+ dimensions in U, V matrices).
+ The value of 
+\begin_inset Formula $k+p$
+\end_inset
+
+ directly impacts running time and memory requirements.
+ 
+\series bold
+\emph on
+k+p=500 is probably more than reasonable
+\series default
+\emph default
+.
+ Typically 
+\begin_inset Formula $k+p$
+\end_inset
+
+ is taken within range 20...200.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-p,
+\begin_inset space ~
+\end_inset
+
+--oversampling
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\family default
+\size default
+ (optional, default 15): stochastic SVD oversampling.
+ 
+\begin_inset Formula $p$
+\end_inset
+
+ doesn't seem to have to be very significant.
+ If power iterations (
+\begin_inset Formula $q>0$
+\end_inset
+
+) are used then p perhaps could be kept quite low, not to exceed 10% of
+ 
+\begin_inset Formula $k$
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-q,
+\begin_inset space ~
+\end_inset
+
+--powerIter
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\size default
+ 
+\family default
+(optional, default 0): number of power iterations to perform.
+ This helps fighting data noise and improve precision significantly more
+ than just increasing 
+\begin_inset Formula $p$
+\end_inset
+
+.
+ Each additional power iteration adds 2 more steps (map/reduce + map-only).
+ Experimental data suggests using 
+\begin_inset Formula $q=1$
+\end_inset
+
+ is already producing quite good results which are hard to much improve
+ upon.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-r,
+\begin_inset space ~
+\end_inset
+
+--blockHeight
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\family default
+\size default
+ (optional, default 10,000): the number of rows of source matrix for block
+ computations during 
+\begin_inset Formula $\mathbf{Y}=\mathbf{QR}$
+\end_inset
+
+ decomposition.
+ Taller blocking causes more memory use but produces less blocks and therefore
+ somewhat better running times.
+ The most optimal mode from the running time point of view should be 1 block
+ per 1 mapper.
+ 
+\emph on
+This cannot be less than k+p.
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-oh,
+\begin_inset space ~
+\end_inset
+
+--outerProdBlockHeight
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\size default
+ 
+\family default
+(optional, default 30,000): the block height during 
+\begin_inset Formula $\mathbf{B}=\mathbf{Q}^{\top}\mathbf{A}$
+\end_inset
+
+ operation
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+\begin_inset CommandInset label
+LatexCommand label
+name "fn:With-extreme-sparse"
+
+\end_inset
+
+With extreme sparse matrices increasing this parameter may lead to better
+ performance by reducing computational pressure on the shuffle and sort
+ and grouping sparse records together.
+\end_layout
+
+\end_inset
+
+
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+\begin_inset CommandInset label
+LatexCommand label
+name "fn:GC thrashing warning"
+
+\end_inset
+
+ Watch for GC thrashing and swap.
+ Values too high may cause GC thrashing and/or swapping, both of which are
+ capable of bringing job performance down to a halt.
+ Don't starve jobs for memory.
+ Defaults are believed to work well for -Xmx800mb or above per child task.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-abth,
+\begin_inset space ~
+\end_inset
+
+--abtBlockHeight
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\size default
+ 
+\family default
+(optional, default 200,000): the block height during 
+\begin_inset Formula $\mathbf{Y}_{i}=\mathbf{A}\mathbf{B}_{i}^{\top}$
+\end_inset
+
+ multiplication
+\begin_inset script superscript
+
+\begin_layout Plain Layout
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "fn:With-extreme-sparse"
+
+\end_inset
+
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "fn:GC thrashing warning"
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-s,
+\begin_inset space ~
+\end_inset
+
+--minSplitSize
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\family default
+\size default
+ (optional, default: use Hadoop's default): minimum split size to use in
+ mappers reading 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+ input.
+ 
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+
+\emph on
+As of this day, I haven't heard of a case where somebody would actually
+ have to use this option and actually increase split size and how it has
+ played out.
+ So this option is experimental.
+ 
+\end_layout
+
+\begin_layout Plain Layout
+Since in this version projection block formation happens in mappers, for
+ a sufficiently wide input matrix the algorithm may not be able to read
+ minimum 
+\begin_inset Formula $k+p$
+\end_inset
+
+ rows and form a block of minimum height required, so in that case the job
+ would bail out at the very first mapping step.
+ If this happens, one of the recourses available is to force increase in
+ the MapReduce split size using SequenceFileInputFormat.setMinSplitSize()
+ property.
+ Increasing this significantly over HDFS block size may result in network
+ IO to mappers.
+ Another caveat is that one sometimes does not want too many mappers because
+ it may in fact increase time of the computation.
+ Consequently, this option should probably be left alone unless one has
+ significant amount of mappers (as in thousands of map tasks) at which point
+ reducing amount of mappers may actually improve the throughput (just a
+ guesstimate at this point).
+ 
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--computeU
+\begin_inset space ~
+\end_inset
+
+<true|false>
+\family default
+\size default
+ (optional, default true).
+ Request computation of the U matrix 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--computeV
+\begin_inset space ~
+\end_inset
+
+<true|false>
+\family default
+\size default
+ (optional, default true).
+ Request computation of the V matrix 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--vHalfSigma
+\begin_inset space ~
+\end_inset
+
+<true|false>
+\family default
+\size default
+ (optional, default: false): compute 
+\begin_inset Formula $\mathbf{V}_{\sigma}=\mathbf{V}\boldsymbol{\Sigma}^{0.5}$
+\end_inset
+
+ instead of 
+\begin_inset Formula $\mathbf{V}$
+\end_inset
+
+ (see overview for explanation).
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--uHalfSigma
+\begin_inset space ~
+\end_inset
+
+<true|false>
+\family default
+\size default
+ (optional, default: false): compute 
+\begin_inset Formula $\mathbf{U}_{\sigma}=\mathbf{U}\boldsymbol{\Sigma}^{0.5}$
+\end_inset
+
+ instead of 
+\begin_inset Formula $\mathbf{U}$
+\end_inset
+
+.
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--reduceTasks
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\family default
+\size default
+ optional.
+ The number of reducers to use (where applicable): depends on the size of
+ the hadoop cluster.
+ At this point it could also be overwritten by a standard hadoop property
+ using -D option
+\begin_inset script superscript
+
+\begin_layout Plain Layout
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "fn:(TODO:-re-verify)"
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+
+.
+
+\series bold
+\emph on
+ Probably always needs to be specified as by default Hadoop would set it
+ to 1, which is certainly far below the cluster capacity.
+ 
+\series default
+\emph default
+Recommended value for this option ~ 95% or ~190% of available reducer capacity
+ to allow for opportunistic executions.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--pca 
+\family default
+\size default
+run in pca mode: treat the input as 
+\begin_inset Formula $\mathbf{A}^{\left(pca\right)}$
+\end_inset
+
+ and also compute column-wise mean 
+\begin_inset Formula $\boldsymbol{\xi}$
+\end_inset
+
+ over the input and use it to compute PCA space (unless external column
+ mean is already provided by 
+\family typewriter
+--pcaOffset
+\family default
+).
+ (see 
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+S
+\end_layout
+
+\end_inset
+
+ 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:PCA-options-in"
+
+\end_inset
+
+)
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--pcaOffset
+\begin_inset space ~
+\end_inset
+
+<
+\begin_inset Formula $\boldsymbol{\xi}$
+\end_inset
+
+-path>
+\family default
+ 
+\size default
+(optional, default: none).
+ Path(glob) of external column mean.
+ The glob parameter must point to a sequence file containing single 
+\family typewriter
+VectorWritable
+\family default
+ row as the 
+\begin_inset Formula $\boldsymbol{\xi}$
+\end_inset
+
+ mean (see 
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+S
+\end_layout
+
+\end_inset
+
+ 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:PCA-options-in"
+
+\end_inset
+
+).
+ This option can be used if column-wise mean is already efficiently obtained
+ as byproduct from another pipeline, or if one wants to use custom centering
+ offset for the data.
+ This will save one MR pass over input since the mean will not have to be
+ computed.
+\end_layout
+
+\begin_layout Paragraph
+Standard Mahout options.
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--input
+\begin_inset space ~
+\end_inset
+
+<glob>
+\family default
+\size default
+ HDFS glob specification where the DistributedRowMatrix input to be found.
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--output
+\begin_inset space ~
+\end_inset
+
+<hdfs-dir>
+\family default
+\size default
+ non-existent hdfs directory where to output 
+\begin_inset Formula $\mathbf{U},\mathbf{V}$
+\end_inset
+
+ and 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+ (singular values) files.
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--tempDir
+\begin_inset space ~
+\end_inset
+
+<temp-dir>
+\family default
+\size default
+ temporary dir where to store intermediate files (cleaned up upon normal
+ completion).
+ This is a standard Mahout optional parameter.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-ow,
+\begin_inset space ~
+\end_inset
+
+--overwrite
+\size default
+ 
+\family default
+overwrite output if exists.
+\end_layout
+
+\begin_layout Section
+Embedded use
+\end_layout
+
+\begin_layout Standard
+It is possible to instantiate and use 
+\family typewriter
+SSVDSolver
+\family default
+ class in embedded fashion in Hadoop-enabled applications.
+ This class would have getter and setter methods for each option available
+ via command line.
+ See javadoc for details.
+\end_layout
+
+\begin_layout Section
+Mixed environments: R+Mahout
+\end_layout
+
+\begin_layout Standard
+wip, todo, check back later.
+\end_layout
+
+\end_body
+\end_document

Added: mahout/site/trunk/content/attachments/27832158/28016860.lyx
URL: http://svn.apache.org/viewvc/mahout/site/trunk/content/attachments/27832158/28016860.lyx?rev=1360593&view=auto
==============================================================================
--- mahout/site/trunk/content/attachments/27832158/28016860.lyx (added)
+++ mahout/site/trunk/content/attachments/27832158/28016860.lyx Thu Jul 12 09:25:54 2012
@@ -0,0 +1,1615 @@
+#LyX 2.0 created this file. For more info see http://www.lyx.org/
+\lyxformat 413
+\begin_document
+\begin_header
+\textclass article
+\begin_preamble
+
+\end_preamble
+\use_default_options true
+\maintain_unincluded_children false
+\language english
+\language_package default
+\inputencoding auto
+\fontencoding global
+\font_roman lmodern
+\font_sans default
+\font_typewriter default
+\font_default_family default
+\use_non_tex_fonts false
+\font_sc false
+\font_osf false
+\font_sf_scale 100
+\font_tt_scale 100
+
+\graphics default
+\default_output_format default
+\output_sync 0
+\bibtex_command default
+\index_command default
+\paperfontsize default
+\spacing single
+\use_hyperref true
+\pdf_bookmarks true
+\pdf_bookmarksnumbered false
+\pdf_bookmarksopen false
+\pdf_bookmarksopenlevel 1
+\pdf_breaklinks false
+\pdf_pdfborder false
+\pdf_colorlinks false
+\pdf_backref false
+\pdf_pdfusetitle true
+\papersize default
+\use_geometry false
+\use_amsmath 1
+\use_esint 1
+\use_mhchem 1
+\use_mathdots 1
+\cite_engine basic
+\use_bibtopic false
+\use_indices false
+\paperorientation portrait
+\suppress_date true
+\use_refstyle 0
+\index Index
+\shortcut idx
+\color #008000
+\end_index
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation skip
+\defskip bigskip
+\quotes_language english
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+\tracking_changes false
+\output_changes false
+\html_math_output 0
+\html_css_as_file 0
+\html_be_strict false
+\end_header
+
+\begin_body
+
+\begin_layout Title
+Command Line Interface, 
+\begin_inset Newline newline
+\end_inset
+
+Stochastic SVD
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Dmitriy Lyubimov, dlyubimov at apache dot org
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\begin_inset CommandInset toc
+LatexCommand tableofcontents
+
+\end_inset
+
+
+\begin_inset Newpage pagebreak
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+Background information
+\end_layout
+
+\begin_layout Subsection
+Stochastic SVD (SSVD)
+\end_layout
+
+\begin_layout Standard
+Stochasitc SVD method in Mahout produces reduced rank Singular Value Decompositi
+on output in its strict mathematical definition:
+\begin_inset Formula 
+\[
+\mathbf{A}\approx\mathbf{U}\boldsymbol{\Sigma}\mathbf{V}^{\top},
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+i.
+ e.
+ it creates outputs for matrices 
+\begin_inset Formula $\mathbf{U}$
+\end_inset
+
+, 
+\begin_inset Formula $\mathbf{V}$
+\end_inset
+
+ and 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+, each of which may be requested individually.
+ The desired rank of decomposition, henceforth denoted as 
+\begin_inset Formula $k\in\mathbb{N}_{1}$
+\end_inset
+
+, is a parameter of the algorithm.
+ The singular values inside diagonal matrix 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+ satisfy 
+\begin_inset Formula $\sigma_{i+1}\leq\sigma_{i}$
+\end_inset
+
+ 
+\begin_inset Formula $\forall i\in\left[1,k-1\right]$
+\end_inset
+
+, i.e.
+ sorted from biggest to smallest.
+ Cases of rank deficiency 
+\begin_inset Formula $\mbox{rank}\left(\mathbf{A}\right)<k$
+\end_inset
+
+ are handled by producing 0s in singular value positions once deficiency
+ takes place.
+ 
+\end_layout
+
+\begin_layout Paragraph
+Single space for comparing row-items and column-items.
+\end_layout
+
+\begin_layout Standard
+On top of it, there's an option to present decomposition output in a form
+ of 
+\begin_inset Formula 
+\begin{equation}
+\mathbf{A}\approx\mathbf{U}_{\sigma}\mathbf{V}_{\sigma}^{\top},\label{eq:baked-sigma}
+\end{equation}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+where one can request 
+\begin_inset Formula $\mathbf{U}_{\sigma}=\mathbf{U}\boldsymbol{\Sigma}^{0.5}$
+\end_inset
+
+ instead of 
+\begin_inset Formula $\mathbf{U}$
+\end_inset
+
+ (but not both), 
+\begin_inset Formula $\mathbf{V}_{\sigma}=\mathbf{V}\boldsymbol{\Sigma}^{0.5}$
+\end_inset
+
+ instead of 
+\begin_inset Formula $\mathbf{V}$
+\end_inset
+
+ (but not both).
+ Here, notation 
+\begin_inset Formula $\boldsymbol{\Sigma}^{0.5}$
+\end_inset
+
+ implies diagonal matrix containing square roots of the singular values:
+ 
+\begin_inset Formula 
+\[
+\boldsymbol{\Sigma}^{0.5}=\left(\begin{matrix}\sqrt{\sigma_{1}} & \cdots & 0\\
+\vdots & \ddots & \vdots\\
+0 & \cdots & \sqrt{\sigma_{k}}
+\end{matrix}\right).
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Original singular values 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+ are still produced and saved regardless.
+\end_layout
+
+\begin_layout Standard
+This option is a nod to a common need of comparing actors represented by
+ both input rows and input columns in a common space.
+ E.g.
+ if LSI is performed such that rows are documents and columns are terms
+ then it is possible to compare documents and terms (ether existing or fold
+ in new ones) in one common space and perform similarity measurement between
+ a document and a term, rather than computing just term2term or document2documen
+t similarities.
+\end_layout
+
+\begin_layout Paragraph
+Folding in new observations.
+\end_layout
+
+\begin_layout Standard
+It is probably worth mentioning the operation of 
+\begin_inset Quotes eld
+\end_inset
+
+folding in
+\begin_inset Quotes erd
+\end_inset
+
+ new observations in context of this method, since it is often a basis for
+ incremental methods.
+\end_layout
+
+\begin_layout Standard
+If 
+\begin_inset Formula $\tilde{\mathbf{c}}_{r}\,\,\left(\tilde{\mathbf{c}}_{c}\right)$
+\end_inset
+
+ is a new row (column) observation in addition to original input 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+, then correspondent 
+\begin_inset Quotes eld
+\end_inset
+
+new
+\begin_inset Quotes erd
+\end_inset
+
+ row vectors of 
+\begin_inset Formula $\tilde{\mathbf{U}}$
+\end_inset
+
+ 
+\begin_inset Formula $\left(\tilde{\mathbf{V}}\right)$
+\end_inset
+
+ can be obtained as 
+\begin_inset Formula 
+\begin{eqnarray}
+\mathbf{\tilde{u}} & = & \boldsymbol{\Sigma}^{-1}\mathbf{V}^{\top}\tilde{\mathbf{c}}_{r},\label{eq:fold-in-u}\\
+\tilde{\mathbf{v}} & = & \boldsymbol{\Sigma}^{-1}\mathbf{U}^{\top}\tilde{\mathbf{c}}_{c}.
+\end{eqnarray}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Similarly, for the case 
+\begin_inset CommandInset ref
+LatexCommand formatted
+reference "eq:baked-sigma"
+
+\end_inset
+
+ folding in new observations into rows of 
+\begin_inset Formula $\tilde{\mathbf{U}}_{\sigma}\,\,\left(\tilde{\mathbf{V}}_{\sigma}\right)$
+\end_inset
+
+ would look like
+\begin_inset Formula 
+\begin{eqnarray}
+\mathbf{\tilde{u}}_{\sigma} & = & \mathbf{V}_{\sigma}^{\top}\tilde{\mathbf{c}}_{r},\label{eq:fold-in-u-sigma}\\
+\tilde{\mathbf{v}}_{\sigma} & = & \mathbf{U}_{\sigma}^{\top}\tilde{\mathbf{c}}_{c}.
+\end{eqnarray}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Thus, new rows can be added to matrices denoted as 
+\begin_inset Formula $\tilde{\mathbf{U}}\,\,\left(\tilde{\mathbf{V}}\right)$
+\end_inset
+
+ corresponding to new observations as new observations become available,
+ i.e.
+ incrementally.
+ Given that new observations are usually moderately sparse vectors, it might
+ be feasible to do fold-in in real time or almost real time, assuming proper
+ fast row-wise indexing of 
+\begin_inset Formula $\mathbf{U}\,\left(\mathbf{V}\right)$
+\end_inset
+
+ exists (e.g.
+ using a batch request to an HBase table containing rows of 
+\begin_inset Formula $\mathbf{U}\,\left(\mathbf{V}\right)$
+\end_inset
+
+).
+ However, since operation of folding in new observations doesn't change
+ original decomposition and its spaces, such new observations cannot be
+ considered 'training' examples.
+ Typically, from time to time accumulated new observations can be added
+ to original input 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+ and the whole decomposition can be recomputed again.
+\end_layout
+
+\begin_layout Standard
+Common applications for SVD include Latent Semantic Analysis (LSA), Principal
+ Component Analysis (PCA), dimensionality reduction and others.
+\end_layout
+
+\begin_layout Subsection
+PCA options in SSVD
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:PCA-options-in"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Some of interesting applications of SVD is dimensionality reduction and
+ Principal Component Analysis.
+ As of MAHOUT-817, SSVD method is equipped with options helping to produce
+ both PCA and dimensionality reduction transformations.
+\end_layout
+
+\begin_layout Paragraph
+Outline: data points are row vectors.
+\end_layout
+
+\begin_layout Standard
+We approach general PCA and dimensionality reduction problem with respect
+ to input expressed in Mahout's distributed row matrix format.
+ We also assume data points are row vectors in such matrix
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Note that in wikipedia article input data points are considered to be column
+ vectors, so our input is transpose w.r.t.
+ to case outlined in the wikipedia PCA article.
+\end_layout
+
+\end_inset
+
+.
+ We denote such 
+\begin_inset Formula $m\times n$
+\end_inset
+
+ input matrix as 
+\begin_inset Formula $\mathbf{A}^{\left(pca\right)}$
+\end_inset
+
+: 
+\begin_inset Formula 
+\[
+\mathbf{A}^{\left(pca\right)}=\left(\begin{matrix}\mathbf{a}_{1}^{\top}\\
+\mathbf{a}_{2}^{\top}\\
+\vdots\\
+\mathbf{a}_{m}^{\top}
+\end{matrix}\right)
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Column mean is n-vector
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+In many cases in literature one would find PCA mean vector denoted as 
+\begin_inset Formula $\boldsymbol{\mu}$
+\end_inset
+
+.
+ 
+\end_layout
+
+\end_inset
+
+ 
+\begin_inset Formula 
+\begin{eqnarray*}
+\mathbf{\boldsymbol{\xi}} & = & \left(\begin{matrix}\xi_{1}\\
+\xi_{2}\\
+\vdots\\
+\xi_{n}
+\end{matrix}\right)\\
+ & = & \frac{1}{m}\sum_{i}^{m}\mathbf{a}_{i}.
+\end{eqnarray*}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+We denote 
+\begin_inset Formula $m\times n$
+\end_inset
+
+ mean matrix as 
+\begin_inset Formula 
+\[
+\boldsymbol{\Xi}=\left(\begin{matrix}\mathbf{\boldsymbol{\xi}}^{\top}\\
+\mathbf{\boldsymbol{\xi}}^{\top}\\
+\vdots\\
+\mathbf{\boldsymbol{\xi}}^{\top}
+\end{matrix}\right)\in\mathbb{R}^{m\times n}
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Traditional approach under these settings starts with finding a column mean
+ 
+\begin_inset Formula $\boldsymbol{\xi}$
+\end_inset
+
+ and subtract it from all data points (row vectors) of the input
+\begin_inset Formula 
+\[
+\mathbf{A}=\mathbf{A}^{\left(pca\right)}-\boldsymbol{\Xi}
+\]
+
+\end_inset
+
+ and then proceeds with finding a reduced 
+\begin_inset Formula $k$
+\end_inset
+
+-rank SVD: 
+\end_layout
+
+\begin_layout Standard
+\begin_inset Formula 
+\[
+\mathbf{A}\approx\mathbf{U}\boldsymbol{\Sigma}\mathbf{V}^{\top}.
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+At this point rows of 
+\begin_inset Formula $\mathbf{U}$
+\end_inset
+
+ correspond to original data points (rows of 
+\begin_inset Formula $\mathbf{A}^{\left(pca\right)}$
+\end_inset
+
+) converted to PCA space and 
+\begin_inset Formula $\mathbf{V}$
+\end_inset
+
+ represents approximate eigenvectors of the covariance matrix of our input
+ (and we are done with PCA part at this point).
+ Note that since our datapoints in this case are row vectors in the input
+ (and not column vectors), covariance matrix is 
+\begin_inset Formula $\mathbf{C}=\frac{1}{m}\mathbf{A}^{\top}\mathbf{A}$
+\end_inset
+
+.
+ 
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Some courses (see Andrew Ng's ML online lectures) also mention a metric
+ aka 
+\begin_inset Quotes eld
+\end_inset
+
+percent of variance retained
+\begin_inset Quotes erd
+\end_inset
+
+ 
+\begin_inset Formula $\sum_{i=1}^{k}\sigma_{i}/\sum_{i=1}^{n}\sigma_{i}\cdot100\%$
+\end_inset
+
+ that seems to suggest that it would be a metric to compare 
+\begin_inset Quotes eld
+\end_inset
+
+goodness
+\begin_inset Quotes erd
+\end_inset
+
+ of choice for 
+\begin_inset Formula $k$
+\end_inset
+
+ .
+ We of course cannot know all of the singular values of the full SVD in
+ context of SSVD, although we can estimate that 
+\family roman
+\series medium
+\shape up
+\size normal
+\emph off
+\bar no
+\strikeout off
+\uuline off
+\uwave off
+\noun off
+\color none
+
+\begin_inset Formula 
+\[
+\frac{\sum_{i=1}^{k}\sigma_{i}}{\sum_{i=1}^{n}\sigma_{i}}\geq\frac{\sum_{i=1}^{k}\sigma_{i}}{\left(n-k\right)\sigma_{k}+\sum_{i=1}^{k}\sigma_{i}},
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Plain Layout
+i.e.
+ we can make a statement in a sense that 
+\begin_inset Quotes eld
+\end_inset
+
+we have 
+\emph on
+at least 
+\emph default
+that much variance retained
+\begin_inset Quotes erd
+\end_inset
+
+ instead.
+ Whether this estimate is useful or not in practice, is not clear to me
+ as I have no error estimate (and, more importantly, no error estimate 
+\emph on
+with
+\emph default
+ stochastic error baked in) between expression on left and right of the
+ inequality at the moment.
+ But at least it puts some metric threshold on how big 
+\begin_inset Formula $k$
+\end_inset
+
+ we may need and identify some cases where smaller values of k will suffice
+ given particular input and minimum variance retained prerequisite thus
+ reducing need for flops.
+ Another possible way to estimate it is perhaps to try and fit existing
+ singular values into asymptotically decaying curve in order to produce
+ a better guess for 
+\begin_inset Formula $\sum_{i=k+1}^{n}\sigma_{i}$
+\end_inset
+
+.
+ 
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Paragraph
+Transformations to/from for new observation data points.
+\end_layout
+
+\begin_layout Standard
+Dimensionality reduction transformations are directly following from SVD
+ fold-in operation 
+\begin_inset CommandInset ref
+LatexCommand eqref
+reference "eq:fold-in-u"
+
+\end_inset
+
+ described above.
+\end_layout
+
+\begin_layout Standard
+Transformation of any new data point observation of an 
+\begin_inset Formula $n$
+\end_inset
+
+-vector 
+\begin_inset Formula $\tilde{\mathbf{c}_{r}}$
+\end_inset
+
+ 
+\emph on
+into
+\emph default
+ reduced dimensionality PCA space 
+\begin_inset Formula $k$
+\end_inset
+
+-vector 
+\begin_inset Formula $\tilde{\mathbf{u}}$
+\end_inset
+
+ will look like 
+\begin_inset Formula 
+\begin{equation}
+\mathbf{\tilde{u}}=\boldsymbol{\Sigma^{-1}}\mathbf{V}^{\top}\left(\tilde{\mathbf{c}}_{r}-\boldsymbol{\xi}\right)\label{eq:toPca}
+\end{equation}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+(this operation is essentially an SVD fold-in operation corrected for the
+ mean subtraction).
+ Note that, again, if input vectors tend to be quite sparse, then 
+\begin_inset CommandInset ref
+LatexCommand eqref
+reference "eq:toPca"
+
+\end_inset
+
+ could be decomposed as 
+\begin_inset Formula 
+\[
+\tilde{\mathbf{u}}=\boldsymbol{\Sigma^{-1}}\mathbf{V}^{\top}\tilde{\mathbf{c}}_{r}-\boldsymbol{\Sigma^{-1}}\mathbf{V}^{\top}\boldsymbol{\xi},
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+and online conversion can be sped up by precomputing term 
+\begin_inset Formula $\boldsymbol{\Sigma^{-1}}\mathbf{V}^{\top}\boldsymbol{\xi}$
+\end_inset
+
+ which ends up to be a small dense 
+\begin_inset Formula $k$
+\end_inset
+
+-vector, and big matrix 
+\begin_inset Formula $\mathbf{V}$
+\end_inset
+
+ could be row-indexed for fast online matrix-vector multiplication.
+\end_layout
+
+\begin_layout Standard
+Inverse transformation (from reduced PCA space into original space) looks
+ like 
+\begin_inset Formula 
+\begin{equation}
+\tilde{\mathbf{c}}_{r}=\boldsymbol{\xi}+\mathbf{V}\boldsymbol{\Sigma}\tilde{\mathbf{u}}.\label{eq:frompca}
+\end{equation}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Paragraph
+MAHOUT-817 goals: why brute force approach is hard in context of big data
+ computations.
+\end_layout
+
+\begin_layout Standard
+In context of massive computations, input 
+\begin_inset Formula $\mathbf{A}^{\left(pca\right)}$
+\end_inset
+
+ is often rather sparse.
+ Sparse matrices are packed and their subsequent operations within Mahout
+ framework are optimized to account for degenerate nature of zero elements
+ computation.
+ Sometimes such reduction of need for flops and space may approach several
+ orders of magnitude.
+ However, mean subtraction step would turn such sparse inputs into a dense
+ matrix 
+\begin_inset Formula $\left(\mathbf{A}^{\left(pca\right)}-\boldsymbol{\Xi}\right)$
+\end_inset
+
+.
+ Such intermediate input will take a lot of space and subsequent SVD will
+ require a lot of flops.
+ Fortunately, this can be addressed in context of SSVD method in a way that
+ will be (almost) cost-equivalent to a regular SSVD computation on original
+ sparse input 
+\begin_inset Formula $\mathbf{A}^{\left(pca\right)}$
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+MAHOUT-817 addresses two goals:
+\end_layout
+
+\begin_layout Itemize
+Provide column-wise mean computation step in the whole pipeline (or use
+ outside mean vector if already available)
+\end_layout
+
+\begin_layout Itemize
+Lift the dense matrix data concerns per above.
+\end_layout
+
+\begin_layout Standard
+The sparser the original input is, the more efficiency gain is to be had
+ by using SSVD PCA options compared to brute-force approach.
+ 
+\end_layout
+
+\begin_layout Standard
+If original input is 100% dense, SSVD PCA options will have roughly the
+ same cost as brute-force approach.
+\end_layout
+
+\begin_layout Standard
+MAHOUT-817 introduces two additional pca options: 
+\family typewriter
+--pca
+\family default
+ and 
+\family typewriter
+--pcaOffset
+\family default
+ that request to treat incoming data as a PCA input.
+ SVD rank opton 
+\family typewriter
+-k
+\family default
+ will correspond to the reduced dimensionality of PCA space.
+
+\family typewriter
+ 
+\family default
+See 
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+S
+\end_layout
+
+\end_inset
+
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:Usage"
+
+\end_inset
+
+ for details.
+\end_layout
+
+\begin_layout Section
+Input/output file formats and layout
+\end_layout
+
+\begin_layout Standard
+Input 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+, as well as outputs 
+\begin_inset Formula $\mathbf{U}\left(\mathbf{U}_{\sigma}\right),\,\mathbf{V}\left(\mathbf{V}_{\sigma}\right)$
+\end_inset
+
+, are in Mahout's Distributed Row Matrix format, i.e.
+ set of sequence files where value is of 
+\family typewriter
+VectorWritable
+\family default
+ type.
+ As far as keys are concerned, rows of 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+ may be keyed (identified) by any 
+\family typewriter
+Writable 
+\family default
+(for as long as it is instantiable thru a default constructor).
+ That, among other thnigs, means that this method can be applied directly
+ on the output of 
+\family typewriter
+seq2sparse 
+\family default
+where keys are of 
+\family typewriter
+Text
+\family default
+ type
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+\begin_inset CommandInset label
+LatexCommand label
+name "fn:(TODO:-re-verify)"
+
+\end_inset
+
+(TODO: re-verify)
+\end_layout
+
+\end_inset
+
+.
+ 
+\end_layout
+
+\begin_layout Standard
+Definition of output 
+\begin_inset Formula $\mathbf{U}\,\,\left(\mathbf{U}_{\sigma}\right)$
+\end_inset
+
+ is identical to definition of the input matrix 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+, and the keys of corresponding rows in 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+ are copied to corresponding rows of output 
+\begin_inset Formula $\mathbf{U}\,\,\left(\mathbf{U}_{\sigma}\right)$
+\end_inset
+
+.
+ 
+\end_layout
+
+\begin_layout Standard
+Definition of output 
+\begin_inset Formula $\mathbf{V}\,\,\left(\mathbf{V}_{\sigma}\right)$
+\end_inset
+
+ is always sequence file(s) of 
+\family typewriter
+(IntWritable,
+\begin_inset Newline newline
+\end_inset
+
+
+\family default
+ 
+\family typewriter
+VectorWritable)
+\family default
+ where key corresponds to a column index of the input 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+.
+ 
+\end_layout
+
+\begin_layout Standard
+Output of 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+ is encoded by a single output file with a single vector value 
+\family typewriter
+(VectorWritable)
+\family default
+ with main diagonal entries of 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+ aka singular values 
+\begin_inset Formula $\left(\begin{matrix}\sigma_{1} & \cdots & \sigma_{k}\end{matrix}\right)$
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Section
+CLI usage
+\begin_inset CommandInset label
+LatexCommand label
+name "sec:Usage"
+
+\end_inset
+
+
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+As of Mahout 0.7 trunk.
+ 
+\begin_inset Newline newline
+\end_inset
+
+12/13/2011 adjusted for MAHOUT-922 changes.
+\begin_inset Newline newline
+\end_inset
+
+02/22/2012 MAHOUT-817.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout LyX-Code
+mahout ssvd <options>
+\end_layout
+
+\begin_layout Paragraph
+Options.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-k,
+\begin_inset space ~
+\end_inset
+
+--rank
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\family default
+\size default
+ (required): the requested SVD rank (minimum number of singular values and
+ dimensions in U, V matrices).
+ The value of 
+\begin_inset Formula $k+p$
+\end_inset
+
+ directly impacts running time and memory requirements.
+ 
+\series bold
+\emph on
+k+p=500 is probably more than reasonable
+\series default
+\emph default
+.
+ Typically 
+\begin_inset Formula $k+p$
+\end_inset
+
+ is taken within range 20...200.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-p,
+\begin_inset space ~
+\end_inset
+
+--oversampling
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\family default
+\size default
+ (optional, default 15): stochastic SVD oversampling.
+ 
+\begin_inset Formula $p$
+\end_inset
+
+ doesn't seem to have to be very significant.
+ If power iterations (
+\begin_inset Formula $q>0$
+\end_inset
+
+) are used then p perhaps could be kept quite low, not to exceed 10% of
+ 
+\begin_inset Formula $k$
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-q,
+\begin_inset space ~
+\end_inset
+
+--powerIter
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\size default
+ 
+\family default
+(optional, default 0): number of power iterations to perform.
+ This helps fighting data noise and improve precision significantly more
+ than just increasing 
+\begin_inset Formula $p$
+\end_inset
+
+.
+ Each additional power iteration adds 2 more steps (map/reduce + map-only).
+ Experimental data suggests using 
+\begin_inset Formula $q=1$
+\end_inset
+
+ is already producing quite good results which are hard to much improve
+ upon.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-r,
+\begin_inset space ~
+\end_inset
+
+--blockHeight
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\family default
+\size default
+ (optional, default 10,000): the number of rows of source matrix for block
+ computations during 
+\begin_inset Formula $\mathbf{Y}=\mathbf{QR}$
+\end_inset
+
+ decomposition.
+ Taller blocking causes more memory use but produces less blocks and therefore
+ somewhat better running times.
+ The most optimal mode from the running time point of view should be 1 block
+ per 1 mapper.
+ 
+\emph on
+This cannot be less than k+p.
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-oh,
+\begin_inset space ~
+\end_inset
+
+--outerProdBlockHeight
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\size default
+ 
+\family default
+(optional, default 30,000): the block height during 
+\begin_inset Formula $\mathbf{B}=\mathbf{Q}^{\top}\mathbf{A}$
+\end_inset
+
+ operation
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+\begin_inset CommandInset label
+LatexCommand label
+name "fn:With-extreme-sparse"
+
+\end_inset
+
+With extreme sparse matrices increasing this parameter may lead to better
+ performance by reducing computational pressure on the shuffle and sort
+ and grouping sparse records together.
+\end_layout
+
+\end_inset
+
+
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+\begin_inset CommandInset label
+LatexCommand label
+name "fn:GC thrashing warning"
+
+\end_inset
+
+ Watch for GC thrashing and swap.
+ Values too high may cause GC thrashing and/or swapping, both of which are
+ capable of bringing job performance down to a halt.
+ Don't starve jobs for memory.
+ Defaults are believed to work well for -Xmx800mb or above per child task.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-abth,
+\begin_inset space ~
+\end_inset
+
+--abtBlockHeight
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\size default
+ 
+\family default
+(optional, default 200,000): the block height during 
+\begin_inset Formula $\mathbf{Y}_{i}=\mathbf{A}\mathbf{B}_{i}^{\top}$
+\end_inset
+
+ multiplication
+\begin_inset script superscript
+
+\begin_layout Plain Layout
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "fn:With-extreme-sparse"
+
+\end_inset
+
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "fn:GC thrashing warning"
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-s,
+\begin_inset space ~
+\end_inset
+
+--minSplitSize
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\family default
+\size default
+ (optional, default: use Hadoop's default): minimum split size to use in
+ mappers reading 
+\begin_inset Formula $\mathbf{A}$
+\end_inset
+
+ input.
+ 
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+
+\emph on
+As of this day, I haven't heard of a case where somebody would actually
+ have to use this option and actually increase split size and how it has
+ played out.
+ So this option is experimental.
+ 
+\end_layout
+
+\begin_layout Plain Layout
+Since in this version projection block formation happens in mappers, for
+ a sufficiently wide input matrix the algorithm may not be able to read
+ minimum 
+\begin_inset Formula $k+p$
+\end_inset
+
+ rows and form a block of minimum height required, so in that case the job
+ would bail out at the very first mapping step.
+ If this happens, one of the recourses available is to force increase in
+ the MapReduce split size using SequenceFileInputFormat.setMinSplitSize()
+ property.
+ Increasing this significantly over HDFS block size may result in network
+ IO to mappers.
+ Another caveat is that one sometimes does not want too many mappers because
+ it may in fact increase time of the computation.
+ Consequently, this option should probably be left alone unless one has
+ significant amount of mappers (as in thousands of map tasks) at which point
+ reducing amount of mappers may actually improve the throughput (just a
+ guesstimate at this point).
+ 
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--computeU
+\begin_inset space ~
+\end_inset
+
+<true|false>
+\family default
+\size default
+ (optional, default true).
+ Request computation of the U matrix 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--computeV
+\begin_inset space ~
+\end_inset
+
+<true|false>
+\family default
+\size default
+ (optional, default true).
+ Request computation of the V matrix 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--vHalfSigma
+\begin_inset space ~
+\end_inset
+
+<true|false>
+\family default
+\size default
+ (optional, default: false): compute 
+\begin_inset Formula $\mathbf{V}_{\sigma}=\mathbf{V}\boldsymbol{\Sigma}^{0.5}$
+\end_inset
+
+ instead of 
+\begin_inset Formula $\mathbf{V}$
+\end_inset
+
+ (see overview for explanation).
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--uHalfSigma
+\begin_inset space ~
+\end_inset
+
+<true|false>
+\family default
+\size default
+ (optional, default: false): compute 
+\begin_inset Formula $\mathbf{U}_{\sigma}=\mathbf{U}\boldsymbol{\Sigma}^{0.5}$
+\end_inset
+
+ instead of 
+\begin_inset Formula $\mathbf{U}$
+\end_inset
+
+.
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--reduceTasks
+\begin_inset space ~
+\end_inset
+
+<int-value>
+\family default
+\size default
+ optional.
+ The number of reducers to use (where applicable): depends on the size of
+ the hadoop cluster.
+ At this point it could also be overwritten by a standard hadoop property
+ using -D option
+\begin_inset script superscript
+
+\begin_layout Plain Layout
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "fn:(TODO:-re-verify)"
+
+\end_inset
+
+
+\end_layout
+
+\end_inset
+
+.
+
+\series bold
+\emph on
+ Probably always needs to be specified as by default Hadoop would set it
+ to 1, which is certainly far below the cluster capacity.
+ 
+\series default
+\emph default
+Recommended value for this option ~ 95% or ~190% of available reducer capacity
+ to allow for opportunistic executions.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--pca 
+\family default
+\size default
+run in pca mode: treat the input as 
+\begin_inset Formula $\mathbf{A}^{\left(pca\right)}$
+\end_inset
+
+ and also compute column-wise mean 
+\begin_inset Formula $\boldsymbol{\xi}$
+\end_inset
+
+ over the input and use it to compute PCA space (unless external column
+ mean is already provided by 
+\family typewriter
+--pcaOffset
+\family default
+).
+ (see 
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+S
+\end_layout
+
+\end_inset
+
+ 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:PCA-options-in"
+
+\end_inset
+
+)
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--pcaOffset
+\begin_inset space ~
+\end_inset
+
+<
+\begin_inset Formula $\boldsymbol{\xi}$
+\end_inset
+
+-path>
+\family default
+ 
+\size default
+(optional, default: none).
+ Path(glob) of external column mean.
+ The glob parameter must point to a sequence file containing single 
+\family typewriter
+VectorWritable
+\family default
+ row as the 
+\begin_inset Formula $\boldsymbol{\xi}$
+\end_inset
+
+ mean (see 
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+S
+\end_layout
+
+\end_inset
+
+ 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sec:PCA-options-in"
+
+\end_inset
+
+).
+ This option can be used if column-wise mean is already efficiently obtained
+ as byproduct from another pipeline, or if one wants to use custom centering
+ offset for the data.
+ This will save one MR pass over input since the mean will not have to be
+ computed.
+\end_layout
+
+\begin_layout Paragraph
+Standard Mahout options.
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--input
+\begin_inset space ~
+\end_inset
+
+<glob>
+\family default
+\size default
+ HDFS glob specification where the DistributedRowMatrix input to be found.
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--output
+\begin_inset space ~
+\end_inset
+
+<hdfs-dir>
+\family default
+\size default
+ non-existent hdfs directory where to output 
+\begin_inset Formula $\mathbf{U},\mathbf{V}$
+\end_inset
+
+ and 
+\begin_inset Formula $\boldsymbol{\Sigma}$
+\end_inset
+
+ (singular values) files.
+ 
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+--tempDir
+\begin_inset space ~
+\end_inset
+
+<temp-dir>
+\family default
+\size default
+ temporary dir where to store intermediate files (cleaned up upon normal
+ completion).
+ This is a standard Mahout optional parameter.
+\end_layout
+
+\begin_layout Labeling
+\labelwidthstring 00.00.0000
+
+\family typewriter
+\size footnotesize
+-ow,
+\begin_inset space ~
+\end_inset
+
+--overwrite
+\size default
+ 
+\family default
+overwrite output if exists.
+\end_layout
+
+\begin_layout Section
+Embedded use
+\end_layout
+
+\begin_layout Standard
+It is possible to instantiate and use 
+\family typewriter
+SSVDSolver
+\family default
+ class in embedded fashion in Hadoop-enabled applications.
+ This class would have getter and setter methods for each option available
+ via command line.
+ See javadoc for details.
+\end_layout
+
+\begin_layout Section
+Mixed environments: R+Mahout
+\end_layout
+
+\begin_layout Standard
+wip, todo, check back later.
+\end_layout
+
+\end_body
+\end_document

Added: mahout/site/trunk/content/attachments/27832158/28016861.pdf
URL: http://svn.apache.org/viewvc/mahout/site/trunk/content/attachments/27832158/28016861.pdf?rev=1360593&view=auto
==============================================================================
Files mahout/site/trunk/content/attachments/27832158/28016861.pdf (added) and mahout/site/trunk/content/attachments/27832158/28016861.pdf Thu Jul 12 09:25:54 2012 differ

Added: mahout/site/trunk/content/attachments/27832158/28016891.r
URL: http://svn.apache.org/viewvc/mahout/site/trunk/content/attachments/27832158/28016891.r?rev=1360593&view=auto
==============================================================================
--- mahout/site/trunk/content/attachments/27832158/28016891.r (added)
+++ mahout/site/trunk/content/attachments/27832158/28016891.r Thu Jul 12 09:25:54 2012
@@ -0,0 +1,135 @@
+
+# standard SSVD
+ssvd.svd <- function(x, k, p=25, qiter=0 ) { 
+
+a <- as.matrix(x)
+m <- nrow(a)
+n <- ncol(a)
+p <- min( min(m,n)-k,p)
+r <- k+p
+
+omega <- matrix ( rnorm(r*n), nrow=n, ncol=r)
+
+y <- a %*% omega
+
+q <- qr.Q(qr(y))
+
+b<- t(q) %*% a
+
+#power iterations
+for ( i in 1:qiter ) { 
+  y <- a %*% t(b)
+  q <- qr.Q(qr(y))
+  b <- t(q) %*% a
+}
+
+bbt <- b %*% t(b)
+
+e <- eigen(bbt, symmetric=T)
+
+res <- list()
+
+res$svalues <- sqrt(e$values)[1:k]
+uhat=e$vectors[1:k,1:k]
+
+res$u <- (q %*% e$vectors)[,1:k]
+res$v <- (t(b) %*% e$vectors %*% diag(1/e$values))[,1:k]
+
+return(res)
+}
+
+
+
+#############
+## ssvd with pci options
+ssvd.cpca <- function ( x, k, p=25, qiter=0, fixY=T ) { 
+
+a <- as.matrix(x)
+m <- nrow(a)
+n <- ncol(a)
+p <- min( min(m,n)-k,p)
+r <- k+p
+
+
+# compute median xi
+xi<-colMeans(a)
+
+omega <- matrix ( rnorm(r*n), nrow=n, ncol=r)
+
+y <- a %*% omega
+
+#fix y
+if ( fixY ) { 
+  #debug
+  cat ("fixing Y...\n");
+
+  s_o = t(omega) %*% cbind(xi)
+  for (i in 1:r ) y[,i]<- y[,i]-s_o[i]
+}
+
+
+q <- qr.Q(qr(y))
+
+b<- t(q) %*% a
+
+# compute sum of q rows 
+s_q <- cbind(colSums(q))
+
+# compute B*xi
+# of course in MR implementation 
+# it will be collected as sums of ( B[,i] * xi[i] ) and reduced after.
+s_b <- b %*% cbind(xi)
+
+
+#power iterations
+for ( i in 1:qiter ) { 
+
+  # fix b 
+  b <- b - s_q %*% rbind(xi) 
+
+  y <- a %*% t(b)
+
+  # fix y 
+  if ( fixY )  
+    for (i in 1:r ) y[,i]<- y[,i]-s_b[i]
+  
+
+  q <- qr.Q(qr(y))
+  b <- t(q) %*% a
+
+  # recompute s_{q}
+  s_q <- cbind(colSums(q))
+
+  #recompute s_{b}
+  s_b <- b %*% cbind(xi)
+
+}
+
+
+
+#C is the outer product of S_q and S_b per doc
+C <- s_q %*% t(s_b)
+
+# fixing BB'
+bbt <- b %*% t(b) -C -t(C) + sum(xi * xi)* (s_q %*% t(s_q))
+
+e <- eigen(bbt, symmetric=T)
+
+res <- list()
+
+res$svalues <- sqrt(e$values)[1:k]
+uhat=e$vectors[1:k,1:k]
+
+res$u <- (q %*% e$vectors)[,1:k]
+
+res$v <- (t(b- s_q %*% rbind(xi) ) %*% e$vectors %*% diag(1/e$values))[,1:k]
+
+return(res)
+
+}
+
+
+
+
+
+