You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2015/12/02 02:05:13 UTC

[35/47] incubator-systemml git commit: [SYSML-327] Add additional algorithm tex files to Algorithms Reference

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/53e814f8/Algorithms Reference/StepGLM.tex
----------------------------------------------------------------------
diff --git a/Algorithms Reference/StepGLM.tex b/Algorithms Reference/StepGLM.tex
new file mode 100644
index 0000000..2afcf59
--- /dev/null
+++ b/Algorithms Reference/StepGLM.tex	
@@ -0,0 +1,111 @@
+\subsection{Stepwise Generalized Linear Regression}
+
+\noindent{\bf Description}
+\smallskip
+
+Our stepwise generalized linear regression script selects a model based on the Akaike information criterion (AIC): the model that gives rise to the lowest AIC is provided. Note that currently only the Bernoulli distribution family is supported (see below for details). \\
+
+\smallskip
+\noindent{\bf Usage}
+\smallskip
+
+{\hangindent=\parindent\noindent\it%
+{\tt{}-f }path/\/{\tt{}StepGLM.dml}
+{\tt{} -nvargs}
+{\tt{} X=}path/file
+{\tt{} Y=}path/file
+{\tt{} B=}path/file
+{\tt{} S=}path/file
+{\tt{} O=}path/file
+{\tt{} link=}int
+{\tt{} yneg=}double
+{\tt{} icpt=}int
+{\tt{} tol=}double
+{\tt{} disp=}double
+{\tt{} moi=}int
+{\tt{} mii=}int
+{\tt{} thr=}double
+{\tt{} fmt=}format
+
+}
+
+
+\smallskip
+\noindent{\bf Arguments}
+\begin{Description}
+	\item[{\tt X}:]
+	Location (on HDFS) to read the matrix of feature vectors; each row is
+	an example.
+	\item[{\tt Y}:]
+	Location (on HDFS) to read the response matrix, which may have 1 or 2 columns
+	\item[{\tt B}:]
+	Location (on HDFS) to store the estimated regression parameters (the $\beta_j$'s), with the
+	intercept parameter~$\beta_0$ at position {\tt B[}$m\,{+}\,1$, {\tt 1]} if available
+	\item[{\tt S}:] (default:\mbox{ }{\tt " "})
+	Location (on HDFS) to store the selected feature-ids in the order as computed by the algorithm,
+	by default it is standard output.
+	\item[{\tt O}:] (default:\mbox{ }{\tt " "})
+	Location (on HDFS) to write certain summary statistics described in Table~\ref{table:GLM:stats},
+	by default it is standard output. 
+	\item[{\tt link}:] (default:\mbox{ }{\tt 2})
+	Link function code to determine the link function~$\eta = g(\mu)$, see Table~\ref{table:commonGLMs}; currently the following link functions are supported: \\
+	{\tt 1} = log,
+	{\tt 2} = logit,
+	{\tt 3} = probit,
+	{\tt 4} = cloglog.
+	\item[{\tt yneg}:] (default:\mbox{ }{\tt 0.0})
+	Response value for Bernoulli ``No'' label, usually 0.0 or -1.0
+	\item[{\tt icpt}:] (default:\mbox{ }{\tt 0})
+	Intercept and shifting/rescaling of the features in~$X$:\\
+	{\tt 0} = no intercept (hence no~$\beta_0$), no shifting/rescaling of the features;\\
+	{\tt 1} = add intercept, but do not shift/rescale the features in~$X$;\\
+	{\tt 2} = add intercept, shift/rescale the features in~$X$ to mean~0, variance~1
+	\item[{\tt tol}:] (default:\mbox{ }{\tt 0.000001})
+	Tolerance (epsilon) used in the convergence criterion: we terminate the outer iterations
+	when the deviance changes by less than this factor; see below for details.
+	\item[{\tt disp}:] (default:\mbox{ }{\tt 0.0})
+	Dispersion parameter, or {\tt 0.0} to estimate it from data
+	\item[{\tt moi}:] (default:\mbox{ }{\tt 200})
+	Maximum number of outer (Fisher scoring) iterations
+	\item[{\tt mii}:] (default:\mbox{ }{\tt 0})
+	Maximum number of inner (conjugate gradient) iterations, or~0 if no maximum
+	limit provided
+	\item[{\tt thr}:] (default:\mbox{ }{\tt 0.01})
+	Threshold to stop the algorithm: if the decrease in the value of the AIC falls below {\tt thr}
+	no further features are being checked and the algorithm stops.
+	\item[{\tt fmt}:] (default:\mbox{ }{\tt "text"})
+	Matrix file output format, such as {\tt text}, {\tt mm}, or {\tt csv};
+	see read/write functions in SystemML Language Reference for details.
+\end{Description}
+
+
+\noindent{\bf Details}
+\smallskip
+
+Similar to {\tt StepLinearRegDS.dml} our stepwise GLM script builds a model by iteratively selecting predictive variables 
+using a forward selection strategy based on the AIC (\ref{eq:AIC}).
+Note that currently only the Bernoulli distribution family ({\tt fam=2} in Table~\ref{table:commonGLMs}) together with the following link functions are supported: log, logit, probit, and cloglog ({\tt link $\in\{1,2,3,4\}$ } in Table~\ref{table:commonGLMs}).  
+
+
+\smallskip
+\noindent{\bf Returns}
+\smallskip
+
+Similar to the outputs from {\tt GLM.dml} the stepwise GLM script computes the estimated regression coefficients and stores them in matrix $B$ on HDFS; matrix $B$ follows the same format as the one produced by {\tt GLM.dml} (see Section~\ref{sec:GLM}).   
+Additionally, {\tt StepGLM.dml} outputs the variable indices (stored in the 1-column matrix $S$) in the order they have been selected by the algorithm, i.e., $i$th entry in matrix $S$ stores the variable which improves the AIC the most in $i$th iteration.  
+If the model with the lowest AIC includes no variables matrix $S$ will be empty. 
+Moreover, the estimated summary statistics as defined in Table~\ref{table:GLM:stats}
+are printed out or stored in a file on HDFS (if requested);
+these statistics will be provided only if the selected model is nonempty, i.e., contains at least one variable.
+
+
+\smallskip
+\noindent{\bf Examples}
+\smallskip
+
+{\hangindent=\parindent\noindent\tt
+	\hml -f StepGLM.dml -nvargs X=/user/biadmin/X.mtx Y=/user/biadmin/Y.mtx	B=/user/biadmin/B.mtx S=/user/biadmin/selected.csv O=/user/biadmin/stats.csv link=2 yneg=-1.0 icpt=2 tol=0.000001  moi=100 mii=10 thr=0.05 fmt=csv
+	
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/53e814f8/Algorithms Reference/StepLinRegDS.tex
----------------------------------------------------------------------
diff --git a/Algorithms Reference/StepLinRegDS.tex b/Algorithms Reference/StepLinRegDS.tex
new file mode 100644
index 0000000..8adf486
--- /dev/null
+++ b/Algorithms Reference/StepLinRegDS.tex	
@@ -0,0 +1,101 @@
+\subsection{Stepwise Linear Regression}
+
+\noindent{\bf Description}
+\smallskip
+
+Our stepwise linear regression script selects a linear model based on the Akaike information criterion (AIC): 
+the model that gives rise to the lowest AIC is computed. \\
+
+\smallskip
+\noindent{\bf Usage}
+\smallskip
+
+{\hangindent=\parindent\noindent\it%
+{\tt{}-f }path/\/{\tt{}StepLinearRegDS.dml}
+{\tt{} -nvargs}
+{\tt{} X=}path/file
+{\tt{} Y=}path/file
+{\tt{} B=}path/file
+{\tt{} S=}path/file
+{\tt{} O=}path/file
+{\tt{} icpt=}int
+{\tt{} thr=}double
+{\tt{} fmt=}format
+
+}
+
+\smallskip
+\noindent{\bf Arguments}
+\begin{Description}
+\item[{\tt X}:]
+Location (on HDFS) to read the matrix of feature vectors, each row contains
+one feature vector.
+\item[{\tt Y}:]
+Location (on HDFS) to read the 1-column matrix of response values
+\item[{\tt B}:]
+Location (on HDFS) to store the estimated regression parameters (the $\beta_j$'s), with the
+intercept parameter~$\beta_0$ at position {\tt B[}$m\,{+}\,1$, {\tt 1]} if available
+\item[{\tt S}:] (default:\mbox{ }{\tt " "})
+Location (on HDFS) to store the selected feature-ids in the order as computed by the algorithm;
+by default the selected feature-ids are forwarded to the standard output.
+\item[{\tt O}:] (default:\mbox{ }{\tt " "})
+Location (on HDFS) to store the CSV-file of summary statistics defined in
+Table~\ref{table:linreg:stats}; by default the summary statistics are forwarded to the standard output.
+\item[{\tt icpt}:] (default:\mbox{ }{\tt 0})
+Intercept presence and shifting/rescaling the features in~$X$:\\
+{\tt 0} = no intercept (hence no~$\beta_0$), no shifting or rescaling of the features;\\
+{\tt 1} = add intercept, but do not shift/rescale the features in~$X$;\\
+{\tt 2} = add intercept, shift/rescale the features in~$X$ to mean~0, variance~1
+\item[{\tt thr}:] (default:\mbox{ }{\tt 0.01})
+Threshold to stop the algorithm: if the decrease in the value of the AIC falls below {\tt thr}
+no further features are being checked and the algorithm stops.
+\item[{\tt fmt}:] (default:\mbox{ }{\tt "text"})
+Matrix file output format, such as {\tt text}, {\tt mm}, or {\tt csv};
+see read/write functions in SystemML Language Reference for details.
+\end{Description}
+
+
+\noindent{\bf Details}
+\smallskip
+
+Stepwise linear regression iteratively selects predictive variables in an automated procedure.
+Currently, our implementation supports forward selection: starting from an empty model (without any variable) 
+the algorithm examines the addition of each variable based on the AIC as a model comparison criterion. The AIC is defined as  
+\begin{equation}
+AIC = -2 \log{L} + 2 edf,\label{eq:AIC}
+\end{equation}    
+where $L$ denotes the likelihood of the fitted model and $edf$ is the equivalent degrees of freedom, i.e., the number of estimated parameters. 
+This procedure is repeated until including no additional variable improves the model by a certain threshold 
+specified in the input parameter {\tt thr}. 
+
+For fitting a model in each iteration we use the ``direct solve'' method as in the script {\tt LinearRegDS.dml} discussed in Section~\ref{sec:LinReg}.  
+
+
+\smallskip
+\noindent{\bf Returns}
+\smallskip
+
+Similar to the outputs from {\tt LinearRegDS.dml} the stepwise linear regression script computes 
+the estimated regression coefficients and stores them in matrix $B$ on HDFS. 
+The format of matrix $B$ is identical to the one produced by the scripts for linear regression (see Section~\ref{sec:LinReg}).   
+Additionally, {\tt StepLinearRegDS.dml} outputs the variable indices (stored in the 1-column matrix $S$) 
+in the order they have been selected by the algorithm, i.e., $i$th entry in matrix $S$ corresponds to 
+the variable which improves the AIC the most in $i$th iteration.  
+If the model with the lowest AIC includes no variables matrix $S$ will be empty (contains one 0). 
+Moreover, the estimated summary statistics as defined in Table~\ref{table:linreg:stats}
+are printed out or stored in a file (if requested). 
+In the case where an empty model achieves the best AIC these statistics will not be produced. 
+
+
+\smallskip
+\noindent{\bf Examples}
+\smallskip
+
+{\hangindent=\parindent\noindent\tt
+	\hml -f StepLinearRegDS.dml -nvargs X=/user/biadmin/X.mtx Y=/user/biadmin/Y.mtx
+	B=/user/biadmin/B.mtx S=/user/biadmin/selected.csv O=/user/biadmin/stats.csv
+	icpt=2 thr=0.05 fmt=csv
+	
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/53e814f8/Algorithms Reference/SystemML_Algorithms_Reference.bib
----------------------------------------------------------------------
diff --git a/Algorithms Reference/SystemML_Algorithms_Reference.bib b/Algorithms Reference/SystemML_Algorithms_Reference.bib
index 8dce564..878e1dc 100644
--- a/Algorithms Reference/SystemML_Algorithms_Reference.bib	
+++ b/Algorithms Reference/SystemML_Algorithms_Reference.bib	
@@ -136,3 +136,80 @@
    pages        = {677--680}
 }
 
+@book{collett2003:kaplanmeier,
+  title={Modelling Survival Data in Medical Research, Second Edition},
+  author={Collett, D.},
+  isbn={9781584883258},
+  lccn={2003040945},
+  series={Chapman \& Hall/CRC Texts in Statistical Science},
+  year={2003},
+  publisher={Taylor \& Francis}
+}
+
+@article{PetoPABCHMMPS1979:kaplanmeier,
+    title = {{Design and analysis of randomized clinical trials requiring prolonged observation of each patient. II. analysis and examples.}},
+    author = {Peto, R. and Pike, M. C. and Armitage, P. and Breslow, N. E. and Cox, D. R. and Howard, S. V. and Mantel, N. and McPherson, K. and Peto, J. and Smith, P. G.},
+    journal = {British journal of cancer},
+    number = {1},
+    pages = {1--39},
+    volume = {35},
+    year = {1977}
+}
+
+@inproceedings{ZhouWSP08:als,
+  author    = {Yunhong Zhou and
+               Dennis M. Wilkinson and
+               Robert Schreiber and
+               Rong Pan},
+  title     = {Large-Scale Parallel Collaborative Filtering for the Netflix Prize},
+  booktitle = {Algorithmic Aspects in Information and Management, 4th International
+               Conference, {AAIM} 2008, Shanghai, China, June 23-25, 2008. Proceedings},
+  pages     = {337--348},
+  year      = {2008}
+}
+
+@book{BreimanFOS84:dtree,
+  author    = {Leo Breiman and
+               J. H. Friedman and
+               R. A. Olshen and
+               C. J. Stone},
+  title     = {Classification and Regression Trees},
+  publisher = {Wadsworth},
+  year      = {1984},
+  isbn      = {0-534-98053-8},
+  timestamp = {Thu, 03 Jan 2002 11:51:52 +0100},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/books/wa/BreimanFOS84},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
+
+@article{PandaHBB09:dtree,
+  author    = {Biswanath Panda and
+               Joshua Herbach and
+               Sugato Basu and
+               Roberto J. Bayardo},
+  title     = {{PLANET:} Massively Parallel Learning of Tree Ensembles with MapReduce},
+  journal   = {{PVLDB}},
+  volume    = {2},
+  number    = {2},
+  pages     = {1426--1437},
+  year      = {2009},
+  url       = {http://www.vldb.org/pvldb/2/vldb09-537.pdf},
+  timestamp = {Wed, 02 Sep 2009 09:21:18 +0200},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/journals/pvldb/PandaHBB09},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
+
+@article{Breiman01:rforest,
+  author    = {Leo Breiman},
+  title     = {Random Forests},
+  journal   = {Machine Learning},
+  volume    = {45},
+  number    = {1},
+  pages     = {5--32},
+  year      = {2001},
+  url       = {http://dx.doi.org/10.1023/A:1010933404324},
+  doi       = {10.1023/A:1010933404324},
+  timestamp = {Thu, 26 May 2011 15:25:18 +0200},
+  biburl    = {http://dblp.uni-trier.de/rec/bib/journals/ml/Breiman01},
+  bibsource = {dblp computer science bibliography, http://dblp.org}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/53e814f8/Algorithms Reference/SystemML_Algorithms_Reference.pdf
----------------------------------------------------------------------
diff --git a/Algorithms Reference/SystemML_Algorithms_Reference.pdf b/Algorithms Reference/SystemML_Algorithms_Reference.pdf
index 4d4ea6a..4087ba5 100644
Binary files a/Algorithms Reference/SystemML_Algorithms_Reference.pdf and b/Algorithms Reference/SystemML_Algorithms_Reference.pdf differ

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/53e814f8/Algorithms Reference/SystemML_Algorithms_Reference.tex
----------------------------------------------------------------------
diff --git a/Algorithms Reference/SystemML_Algorithms_Reference.tex b/Algorithms Reference/SystemML_Algorithms_Reference.tex
index 7dccad0..19988fe 100644
--- a/Algorithms Reference/SystemML_Algorithms_Reference.tex	
+++ b/Algorithms Reference/SystemML_Algorithms_Reference.tex	
@@ -1,10 +1,11 @@
 \documentclass[letter]{article}
 \usepackage{graphicx,amsmath,amssymb,amsthm,subfigure,color,url,multirow,rotating,comment}
-%\usepackage{tikz}
+\usepackage{tikz}
 \usepackage[normalem]{ulem}
 \usepackage[np,autolanguage]{numprint}
+\usepackage{tabularx}
 
-\usepackage[]{hyperref}
+\usepackage[pdftex]{hyperref}
 \hypersetup{
     unicode=false,          % non-Latin characters in Acrobat’s bookmarks
     pdftoolbar=true,        % show Acrobat’s toolbar?
@@ -92,7 +93,9 @@
 
 \input{NaiveBayes}
 
-%\input{DecisionTrees}
+\input{DecisionTrees}
+
+\input{RandomForest}
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Clustering}
@@ -106,28 +109,36 @@
 
 \input{LinReg}
 
-\newpage
+\input{StepLinRegDS}
 
 \input{GLM}
 
-\newpage
+\input{StepGLM}
 
 \input{GLMpredict.tex}
 
-\newpage
-
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Matrix Factorization}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 \input{pca}
 
+\input{ALS.tex}
+
+%%{\color{red}\subsection{GNMF}}
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%{\color{red}\section{Sequence Mining}}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%\section{Matrix Factorization}
 
-%%{\color{red}\subsection{GNMF}}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Survival Analysis}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\input{KaplanMeier}
+
+\input{Cox}
 
 \bibliographystyle{abbrv}