You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2015/12/02 02:04:42 UTC
[04/47] incubator-systemml git commit: Add Algorithms and Language References

Add Algorithms and Language References


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/a891c0a1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/a891c0a1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/a891c0a1

Branch: refs/heads/gh-pages
Commit: a891c0a1b36d62834ef4fd3da4b77c4a93ebacba
Parents: 
Author: Luciano Resende <lr...@apache.org>
Authored: Sun Aug 16 23:48:13 2015 -0700
Committer: Luciano Resende <lr...@apache.org>
Committed: Sun Aug 16 23:48:13 2015 -0700

----------------------------------------------------------------------
 Algorithms Reference/.jazzignore                |   35 +
 Algorithms Reference/BinarySVM.tex              |  154 +
 Algorithms Reference/DecisionTrees.tex          |  241 +
 Algorithms Reference/DescriptiveBivarStats.tex  |  417 +
 Algorithms Reference/DescriptiveStats.tex       |   94 +
 Algorithms Reference/DescriptiveStratStats.tex  |  285 +
 Algorithms Reference/DescriptiveUnivarStats.tex |  582 ++
 Algorithms Reference/GLM.tex                    |  409 +
 Algorithms Reference/GLMpredict.tex             |  453 +
 Algorithms Reference/Kmeans.tex                 |  350 +
 Algorithms Reference/LinReg.tex                 |  306 +
 Algorithms Reference/LogReg.tex                 |  266 +
 Algorithms Reference/MultiSVM.tex               |  153 +
 Algorithms Reference/NaiveBayes.tex             |  134 +
 Algorithms Reference/PCA.tex                    |  121 +
 .../SystemML_Algorithms_Reference.bib           |  138 +
 .../SystemML_Algorithms_Reference.pdf           |  Bin 0 -> 620293 bytes
 .../SystemML_Algorithms_Reference.tex           |  142 +
 Language Reference/PyDML Language Reference.doc |  Bin 0 -> 209408 bytes
 Language Reference/Python syntax for DML.doc    |  Bin 0 -> 207360 bytes
 Language Reference/README.txt                   |   87 +
 .../SystemML_Language_Reference.html            | 9801 ++++++++++++++++++
 22 files changed, 14168 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/a891c0a1/Algorithms Reference/.jazzignore
----------------------------------------------------------------------
diff --git a/Algorithms Reference/.jazzignore b/Algorithms Reference/.jazzignore
new file mode 100644
index 0000000..d782c85
--- /dev/null
+++ b/Algorithms Reference/.jazzignore	
@@ -0,0 +1,35 @@
+### Jazz Ignore 0
+# Ignored files and folders will not be committed, but may be modified during 
+# accept or update.  
+# - Ignore properties should contain a space separated list of filename patterns.  
+# - Each pattern is case sensitive and surrounded by braces ('{' and '}').  
+# - "*" matches zero or more characters.  
+# - "?" matches a single character.  
+# - The pattern list may be split across lines by ending the line with a 
+#     backslash and starting the next line with a tab.  
+# - Patterns in core.ignore prevent matching resources in the same 
+#     directory from being committed.  
+# - Patterns in core.ignore.recursive matching resources in the current 
+#     directory and all subdirectories from being committed.  
+# - The default value of core.ignore.recursive is *.class 
+# - The default value for core.ignore is bin 
+# 
+# To ignore shell scripts and hidden files in this subtree: 
+#     e.g: core.ignore.recursive = {*.sh} {\.*} 
+# 
+# To ignore resources named 'bin' in the current directory (but allow 
+#  them in any sub directorybelow): 
+#     e.g: core.ignore.recursive = {*.sh} {\.*} 
+# 
+# NOTE: modifying ignore files will not change the ignore status of 
+#     Eclipse derived resources.
+
+core.ignore.recursive= 
+
+core.ignore= \
+	{SystemML_Algorithms_Reference.aux} \
+	{SystemML_Algorithms_Reference.bbl} \
+	{SystemML_Algorithms_Reference.blg} \
+	{SystemML_Algorithms_Reference.dvi} \
+	{SystemML_Algorithms_Reference.out} \
+	{SystemML_Algorithms_Reference.synctex.gz} 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/a891c0a1/Algorithms Reference/BinarySVM.tex
----------------------------------------------------------------------
diff --git a/Algorithms Reference/BinarySVM.tex b/Algorithms Reference/BinarySVM.tex
new file mode 100644
index 0000000..5d59adc
--- /dev/null
+++ b/Algorithms Reference/BinarySVM.tex	
@@ -0,0 +1,154 @@
+\subsubsection{Binary-class Support Vector Machines}
+\label{l2svm}
+
+\noindent{\bf Description}
+
+Support Vector Machines are used to model the relationship between a categorical 
+dependent variable y and one or more explanatory variables denoted X. This 
+implementation learns (and predicts with) a binary class support vector machine 
+(y with domain size 2).
+\\
+
+\noindent{\bf Usage}
+
+\begin{tabbing}
+\texttt{-f} \textit{path}/\texttt{l2-svm.dml -nvargs} 
+\=\texttt{X=}\textit{path}/\textit{file} 
+  \texttt{Y=}\textit{path}/\textit{file} 
+  \texttt{icpt=}\textit{int} 
+  \texttt{tol=}\textit{double}\\
+\>\texttt{reg=}\textit{double} 
+  \texttt{maxiter=}\textit{int} 
+  \texttt{model=}\textit{path}/\textit{file}\\
+\>\texttt{Log=}\textit{path}/\textit{file}
+  \texttt{fmt=}\textit{csv}$\vert$\textit{text}
+\end{tabbing}
+
+\begin{tabbing}
+\texttt{-f} \textit{path}/\texttt{l2-svm-predict.dml -nvargs} 
+\=\texttt{X=}\textit{path}/\textit{file} 
+  \texttt{Y=}\textit{path}/\textit{file} 
+  \texttt{icpt=}\textit{int} 
+  \texttt{model=}\textit{path}/\textit{file}\\
+\>\texttt{scores=}\textit{path}/\textit{file}
+  \texttt{accuracy=}\textit{path}/\textit{file}\\
+\>\texttt{confusion=}\textit{path}/\textit{file}
+  \texttt{fmt=}\textit{csv}$\vert$\textit{text}
+\end{tabbing}
+
+%%\begin{verbatim}
+%%-f path/l2-svm.dml -nvargs X=path/file Y=path/file icpt=int tol=double
+%%                      reg=double maxiter=int model=path/file
+%%\end{verbatim}
+
+\noindent{\bf Arguments}
+
+\begin{itemize}
+\item X: Location (on HDFS) to read the matrix of feature vectors; 
+each row constitutes one feature vector.
+\item Y: Location to read the one-column matrix of (categorical) 
+labels that correspond to feature vectors in X. Binary class labels 
+can be expressed in one of two choices: $\pm 1$ or $1/2$. Note that,
+this argument is optional for prediction.
+\item icpt (default: {\tt 0}): If set to 1 then a constant bias column is 
+added to X. 
+\item tol (default: {\tt 0.001}): Procedure terminates early if the reduction
+in objective function value is less than tolerance times the initial objective
+function value.
+\item reg (default: {\tt 1}): Regularization constant. See details to find 
+out where lambda appears in the objective function. If one were interested 
+in drawing an analogy with the C parameter in C-SVM, then C = 2/lambda. 
+Usually, cross validation is employed to determine the optimum value of 
+lambda.
+\item maxiter (default: {\tt 100}): The maximum number of iterations.
+\item model: Location (on HDFS) that contains the learnt weights.
+\item Log: Location (on HDFS) to collect various metrics (e.g., objective 
+function value etc.) that depict progress across iterations while training.
+\item fmt (default: {\tt text}): Specifies the output format. Choice of 
+comma-separated values (csv) or as a sparse-matrix (text).
+\item scores: Location (on HDFS) to store scores for a held-out test set.
+Note that, this is an optional argument.
+\item accuracy: Location (on HDFS) to store the accuracy computed on a
+held-out test set. Note that, this is an optional argument.
+\item confusion: Location (on HDFS) to store the confusion matrix
+computed using a held-out test set. Note that, this is an optional 
+argument.
+\end{itemize}
+
+\noindent{\bf Details}
+
+Support vector machines learn a classification function by solving the
+following optimization problem ($L_2$-SVM):
+\begin{eqnarray*}
+&\textrm{argmin}_w& \frac{\lambda}{2} ||w||_2^2 + \sum_i \xi_i^2\\
+&\textrm{subject to:}& y_i w^{\top} x_i \geq 1 - \xi_i ~ \forall i
+\end{eqnarray*}
+where $x_i$ is an example from the training set with its label given by $y_i$, 
+$w$ is the vector of parameters and $\lambda$ is the regularization constant 
+specified by the user.
+
+To account for the missing bias term, one may augment the data with a column
+of constants which is achieved by setting intercept argument to 1 (C-J Hsieh 
+et al, 2008).
+
+This implementation optimizes the primal directly (Chapelle, 2007). It uses 
+nonlinear conjugate gradient descent to minimize the objective function 
+coupled with choosing step-sizes by performing one-dimensional Newton 
+minimization in the direction of the gradient.
+\\
+
+\noindent{\bf Returns}
+
+The learnt weights produced by l2-svm.dml are populated into a single column matrix 
+and written to file on HDFS (see model in section Arguments). The number of rows in 
+this matrix is ncol(X) if intercept was set to 0 during invocation and ncol(X) + 1 
+otherwise. The bias term, if used, is placed in the last row. Depending on what arguments
+are provided during invocation, l2-svm-predict.dml may compute one or more of scores, 
+accuracy and confusion matrix in the output format specified. 
+\\
+
+%%\noindent{\bf See Also}
+%%
+%%In case of multi-class classification problems (y with domain size greater than 2), 
+%%please consider using a multi-class classifier learning algorithm, e.g., multi-class
+%%support vector machines (see Section \ref{msvm}). To model the relationship between 
+%%a scalar dependent variable y and one or more explanatory variables X, consider 
+%%Linear Regression instead (see Section \ref{linreg-solver} or Section 
+%%\ref{linreg-iterative}).
+%%\\
+%%
+\noindent{\bf Examples}
+
+\begin{verbatim}
+hadoop jar SystemML.jar -f l2-svm.dml -nvargs X=/user/biadmin/X.mtx 
+                                              Y=/user/biadmin/y.mtx 
+                                              icpt=0 tol=0.001 fmt=csv
+                                              reg=1.0 maxiter=100 
+                                              model=/user/biadmin/weights.csv
+                                              Log=/user/biadmin/Log.csv
+\end{verbatim}
+
+\begin{verbatim}
+hadoop jar SystemML.jar -f l2-svm-predict.dml -nvargs X=/user/biadmin/X.mtx 
+                                                      Y=/user/biadmin/y.mtx 
+                                                      icpt=0 fmt=csv
+                                                      model=/user/biadmin/weights.csv
+                                                      scores=/user/biadmin/scores.csv
+                                                      accuracy=/user/biadmin/accuracy.csv
+                                                      confusion=/user/biadmin/confusion.csv
+\end{verbatim}
+
+\noindent{\bf References}
+
+\begin{itemize}
+\item W. T. Vetterling and B. P. Flannery. \newblock{\em Conjugate Gradient Methods in Multidimensions in 
+Numerical Recipes in C - The Art in Scientific Computing}. \newblock W. H. Press and S. A. Teukolsky
+(eds.), Cambridge University Press, 1992.
+\item J. Nocedal and  S. J. Wright. Numerical Optimization, Springer-Verlag, 1999.
+\item C-J Hsieh, K-W Chang, C-J Lin, S. S. Keerthi and S. Sundararajan. \newblock{\em A Dual Coordinate 
+Descent Method for Large-scale Linear SVM.} \newblock International Conference of Machine Learning
+(ICML), 2008.
+\item Olivier Chapelle. \newblock{\em Training a Support Vector Machine in the Primal}. \newblock Neural 
+Computation, 2007.
+\end{itemize}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/a891c0a1/Algorithms Reference/DecisionTrees.tex
----------------------------------------------------------------------
diff --git a/Algorithms Reference/DecisionTrees.tex b/Algorithms Reference/DecisionTrees.tex
new file mode 100644
index 0000000..f404dfc
--- /dev/null
+++ b/Algorithms Reference/DecisionTrees.tex	
@@ -0,0 +1,241 @@
+\subsection{Decision Trees}
+\label{decision_trees}
+
+\noindent{\bf Description}
+Decision trees (for classification) is a classifier that is considered
+more interpretable than other statistical classifiers. This implementation
+is well-suited to handle large-scale data and builds a (binary) decision 
+tree in parallel.\\
+
+\noindent{\bf Usage}
+\begin{tabbing}
+\texttt{-f} \textit{path}/\texttt{decision-tree.dml -nvargs} 
+\=\texttt{X=}\textit{path}/\textit{file} 
+  \texttt{Y=}\textit{path}/\textit{file} 
+  \texttt{types=}\textit{path}/\textit{file}\\
+\>\texttt{model=}\textit{path}/\textit{file}
+  \texttt{bins=}\textit{int}
+  \texttt{depth=}\textit{int}\\
+\>\texttt{num\_leaf=}\textit{int}
+  \texttt{num\_samples=}\textit{int}\\
+\>\texttt{Log=}\textit{path}/\textit{file}
+  \texttt{fmt=}\textit{csv}$\vert$\textit{text}
+\end{tabbing}
+
+\begin{tabbing}
+\texttt{-f} \textit{path}/\texttt{decision-tree-predict.dml -nvargs} 
+\=\texttt{X=}\textit{path}/\textit{file} 
+  \texttt{Y=}\textit{path}/\textit{file} 
+  \texttt{model=}\textit{path}/\textit{file}\\
+\>\texttt{fmt=}\textit{csv}$\vert$\textit{text}
+  \texttt{accuracy=}\textit{path}/\textit{file}\\
+\>\texttt{confusion=}\textit{path}/\textit{file}
+  \texttt{predictions=}\textit{path}/\textit{file}
+\end{tabbing}
+
+\noindent{\bf Arguments}
+
+\begin{itemize}
+\item X: Location (on HDFS) to read the matrix of feature vectors; 
+each row constitutes one feature vector.
+\item Y: Location (on HDFS) to read the one-column matrix of (categorical) 
+labels that correspond to feature vectors in X. Classes are assumed to be
+contiguously labeled beginning from 1. Note that, this argument is optional
+for prediction.
+\item model: Location (on HDFS) that contains the learnt decision tree.
+\item types: Location (on HDFS) that contains each feature's type. 1 denotes
+continuous-valued (scale) and 2 denotes categorical.
+\item bins (default: {\tt 50}): Number of thresholds to choose for each 
+continuous-valued feature (deterimined by equi-height binning). 
+\item depth (default: {\tt 10}): Maximum depth of the learnt decision tree.
+\item num\_leaf (default: {\tt 1}): Parameter that controls pruning. The tree
+is not expanded if a node receives less than num\_leaf training examples.
+\item num\_samples (default: {\tt 10}): Parameter that decides when to switch
+to in-memory building of subtrees. If a node $v$ receives less than num\_samples
+training examples then this implementation switches to an in-memory subtree
+building procedure to build the subtree under $v$ in its entirety.
+\item Log: Location (on HDFS) that collects various useful metrics that indicate
+training progress.
+\item predictions: Location (on HDFS) to store predictions for a held-out test set.
+Note that, this is an optional argument.
+\item fmt (default: {\tt text}): Specifies the output format. Choice of 
+comma-separated values (csv) or as a sparse-matrix (text).
+\item accuracy: Location (on HDFS) to store the testing accuracy from a 
+held-out test set during prediction. Note that, this is an optional argument.
+\item confusion: Location (on HDFS) to store the confusion matrix
+computed using a held-out test set. Note that, this is an optional 
+argument.
+\end{itemize}
+
+\noindent{\bf Details}
+ 
+Decision trees (Breiman et al, 1984) are simple models of
+classification that,  due to their structure,  are easy to
+interpret. Given an example feature vector, each node in the learnt
+tree runs a simple test on it. Based on the result of the test, the
+example is either diverted to the left subtree or to the right
+subtree. Once the example reaches a leaf, then the label stored at the
+leaf is returned as the prediction for the example.
+
+\par
+
+Building a decision tree from a fully labeled training set entails
+choosing appropriate tests for each internal node in the tree and this
+is usually performed in a top-down manner. Choosing a test requires
+first choosing a feature $j$ and depending on the type of $j$, either
+a threshold $\sigma$, in case $j$ is continuous-valued, or a subset of
+values $S \subseteq \text{Dom}(j)$ where $\text{Dom}(j)$ denotes
+domain of $j$, in case it is categorical. For continuous-valued
+features the test is thus of form $x_j < \sigma$ and for categorical
+features it is of form $x_j \in S$, where $x_j$ denotes the $j^{th}$
+feature value of feature vector $x$. One way to determine which test
+to include, is to compare impurities of the subtrees induced by the
+test and this implementation uses {\it Gini impurity}.
+
+\par
+
+The current implementation allows the user to specify the maximum
+depth of the learnt tree using the argument {\it depth}. It also
+allows for some automated pruning via the argument {\it num\_leaf}. If
+a node receives $\leq$ {\it num\_leaf} training examples, then a leaf
+is built in its place. Furthermore, for a continuous-valued feature
+$j$ the number of candidate thresholds $\sigma$ to choose from is of
+the order of the number of examples present in the training set. Since
+for large-scale data this can result in a large number of candidate
+thresholds, the user can limit this number via the arguments {\it
+  bins} which controls the number of candidate thresholds considered
+for each continuous-valued feature. For each continuous-valued
+feature, the implementation computes an equi-height histogram to
+generate one candidate threshold per equi-height bin. To determine the
+best value subset to split on in the case of categorical features,
+this implementation greedily includes values from the feature's domain
+until the sum of the impurities of the subtrees induced stops
+improving.
+
+\par
+
+Learning a decision tree on large-scale data has received  some
+attention in the literature. The current implementation includes logic
+for choosing tests for multiple nodes that belong to the same level in
+the decision tree in parallel (breadth-first expansion) and for
+building entire subtrees under multiple nodes in parallel (depth-first
+subtree building). Empirically it has been demonstrated that it is
+advantageous to perform breadth-first expansion for the nodes
+belonging to the top levels of the tree and to perform depth-first
+subtree building for nodes belonging to the lower levels of the tree
+(Panda et al, 2009). The parameter {\it num\_samples} controls when we
+switch to  depth-first subtree building. Any node in the decision tree
+that receives $\leq$ {\it num\_samples} training examples, the subtree
+under it is built in its entirety in one shot.
+
+\par
+
+{\it Description of the model}: The learnt decision tree is represented using a matrix that
+contains at least 3 rows. Each column in the matrix contains the parameters relevant
+to a single node in the tree. The $i^{th}$ column denotes the parameters for the $i^{th}$ node
+whose left child is stored in the $2i^{th}$ column and right child is stored in the $2i+1^{th}$
+column. Here is a brief description of what each row in the matrix contains:
+\begin{itemize}
+\item $1^{st}$ row: Contains the feature index of the feature that this node looks at if 
+the node is an internal node, otherwise -1.
+\item $2^{nd}$ row: Contains the type of the feature that this node looks at if the node is
+an internal node, otherwise the label this leaf node is supposed to predict. 
+1 denotes continuous-valued feature and 2 denotes categorical.
+\item $3^{rd}$: Only applicable for internal nodes. Contains the threshold the example's 
+feature value is compared to if the feature chosen for this node is a continuous-valued feature. 
+If on the other hand, the feature chosen for this node is categorical then the size of the 
+subset of values is stored here.
+\item $4^{th}$ row onwards: Only applicable in the case of internal nodes where the feature
+chosen is a categorical feature. Rows $4, 5 \ldots$ depict the value subset 
+chosen for this node.
+\end{itemize}
+As an example, Figure \ref{dtree} shows a decision tree with $5$ nodes and its matrix
+representation.
+
+\begin{figure}
+\begin{minipage}{0.3\linewidth}
+\begin{center}
+\begin{tikzpicture}
+\node (labelleft) [draw,shape=circle,minimum size=16pt] at (0,0) {$2$};
+\node (labelright) [draw,shape=circle,minimum size=16pt] at (1,0) {$1$};
+\node (rootleft) [draw,shape=rectangle,minimum size=16pt] at (0.5,1) {$x_5 \in \{2,3\}$};
+\node (rootlabel) [draw,shape=circle,minimum size=16pt] at (2.5,1) {$1$};
+\node (root) [draw,shape=rectangle,minimum size=16pt] at (1.75,2) {$x_3 < 0.45$};
+
+\draw[-latex] (root) -- (rootleft);
+\draw[-latex] (root) -- (rootlabel);
+\draw[-latex] (rootleft) -- (labelleft);
+\draw[-latex] (rootleft) -- (labelright);
+
+\end{tikzpicture}
+\end{center}
+\begin{center}
+(a)
+\end{center}
+\end{minipage}
+\hfill
+\begin{minipage}{0.65\linewidth}
+\begin{center}
+\begin{tabular}{c|c|c|c|c|c|}
+& Col 1 & Col 2 & Col 3 & Col 4 & Col 5\\
+\hline
+Row 1 & 3 & 5 & -1 & -1 & -1\\
+\hline
+Row 2 & 1 & 2 & 1 & 2 & 1\\
+\hline
+Row 3 & 0.45 & 2 &  &  & \\
+\hline
+Row 4 &  & 2 &  &  & \\
+\hline
+Row 5 &  & 3 &  &  & \\
+\hline
+\end{tabular}
+\end{center}
+\begin{center}
+(b)
+\end{center}
+\end{minipage}
+\caption{(a) An example tree and its (b) matrix representation. $x$ denotes an example and $x_j$ the $j^{th}$ feature's value in it.}
+\label{dtree}
+\end{figure}
+
+\vspace{16pt}
+
+\noindent{\bf Returns}
+
+The matrix corresponding to the learnt model is written to a file in the format requested. See
+details where the structure of the model matrix is
+described. Depending on what arguments are provided during
+invocation, decision-tree-predict.dml may compute one or more of
+predictions,  accuracy and confusion matrix in the requested output format.
+\\
+
+\noindent{\bf Examples}
+\begin{verbatim}
+hadoop jar SystemML.jar -f decision-tree.dml -nvargs 
+                           X=/user/biadmin/X.mtx 
+                           Y=/user/biadmin/y.mtx 
+                           types=/user/biadmin/types.mtx
+                           model=/user/biadmin/model.mtx
+                           bins=50 depth=10 num_leaf=1
+                           num_samples=250 fmt=csv
+                           Log=/user/biadmin/accuracy.csv
+\end{verbatim}
+
+\begin{verbatim}
+hadoop jar SystemML.jar -f decision-tree-predict.dml -nvargs 
+                           X=/user/biadmin/X.mtx 
+                           Y=/user/biadmin/y.mtx 
+                           model=/user/biadmin/model.mtx
+                           fmt=csv
+                           predictions=/user/biadmin/probabilities.csv
+                           accuracy=/user/biadmin/accuracy.csv
+                           confusion=/user/biadmin/confusion.csv
+\end{verbatim}
+
+\noindent{\bf References}
+
+\begin{itemize}
+\item B. Panda, J. Herbach, S. Basu, and R. Bayardo. \newblock{PLANET: massively parallel learning of tree ensembles with MapReduce}. In Proceedings of the VLDB Endowment, 2009.
+\item L. Breiman, J. Friedman, R. Olshen, and C. Stone. \newblock{Classification and Regression Trees}. Wadsworth and Brooks, 1984.
+\end{itemize}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/a891c0a1/Algorithms Reference/DescriptiveBivarStats.tex
----------------------------------------------------------------------
diff --git a/Algorithms Reference/DescriptiveBivarStats.tex b/Algorithms Reference/DescriptiveBivarStats.tex
new file mode 100644
index 0000000..2fe872d
--- /dev/null
+++ b/Algorithms Reference/DescriptiveBivarStats.tex	
@@ -0,0 +1,417 @@
+\subsection{Bivariate Statistics}
+
+\noindent{\bf Description}
+\smallskip
+
+Bivariate statistics are used to quantitatively describe the association between
+two features, such as test their statistical (in-)dependence or measure
+the accuracy of one data feature predicting the other feature, in a sample.
+The \BivarScriptName{} script computes common bivariate statistics,
+such as \NameStatR{} and \NameStatChi{}, in parallel for many pairs
+of data features.  For a given dataset matrix, script \BivarScriptName{} computes
+certain bivariate statistics for the given feature (column) pairs in the
+matrix.  The feature types govern the exact set of statistics computed for that pair.
+For example, \NameStatR{} can only be computed on two quantitative (scale)
+features like `Height' and `Temperature'. 
+It does not make sense to compute the linear correlation of two categorical attributes
+like `Hair Color'. 
+
+
+\smallskip
+\noindent{\bf Usage}
+\smallskip
+
+{\hangindent=\parindent\noindent\it%\tolerance=0
+{\tt{}-f }path/\/\BivarScriptName{}
+{\tt{} -nvargs}
+{\tt{} X=}path/file
+{\tt{} index1=}path/file
+{\tt{} index2=}path/file
+{\tt{} types1=}path/file
+{\tt{} types2=}path/file
+{\tt{} OUTDIR=}path
+% {\tt{} fmt=}format
+
+}
+
+
+\smallskip
+\noindent{\bf Arguments}
+\begin{Description}
+\item[{\tt X}:]
+Location (on HDFS) to read the data matrix $X$ whose columns are the features
+that we want to compare and correlate with bivariate statistics.
+\item[{\tt index1}:] % (default:\mbox{ }{\tt " "})
+Location (on HDFS) to read the single-row matrix that lists the column indices
+of the \emph{first-argument} features in pairwise statistics.
+Its $i^{\textrm{th}}$ entry (i.e.\ $i^{\textrm{th}}$ column-cell) contains the
+index $k$ of column \texttt{X[,$\,k$]} in the data matrix
+whose bivariate statistics need to be computed.
+% The default value means ``use all $X$-columns from the first to the last.''
+\item[{\tt index2}:] % (default:\mbox{ }{\tt " "})
+Location (on HDFS) to read the single-row matrix that lists the column indices
+of the \emph{second-argument} features in pairwise statistics.
+Its $j^{\textrm{th}}$ entry (i.e.\ $j^{\textrm{th}}$ column-cell) contains the
+index $l$ of column \texttt{X[,$\,l$]} in the data matrix
+whose bivariate statistics need to be computed.
+% The default value means ``use all $X$-columns from the first to the last.''
+\item[{\tt types1}:] % (default:\mbox{ }{\tt " "})
+Location (on HDFS) to read the single-row matrix that lists the \emph{types}
+of the \emph{first-argument} features in pairwise statistics.
+Its $i^{\textrm{th}}$ entry (i.e.\ $i^{\textrm{th}}$ column-cell) contains the type
+of column \texttt{X[,$\,k$]} in the data matrix, where $k$ is the $i^{\textrm{th}}$
+entry in the {\tt index1} matrix.  Feature types must be encoded by
+integer numbers: $1 = {}$scale, $2 = {}$nominal, $3 = {}$ordinal.
+% The default value means ``treat all referenced $X$-columns as scale.''
+\item[{\tt types2}:] % (default:\mbox{ }{\tt " "})
+Location (on HDFS) to read the single-row matrix that lists the \emph{types}
+of the \emph{second-argument} features in pairwise statistics.
+Its $j^{\textrm{th}}$ entry (i.e.\ $j^{\textrm{th}}$ column-cell) contains the type
+of column \texttt{X[,$\,l$]} in the data matrix, where $l$ is the $j^{\textrm{th}}$
+entry in the {\tt index2} matrix.  Feature types must be encoded by
+integer numbers: $1 = {}$scale, $2 = {}$nominal, $3 = {}$ordinal.
+% The default value means ``treat all referenced $X$-columns as scale.''
+\item[{\tt OUTDIR}:]
+Location path (on HDFS) where the output matrices with computed bivariate
+statistics will be stored.  The matrices' file names and format are defined
+in Table~\ref{table:bivars}.
+% \item[{\tt fmt}:] (default:\mbox{ }{\tt "text"})
+% Matrix file output format, such as {\tt text}, {\tt mm}, or {\tt csv};
+% see read/write functions in SystemML Language Reference for details.
+\end{Description}
+
+\begin{table}[t]\hfil
+\begin{tabular}{|lll|}
+\hline\rule{0pt}{12pt}%
+Ouput File / Matrix         & Row$\,$\# & Name of Statistic   \\[2pt]
+\hline\hline\rule{0pt}{12pt}%
+\emph{All Files}            &     1     & 1-st feature column \\
+\rule{1em}{0pt}"            &     2     & 2-nd feature column \\[2pt]
+\hline\rule{0pt}{12pt}%
+bivar.scale.scale.stats     &     3     & \NameStatR          \\[2pt]
+\hline\rule{0pt}{12pt}%
+bivar.nominal.nominal.stats &     3     & \NameStatChi        \\
+\rule{1em}{0pt}"            &     4     & Degrees of freedom  \\
+\rule{1em}{0pt}"            &     5     & \NameStatPChi       \\
+\rule{1em}{0pt}"            &     6     & \NameStatV          \\[2pt]
+\hline\rule{0pt}{12pt}%
+bivar.nominal.scale.stats   &     3     & \NameStatEta        \\
+\rule{1em}{0pt}"            &     4     & \NameStatF          \\[2pt]
+\hline\rule{0pt}{12pt}%
+bivar.ordinal.ordinal.stats &     3     & \NameStatRho        \\[2pt]
+\hline
+\end{tabular}\hfil
+\caption{%
+The output matrices of \BivarScriptName{} have one row per one bivariate
+statistic and one column per one pair of input features.  This table lists
+the meaning of each matrix and each row.%
+% Signs ``+'' show applicability to scale or/and to categorical features.
+}
+\label{table:bivars}
+\end{table}
+
+
+
+\pagebreak[2]
+
+\noindent{\bf Details}
+\smallskip
+
+Script \BivarScriptName{} takes an input matrix \texttt{X} whose columns represent
+the features and whose rows represent the records of a data sample.
+Given \texttt{X}, the script computes certain relevant bivariate statistics
+for specified pairs of feature columns \texttt{X[,$\,i$]} and \texttt{X[,$\,j$]}.
+Command-line parameters \texttt{index1} and \texttt{index2} specify the files with
+column pairs of interest to the user.  Namely, the file given by \texttt{index1}
+contains the vector of the 1st-attribute column indices and the file given
+by \texttt{index2} has the vector of the 2nd-attribute column indices, with
+``1st'' and ``2nd'' referring to their places in bivariate statistics.
+Note that both \texttt{index1} and \texttt{index2} files should contain a 1-row matrix
+of positive integers.
+
+The bivariate statistics to be computed depend on the \emph{types}, or
+\emph{measurement levels}, of the two columns.
+The types for each pair are provided in the files whose locations are specified by
+\texttt{types1} and \texttt{types2} command-line parameters.
+These files are also 1-row matrices, i.e.\ vectors, that list the 1st-attribute and
+the 2nd-attribute column types in the same order as their indices in the
+\texttt{index1} and \texttt{index2} files.  The types must be provided as per
+the following convention: $1 = {}$scale, $2 = {}$nominal, $3 = {}$ordinal.
+
+The script orgainizes its results into (potentially) four output matrices, one per
+each type combination.  The types of bivariate statistics are defined using the types
+of the columns that were used for their arguments, with ``ordinal'' sometimes
+retrogressing to ``nominal.''  Table~\ref{table:bivars} describes what each column
+in each output matrix contains.  In particular, the script includes the following
+statistics:
+\begin{Itemize}
+\item For a pair of scale (quantitative) columns, \NameStatR;
+\item For a pair of nominal columns (with finite-sized, fixed, unordered domains), 
+the \NameStatChi{} and its p-value;
+\item For a pair of one scale column and one nominal column, \NameStatF{};
+\item For a pair of ordinal columns (ordered domains depicting ranks), \NameStatRho.
+\end{Itemize}
+Note that, as shown in Table~\ref{table:bivars}, the output matrices contain the
+column indices of the features involved in each statistic.
+Moreover, if the output matrix does not contain
+a value in a certain cell then it should be interpreted as a~$0$
+(sparse matrix representation).
+
+Below we list all bivariate statistics computed by script \BivarScriptName.
+The statistics are collected into several groups by the type of their input
+features.  We refer to the two input features as $v_1$ and $v_2$ unless
+specified otherwise; the value pairs are $(v_{1,i}, v_{2,i})$ for $i=1,\ldots,n$,
+where $n$ is the number of rows in \texttt{X}, i.e.\ the sample size.
+
+
+\paragraph{Scale-vs-scale statistics.}
+Sample statistics that describe association between two quantitative (scale) features.
+A scale feature has numerical values, with the natural ordering relation.
+\begin{Description}
+%%%%%%%%%%%%%%%%%%%% DESCRIPTIVE STATISTIC %%%%%%%%%%%%%%%%%%%%
+\item[\it\NameStatR]:
+A measure of linear dependence between two numerical features:
+\begin{equation*}
+r \,\,=\,\, \frac{\Cov(v_1, v_2)}{\sqrt{\Var v_1 \Var v_2}}
+\,\,=\,\, \frac{\sum_{i=1}^n (v_{1,i} - \bar{v}_1) (v_{2,i} - \bar{v}_2)}%
+{\sqrt{\sum_{i=1}^n (v_{1,i} - \bar{v}_1)^{2\mathstrut} \cdot \sum_{i=1}^n (v_{2,i} - \bar{v}_2)^{2\mathstrut}}}
+\end{equation*}
+Commonly denoted by~$r$, correlation ranges between $-1$ and $+1$, reaching ${\pm}1$ when all value
+pairs $(v_{1,i}, v_{2,i})$ lie on the same line.  Correlation near~0 means that a line is not a good
+way to represent the dependence between the two features; however, this does not imply independence.
+The sign indicates direction of the linear association: $r > 0$ ($r < 0$) if one feature tends to
+linearly increase (decrease) when the other feature increases.  Nonlinear association, if present,
+may disobey this sign.
+\NameStatR{} is symmetric: $r(v_1, v_2) = r(v_2, v_1)$; it does not change if we transform $v_1$ and $v_2$
+to $a + b v_1$ and $c + d v_2$ where $a, b, c, d$ are constants and $b, d > 0$.
+
+Suppose that we use simple linear regression to represent one feature given the other, say
+represent $v_{2,i} \approx \alpha + \beta v_{1,i}$ by selecting $\alpha$ and $\beta$
+to minimize the least-squares error $\sum_{i=1}^n (v_{2,i} - \alpha - \beta v_{1,i})^2$.
+Then the best error equals
+\begin{equation*}
+\min_{\alpha, \beta} \,\,\sum_{i=1}^n \big(v_{2,i} - \alpha - \beta v_{1,i}\big)^2 \,\,=\,\,
+(1 - r^2) \,\sum_{i=1}^n \big(v_{2,i} - \bar{v}_2\big)^2
+\end{equation*}
+In other words, $1\,{-}\,r^2$ is the ratio of the residual sum of squares to
+the total sum of squares.  Hence, $r^2$ is an accuracy measure of the linear regression.
+\end{Description}
+
+
+\paragraph{Nominal-vs-nominal statistics.}
+Sample statistics that describe association between two nominal categorical features.
+Both features' value domains are encoded with positive integers in arbitrary order:
+nominal features do not order their value domains.
+\begin{Description}
+%%%%%%%%%%%%%%%%%%%% DESCRIPTIVE STATISTIC %%%%%%%%%%%%%%%%%%%%
+\item[\it\NameStatChi]:
+A measure of how much the frequencies of value pairs of two categorical features deviate from
+statistical independence.  Under independence, the probability of every value pair must equal
+the product of probabilities of each value in the pair:
+$\Prob[a, b] - \Prob[a]\,\Prob[b] = 0$.  But we do not know these (hypothesized) probabilities;
+we only know the sample frequency counts.  Let $n_{a,b}$ be the frequency count of pair
+$(a, b)$, let $n_a$ and $n_b$ be the frequency counts of $a$~alone and of $b$~alone.  Under
+independence, difference $n_{a,b}{/}n - (n_a{/}n)(n_b{/}n)$ is unlikely to be exactly~0 due
+to sample randomness, yet it is unlikely to be too far from~0.  For some pairs $(a,b)$ it may
+deviate from~0 farther than for other pairs.  \NameStatChi{}~is an aggregate measure that
+combines squares of these differences across all value pairs:
+\begin{equation*}
+\chi^2 \,\,=\,\, \sum_{a,\,b} \Big(\frac{n_a n_b}{n}\Big)^{-1} \Big(n_{a,b} - \frac{n_a n_b}{n}\Big)^2
+\,=\,\, \sum_{a,\,b} \frac{(O_{a,b} - E_{a,b})^2}{E_{a,b}}
+\end{equation*}
+where $O_{a,b} = n_{a,b}$ are the \emph{observed} frequencies and $E_{a,b} = (n_a n_b){/}n$ are
+the \emph{expected} frequencies for all pairs~$(a,b)$.  Under independence (plus other standard
+assumptions) the sample~$\chi^2$ closely follows a well-known distribution, making it a basis for
+statistical tests for independence, see~\emph{\NameStatPChi} for details.  Note that \NameStatChi{}
+does \emph{not} measure the strength of dependence: even very weak dependence may result in a
+significant deviation from independence if the counts are large enough.  Use~\NameStatV{} instead
+to measure the strength of dependence.
+%%%%%%%%%%%%%%%%%%%% DESCRIPTIVE STATISTIC %%%%%%%%%%%%%%%%%%%%
+\item[\it Degrees of freedom]:
+An integer parameter required for the interpretation of~\NameStatChi{} measure.  Under independence
+(plus other standard assumptions) the sample~$\chi^2$ statistic is approximately distributed as the
+sum of $d$~squares of independent normal random variables with mean~0 and variance~1, where $d$ is
+this integer parameter.  For a pair of categorical features such that the $1^{\textrm{st}}$~feature
+has $k_1$ categories and the $2^{\textrm{nd}}$~feature has $k_2$ categories, the number of degrees
+of freedom is $d = (k_1 - 1)(k_2 - 1)$.
+%%%%%%%%%%%%%%%%%%%% DESCRIPTIVE STATISTIC %%%%%%%%%%%%%%%%%%%%
+\item[\it\NameStatPChi]:
+A measure of how likely we would observe the current frequencies of value pairs of two categorical
+features assuming their statistical independence.  More precisely, it computes the probability that
+the sum of $d$~squares of independent normal random variables with mean~0 and variance~1
+(called the $\chi^2$~distribution with $d$ degrees of freedom) generates a value at least as large
+as the current sample \NameStatChi.  The $d$ parameter is \emph{degrees of freedom}, see above.
+Under independence (plus other standard assumptions) the sample \NameStatChi{} closely follows the
+$\chi^2$~distribution and is unlikely to land very far into its tail.  On the other hand, if the
+two features are dependent, their sample \NameStatChi{} becomes arbitrarily large as $n\to\infty$
+and lands extremely far into the tail of the $\chi^2$~distribution given a large enough data sample.
+\NameStatPChi{} returns the tail ``weight'' on the right-hand side of \NameStatChi:
+\begin{equation*}
+P\,\,=\,\, \Prob\big[r \geq \textrm{\NameStatChi} \,\,\big|\,\, r \sim \textrm{the $\chi^2$ distribution}\big]
+\end{equation*}
+As any probability, $P$ ranges between 0 and~1.  If $P\leq 0.05$, the dependence between the two
+features may be considered statistically significant (i.e.\ their independence is considered
+statistically ruled out).  For highly dependent features, it is not unusual to have $P\leq 10^{-20}$
+or less, in which case our script will simply return $P = 0$.  Independent features should have
+their $P\geq 0.05$ in about 95\% cases.
+%%%%%%%%%%%%%%%%%%%% DESCRIPTIVE STATISTIC %%%%%%%%%%%%%%%%%%%%
+\item[\it\NameStatV]:
+A measure for the strength of association, i.e.\ of statistical dependence, between two categorical
+features, conceptually similar to \NameStatR.  It divides the observed~\NameStatChi{} by the maximum
+possible~$\chi^2_{\textrm{max}}$ given $n$ and the number $k_1, k_2$~of categories in each feature,
+then takes the square root.  Thus, \NameStatV{} ranges from 0 to~1,
+where 0 implies no association and 1 implies the maximum possible association (one-to-one
+correspondence) between the two features.  See \emph{\NameStatChi} for the computation of~$\chi^2$;
+its maximum${} = {}$%
+$n\cdot\min\{k_1\,{-}\,1, k_2\,{-}\,1\}$ where the $1^{\textrm{st}}$~feature
+has $k_1$ categories and the $2^{\textrm{nd}}$~feature has $k_2$ categories~\cite{AcockStavig1979:CramersV},
+so
+\begin{equation*}
+\textrm{\NameStatV} \,\,=\,\, \sqrt{\frac{\textrm{\NameStatChi}}{n\cdot\min\{k_1\,{-}\,1, k_2\,{-}\,1\}}}
+\end{equation*}
+As opposed to \NameStatPChi, which goes to~0 (rapidly) as the features' dependence increases,
+\NameStatV{} goes towards~1 (slowly) as the dependence increases.  Both \NameStatChi{} and
+\NameStatPChi{} are very sensitive to~$n$, but in \NameStatV{} this is mitigated by taking the
+ratio.
+\end{Description}
+
+
+\paragraph{Nominal-vs-scale statistics.}
+Sample statistics that describe association between a categorical feature
+(order ignored) and a quantitative (scale) feature.
+The values of the categorical feature must be coded as positive integers.
+\begin{Description}
+%%%%%%%%%%%%%%%%%%%% DESCRIPTIVE STATISTIC %%%%%%%%%%%%%%%%%%%%
+\item[\it\NameStatEta]:
+A measure for the strength of association (statistical dependence) between a nominal feature
+and a scale feature, conceptually similar to \NameStatR.  Ranges from 0 to~1, approaching 0
+when there is no association and approaching 1 when there is a strong association.  
+The nominal feature, treated as the independent variable, is assumed to have relatively few
+possible values, all with large frequency counts.  The scale feature is treated as the dependent
+variable.  Denoting the nominal feature by~$x$ and the scale feature by~$y$, we have:
+\begin{equation*}
+\eta^2 \,=\, 1 - \frac{\sum_{i=1}^{n} \big(y_i - \hat{y}[x_i]\big)^2}{\sum_{i=1}^{n} (y_i - \bar{y})^2},
+\,\,\,\,\textrm{where}\,\,\,\,
+\hat{y}[x] = \frac{1}{\mathop{\mathrm{freq}}(x)}\sum_{i=1}^n  
+\,\left\{\!\!\begin{array}{rl} y_i & \textrm{if $x_i = x$}\\ 0 & \textrm{otherwise}\end{array}\right.\!\!\!
+\end{equation*}
+and $\bar{y} = (1{/}n)\sum_{i=1}^n y_i$ is the mean.  Value $\hat{y}[x]$ is the average 
+of~$y_i$ among all records where $x_i = x$; it can also be viewed as the ``predictor'' 
+of $y$ given~$x$.  Then $\sum_{i=1}^{n} (y_i - \hat{y}[x_i])^2$ is the residual error
+sum-of-squares and $\sum_{i=1}^{n} (y_i - \bar{y})^2$ is the total sum-of-squares for~$y$. 
+Hence, $\eta^2$ measures the accuracy of predicting $y$ with~$x$, just like the
+``R-squared'' statistic measures the accuracy of linear regression.  Our output $\eta$
+is the square root of~$\eta^2$.
+%%%%%%%%%%%%%%%%%%%% DESCRIPTIVE STATISTIC %%%%%%%%%%%%%%%%%%%%
+\item[\it\NameStatF]:
+A measure of how much the values of the scale feature, denoted here by~$y$,
+deviate from statistical independence on the nominal feature, denoted by~$x$.
+The same measure appears in the one-way analysis of vari\-ance (ANOVA).
+Like \NameStatChi, \NameStatF{} is used to test the hypothesis that
+$y$~is independent from~$x$, given the following assumptions:
+\begin{Itemize}
+\item The scale feature $y$ has approximately normal distribution whose mean
+may depend only on~$x$ and variance is the same for all~$x$;
+\item The nominal feature $x$ has relatively small value domain with large
+frequency counts, the $x_i$-values are treated as fixed (non-random);
+\item All records are sampled independently of each other.
+\end{Itemize}
+To compute \NameStatF{}, we first compute $\hat{y}[x]$ as the average of~$y_i$
+among all records where $x_i = x$.  These $\hat{y}[x]$ can be viewed as
+``predictors'' of $y$ given~$x$; if $y$ is independent on~$x$, they should
+``predict'' only the global mean~$\bar{y}$.  Then we form two sums-of-squares:
+\begin{Itemize}
+\item \emph{Residual} sum-of-squares of the ``predictor'' accuracy: $y_i - \hat{y}[x_i]$;
+\item \emph{Explained} sum-of-squares of the ``predictor'' variability: $\hat{y}[x_i] - \bar{y}$.
+\end{Itemize}
+\NameStatF{} is the ratio of the explained sum-of-squares to
+the residual sum-of-squares, each divided by their corresponding degrees
+of freedom:
+\begin{equation*}
+F \,\,=\,\, 
+\frac{\sum_{x}\, \mathop{\mathrm{freq}}(x) \, \big(\hat{y}[x] - \bar{y}\big)^2 \,\big/\,\, (k\,{-}\,1)}%
+{\sum_{i=1}^{n} \big(y_i - \hat{y}[x_i]\big)^2 \,\big/\,\, (n\,{-}\,k)} \,\,=\,\,
+\frac{n\,{-}\,k}{k\,{-}\,1} \cdot \frac{\eta^2}{1 - \eta^2}
+\end{equation*}
+Here $k$ is the domain size of the nominal feature~$x$.  The $k$ ``predictors''
+lose 1~freedom due to their linear dependence with~$\bar{y}$; similarly,
+the $n$~$y_i$-s lose $k$~freedoms due to the ``predictors''.
+
+The statistic can test if the independence hypothesis of $y$ from $x$ is reasonable;
+more generally (with relaxed normality assumptions) it can test the hypothesis that
+\emph{the mean} of $y$ among records with a given~$x$ is the same for all~$x$.
+Under this hypothesis \NameStatF{} has, or approximates, the $F(k\,{-}\,1, n\,{-}\,k)$-distribution.
+But if the mean of $y$ given $x$ depends on~$x$, \NameStatF{}
+becomes arbitrarily large as $n\to\infty$ (with $k$~fixed) and lands extremely far
+into the tail of the $F(k\,{-}\,1, n\,{-}\,k)$-distribution given a large enough data sample.
+\end{Description}
+
+
+\paragraph{Ordinal-vs-ordinal statistics.}
+Sample statistics that describe association between two ordinal categorical features.
+Both features' value domains are encoded with positive integers, so that the natural
+order of the integers coincides with the order in each value domain.
+\begin{Description}
+%%%%%%%%%%%%%%%%%%%% DESCRIPTIVE STATISTIC %%%%%%%%%%%%%%%%%%%%
+\item[\it\NameStatRho]:
+A measure for the strength of association (statistical dependence) between
+two ordinal features, conceptually similar to \NameStatR.  Specifically, it is \NameStatR{}
+applied to the feature vectors in which all values are replaced by their ranks, i.e.\ 
+their positions if the vector is sorted.  The ranks of identical (duplicate) values
+are replaced with their average rank.  For example, in vector
+$(15, 11, 26, 15, 8)$ the value ``15'' occurs twice with ranks 3 and~4 per the sorted
+order $(8_1, 11_2, 15_3, 15_4, 26_5)$; so, both values are assigned their average
+rank of $3.5 = (3\,{+}\,4)\,{/}\,2$ and the vector is replaced by~$(3.5,\, 2,\, 5,\, 3.5,\, 1)$.
+
+Our implementation of \NameStatRho{} is geared towards features having small value domains
+and large counts for the values.  Given the two input vectors, we form a contingency table $T$
+of pairwise frequency counts, as well as a vector of frequency counts for each feature: $f_1$
+and~$f_2$.  Here in $T_{i,j}$, $f_{1,i}$, $f_{2,j}$ indices $i$ and~$j$ refer to the
+order-preserving integer encoding of the feature values.
+We use prefix sums over $f_1$ and~$f_2$ to compute the values' average ranks:
+$r_{1,i} = \sum_{j=1}^{i-1} f_{1,j} + (f_{1,i}\,{+}\,1){/}2$, and analogously for~$r_2$.
+Finally, we compute rank variances for $r_1, r_2$ weighted by counts $f_1, f_2$ and their
+covariance weighted by~$T$, before applying the standard formula for \NameStatR:
+\begin{equation*}
+\rho \,\,=\,\, \frac{\Cov_T(r_1, r_2)}{\sqrt{\Var_{f_1}(r_1)\Var_{f_2}(r_2)}}
+\,\,=\,\, \frac{\sum_{i,j} T_{i,j} (r_{1,i} - \bar{r}_1) (r_{2,j} - \bar{r}_2)}%
+{\sqrt{\sum_i f_{1,i} (r_{1,i} - \bar{r}_1)^{2\mathstrut} \cdot \sum_j f_{2,j} (r_{2,j} - \bar{r}_2)^{2\mathstrut}}}
+\end{equation*}
+where $\bar{r}_1 = \sum_i r_{1,i} f_{1,i}{/}n$, analogously for~$\bar{r}_2$.
+The value of $\rho$ lies between $-1$ and $+1$, with sign indicating the prevalent direction
+of the association: $\rho > 0$ ($\rho < 0$) means that one feature tends to increase (decrease)
+when the other feature increases.  The correlation becomes~1 when the two features are
+monotonically related.
+\end{Description}
+
+
+\smallskip
+\noindent{\bf Returns}
+\smallskip
+
+A collection of (potentially) 4 matrices.  Each matrix contains bivariate statistics that
+resulted from a different combination of feature types.  There is one matrix for scale-scale
+statistics (which includes \NameStatR), one for nominal-nominal statistics (includes \NameStatChi{}),
+one for nominal-scale statistics (includes \NameStatF) and one for ordinal-ordinal statistics
+(includes \NameStatRho).  If any of these matrices is not produced, then no pair of columns required
+the corresponding type combination.  See Table~\ref{table:bivars} for the matrix naming and
+format details.
+
+
+\smallskip
+\pagebreak[2]
+
+\noindent{\bf Examples}
+\smallskip
+
+{\hangindent=\parindent\noindent\tt
+\hml -f \BivarScriptName{} -nvargs
+X=/user/biadmin/X.mtx 
+index1=/user/biadmin/S1.mtx 
+index2=/user/biadmin/S2.mtx 
+types1=/user/biadmin/K1.mtx 
+types2=/user/biadmin/K2.mtx 
+OUTDIR=/user/biadmin/stats.mtx
+
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/a891c0a1/Algorithms Reference/DescriptiveStats.tex
----------------------------------------------------------------------
diff --git a/Algorithms Reference/DescriptiveStats.tex b/Algorithms Reference/DescriptiveStats.tex
new file mode 100644
index 0000000..d471b50
--- /dev/null
+++ b/Algorithms Reference/DescriptiveStats.tex	
@@ -0,0 +1,94 @@
+\newcommand{\UnivarScriptName}{\texttt{\tt Univar-Stats.dml}}
+\newcommand{\BivarScriptName}{\texttt{\tt bivar-stats.dml}}
+
+\newcommand{\OutputRowIDMinimum}{1}
+\newcommand{\OutputRowIDMaximum}{2}
+\newcommand{\OutputRowIDRange}{3}
+\newcommand{\OutputRowIDMean}{4}
+\newcommand{\OutputRowIDVariance}{5}
+\newcommand{\OutputRowIDStDeviation}{6}
+\newcommand{\OutputRowIDStErrorMean}{7}
+\newcommand{\OutputRowIDCoeffVar}{8}
+\newcommand{\OutputRowIDQuartiles}{?, 13, ?}
+\newcommand{\OutputRowIDMedian}{13}
+\newcommand{\OutputRowIDIQMean}{14}
+\newcommand{\OutputRowIDSkewness}{9}
+\newcommand{\OutputRowIDKurtosis}{10}
+\newcommand{\OutputRowIDStErrorSkewness}{11}
+\newcommand{\OutputRowIDStErrorCurtosis}{12}
+\newcommand{\OutputRowIDNumCategories}{15}
+\newcommand{\OutputRowIDMode}{16}
+\newcommand{\OutputRowIDNumModes}{17}
+\newcommand{\OutputRowText}[1]{\mbox{(output row~{#1})\hspace{0.5pt}:}}
+
+\newcommand{\NameStatR}{Pearson's correlation coefficient}
+\newcommand{\NameStatChi}{Pearson's~$\chi^2$}
+\newcommand{\NameStatPChi}{$P\textrm{-}$value of Pearson's~$\chi^2$}
+\newcommand{\NameStatV}{Cram\'er's~$V$}
+\newcommand{\NameStatEta}{Eta statistic}
+\newcommand{\NameStatF}{$F$~statistic}
+\newcommand{\NameStatRho}{Spearman's rank correlation coefficient}
+
+Descriptive statistics are used to quantitatively describe the main characteristics of the data.
+They provide meaningful summaries computed over different observations or data records
+collected in a study.  These summaries typically form the basis of the initial data exploration
+as part of a more extensive statistical analysis.  Such a quantitative analysis assumes that
+every variable (also known as, attribute, feature, or column) in the data has a specific
+\emph{level of measurement}~\cite{Stevens1946:scales}.
+
+The measurement level of a variable, often called as {\bf variable type}, can either be
+\emph{scale} or \emph{categorical}.  A \emph{scale} variable represents the data measured on
+an interval scale or ratio scale.  Examples of scale variables include `Height', `Weight',
+`Salary', and `Temperature'.  Scale variables are also referred to as \emph{quantitative}
+or \emph{continuous} variables.  In contrast, a \emph{categorical} variable has a fixed
+limited number of distinct values or categories.  Examples of categorical variables
+include `Gender', `Region', `Hair color', `Zipcode', and `Level of Satisfaction'.
+Categorical variables can further be classified into two types, \emph{nominal} and
+\emph{ordinal}, depending on whether the categories in the variable can be ordered via an
+intrinsic ranking.  For example, there is no meaningful ranking among distinct values in
+`Hair color' variable, while the categories in `Level of Satisfaction' can be ranked from
+highly dissatisfied to highly satisfied.
+
+The input dataset for descriptive statistics is provided in the form of a matrix, whose
+rows are the records (data points) and whose columns are the features (i.e.~variables).
+Some scripts allow this matrix to be vertically split into two or three matrices.  Descriptive
+statistics are computed over the specified features (columns) in the matrix.  Which
+statistics are computed depends on the types of the features.  It is important to keep
+in mind the following caveats and restrictions:
+\begin{Enumerate}
+\item  Given a finite set of data records, i.e.~a \emph{sample}, we take their feature
+values and compute their \emph{sample statistics}.  These statistics
+will vary from sample to sample even if the underlying distribution of feature values
+remains the same.  Sample statistics are accurate for the given sample only.
+If the goal is to estimate the \emph{distribution statistics} that are parameters of
+the (hypothesized) underlying distribution of the features, the corresponding sample
+statistics may sometimes be used as approximations, but their accuracy will vary.
+\item  In particular, the accuracy of the estimated distribution statistics will be low
+if the number of values in the sample is small.  That is, for small samples, the computed
+statistics may depend on the randomness of the individual sample values more than on
+the underlying distribution of the features.
+\item  The accuracy will also be low if the sample records cannot be assumed mutually
+independent and identically distributed (i.i.d.), that is, sampled at random from the
+same underlying distribution.  In practice, feature values in one record often depend
+on other features and other records, including unknown ones.
+\item  Most of the computed statistics will have low estimation accuracy in the presence of
+extreme values (outliers) or if the underlying distribution has heavy tails, for example
+obeys a power law.  However, a few of the computed statistics, such as the median and
+\NameStatRho{}, are \emph{robust} to outliers.
+\item  Some sample statistics are reported with their \emph{sample standard errors}
+in an attempt to quantify their accuracy as distribution parameter estimators.  But these
+sample standard errors, in turn, only estimate the underlying distribution's standard
+errors and will have low accuracy for small or \mbox{non-i.i.d.} samples, outliers in samples,
+or heavy-tailed distributions.
+\item  We assume that the quantitative (scale) feature columns do not contain missing
+values, infinite values, \texttt{NaN}s, or coded non-numeric values, unless otherwise
+specified.  We assume that each categorical feature column contains positive integers
+from 1 to the number of categories; for ordinal features, the natural order on
+the integers should coincide with the order on the categories.
+\end{Enumerate}
+
+\input{DescriptiveUnivarStats}
+
+\input{DescriptiveBivarStats}
+
+\input{DescriptiveStratStats}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/a891c0a1/Algorithms Reference/DescriptiveStratStats.tex
----------------------------------------------------------------------
diff --git a/Algorithms Reference/DescriptiveStratStats.tex b/Algorithms Reference/DescriptiveStratStats.tex
new file mode 100644
index 0000000..716cf35
--- /dev/null
+++ b/Algorithms Reference/DescriptiveStratStats.tex	
@@ -0,0 +1,285 @@
+\subsection{Stratified Bivariate Statistics}
+
+\noindent{\bf Description}
+\smallskip
+
+The {\tt stratstats.dml} script computes common bivariate statistics, such
+as correlation, slope, and their p-value, in parallel for many pairs of input
+variables in the presence of a confounding categorical variable.  The values
+of this confounding variable group the records into strata (subpopulations),
+in which all bivariate pairs are assumed free of confounding.  The script
+uses the same data model as in one-way analysis of covariance (ANCOVA), with
+strata representing population samples.  It also outputs univariate stratified
+and bivariate unstratified statistics.
+
+\begin{table}[t]\hfil
+\begin{tabular}{|l|ll|ll|ll||ll|}
+\hline
+Month of the year & \multicolumn{2}{l|}{October} & \multicolumn{2}{l|}{November} &
+    \multicolumn{2}{l||}{December} & \multicolumn{2}{c|}{Oct$\,$--$\,$Dec} \\
+Customers, millions    & 0.6 & 1.4 & 1.4 & 0.6 & 3.0 & 1.0 & 5.0 & 3.0 \\
+\hline
+Promotion (0 or 1)     & 0   & 1   & 0   & 1   & 0   & 1   & 0   & 1   \\
+Avg.\ sales per 1000   & 0.4 & 0.5 & 0.9 & 1.0 & 2.5 & 2.6 & 1.8 & 1.3 \\
+\hline
+\end{tabular}\hfil
+\caption{Stratification example: the effect of the promotion on average sales
+becomes reversed and amplified (from $+0.1$ to $-0.5$) if we ignore the months.}
+\label{table:stratexample}
+\end{table}
+
+To see how data stratification mitigates confounding, consider an (artificial)
+example in Table~\ref{table:stratexample}.  A highly seasonal retail item
+was marketed with and without a promotion over the final 3~months of the year.
+In each month the sale was more likely with the promotion than without it.
+But during the peak holiday season, when shoppers came in greater numbers and
+bought the item more often, the promotion was less frequently used.  As a result,
+if the 4-th quarter data is pooled together, the promotion's effect becomes
+reversed and magnified.  Stratifying by month restores the positive correlation.
+
+The script computes its statistics in parallel over all possible pairs from two
+specified sets of covariates.  The 1-st covariate is a column in input matrix~$X$
+and the 2-nd covariate is a column in input matrix~$Y$; matrices $X$ and~$Y$ may
+be the same or different.  The columns of interest are given by their index numbers
+in special matrices.  The stratum column, specified in its own matrix, is the same
+for all covariate pairs.
+
+Both covariates in each pair must be numerical, with the 2-nd covariate normally
+distributed given the 1-st covariate (see~Details).  Missing covariate values or
+strata are represented by~``NaN''.  Records with NaN's are selectively omitted
+wherever their NaN's are material to the output statistic.
+
+\smallskip
+\pagebreak[3]
+
+\noindent{\bf Usage}
+\smallskip
+
+{\hangindent=\parindent\noindent\it%
+{\tt{}-f }path/\/{\tt{}stratstats.dml}
+{\tt{} -nvargs}
+{\tt{} X=}path/file
+{\tt{} Xcid=}path/file
+{\tt{} Y=}path/file
+{\tt{} Ycid=}path/file
+{\tt{} S=}path/file
+{\tt{} Scid=}int
+{\tt{} O=}path/file
+{\tt{} fmt=}format
+
+}
+
+
+\smallskip
+\noindent{\bf Arguments}
+\begin{Description}
+\item[{\tt X}:]
+Location (on HDFS) to read matrix $X$ whose columns we want to use as
+the 1-st covariate (i.e.~as the feature variable)
+\item[{\tt Xcid}:] (default:\mbox{ }{\tt " "})
+Location to read the single-row matrix that lists all index numbers
+of the $X$-columns used as the 1-st covariate; the default value means
+``use all $X$-columns''
+\item[{\tt Y}:] (default:\mbox{ }{\tt " "})
+Location to read matrix $Y$ whose columns we want to use as the 2-nd
+covariate (i.e.~as the response variable); the default value means
+``use $X$ in place of~$Y$''
+\item[{\tt Ycid}:] (default:\mbox{ }{\tt " "})
+Location to read the single-row matrix that lists all index numbers
+of the $Y$-columns used as the 2-nd covariate; the default value means
+``use all $Y$-columns''
+\item[{\tt S}:] (default:\mbox{ }{\tt " "})
+Location to read matrix $S$ that has the stratum column.
+Note: the stratum column must contain small positive integers; all fractional
+values are rounded; stratum IDs of value ${\leq}\,0$ or NaN are treated as
+missing.  The default value for {\tt S} means ``use $X$ in place of~$S$''
+\item[{\tt Scid}:] (default:\mbox{ }{\tt 1})
+The index number of the stratum column in~$S$
+\item[{\tt O}:]
+Location to store the output matrix defined in Table~\ref{table:stratoutput}
+\item[{\tt fmt}:] (default:\mbox{ }{\tt "text"})
+Matrix file output format, such as {\tt text}, {\tt mm}, or {\tt csv};
+see read/write functions in SystemML Language Reference for details.
+\end{Description}
+
+
+\begin{table}[t]\small\hfil
+\begin{tabular}{|rcl|rcl|}
+\hline
+& Col.\# & Meaning & & Col.\# & Meaning \\
+\hline
+\multirow{9}{*}{\begin{sideways}1-st covariate\end{sideways}}\hspace{-1em}
+& 01     & $X$-column number                & 
+\multirow{9}{*}{\begin{sideways}2-nd covariate\end{sideways}}\hspace{-1em}
+& 11     & $Y$-column number                \\
+& 02     & presence count for $x$           & 
+& 12     & presence count for $y$           \\
+& 03     & global mean $(x)$                & 
+& 13     & global mean $(y)$                \\
+& 04     & global std.\ dev. $(x)$          & 
+& 14     & global std.\ dev. $(y)$          \\
+& 05     & stratified std.\ dev. $(x)$      & 
+& 15     & stratified std.\ dev. $(y)$      \\
+& 06     & $R^2$ for $x \sim {}$strata      & 
+& 16     & $R^2$ for $y \sim {}$strata      \\
+& 07     & adjusted $R^2$ for $x \sim {}$strata      & 
+& 17     & adjusted $R^2$ for $y \sim {}$strata      \\
+& 08     & p-value, $x \sim {}$strata       & 
+& 18     & p-value, $y \sim {}$strata       \\
+& 09--10 & reserved                         & 
+& 19--20 & reserved                         \\
+\hline
+\multirow{9}{*}{\begin{sideways}$y\sim x$, NO strata\end{sideways}}\hspace{-1.15em}
+& 21     & presence count $(x, y)$          &
+\multirow{10}{*}{\begin{sideways}$y\sim x$ AND strata$\!\!\!\!$\end{sideways}}\hspace{-1.15em}
+& 31     & presence count $(x, y, s)$       \\
+& 22     & regression slope                 &
+& 32     & regression slope                 \\
+& 23     & regres.\ slope std.\ dev.        &
+& 33     & regres.\ slope std.\ dev.        \\
+& 24     & correlation${} = \pm\sqrt{R^2}$  &
+& 34     & correlation${} = \pm\sqrt{R^2}$  \\
+& 25     & residual std.\ dev.              &
+& 35     & residual std.\ dev.              \\
+& 26     & $R^2$ in $y$ due to $x$          &
+& 36     & $R^2$ in $y$ due to $x$          \\
+& 27     & adjusted $R^2$ in $y$ due to $x$ &
+& 37     & adjusted $R^2$ in $y$ due to $x$ \\
+& 28     & p-value for ``slope = 0''        &
+& 38     & p-value for ``slope = 0''        \\
+& 29     & reserved                         &
+& 39     & \# strata with ${\geq}\,2$ count \\
+& 30     & reserved                         &
+& 40     & reserved                         \\
+\hline
+\end{tabular}\hfil
+\caption{The {\tt stratstats.dml} output matrix has one row per each distinct
+pair of 1-st and 2-nd covariates, and 40 columns with the statistics described
+here.}
+\label{table:stratoutput}
+\end{table}
+
+
+
+
+\noindent{\bf Details}
+\smallskip
+
+Suppose we have $n$ records of format $(i, x, y)$, where $i\in\{1,\ldots, k\}$ is
+a stratum number and $(x, y)$ are two numerical covariates.  We want to analyze
+conditional linear relationship between $y$ and $x$ conditioned by~$i$.
+Note that $x$, but not~$y$, may represent a categorical variable if we assign a
+numerical value to each category, for example 0 and 1 for two categories.
+
+We assume a linear regression model for~$y$:
+\begin{equation}
+y_{i,j} \,=\, \alpha_i + \beta x_{i,j} + \eps_{i,j}\,, \quad\textrm{where}\,\,\,\,
+\eps_{i,j} \sim \Normal(0, \sigma^2)
+\label{eqn:stratlinmodel}
+\end{equation}
+Here $i = 1\ldots k$ is a stratum number and $j = 1\ldots n_i$ is a record number
+in stratum~$i$; by $n_i$ we denote the number of records available in stratum~$i$.
+The noise term~$\eps_{i,j}$ is assumed to have the same variance in all strata.
+When $n_i\,{>}\,0$, we can estimate the means of $x_{i, j}$ and $y_{i, j}$ in
+stratum~$i$ as
+\begin{equation*}
+\bar{x}_i \,= \Big(\sum\nolimits_{j=1}^{n_i} \,x_{i, j}\Big) / n_i\,;\quad
+\bar{y}_i \,= \Big(\sum\nolimits_{j=1}^{n_i} \,y_{i, j}\Big) / n_i
+\end{equation*}
+If $\beta$ is known, the best estimate for $\alpha_i$ is $\bar{y}_i - \beta \bar{x}_i$,
+which gives the prediction error sum-of-squares of
+\begin{equation}
+\sum\nolimits_{i=1}^k \sum\nolimits_{j=1}^{n_i} \big(y_{i,j} - \beta x_{i,j} - (\bar{y}_i - \beta \bar{x}_i)\big)^2
+\,\,=\,\, \beta^{2\,}V_x \,-\, 2\beta \,V_{x,y} \,+\, V_y
+\label{eqn:stratsumsq}
+\end{equation}
+where $V_x$, $V_y$, and $V_{x, y}$ are, correspondingly, the ``stratified'' sample
+estimates of variance $\Var(x)$ and $\Var(y)$ and covariance $\Cov(x,y)$ computed as
+\begin{align*}
+V_x     \,&=\, \sum\nolimits_{i=1}^k \sum\nolimits_{j=1}^{n_i} \big(x_{i,j} - \bar{x}_i\big)^2; \quad
+V_y     \,=\, \sum\nolimits_{i=1}^k \sum\nolimits_{j=1}^{n_i} \big(y_{i,j} - \bar{y}_i\big)^2;\\
+V_{x,y} \,&=\, \sum\nolimits_{i=1}^k \sum\nolimits_{j=1}^{n_i} \big(x_{i,j} - \bar{x}_i\big)\big(y_{i,j} - \bar{y}_i\big)
+\end{align*}
+They are stratified because we compute the sample (co-)variances in each stratum~$i$
+separately, then combine by summation.  The stratified estimates for $\Var(X)$ and $\Var(Y)$
+tend to be smaller than the non-stratified ones (with the global mean instead of $\bar{x}_i$
+and~$\bar{y}_i$) since $\bar{x}_i$ and $\bar{y}_i$ fit closer to $x_{i,j}$ and $y_{i,j}$
+than the global means.  The stratified variance estimates the uncertainty in $x_{i,j}$ 
+and~$y_{i,j}$ given their stratum~$i$.
+
+Minimizing over~$\beta$ the error sum-of-squares~(\ref{eqn:stratsumsq})
+gives us the regression slope estimate \mbox{$\hat{\beta} = V_{x,y} / V_x$},
+with~(\ref{eqn:stratsumsq}) becoming the residual sum-of-squares~(RSS):
+\begin{equation*}
+\mathrm{RSS} \,\,=\, \,
+\sum\nolimits_{i=1}^k \sum\nolimits_{j=1}^{n_i} \big(y_{i,j} - 
+\hat{\beta} x_{i,j} - (\bar{y}_i - \hat{\beta} \bar{x}_i)\big)^2
+\,\,=\,\,  V_y \,\big(1 \,-\, V_{x,y}^2 / (V_x V_y)\big)
+\end{equation*}
+The quantity $\hat{R}^2 = V_{x,y}^2 / (V_x V_y)$, called \emph{$R$-squared}, estimates the fraction
+of stratified variance in~$y_{i,j}$ explained by covariate $x_{i, j}$ in the linear 
+regression model~(\ref{eqn:stratlinmodel}).  We define \emph{stratified correlation} as the
+square root of~$\hat{R}^2$ taken with the sign of~$V_{x,y}$.  We also use RSS to estimate
+the residual standard deviation $\sigma$ in~(\ref{eqn:stratlinmodel}) that models the prediction error
+of $y_{i,j}$ given $x_{i,j}$ and the stratum:
+\begin{equation*}
+\hat{\beta}\, =\, \frac{V_{x,y}}{V_x}; \,\,\,\, \hat{R} \,=\, \frac{V_{x,y}}{\sqrt{V_x V_y}};
+\,\,\,\, \hat{R}^2 \,=\, \frac{V_{x,y}^2}{V_x V_y};
+\,\,\,\, \hat{\sigma} \,=\, \sqrt{\frac{\mathrm{RSS}}{n - k - 1}}\,\,\,\,
+\Big(n = \sum_{i=1}^k n_i\Big)
+\end{equation*}
+
+The $t$-test and the $F$-test for the null-hypothesis of ``$\beta = 0$'' are
+obtained by considering the effect of $\hat{\beta}$ on the residual sum-of-squares,
+measured by the decrease from $V_y$ to~RSS.
+The $F$-statistic is the ratio of the ``explained'' sum-of-squares
+to the residual sum-of-squares, divided by their corresponding degrees of freedom.
+There are $n\,{-}\,k$ degrees of freedom for~$V_y$, parameter $\beta$ reduces that
+to $n\,{-}\,k\,{-}\,1$ for~RSS, and their difference $V_y - {}$RSS has just 1 degree
+of freedom:
+\begin{equation*}
+F \,=\, \frac{(V_y - \mathrm{RSS})/1}{\mathrm{RSS}/(n\,{-}\,k\,{-}\,1)}
+\,=\, \frac{\hat{R}^2\,(n\,{-}\,k\,{-}\,1)}{1-\hat{R}^2}; \quad
+t \,=\, \hat{R}\, \sqrt{\frac{n\,{-}\,k\,{-}\,1}{1-\hat{R}^2}}.
+\end{equation*}
+The $t$-statistic is simply the square root of the $F$-statistic with the appropriate
+choice of sign.  If the null hypothesis and the linear model are both true, the $t$-statistic
+has Student $t$-distribution with $n\,{-}\,k\,{-}\,1$ degrees of freedom.  We can
+also compute it if we divide $\hat{\beta}$ by its estimated standard deviation:
+\begin{equation*}
+\stdev(\hat{\beta})_{\mathrm{est}} \,=\, \hat{\sigma}\,/\sqrt{V_x} \quad\Longrightarrow\quad
+t \,=\, \hat{R}\sqrt{V_y} \,/\, \hat{\sigma} \,=\, \beta \,/\, \stdev(\hat{\beta})_{\mathrm{est}}
+\end{equation*}
+The standard deviation estimate for~$\beta$ is included in {\tt stratstats.dml} output.
+
+\smallskip
+\noindent{\bf Returns}
+\smallskip
+
+The output matrix format is defined in Table~\ref{table:stratoutput}.
+
+\smallskip
+\noindent{\bf Examples}
+\smallskip
+
+{\hangindent=\parindent\noindent\tt
+\hml -f stratstats.dml -nvargs X=/user/biadmin/X.mtx Xcid=/user/biadmin/Xcid.mtx
+  Y=/user/biadmin/Y.mtx Ycid=/user/biadmin/Ycid.mtx S=/user/biadmin/S.mtx Scid=2
+  O=/user/biadmin/Out.mtx fmt=csv
+
+}
+{\hangindent=\parindent\noindent\tt
+\hml -f stratstats.dml -nvargs X=/user/biadmin/Data.mtx Xcid=/user/biadmin/Xcid.mtx
+  Ycid=/user/biadmin/Ycid.mtx Scid=7 O=/user/biadmin/Out.mtx
+
+}
+
+%\smallskip
+%\noindent{\bf See Also}
+%\smallskip
+%
+%For non-stratified bivariate statistics with a wider variety of input data types
+%and statistical tests, see \ldots.  For general linear regression, see
+%{\tt LinearRegDS.dml} and {\tt LinearRegCG.dml}.  For logistic regression, appropriate
+%when the response variable is categorical, see {\tt MultiLogReg.dml}.
+