You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/09/11 18:51:55 UTC
incubator-madlib git commit: Design doc: Fix SVM writeup, remove old-style format

Repository: incubator-madlib
Updated Branches:
  refs/heads/master e1c99c153 -> ceafc8d71


Design doc: Fix SVM writeup, remove old-style format


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/ceafc8d7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/ceafc8d7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/ceafc8d7

Branch: refs/heads/master
Commit: ceafc8d71112ccab8b23b888e0aa8cb66f7261b8
Parents: e1c99c1
Author: Rahul Iyer <ri...@apache.org>
Authored: Sun Sep 11 11:51:18 2016 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Sun Sep 11 11:51:18 2016 -0700

----------------------------------------------------------------------
 doc/design/design.tex                           |   3 +-
 doc/design/modules/ARIMA.tex                    |  78 +++---
 doc/design/modules/SVM.tex                      | 276 +++++++++----------
 doc/design/modules/cox-proportional-hazards.tex |  56 ++--
 doc/design/modules/crf.tex                      |  12 +-
 doc/design/modules/lda.tex                      |   6 +-
 doc/design/modules/linear-systems.tex           |   2 +-
 doc/design/modules/regression.tex               |  38 +--
 doc/literature.bib                              |  48 ++--
 9 files changed, 262 insertions(+), 257 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceafc8d7/doc/design/design.tex
----------------------------------------------------------------------
diff --git a/doc/design/design.tex b/doc/design/design.tex
index dfdf6f6..516a23a 100644
--- a/doc/design/design.tex
+++ b/doc/design/design.tex
@@ -8,11 +8,11 @@
 \usepackage[sumlimits]{amsmath}
 \usepackage{amssymb}
 \usepackage{relsize}
-
 \usepackage[hyperref]{ntheorem}
 \usepackage[
 	bookmarks,
 	colorlinks=false,
+	breaklinks=true,
 	linkcolor=blue,
 	citecolor=blue,
 	pagebackref=false,
@@ -29,6 +29,7 @@
 	firstinits=true
 ]{biblatex}
 \usepackage{scrpage2}                  % Headers and footers
+\usepackage{scrhack}
 \usepackage{color}                     % Colors, possibly only for \todo
 \usepackage{enumitem}                  % enumerate environment
 \usepackage{ctable}

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceafc8d7/doc/design/modules/ARIMA.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/ARIMA.tex b/doc/design/modules/ARIMA.tex
index e50fabe..b70c478 100644
--- a/doc/design/modules/ARIMA.tex
+++ b/doc/design/modules/ARIMA.tex
@@ -13,9 +13,9 @@
 % Abstract. What is the problem we want to solve?
 \section{Introduction}
 
-An ARIMA model is an {\it a}uto-{\it r}egressive {\it i}ntegrated {\it
-  m}oving {\it a}verage model.  An ARIMA model is typically expressed
-in the form  
+An ARIMA model is an \textit{a}uto-\textit{r}egressive \textit{i}ntegrated
+\textit{m}oving \textit{a}verage model.  An ARIMA model is typically expressed
+in the form
 \begin{equation}
 (1 - \phi(B)) Y_t  = (1 + \theta(B)) Z_t,
 \end{equation}
@@ -23,22 +23,22 @@ where $B$ is the backshift operator. The time $t$ is from $1$ to $N$.
 
 ARIMA models involve the following variables:
 \begin{enumerate}
-   \item The lag difference $Y_{t}$, where  $Y_{t} = (1-B)^{d}(X_{t} - \mu)$.  
+   \item The lag difference $Y_{t}$, where  $Y_{t} = (1-B)^{d}(X_{t} - \mu)$.
     \item The values of the time series $X_t$.
     \item $p$, $q$, and $d$ are the parameters of the ARIMA model.
       $d$ is the differencing order, $p$ is the order of the AR
-      operator, and $q$ is the order of the MA operator.   
-    \item The AR operator $\phi(B)$. 
+      operator, and $q$ is the order of the MA operator.
+    \item The AR operator $\phi(B)$.
     \item The MA operator $\theta(B)$.
     \item The mean value $\mu$, which is always set to be zero for
-      $d>0$ or need to be estimated. 
-    \item The error terms $Z_t$.  
+      $d>0$ or need to be estimated.
+    \item The error terms $Z_t$.
 \end{enumerate}
 
 \subsection{AR \& MA Operators}
 The  auto regression operator models the prediction for the next
 observation  as some linear combination of the previous observations.
-More formally, an AR operator of order $p$ is defined as  
+More formally, an AR operator of order $p$ is defined as
 \begin{align}
 \phi(B) Y_t= \phi_1 Y_{t-1}   + \dots +  \phi_{p} Y_{t-p}
 \end{align}
@@ -46,7 +46,7 @@ More formally, an AR operator of order $p$ is defined as
 The moving average operator is similar, and it models the prediction
 for the next observation as a linear combination of the errors in the
 previous prediction errors.  More formally, the MA operator of order
-$q$ is defined as  
+$q$ is defined as
 \begin{align}
 \theta(B) Z_t =   \theta_{1} Z_{t-1} + \dots + \theta_{q} Z_{t-q}.
 \end{align}
@@ -54,7 +54,7 @@ $q$ is defined as
 \section{Solving for the model parameters}\label{sec:para_est}
 
 \subsection{Least Squares}\label{sec:CLS}
-We assume that 
+We assume that
 \begin{equation}
 \Pr(Z_t) = \frac{1}{\sqrt{2 \pi \sigma^2}} e^{-Z^2_t/2 \sigma^2}, \quad t > 0
 \end{equation}
@@ -84,7 +84,7 @@ l(\phi, \theta) &= \sum_{t = 1}^N \ln \left(\frac{1}{\sqrt{2 \pi \sigma^2}} e^{-
 \end{align}
 Thus, finding the maximum likelihood is equivalent to solving the
 optimization problem (known as the conditional least squares
-formation) 
+formation)
 \begin{align}
 \min_{\theta, \phi} \sum_{t = 1}^N  Z^2_t.
 \end{align}
@@ -92,7 +92,7 @@ The error term $Z_t$ can be computed iteratively as follows:
 \begin{align}\label{equ:error terms}
 Z_t = X_t - F_t(\phi, \theta, \mu)
 \end{align}
-where 
+where
 \begin{align}
 F_t(\phi, \theta, \mu) = \mu + \sum_{i=1}^p \phi_i (X_{t-i}-\mu) + \sum_{i=1}^q \theta_i Z_{t-i}
 \end{align}
@@ -103,7 +103,7 @@ also known as the damped least-squares (DLS) method, provides a
 numerical solution to the problem of minimizing a function, generally
 nonlinear, over a space of parameters of the function. These
 minimization problems arise especially in least squares curve fitting
-and nonlinear programming. 
+and nonlinear programming.
 
 %Author's (Mark W) Note: This is taken from http://people.duke.edu/~hpgavin/ce281/lm.pdf
 To understand the Levenberg-Marquardt algorithm, it helps to know the
@@ -117,7 +117,7 @@ Levenberg-Marquardt algorithm tries to get the best of best worlds,
 and combine the gradient descent step with Gauss-Newton step in a
 weighted average.  For iterates far from the true solution, the step
 favors the gradient descent step, but as the iterate approaches the
-true solution, the Gauss-Newton step dominates.   
+true solution, the Gauss-Newton step dominates.
 
 %Author's (Mark W) Note: Sudo code taken from http://users.ics.forth.gr/~lourakis/levmar/levmar.pdf
 Like other numeric minimization algorithms, LMA is an iterative
@@ -126,7 +126,7 @@ initial guess for the parameter vector, $p$, as well as some tuning
 parameters $\tau, \epsilon_1, \epsilon_2, \epsilon_3,$ and $k_{max}$.
 Let $Z(p)$ be the vector of calculated errors ($Z_t$'s) for the
 parameter vector $p$, and let $J = (J_{1}, J_{2}, \dots, J_N)^T$
-be a Jacobian matrix.   
+be a Jacobian matrix.
 
 A proposed implementation is as follows:
 \begin{algorithm}
@@ -154,7 +154,7 @@ A proposed implementation is as follows:
 				\State $\rho \leftarrow (\| Z(\vec{\phi},\vec{\theta},\mu)\|^2 - \| Z(\vec{\phi}_{new},\vec{\theta}_{new},\mu_{new})\|^2 )/(\delta^T(u \delta + g))$ \Comment{Calculate improvement of trial step}
 				\If{$\rho > 0$} \Comment{Trial step was good, proceed to next iteration}
 					\State $(\vec{\phi},\vec{\theta},\mu) \leftarrow (\vec{\phi}_{new},\vec{\theta}_{new},\mu_{new})$ \Comment{Update variables}
-					\State Calculate $Z(\vec{\phi},\vec{\theta},\mu)$ with equation \ref{equ:error terms}.  
+					\State Calculate $Z(\vec{\phi},\vec{\theta},\mu)$ with equation \ref{equ:error terms}.
 					\State $A \leftarrow J^T J$
 					\State $g \leftarrow J^T Z(\vec{\phi},\vec{\theta},\mu)$
 					\State $ \text{stop} \leftarrow (\|g\|_{\infty} \le \epsilon_1)$ or $(\| Z(\vec{\phi},\vec{\theta},\mu)^2 \| \le \epsilon_3)$  \Comment{Terminate if we are close to the solution.}
@@ -172,15 +172,15 @@ A proposed implementation is as follows:
 \label{alg: Levenberg-Marquardt}
 \end{algorithm}
 
-Suggested values for the tuning parameters are $\epsilon_1 = \epsilon_2 = \epsilon_3 = 10^{-15}, \tau = 10^{-3}$ and $k_{max} = 100$.  
+Suggested values for the tuning parameters are $\epsilon_1 = \epsilon_2 = \epsilon_3 = 10^{-15}, \tau = 10^{-3}$ and $k_{max} = 100$.
 \subsubsection{Partial Derivatives}\label{sec:partial der}
-The Jacobian matrix $J = (J_{1}, J_{2}, \dots, J_N)^T$ requires the partial derivatives, which are 
+The Jacobian matrix $J = (J_{1}, J_{2}, \dots, J_N)^T$ requires the partial derivatives, which are
 \begin{align}
 J_t = (J_{t, \phi_1}, \dots, J_{t,\phi_p}, J_{t,\theta_1}, \dots,
 J_{t,\theta_q}, J_{t,\mu})^T\ .
 \end{align}
 Here the last term is present only when \texttt{include\_mean} is
-\texttt{True}. 
+\texttt{True}.
 The iteration relations for $J$ are
 \begin{align}
 J_{t, \phi_i} &= \frac{\partial F_t(\phi,\theta)}{\partial \phi_i} =
@@ -204,10 +204,10 @@ $\vec{\phi}$ and $\vec{\theta}$. The initial conditions for the above
 equations are
 \begin{equation}
 J_{t,\phi_i} = J_{t,\theta_j} = J_{t,\mu} = 0 \quad \mbox{for }
-t \leq p, \mbox{ and } i=1,\dots,p; j = 1, \dots, q\ , 
+t \leq p, \mbox{ and } i=1,\dots,p; j = 1, \dots, q\ ,
 \end{equation}
 because we have fixed $Z_t$ for $t\leq p$ to be a constant $0$ in the initial
-condition. Note that $J$ is zero not only for $t\leq 
+condition. Note that $J$ is zero not only for $t\leq
 0$ but also for $t\leq p$.
 
 \subsection{Estimates of  Other Quantities}
@@ -215,7 +215,7 @@ Finally the variance of the residuals is
 \begin{equation}
 \sigma^2 = \frac{1}{N-p}\sum_{t=1}^N Z_t^2\ . \label{eq:s2}
 \end{equation}
-The estimate for the maximized log-likelihood is 
+The estimate for the maximized log-likelihood is
 \begin{equation}
 l = -\frac{N}{2}\left[1 + \log(2\pi\sigma^2)\right]\ , \label{eq:loglik-R}
 \end{equation}
@@ -225,21 +225,21 @@ Eq. (\ref{eq:loglikelihood}), you will get a result slightly different
 from Eq. (\ref{eq:loglik-R}). However, Eq. (\ref{eq:loglik-R}) is what
 R uses for the method \texttt{``CSS''}.
 
-The standard error for coefficient $a$, where $a=\phi_1,\dots,\phi_p,\theta_1,\dots,\theta_q,\mu$, is 
+The standard error for coefficient $a$, where $a=\phi_1,\dots,\phi_p,\theta_1,\dots,\theta_q,\mu$, is
 \begin{equation}
 \mbox{error}_a = \sqrt{(H^{-1})_{aa}}\ .
 \end{equation}
-The Hessian matrix is 
+The Hessian matrix is
 \begin{equation}
 H_{ab} = \frac{\partial^2}{\partial a \partial b}
 \left(\frac{1}{2\sigma^2}\sum_{t=1}^N Z_t^2 \right) =
-\frac{1}{\sigma^2}\sum_{t=1}^N 
-\left(J_{t,a}J_{t,b} - 
+\frac{1}{\sigma^2}\sum_{t=1}^N
+\left(J_{t,a}J_{t,b} -
   Z_t K_{t,ab} \right) = \frac{1}{\sigma^2}\left(
   A - \sum_{t=1}^N Z_t K_{t,ab}\right)\ ,
 \end{equation}
 where $a,b=\phi_1,\dots,\phi_p,\theta_1,\dots,\theta_q,\mu$,
-$\sigma^2$ is given by Eq. (\ref{eq:s2}), $A=J^TJ$ and 
+$\sigma^2$ is given by Eq. (\ref{eq:s2}), $A=J^TJ$ and
 \begin{equation}
 K_{t,ab}=\frac{\partial J_{t,a}}{\partial b} = - \frac{\partial^2
   Z_t}{\partial a \partial b} \ .
@@ -290,7 +290,7 @@ H_{ab} &=&
 H_{aa} &=& \frac{f(a+\Delta) - 2f(a) + f(a-\Delta)}{\Delta^2}
 \end{eqnarray}
 where $\Delta$ is a small number and the coefficients other than $a$
-and $b$ are ignored from the arguments of $f(\cdot)$ for simplicity. 
+and $b$ are ignored from the arguments of $f(\cdot)$ for simplicity.
 
 
 \subsubsection{Implementation}
@@ -304,21 +304,21 @@ The key of LMA is to compute Jacobian matrix $J$ in each iteration. In order to
 \section{Solving for the optimal model}\label{sec:model_opt}
 
 \subsection{Auto-Correlation Function}
-Note that there several common definitions of the auto-correlation function.  This implementation uses the normalized form.  
+Note that there several common definitions of the auto-correlation function.  This implementation uses the normalized form.
 
-The auto-correlation function  is a cross-correlation of a function (or time-series) with itself, and is typically used to find periodic behavior in a time-series.  For a real, discrete time series, the auto-correlation  $R(k)$ for lag $k$ of a time series $X$ with $N$ data points  is 
+The auto-correlation function  is a cross-correlation of a function (or time-series) with itself, and is typically used to find periodic behavior in a time-series.  For a real, discrete time series, the auto-correlation  $R(k)$ for lag $k$ of a time series $X$ with $N$ data points  is
 \begin{align}\label{equ:auto corr}
-R_X(k) =\sum_{t=k+1}^N  \frac{(x_t -\mu) (x_{t-k} - \mu)}{N \sigma^2} .  
+R_X(k) =\sum_{t=k+1}^N  \frac{(x_t -\mu) (x_{t-k} - \mu)}{N \sigma^2} .
 \end{align}
-where $\sigma^2$ and $\mu$ are the variance and mean of the time series respectively.    
-For this implementation, the range of desired $k$ values will be small ($\approx 10\log(N)$ ), and the auto-correlation function for the range can be computed naively with equation \ref{equ:auto corr}.  
+where $\sigma^2$ and $\mu$ are the variance and mean of the time series respectively.
+For this implementation, the range of desired $k$ values will be small ($\approx 10\log(N)$ ), and the auto-correlation function for the range can be computed naively with equation \ref{equ:auto corr}.
 
 \subsection{Partial Auto-Correlation Function}
-%Author's note: this material is taken from http://sfb649.wiwi.hu-berlin.de/fedc_homepage/xplore/tutorials/sfehtmlnode59.html 
+%Author's note: this material is taken from http://sfb649.wiwi.hu-berlin.de/fedc_homepage/xplore/tutorials/sfehtmlnode59.html
 %Some definitions are also from http://sfb649.wiwi.hu-berlin.de/fedc_homepage/xplore/tutorials/sfehtmlnode48.html
-The partial auto-correlation function is a conceptually simple extension to the auto-correlation function, but greatly increases the complexity of the calculations.  The partial auto-correlation is the correlation for lag $k$ after all correlations from lags $<k$ have been accounted for. 
+The partial auto-correlation function is a conceptually simple extension to the auto-correlation function, but greatly increases the complexity of the calculations.  The partial auto-correlation is the correlation for lag $k$ after all correlations from lags $<k$ have been accounted for.
 
-Let 
+Let
 \begin{align}
 R_{(k)} \equiv  \left[ R_X(1), R_X(2), \dots, R_X(k)\right]^T
 \end{align}
@@ -331,11 +331,11 @@ R_X(1) & 1 & \dots & R_X(k-2) \\
 R_X(k-1) &R_X(k-2) & \dots & 1 \end{matrix} \right]
 \end{align}
 
-Then the partial auto-correlation function $\Phi(k)$ for lag $k$ is 
+Then the partial auto-correlation function $\Phi(k)$ for lag $k$ is
 \begin{align}
 \Phi(k) = \frac{ \det P^*_k}{\det P_k}
 \end{align}
-where $P^*_k$ is equal to the matrix $P_k$, except the $k$th column is replaced with $R_{(k)}$.  
+where $P^*_k$ is equal to the matrix $P_k$, except the $k$th column is replaced with $R_{(k)}$.
 
 \subsection{Automatic Model Selection}
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceafc8d7/doc/design/modules/SVM.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/SVM.tex b/doc/design/modules/SVM.tex
index c670694..049bf92 100644
--- a/doc/design/modules/SVM.tex
+++ b/doc/design/modules/SVM.tex
@@ -19,7 +19,13 @@
 
 % Abstract. What is the problem we want to solve?
 
-Support Vector Machines (SVMs) are a commonly used technique for approaching regression and classification problems. SVM models have two particularly desirable features: robustness in the presence of noisy data, and applicability to a variety of data schemes. At its core, a \textit{linear} SVM model is a hyperplane separating the two distinct classes of data (in the case of classification problems), in such a way that the \textit{margin} is maximized. Using kernels, one can approximate a large variety of decision boundaries.
+Support Vector Machines (SVMs) are a commonly used technique for approaching
+regression and classification problems. SVM models have two particularly
+desirable features: robustness in the presence of noisy data, and applicability
+to a variety of data schemes. At its core, a \textit{linear} SVM model is a
+hyperplane separating the two distinct classes of data (in the case of
+classification problems), in such a way that the \textit{margin} is maximized.
+Using kernels, one can approximate a large variety of decision boundaries.
 
 
 
@@ -37,11 +43,26 @@ Support Vector Machines (SVMs) are a commonly used technique for approaching reg
 %Cite Smola's book for this section
 
 
-Suppose we have a classification task with training data $(x_1,y_1),\ldots,(x_n,y_n) \in \R^d\times \{0,1\}$. A hyperplane in $\R^d$ is determined by $w\in R^d$ and $b\in R$: $x\in\R^d$ is on the plane if $\langle w, x\rangle + b = 0$. To solve the classification problem, we seek a hyperplane which separates the data, and therefore coefficients $(w,b) \in \R^d\times\R$ such that  for all $i\leq n$, $y_i = \text{sgn}(\langle w, x_i\rangle + b)$.\\
+Suppose we have a classification task with training data
+$(x_1,y_1),\ldots,(x_n,y_n) \in \R^d\times \{0,1\}$. A hyperplane in $\R^d$ is
+determined by $w\in R^d$ and $b\in R$: $x\in\R^d$ is on the plane if $\langle w,
+x\rangle + b = 0$. To solve the classification problem, we seek a hyperplane
+which separates the data, and therefore coefficients $(w,b) \in \R^d\times\R$
+such that  for all $i\leq n$, $y_i = \text{sgn}(\langle w, x_i\rangle + b)$.\\
 
-The \textit{margin} determined by a hyperplane and the training data is the smallest distance from a data point to the hyperplane. For convenience, the length of $w$ can be modified so that the margin is equal to $\frac{1}{||w||}$. In addition to merely separating the data, it is desirable to find a hyperplane which maximizes the margin; inuitively, a larger margin will lead to better predictions on future data.\\
+The \textit{margin} determined by a hyperplane and the training data is the
+smallest distance from a data point to the hyperplane. For convenience, the
+length of $w$ can be modified so that the margin is equal to $\frac{1}{||w||}$.
+In addition to merely separating the data, it is desirable to find a hyperplane
+which maximizes the margin; inuitively, a larger margin will lead to better
+predictions on future data.\\
 
-The \textit{support vectors} are the members of the training data whose distance to hyperplane is exactly the margin; that is, the training points which are closest to the hyperplane. Learning with SVM lends itself to large datasets in part because the model in depends only on these support vectors; all other training data do not influence the final model. When assigning values to new data, one needs only to check it against the support vectors.\\
+The \textit{support vectors} are the members of the training data whose distance
+to hyperplane is exactly the margin; that is, the training points which are
+closest to the hyperplane. Learning with SVM lends itself to large datasets in
+part because the model in depends only on these support vectors; all other
+training data do not influence the final model. When assigning values to new
+data, one needs only to check it against the support vectors.\\
 
 Thus, we are left with the following convex programming problem:
 
@@ -50,7 +71,10 @@ Thus, we are left with the following convex programming problem:
 \text{subject to } y_i(\langle w, x_i \rangle + b) \geq 1 \text{ for } i=1\ldots n.
 \end{align*}
 
-If the data are not linearly separable, \textit{slack variables} $\xi_i$ are introduced to allow for some points to be classified incorrectly. We do not want to sacrifice too much to capture outliers, so we introduce an extra term in the objective function to control the error. The optimization problem now becomes
+If the data are not linearly separable, \textit{slack variables} $\xi_i$ are
+introduced to allow for some points to be classified incorrectly. We do not want
+to sacrifice too much to capture outliers, so we introduce an extra term in the
+objective function to control the error. The optimization problem now becomes
 
 \begin{align*}
 \underset{w,\vec{\xi},b}{\text{Minimize }} \frac12 ||w||^2 + \frac{C}{n}\sum_{i=1}^n \xi_i \\
@@ -71,8 +95,14 @@ One can formulate the same problem without constraints:
 \end{equation}
 
 
-where $\ell(y,f(x)) = \max(0,1-yf(x))$ is the \textit{hinge loss}. Also, in the case of linear SVM, $f(x) = \langle w, x\rangle + b$ (Note: there are a couple ways to handle the variable $b$; one can include it in the gradient calculations, or add extra feature to the data space.) Also note that this loss function is not smooth. However, one can solve this unconstrained convex problem using the techniques outlined in the ``Convex Programming Framework'' section of this document, for example using IGD (see paper by Feng, et al). In particular, the subgradient of this loss function which is used in our gradient-descent method is:
-
+where $\ell(y,f(x)) = \max(0,1-yf(x))$ is the \textit{hinge loss}. Also, in the
+case of linear SVM, $f(x) = \langle w, x\rangle + b$ (Note: there are a couple
+ways to handle the variable $b$; one can include it in the gradient
+calculations, or add extra feature to the data space.) Also note that this loss
+function is not smooth. However, one can solve this unconstrained convex problem
+using the techniques outlined in the ``Convex Programming Framework'' section of
+this document. In particular, the subgradient of this loss function which is
+used in our gradient-descent method is:
 $$
 v(y,f(x)) = \left\{
         \begin{array}{ll}
@@ -102,7 +132,12 @@ See \cite{ShSS07} (extended version) for details.
 
 \subsection{$\epsilon$-Regression}
 
-SVM can also be used to predict the values of an affine function $f(x) = \langle w, x\rangle+b$, given sample input-output pairs $(x_1,y_1),\ldots,(x_n,y_n)$. If we allow ourselves an error bound of $\epsilon>0$, and some error controlled by the slack variables $\xi^*$, it is a matter of simply modifying the above convex problem. By demanding that our function is relatively ``flat," and that it approximates the true $f$ reasonably, the relevant optimization problem is:
+SVM can also be used to predict the values of an affine function $f(x) = \langle
+w, x\rangle+b$, given sample input-output pairs $(x_1,y_1),\ldots,(x_n,y_n)$. If
+we allow ourselves an error bound of $\epsilon>0$, and some error controlled by
+the slack variables $\xi^*$, it is a matter of simply modifying the above convex
+problem. By demanding that our function is relatively ``flat," and that it
+approximates the true $f$ reasonably, the relevant optimization problem is:
 
 \begin{align*}
 \underset{w,\vec{\xi},\vec{\xi^*_i},b}{\text{Minimize }} & \frac{1}{2}||w||^2 + \frac{C}{n}\sum_{i=1}^n \xi_i + \xi^*_i \\
@@ -111,7 +146,9 @@ SVM can also be used to predict the values of an affine function $f(x) = \langle
 						& \xi_i,\xi^*_i \geq 0
 \end{align*}
 
-One can also formulate $\epsilon$-regression as an unconstrained optimization problem just as we did with classification. In fact, the objective function remains the same, except rather than hinge loss we would  use the loss function
+One can also formulate $\epsilon$-regression as an unconstrained optimization
+problem just as we did with classification. In fact, the objective function
+remains the same, except rather than hinge loss we would  use the loss function
 
 \[ \ell(y,f(x)) = \max(0,|y-f(x)|-\epsilon) \]
 
@@ -127,55 +164,60 @@ v(y,f(x)) = \left\{
     \right.
 $$
 
-
-
-
-
-%Cite Feng et al here
-
-%Data Science would like to see an implementation of string kernels as well.
-%In GPDB, they would like general user defined functions.
-%
-
-
-
-
-
-
-
-
-
-
-
-
 \section{Nonlinear SVM}\label{sec:nonlinear}
 
-One can generalize the above optimization problems using \textit{kernels}. A kernel is nothing more than a continuous, symmetric, positive-definite function $k:X\times X \to \R$. This replacement, known as the \textit{kernel trick}, allows one to compute decision boundaries which are more general than hyperplanes. \\
+One can generalize the above optimization problems using \textit{kernels}. A
+kernel is nothing more than a continuous, symmetric, positive-definite function
+$k:X\times X \to \R$. This replacement, known as the \textit{kernel trick},
+allows one to compute decision boundaries which are more general than
+hyperplanes. \\
 
 
 
-Two commonly used kernels are polynomial kernels, of the form $k(x,y) = (\langle x,y\rangle +q)^r$, and Gaussian kernels, $k(x,y) = e^{-\gamma||x-y||^2}$, $\gamma$ a free parameter.\\
+Two commonly used kernels are polynomial kernels, of the form $k(x,y) = (\langle
+x,y\rangle +q)^r$, and Gaussian kernels, $k(x,y) = e^{-\gamma||x-y||^2}$,
+$\gamma$ a free parameter.\\
 
-The approach we take to learning with these kernels, following \cite{RR07}, is to approximate them with linear kernels and then apply the above gradient descent based algorithm for learning linear kernels. The approximation takes the following form: we embed the data into a finite dimensional feature space, $z:\R^d\to \R^D$, so that the inner product between transformed points approximates the kernel:
+The approach we take to learning with these kernels, following \cite{RR07}, is
+to approximate them with linear kernels and then apply the above gradient
+descent based algorithm for learning linear kernels. The approximation takes the
+following form: we embed the data into a finite dimensional feature space,
+$z:\R^d\to \R^D$, so that the inner product between transformed points
+approximates the kernel:
 
 \[ k(x,y) \approx \langle z(x), z(y)\rangle.\]
 
-This map $z$ will be a randomly generated (in a manner depending on the kernel) by methods outlined in the papers \cite{KK12,RR07}. We then use $z$ to embed both the training data and the test data into $\R^D$ and run linear SVM on the embedded data.\\
+This map $z$ will be a randomly generated (in a manner depending on the kernel)
+by methods outlined in the papers \cite{KK12,RR07}. We then use $z$ to embed
+both the training data and the test data into $\R^D$ and run linear SVM on the
+embedded data.\\
 
 \subsection{Gaussian kernels}
 
-A kernel $k$ is said to be shift-invariant if, for all $x,y,z$, $k(x,y) = k(x-z,y-z)$. In the paper \cite{RR07}, the authors show that for any shift-invariant kernel $k$, there exists a probability measure $p(w)$ on its domain such that
+A kernel $k$ is said to be shift-invariant if, for all $x,y,z$, $k(x,y) =
+k(x-z,y-z)$. In the paper \cite{RR07}, the authors show that for any shift-
+invariant kernel $k$, there exists a probability measure $p(w)$ on its domain
+such that
 
 \[ k(x,y) = k(x-y,0) = \int_X p(w)e^{i\langle w, x-y\rangle} dw = E_w[e^{i\langle w,x\rangle}e^{-i\langle w,y\rangle}]
 \]
 
-where the expectation above is taken with respect to the measure $p$. In the particular case of $k(x,y) = \exp(-\gamma||x-y||^2)$,
+where the expectation above is taken with respect to the measure $p$. In the
+particular case of $k(x,y) = \exp(-\gamma||x-y||^2)$,
 
 \[ p(w) = \left(\dfrac{1}{\sqrt{4\pi\gamma}}\right)^d \exp\left(\frac{-||w||^2}{4\gamma}\right)\]
 
-where $d$ is the dimension of the sample data. In other words, $p(w)$ is a Gaussian distribution with mean zero and standard deviation $\sigma = \sqrt{2\gamma}$.\\
+where $d$ is the dimension of the sample data. In other words, $p(w)$ is a
+Gaussian distribution with mean zero and standard deviation
+$\sigma = \sqrt{2\gamma}$.\\
 
- Since both the kernel and the probability distribution are real-valued in the above integral, the complex exponential can be replaced with $\cos(\langle w, x-y\rangle)$. Define a random function $z: x\mapsto \sqrt{2}\cos(\langle w,x\rangle + b)$, where $w$ is drawn from the distribution $p$ and $b$ uniformly from $[0,2\pi]$. Then $\langle z(x),z(y)\rangle$ is an unbiased estimator of $k(x,y)=k(x-y,0)$. To see this, use the sum of angles formula:
+Since both the kernel and the probability distribution are real-valued in the
+above integral, the complex exponential can be replaced with
+$\cos(\langle w, x-y\rangle)$. Define a random function
+$z: x\mapsto \sqrt{2}\cos(\langle w,x\rangle + b)$,
+where $w$ is drawn from the distribution $p$ and $b$ uniformly
+from $[0,2\pi]$. Then $\langle z(x),z(y)\rangle$ is an unbiased estimator of
+$k(x,y)=k(x-y,0)$. To see this, use the sum of angles formula:
 
 \begin{align*}
 z(x)z(y) &= 2\cos(\langle w,x\rangle+b)\cos(\langle w,y\rangle+b)\\
@@ -200,24 +242,23 @@ E_{w,b}[z(x)z(y)] &= E[\cos(\langle w,x\rangle)\cos(\langle w,y\rangle) + \sin(\
 
 
 
-To lower the variance of our estimate, we sample several $(w,b)$ pairs and average the resulting values of $z$.\\
-
-
-
-
-
-
+To lower the variance of our estimate, we sample several $(w,b)$ pairs and
+average the resulting values of $z$.\\
 
 \subsubsection{Formal Description}
 
 \begin{algorithm}[Random Fourier Features]
-\alginput{Training data $X$, an $n\times d$ matrix representing $n$ data points in dimension $d$,\\  $\gamma$ parameter of Gaussian distribution $e^{-\gamma ||x||^2}$\\
-				dimension of range space, $D$,\\
-				random normal generator seed}
-\algoutput{$X'$, an $n\times D$ dimension matrix of data in feature space to be sent to linear solver, as well as the $\Omega$ and $\vec{b}$ used to compute $X'$, to be used by the predictor.}
+\alginput{Training data $X$, an $n\times d$ matrix representing $n$ data points in dimension $d$,\\
+          $\gamma$ parameter of Gaussian distribution $e^{-\gamma ||x||^2}$\\
+		  dimension of range space, $D$,\\
+		  random normal generator seed}
+\algoutput{$X'$, an $n\times D$ dimension matrix of data in feature space to be sent to linear solver,
+           as well as the $\Omega$ and $\vec{b}$ used to compute $X'$, to be used by the predictor.}
 
 \begin{algorithmic}[1]
-	\State $\Omega \set d\times D $ matrix of samples drawn from the standard normal distribution $ stdnormal$, then scaled by a factor of $\sqrt{2\gamma}$ to simulate a Gaussian with $\sigma=\sqrt{2\gamma}$ (see fit function from RBFsampler class of scikit-learn)
+	\State $\Omega \set d\times D $ matrix of samples drawn from the standard normal distribution $ stdnormal$,
+    then scaled by a factor of $\sqrt{2\gamma}$ to simulate a Gaussian with $\sigma=\sqrt{2\gamma}$
+    (see fit function from RBFsampler class of scikit-learn)
 	\State $\vec{b} \set$ vector of length $D$, each entry a uniform random sample from $[0,2\pi]$
 	\State $X' \set  X\cdot \Omega$
 	\State $X' \set X' + \vec{b} $ ($\vec{b}$ is added to each row)
@@ -230,25 +271,41 @@ To lower the variance of our estimate, we sample several $(w,b)$ pairs and avera
 
 
 \subsubsection{Parallelization}
-Since the random cosine features are generated independently, each coordinate could be computed independently in parallel, as long as each node has access to the distribution $p$.
+Since the random cosine features are generated independently, each coordinate
+could be computed independently in parallel, as long as each node has access to
+the distribution $p$.
 
 
 \subsection{Dot product kernels}
 
-%Cite the Kar-Karnick paper here
-
-As in the case of Gaussian kernels, we can use random feature maps to approximate polynomial kernels,  $k(x,y)=(\langle x,y\rangle+q)^r$. Again, the embedding takes the following simple form:
+As in the case of Gaussian kernels, we can use random feature maps to
+approximate polynomial kernels,  $k(x,y)=(\langle x,y\rangle+q)^r$. Again, the
+embedding takes the following simple form:
 
 \begin{align*}
 z: \R^d\to \R^D\\
 x\mapsto \frac{1}{\sqrt{D}} (z_1(x), \ldots, z_D(x)).
 \end{align*}
 
-The idea here is to choose each random features $z_i$ such that it satisfies $E[z_i(x)z_i(y)] = k(x, y)$. Then the concentration of measure phenomenon, e.g., as $D$ goes to infinity, ensures $|k(x, y) - z(x)^Tz(y)|$ to be small with high probability. We describe the process of generating $z_i$ below. Note that it applies to any positive definite kernel $k: (x, y) \mapsto f(\langle x, y \rangle)$, where $f$ admits a Maclaurin expansion, i.e., $f(x) = \sum_{N=0}^{\infty} a_N x^N$, where $a_N = f^{(N)}(0) / N!$. For example, in the case of polynomial kernels $k(x,y)=(\langle x,y\rangle+q)^r$, $f(x)$ will be $(x + q)^r$ and $a_N$ will be the N-th derivative of $(x + q)^r$ divided by $N!$.
+The idea here is to choose each random features $z_i$ such that it satisfies
+$E[z_i(x)z_i(y)] = k(x, y)$. Then the concentration of measure phenomenon, e.g.,
+as $D$ goes to infinity, ensures $|k(x, y) - z(x)^Tz(y)|$ to be small with high
+probability. We describe the process of generating $z_i$ below. Note that it
+applies to any positive definite kernel
+$k: (x, y) \mapsto f(\langle x, y \rangle)$,
+where $f$ admits a Maclaurin expansion, i.e.,
+$f(x) = \sum_{N=0}^{\infty} a_N x^N$,
+where $a_N = f^{(N)}(0) / N!$.
+For example, in the case of polynomial kernels
+$k(x,y)=(\langle x,y\rangle+q)^r$, $f(x)$ will be
+$(x + q)^r$ and $a_N$ will be the N-th derivative of $(x + q)^r$ divided by $N!$.
 
 \begin{enumerate}
-	\item Randomly pick a number $N \in \N \cup \{0\}$ with $\Pcal[N] = \frac{1}{p^{N+1}}$. Here $p$ is a positive number larger than 1 and in practice $p=2$ is often a good choice since it establishes a normalized measure over $\N \cup \{0\}$ \cite{KK12}.
-	\item Pick $N$ independent Rademacher vector $w_1, ..., w_N$, i.e., each component of Rademacher vector is chosen independently using a fair coin toss from the set $\{-1, 1\}$.
+	\item Randomly pick a number $N \in \N \cup \{0\}$ with $\Pcal[N] = \frac{1}{p^{N+1}}$.
+    Here $p$ is a positive number larger than 1 and in practice $p=2$ is often a
+    good choice since it establishes a normalized measure over $\N \cup \{0\}$ \cite{KK12}.
+	\item Pick $N$ independent Rademacher vector $w_1, ..., w_N$, i.e., each
+    component of Rademacher vector is chosen independently using a fair coin toss from the set $\{-1, 1\}$.
 	\item Compute $z_i(x)$ as follows,
 	\begin{align}
 	\label{equ:svm-zi}
@@ -260,7 +317,12 @@ The idea here is to choose each random features $z_i$ such that it satisfies $E[
 	\end{align}
 \end{enumerate}
 
-The reason why we pick $N$ from $\N \cup \{0\}$ with probability $\Pcal[N] = \frac{1}{p^{N+1}}$ becomes obvious in the following simple proof. It establishes that the random feature product $z_i(x)z_i(y)$ approximates the kernel product $k(x, y)$ with high probability for any $x, y \in \R^d$, i.e., $E[z_i(x)z_i(y)] = k(x, y)$:
+The reason why we pick $N$ from $\N \cup \{0\}$ with probability $\Pcal[N] =
+\frac{1}{p^{N+1}}$ becomes obvious in the following simple proof. It establishes
+that the random feature product $z_i(x)z_i(y)$ approximates the kernel product
+$k(x, y)$ with high probability for any $x, y \in \R^d$, i.e., $E[z_i(x)z_i(y)]
+= k(x, y)$:
+
 \begin{align*}
     E[z_i(x)z_i(y)]
     &= E_N [E_{w_1, ..., w_N} [z_i(x)z_i(y)] | N]  \\
@@ -273,10 +335,6 @@ The reason why we pick $N$ from $\N \cup \{0\}$ with probability $\Pcal[N] = \fr
 
 See the paper of Kar and Karnick \cite{KK12} for more details.
 
-%I am confused about their algorithm: for polynomial functions, f(x) = x^r, it seems like their
-%random Maclaurin algorithm is not well defined, simply because if N != r (in their notation),
-%then a_N = 0 and the algorithm would just return 0.
-
 \subsubsection{Formal Description}
 
 \begin{algorithm}[Random Features for Polynomial Kernels]
@@ -300,86 +358,20 @@ $\Omega, \Ncal$ generated by the algorithm, to be used by the predictor.\\
 \end{algorithm}
 
 \subsubsection{Parallelization}
-In the above algorithm, Step \ref{alg:x-omega} is done by broadcasting the matrix $\Omega$ to each row of $X$, and computing the matrix-vector product locally in parallel for each row; Similarly, Step \ref{alg:x-for} can also be distributed since the computations for each row are independent of the others.
-
-
-
-%
-%\section{Interface} % (fold)
-%\label{sec:interface}
-%The interface will consist of two functions, \texttt{svm\_train}  and \texttt{svm\_predict}. In the training function, if the dependent variable  column contains data which are doubles or floats, then the function will generate a regression model. Otherwise, it will be assumed the user wants a classification model.
-%
-%\subsection{Training} % (fold)
-%\label{sub:training}
-%\begin{sql}
-%    svm_train(
-%            training_table,
-%            independent_columns,
-%            dependent_column,
-%            model_table,
-%            kernel_func,
-%            kernel_parameters,
-%            optimization_parameters
-%            regularization_parameters
-%    )
-%\end{sql}
-%
-%\paragraph{Arguments:}
-%
-%\begin{itemize}
-%    \item \emph{training\_table}: name of the table containing the sample data
-%    \item \emph{independent\_columns}: columns which are to be independent variables
-%     \item \emph{dependent\_columns}: column which is to be the dependent variable
-%    \item \emph{model\_table}: name of output table containing the model data
-%    \item \emph{kernel\_func}: string representing kernel function used for training and predicting
-%    \item \emph{kernel\_parameters}: string of parameters to be used with kernel function (possibly null)
-%    \item \emph{optimization\_parameters}: string of parameters for gradient descent, i.e. step-size $\eta$, max\_iterations, etc.
-%    \item \emph{regularization\_parameters}: string of regularization parameters, $\lambda$
-%   % \item \emph{grouping\_cols}: the name of the columns to group by
-%
-%\end{itemize}
-%
-%\subsection{Prediction}
-%\label{sub:predict}
-%\begin{sql}
-%	svm_predict(
-%		model_table
-%		new_data
-%		prediction_table
-%		verbose
-%	)
-%\end{sql}
-%
-%\paragraph{Arguments:}
-%
-%\begin{itemize}
-%
-%	\item \emph{model\_table}: name of table containing model data\\
-%	data to be included: the decision function determined by $(w,b)$, the kernel, kernel parameters, optimization parameters the randomly generated $\omega$'s for the embedding into random feature space.
-%	\item \emph{new\_data}: name of table containing new data points to be estimated
-%	\item \emph{prediction\_table}: name of table containing prediction values on input data
-%	\item \emph{verbose}: (OPTIONAL) verbose output
-%\end{itemize}
-
-
-
-\subsection{Nystr{\"o}m methods}
-
-%Cite Yang, Li, et al paper here
-
-The Nystr{\"o}m method approximates the kernel matrix $K=(k(x_i,x_j)_{i,j=1\ldots,N})$ by randomly sampling $m \ll N$ training data points  $\hat{x}_1,\ldots, \hat{x}_m$ and constructing a new, low rank matrix from the data. One constructs a low-dimensional feature representation of the form $x\mapsto A(k(x,\hat{x}_1),\ldots, k(x,\hat{x}_m))^{\textbf{T}}$, where $A$ is some matrix constructed from the eigenvectors and eigenvalues of the submatrix of the Gram matrix determined by $\hat{x}_1,\ldots, \hat{x}_m$. The computational complexity of constructing this predictor is $O(m^2n)$, which is much less than the cost of computing the full Gram matrix.
-
+In the above algorithm, Step \ref{alg:x-omega} is done by broadcasting the
+matrix $\Omega$ to each row of $X$, and computing the matrix-vector product
+locally in parallel for each row; Similarly, Step \ref{alg:x-for} can also be
+distributed since the computations for each row are independent of the others.
 
 \section{Novelty Detection}
-Suppose we have training data $x_1, x_2, \ldots x_n \in \R^d$, the goal of novelty detection is to learn a hyperplane in $\R^d$ that separates the training data from the origin with maximum margin. We model this problem as a 
-one-class classification problem by transforming the training data to $(x_1,y_1),\ldots,(x_n,y_n) \in \R^d\times \{1\}$, indicating that the dependent variable of each training instance is assumed to be the same.
-Given such a mapping, we use the SVM classification mechanisms detailed in Sections~\ref{sec:linear} and~\ref{sec:nonlinear} to learn a one-class classification model. See the paper by Sch\"{o}lkopf for more details on one-class 
-SVM~\cite{Scholkopf}.
-
-
-
-
-
-
 
-% section clustered_standard_errors (end)
+Suppose we have training data $x_1, x_2, \ldots x_n \in \R^d$, the goal of
+novelty detection is to learn a hyperplane in $\R^d$ that separates the training
+data from the origin with maximum margin. We model this as a one-class
+classification problem by transforming the training data to
+$(x_1,y_1),\ldots,(x_n,y_n) \in \R^d\times \{1\}$, indicating that the dependent
+variable of each training instance is assumed to be the same. The origin is
+treated as a negative class data point and all input data points are treated as
+part of the positive class. Given such a mapping, we use the SVM classification
+mechanisms detailed in Sections~\ref{sec:linear} and~\ref{sec:nonlinear} to
+learn a one-class classification model.

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceafc8d7/doc/design/modules/cox-proportional-hazards.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/cox-proportional-hazards.tex b/doc/design/modules/cox-proportional-hazards.tex
index 046e3e5..99192c7 100644
--- a/doc/design/modules/cox-proportional-hazards.tex
+++ b/doc/design/modules/cox-proportional-hazards.tex
@@ -44,7 +44,7 @@ $x_i$ denote the observed value of the  $i$th covariate, then the Cox model is
 written as
 
 \begin{equation}
-h(t) = e^{\alpha(t)} e^{\beta_1 x_1 + \beta_2 x_2 + \dots + \beta_m x_m} = e^{\alpha(t)} e^{\bold{\beta^T x}}
+h(t) = e^{\alpha(t)} e^{\beta_1 x_1 + \beta_2 x_2 + \dots + \beta_m x_m} = e^{\alpha(t)} e^{\mathbf{\beta^T x}}
 \end{equation}
 where $\beta_i$ is the coefficient associated with the $i$th covariate.
 
@@ -52,13 +52,13 @@ Many applications take values from multiple observations, measuring the values
 of $x_i$ for each observation.  %The $j$th observation has the hazard function
 
 %\begin{equation}
-%h_j(t) = e^{\alpha(t)} e^{\beta_1 x_{j1} + \beta_2 x_{j2}+ \dots + \beta_k x_{jk}}= e^{\alpha(t)} e^{\bold{\beta^T x_j}}.
+%h_j(t) = e^{\alpha(t)} e^{\beta_1 x_{j1} + \beta_2 x_{j2}+ \dots + \beta_k x_{jk}}= e^{\alpha(t)} e^{\mathbf{\beta^T x_j}}.
 %\end{equation}
 
 In the \textit{proportional-hazard model}, the hazard functions of two
 observations $j$ and $k$ are compared. The ratio of the two is
 \begin{equation}
-\frac{h_j(t)}{h_k(t)} = \frac{e^{\alpha(t)} e^{\bold{\beta^T x_j}} }{e^{\alpha(t)} e^{\bold{\beta^T x_k}} } = \frac{e^{\bold{\beta^T x_j} }}{ e^{\bold{\beta^T x_k}}}
+\frac{h_j(t)}{h_k(t)} = \frac{e^{\alpha(t)} e^{\mathbf{\beta^T x_j}} }{e^{\alpha(t)} e^{\mathbf{\beta^T x_k}} } = \frac{e^{\mathbf{\beta^T x_j} }}{ e^{\mathbf{\beta^T x_k}}}
 \end{equation}
 
 The critical idea here is that the ratio of the two hazard functions is
@@ -82,35 +82,35 @@ at time $t_i$.
 Given that there was a death at time $t_i$ and observation $k$ was alive prior
 to $t_i$, the probability that the death happened to observation $k$  is
 \begin{equation}\label{cox:equ:death-prob}
-\Pr(T_k = t_i | R(t_i)) =  \frac{e^{\bold{\beta^T x_k} }}{ \sum_{j \in R(t_i)} e^{\bold{\beta^T x_j}}}.
+\Pr(T_k = t_i | R(t_i)) =  \frac{e^{\mathbf{\beta^T x_k} }}{ \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}}}.
 \end{equation}
 
 The \textit{partial likelihood function} can now be generated as the product of conditional probabilities.
 More formally,
 \begin{equation}\label{cox:equ:likelihood}
-\mathcal{L} = \prod_{i = 1}^n \left(  \frac{e^{\bold{\beta^T x_i} }}{ \sum_{j \in R(t_i)} e^{\bold{\beta^T x_j}}} \right).
+\mathcal{L} = \prod_{i = 1}^n \left(  \frac{e^{\mathbf{\beta^T x_i} }}{ \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}}} \right).
 \end{equation}
 
  The log-likelihood form of this equation is
 \begin{equation}\label{cox:equ:LLH}
-L = \sum_{i = 1}^n \left[  \bold{\beta^T x_i} - \log\left(\sum_{j \in R(t_i)} e^{\bold{\beta^T x_j} } \right) \right].
+L = \sum_{i = 1}^n \left[  \mathbf{\beta^T x_i} - \log\left(\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } \right) \right].
 \end{equation}
 
 An estimation of $\beta$ can be found by simply maximizing this log-likelihood.
 To maximize the likelihood, it helps to have the derivative of equation
 \ref{cox:equ:LLH}, which is
 \begin{equation}\label{cox:equ:LLH derivative}
-\frac{\partial L}{\partial \beta_k} = \sum_{i = 1}^n \left( x_{ik} - \frac{\sum_{j \in R(t_i)} x_{jk} e^{\bold{\beta^T x_j} } }{\sum_{j \in R(t_i)} e^{\bold{\beta^T x_j} } } \right).
+\frac{\partial L}{\partial \beta_k} = \sum_{i = 1}^n \left( x_{ik} - \frac{\sum_{j \in R(t_i)} x_{jk} e^{\mathbf{\beta^T x_j} } }{\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } } \right).
 \end{equation}
 It follows that the second derivative is
 \begin{equation}\label{cox:equ:LLH second derivative}
 \frac{\partial^2 L}{\partial \beta_k \beta_l} = \sum_{i = 1}^n
             \left(
-                \frac{\left(  \sum_{j \in R(t_i)} x_{jk} e^{\bold{\beta^T x_j} } \right)
-                            \left(  \sum_{j \in R(t_i)} x_{jl} e^{\bold{\beta^T x_j} } \right)}
-                     {\left( \sum_{j \in R(t_i)} e^{\bold{\beta^T x_j} } \right)^2 } -
-                \frac{  \sum_{j \in R(t_i)} x_{jk}x_{jl} e^{\bold{\beta^T x_j} } }
-                     {\sum_{j \in R(t_i)} e^{\bold{\beta^T x_j} } }
+                \frac{\left(  \sum_{j \in R(t_i)} x_{jk} e^{\mathbf{\beta^T x_j} } \right)
+                            \left(  \sum_{j \in R(t_i)} x_{jl} e^{\mathbf{\beta^T x_j} } \right)}
+                     {\left( \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } \right)^2 } -
+                \frac{  \sum_{j \in R(t_i)} x_{jk}x_{jl} e^{\mathbf{\beta^T x_j} } }
+                     {\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } }
             \right).
 \end{equation}
 
@@ -125,7 +125,7 @@ death, and 0 otherwise.
 
 Incorporating this indicator variable into equation \ref{cox:equ:likelihood} gives
 \begin{equation}\label{cox:equ:likelihood-censoring}
-\mathcal{L} = \prod_{i = 1}^n \left(  \frac{e^{\bold{\beta^T x_i} }}{ \sum_{j \in R(t_i)} e^{\bold{\beta^T x_j}}} \right)^{\delta_i}.
+\mathcal{L} = \prod_{i = 1}^n \left(  \frac{e^{\mathbf{\beta^T x_i} }}{ \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}}} \right)^{\delta_i}.
 \end{equation}
 The appropriate changes to the LLH function and its derivatives are trivial.
 
@@ -183,7 +183,7 @@ together. However, this might not work well in this setting. To see why,
 consider parallelizing equation \ref{cox:equ:LLH second derivative}.  Each
 worker task is given one term in the sum, which looks like
 \begin{equation}
- \frac{\left(  \sum_{j \in R(t_i)} x_{jk} e^{\bold{\beta^T x _j} } \right) \left(  \sum_{j \in R(t_i)} x_{jl} e^{\bold{\beta^T x _j} } \right)}{\left( \sum_{j \in R(t_i)} e^{\bold{\beta^T x_j} } \right)^2 }   -  \frac{  \sum_{j \in R(t_i)} x_{jk}x_{jl} e^{\bold{\beta^T x_j} } }{\sum_{j \in R(t_i)} e^{\bold{\beta^T x_j} } }.
+ \frac{\left(  \sum_{j \in R(t_i)} x_{jk} e^{\mathbf{\beta^T x _j} } \right) \left(  \sum_{j \in R(t_i)} x_{jl} e^{\mathbf{\beta^T x _j} } \right)}{\left( \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } \right)^2 }   -  \frac{  \sum_{j \in R(t_i)} x_{jk}x_{jl} e^{\mathbf{\beta^T x_j} } }{\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } }.
 \end{equation}
 
 Note that the sums in the numerator are summing over all the data points in the
@@ -192,9 +192,9 @@ derivative terms as defined in \ref{cox:equ:LLH derivative}. However, we note
 that this sum has a structure that allows it to be computed in linear time (with
 respect to the number of data points) using the following quantities.
 \begin{align}
-H_{i} &=   \sum_{j \in R(t_i)} x_{j} e^{\bold{\beta^T x_j}}\\
-S_{i}&=   \sum_{j \in R(t_i)} e^{\bold{\beta^T x_j}} \\
-V_{i}&=   \sum_{j \in R(t_i)} x_{j}x_{j}^T e^{\bold{\beta^T x_j} }
+H_{i} &=   \sum_{j \in R(t_i)} x_{j} e^{\mathbf{\beta^T x_j}}\\
+S_{i}&=   \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}} \\
+V_{i}&=   \sum_{j \in R(t_i)} x_{j}x_{j}^T e^{\mathbf{\beta^T x_j} }
 \end{align}
 
 Note that $H_{i}$ is a column vector with $m$ elements ( $H_{i}\in
@@ -213,8 +213,8 @@ derivative}, can be reduced to
 Since we assume that the data points are sorted in increasing order i.e
 $R(t_i) = \{i, i+1 \ldots n \}$, we can calculate the above summation as
 \begin{align}
-H_{i}& =   H_{i+1} +  x_{i} e^{\bold{\beta^T x_i}} \label{eq:numerator_avg}\\
-S_i & = S_{i+1} + e^{\bold{\beta^T x_i}} \label{eq:denominator_avg}\\
+H_{i}& =   H_{i+1} +  x_{i} e^{\mathbf{\beta^T x_i}} \label{eq:numerator_avg}\\
+S_i & = S_{i+1} + e^{\mathbf{\beta^T x_i}} \label{eq:denominator_avg}\\
 V_i & = V_{i+1} +  \frac{H_{i}H_{i}^T }{ S_{i}^2 } -  \frac{V_{i}}{
   S_{i}} \label{eq:hessian}.
 \end{align}
@@ -225,7 +225,6 @@ can be computed in linear time.
 %Hessian must be computed as well.  Unfortunately, matrix inversion is an
 %expensive operation, and  cannot be parallelized.
 
-
 % SECTION: Stratification Support
 \section{Stratification Support}\label{cox:stratified}
 A crucial component of the Cox Proportional Hazards model is the proportional hazards assumption:
@@ -238,7 +237,7 @@ we use all independent variables as covariates. A stratified Cox regression
 model may then be useful. It offers a way such that we can choose a subset of
 independent variables as covariates while are still taking the remaining
 independent variables into account. The stratified Cox regression is available
-in both R~\cite{r-cox} and Stata~\cite{stata-cox}.
+in both R~\cite{rcox} and Stata~\cite{statacox}.
 
 Stratification is used as shorthand for building a Cox model that allows for
 more than one stratum, and hence, allows for more than one baseline hazard
@@ -258,11 +257,11 @@ To explicitly clarify how stratification differentiates from grouping support:
     Cox model for each value of the categorical column, where the baseline
     hazards for each value of the categorical column would be different and
     the estimated coefficient values for each explanatory variable would be
-    \textbf{different} for each value of the categorical column.
+    different for each value of the categorical column.
     \item Stratifying by a categorical column would build a single
     common Cox model, where the baseline hazards for each value of the
     categorical column would be different, but the estimated coefficient values
-    for each explanatory variable would be the \textbf{same} for each value of the stratum.
+    for each explanatory variable would be the same for each value of the stratum.
 \end{itemize}
 
 It is valuable to emphasize that all strata share all coefficients, and that the
@@ -272,7 +271,7 @@ different strata.
 
 \subsection{Estimating A Stratified Cox Model}\label{cox:estimate-stratified}
 The parameter estimation is done by maximizing the product of likelihoods, each
-from a stratum~\cite{stratified-ethz-slides}.
+from a stratum~\cite{stratifiedethzslides}.
 
 Given $n$ observations, each with $m$ covariates and each in one of $K$
 strata~\footnote{Note that this does not mean that we have $K$ variables other
@@ -282,13 +281,13 @@ $\mathit{ST}_{k}$ denote the set of observations in the $k$-th stratum.
 Because, as an objective function, the sum of log-likelihoods is equivalent to
 the product of likelihoods, according to Equation \ref{cox:equ:LLH}, we have the
 log-likelihood associated with the $k$-th stratum,
-\begin{equation}\label{cox:equ:obj-each-stratum}L_{k} = \sum_{i \in \mathit{ST}_{k}} \left[  \bold{\beta^T x_i} - \log\left(\sum_{j \in R_k(t_i)} e^{\bold{\beta^T x_j} } \right) \right],
+\begin{equation}\label{cox:equ:obj-each-stratum}L_{k} = \sum_{i \in \mathit{ST}_{k}} \left[  \mathbf{\beta^T x_i} - \log\left(\sum_{j \in R_k(t_i)} e^{\mathbf{\beta^T x_j} } \right) \right],
 \end{equation}
 where $R_k(t_i)$ the set of observations in $\mathit{ST}_{k}$ and still alive at time $t_i$.
 
 Therefore, the objective function of stratified cox regression can be expressed as
 \begin{equation}\label{cox:equ:stratified-obj}
-L^{\mathit{stratified}} = \sum_{k=1}^{K} L_{k} = \sum_{k=1}^{K} \sum_{i \in \mathit{ST}_{k}} \left[  \bold{\beta^T x_i} - \log\left(\sum_{j \in R_k(t_i)} e^{\bold{\beta^T x_j} } \right) \right].
+L^{\mathit{stratified}} = \sum_{k=1}^{K} L_{k} = \sum_{k=1}^{K} \sum_{i \in \mathit{ST}_{k}} \left[  \mathbf{\beta^T x_i} - \log\left(\sum_{j \in R_k(t_i)} e^{\mathbf{\beta^T x_j} } \right) \right].
 \end{equation}
 The appropriate changes to gradient, Hessian and censoring are trivial.
 
@@ -357,7 +356,7 @@ transformation of times: \texttt{``km''}, \texttt{``rank''},
 \texttt{``km''} stands for Kaplan-Meier's method.
 \texttt{``rank''} takes the ranks of the times instead of times.
 \texttt{``log''} uses the logarithm of times.
-And \texttt{``identity''} does nothing and directly uses times.\cite{cox-zph}}
+And \texttt{``identity''} does nothing and directly uses times.\cite{coxzph}}
 
 %To quantify whether there is a trend in the Schoenfeld residuals over
 %time, we can compute the correlation between the scaled residuals and the
@@ -428,7 +427,8 @@ times. Here $[\cdot]_{+}$ means sum over all observations with the
 same survival time.
 
 \subsection{Robust Variance Estimators}
-In MADlib, we implement robust variance estimator devised by Lin and Wei~\cite{lin1989robust}.
+In MADlib, we implement robust variance estimator devised by
+Lin and Wei~\cite{lin1989robust}.
 With our notations above, let
 \begin{eqnarray*}
     \vec{W}_{i}

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceafc8d7/doc/design/modules/crf.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/crf.tex b/doc/design/modules/crf.tex
index 9a1858d..6bdc8a4 100644
--- a/doc/design/modules/crf.tex
+++ b/doc/design/modules/crf.tex
@@ -276,7 +276,7 @@ The following table dipicts how the training/testing data is stored in the datab
 \begin {table}[h]
 \caption {Training or Testing data} \label{tab:trainingdata}
 \begin{center}
-    \footnotesize\tt
+    \footnotesize
 \begin{tabular}{llll||llll}
   start\_pos & doc\_id & seg\_text & label & start\_pos & doc\_id & seg\_text & label\\
   \hline
@@ -420,7 +420,7 @@ The dictionary constains all the tokens and the number of times they appear in t
 \begin {table}[h]
 \caption {Feature table} \label{tab:featuretbl}
 \begin{center}
-    \scriptsize\tt
+    \scriptsize
     \begin{tabular}{ | l | l | l | l | l || l | l | l | l | l | }
     \hline
     f\_index & f\_name & prev\_label & label & weight & f\_index & f\_name & prev\_label & label & weight\\
@@ -441,7 +441,7 @@ The dictionary constains all the tokens and the number of times they appear in t
 \begin {table}[h]
 \caption {Dictionary table} \label{tab:title}
 \begin{center}
-    \scriptsize\tt
+    \scriptsize
     \begin{tabular}{ | l | l || l | l || l | l || l | l | }
     \hline
     token     & total & token   & total & token     & total & token       & total\\
@@ -514,7 +514,7 @@ The dictionary constains all the tokens and the number of times they appear in t
 \begin {table}[h]
 \caption {viterbi\_mtbl table} \label{tab:viterbimtbl}
  \begin{center}
-%\tiny\tt
+%\tiny
   \begin{tabular}{l*{6}{c}r}
    token             & 0   & 1   & 2   & 3   & ... & 43 &  44 \\
    \hline
@@ -532,7 +532,7 @@ The dictionary constains all the tokens and the number of times they appear in t
 \begin {table}[h]
 \caption {viterbi\_rtbl table} \label{tab:viterbirtbl}
  \begin{center}
-%\tiny\tt
+%\tiny
   \begin{tabular}{l*{6}{c}r}
    token             & 0   & 1   & 2   & 3   & ... & 43 &  44 \\
    \hline
@@ -577,7 +577,7 @@ observed input sequence.
 \begin {table}[h]
 \caption {Viterbi inference output}
 \begin{center}
-    \scriptsize\tt
+    \scriptsize
     \begin{tabular}{ | l | l | l | l | l | }
     \hline
     doc\_id & start\_pos & token & label & probability     \\\hline

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceafc8d7/doc/design/modules/lda.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/lda.tex b/doc/design/modules/lda.tex
index 5e4c271..c3fd0ac 100644
--- a/doc/design/modules/lda.tex
+++ b/doc/design/modules/lda.tex
@@ -42,7 +42,7 @@ big datasets.
 Althouth the derivation of Gibbs sampling for LDA is complicated, the results are very simple. The following equation tells us how to sample a new topic for a word in a corpus:
 
 \begin{equation}
-P(z_i=k|{\bf z}_{-i},{\bf w}) \propto \frac{nwz_{-i,k}^{(w_i)} +\beta}{nz_{-i,k}+W \beta}\times(ndz_{-i,k}^{(d_i)}+\alpha)
+P(z_i=k|\textbf{z}_{-i},\textbf{w}) \propto \frac{nwz_{-i,k}^{(w_i)} +\beta}{nz_{-i,k}+W \beta}\times(ndz_{-i,k}^{(d_i)}+\alpha)
 \end{equation}
 where:
 \begin{itemize}
@@ -51,8 +51,8 @@ where:
 \item $w_i$ - wordid of the $i_{th}$ word
 \item $k$ - the $k_{th}$ topic, where $1 <= k <= T$, and $T$ is the number of topics
 \item $z_i$ - topic assignment for the $i_{th}$ word
-\item ${\bf z}_{-i}$ - topic assignments for other words excluding the $i_{th}$ word
-\item ${\bf w}$ - all words in the corpus
+\item $\textbf{z}_{-i}$ - topic assignments for other words excluding the $i_{th}$ word
+\item $\textbf{w}$ - all words in the corpus
 \item $ndz$ - per-document topic counts
 \item $nwz$ - per-word topic counts
 \item $nz$ - corpus-level topic counts

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceafc8d7/doc/design/modules/linear-systems.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/linear-systems.tex b/doc/design/modules/linear-systems.tex
index 1299370..b64f2af 100644
--- a/doc/design/modules/linear-systems.tex
+++ b/doc/design/modules/linear-systems.tex
@@ -83,7 +83,7 @@ the conjugate gradient method (CG) and the randomized Kaczmarz (RK) method.
 \subsection{Conjugate Gradient (CG)}
 The linear conjugate gradient method (not to be confused with the non-linear
 conjugate gradient method) to solve large sparse linear systems with a
-{\it symmetric positive definite} $A$ matrix. Such a system can be stated as:
+\textit{symmetric positive definite} $A$ matrix. Such a system can be stated as:
 \begin{equation}
   \label{eq:sym_linear_system}
   Ax = b

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceafc8d7/doc/design/modules/regression.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/regression.tex b/doc/design/modules/regression.tex
index 47ede86..e241f9a 100644
--- a/doc/design/modules/regression.tex
+++ b/doc/design/modules/regression.tex
@@ -42,10 +42,10 @@ Regression models involve the following variables:
 \section{Multinomial Logistic Regression}\label{sec:multi_intro}
 
 Multinomial logistic regression is a widely used regression analysis tool that
-models the outcomes of categorical dependent random variables. {\it Generalized
-linear models} identify key ideas shared by a broad class of distributions
-thereby extending techniques used in linear regression, into the field of
-logistic regression.
+models the outcomes of categorical dependent random variables.
+\textit{Generalized linear models} identify key ideas shared by a broad class of
+distributions thereby extending techniques used in linear regression, into the
+field of logistic regression.
 
 This document provides an overview of the theory of multinomial logistic
 regression models followed by a design specification of how the model can be
@@ -64,7 +64,7 @@ variable $Z_i$.  Analogous to the observed values, we define a set of parameters
 $\bS{\pi}$  as an $N\times J$ matrix with each entry $\pi_{i,j}$ denoting the
 probability of observing the random variable $Z_i$ to fall in the $j^{th}$
 category. In logistic regression, we assume that the random variables $\bS{Z}$
-are explained by a {\it design matrix} of independent random variables $\bS{X}$
+are explained by a \textit{design matrix} of independent random variables $\bS{X}$
 which contains $N$ rows and $(K+1)$ columns. We define a regression coefficient
 $\bS{\beta}$ as a matrix with $K+1$ rows and $J$ columns such that $\beta_{k,j}$
 corresponds to the importance while predicting the $j^{th}$ value of the
@@ -408,16 +408,16 @@ FISTA, and the other is IGD.
 
 \paragraph{FISTA}
 
-Fast Iterative Shrinkage Thresholding Algorithm (FISTA) with {\bf
-  backtracking step size} [4]:
+Fast Iterative Shrinkage Thresholding Algorithm (FISTA) with
+\textbf{backtracking step size} [4]:
 \vspace{0.2in}
 \hrule
 \vspace{0.2in}
-{\bf Step $0$}: Choose $\delta>0$ and $\eta > 1$, and
+\textbf{Step $0$}: Choose $\delta>0$ and $\eta > 1$, and
 $\vec{w}^{(0)} \in R^N$. Set $\vec{v}^{(1)}=\vec{w}^{(0)}$ and
 $t_1=1$.
 
-{\bf Step $k$}: ($k \ge 1$) Find the smallest nonnegative integers
+\textbf{Step $k$}: ($k \ge 1$) Find the smallest nonnegative integers
 $i_k$ such that with $\bar{\delta} = \delta_{k-1}/\eta^{i_k-1}$
 \begin{equation}
 F(p_{\bar{\delta}}(\vec{v}^{(k)})) \le
@@ -489,14 +489,14 @@ where
 \vec{u} = \vec{v} - \delta\,\nabla f(\vec{v})\ .
 \end{equation}
 
-{\bf Active set method} is used in our implementation for FISTA to speed up the
+\textbf{Active set method} is used in our implementation for FISTA to speed up the
 computation. Considerable speedup is obtained by organizing the iterations
 around the active set of features - those with nonzero coefficients. After a
 complete cycle through all the variables, we iterate on only the active set till
 convergence. If another complete cycle does not change the active set, we are
 done, otherwise the process is repeated.
 
-{\bf Warm-up method} is also used to speed up the computation. When the option
+\textbf{Warm-up method} is also used to speed up the computation. When the option
 parameter $warmup$ is set to be $True$, a series of lambda values, which is
 strictly descent and ends at the lambda value that the user wants to calculate,
 will be used. The larger lambda gives very sparse solution, and the sparse
@@ -505,7 +505,7 @@ which will speed up the computation for the next lambda. For larger data sets,
 this can sometimes accelerate the whole computation and might be faster than
 computation on only one lambda value.
 
-{\bf Note:} Our implementation is a little bit different from the original
+\textbf{Note:} Our implementation is a little bit different from the original
 FISTA. In the original FISTA, during the backtracking procedure, the algorithm
 is searching for a non-negative integer $i_k$ and the new step size is $\delta_k
 = \delta_{k-1}/\eta^{i_k}$. Thus the step size is non-increasing. Here, we allow
@@ -538,7 +538,7 @@ Let $p = 2\log d$ and let $h^{-1}$ be as in Eq. (\ref{eq:f})
 
 Let $\vec{\theta} = \vec{0}$
 
-{\bf for} $k = 1,2,\dots$
+\textbf{for} $k = 1,2,\dots$
 
 \quad $\vec{w} = h^{-1}(\vec{\theta})$
 
@@ -571,9 +571,9 @@ ability to process large data set on multiple machines. So this
 algorithm provides an option  \emph{parallel} to let the user choose
 whether to do parallel computation.
 
-IGD also implements the {\bf warm-up method}.
+IGD also implements the \textbf{warm-up method}.
 
-{\bf Stopping Criteria} Both {\bf FISTA} and {\bf IGD} compute the average difference
+\textbf{Stopping Criteria} Both \textbf{FISTA} and \textbf{IGD} compute the average difference
 between the coefficients of two consecutive iterations, and if the
 difference is smaller than \emph{tolerance} or the iteration number
 is larger than \emph{max\_iter}, the computation stops.
@@ -595,9 +595,9 @@ speed, although we acquire the ability to process large data set on multiple
 machines. So this algorithm provides an option <em>parallel</em> to let the user
 choose whether to do parallel computation.
 
-IGD also implements the {\bf warm-up method}.
+IGD also implements the \textbf{warm-up method}.
 
-{\bf Stopping Criteria} Both {\bf FISTA} and {\bf IGD} compute the average
+\textbf{Stopping Criteria} Both \textbf{FISTA} and \textbf{IGD} compute the average
 difference between the coefficients of two consecutive iterations, and if the
 difference is smaller than <em>tolerance</em> or the iteration number is larger
 than <em>max\_iter</em>, the computation stops.
@@ -1288,9 +1288,9 @@ target function
 
 The meat part is different
 \begin{equation}
-  M(\vec{\beta}) = \bf{A}^T\bf{A}
+  M(\vec{\beta}) = {A}^T{A}
 \end{equation}
-where the $m$-th row of $\bf{A}$ is
+where the $m$-th row of ${A}$ is
 \begin{equation}
   A_m = \sum_{i\in G_m}\frac{\partial
       l(y_i,\vec{x}_i,\vec{\beta})}{\partial \vec{\beta}}

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ceafc8d7/doc/literature.bib
----------------------------------------------------------------------
diff --git a/doc/literature.bib b/doc/literature.bib
index 0aee99a..64f5b0d 100644
--- a/doc/literature.bib
+++ b/doc/literature.bib
@@ -58,29 +58,41 @@
 	Year = {2011},
 	Bdsk-Url-1 = {http://www.postgresql.org/docs/9.1}}
 
-@manual{r-cox,
+@manual{rcox,
 	Author = {John Fox},
-	Title = {{Cox Proportional-Hazards Regression for Survival Data: Appendix to An R and S-PLUS Companion to Applied Regression}},
-	Url = {http://cran.r-project.org/doc/contrib/Fox-Companion/appendix-cox-regression.pdf},
+	Title = {{Cox Proportional Hazards Regression for Survival Data:  Appendix to An R and S-PLUS Companion to Applied Regression}},
 	Year = {2002}}
 
-@manual{stata-cox,
-	Author = {{Stata Press}},
-	Title = {{stcox - Cox proportional hazards model}},
-	Url = {http://www.stata-press.com/manuals/st_stcox.pdf}}
+@manual{statacox,
+	Author = {Stata Press},
+	Title = {{stcox - Cox proportional hazards model}}
+}
 
-@manual{stratified-ethz-slides,
+@article{stratifiedethzslides,
 	Author = {Lisa Borsi and Marc Lickes and Lovro Soldo},
-	Title = {{The stratified Cox Procedure}},
-	Url = {http://stat.ethz.ch/education/semesters/ss2011/seminar/contents/presentation_5.pdf},
-	Year = {2011}}
+	Title = {The stratified Cox Procedure},
+	Volume = {27},
+	Year = {2011}
+}
 
-@manual{cox-zph,
-  author = {Test the Proportional Hazards Assumption of a Cox Regression},
-  title = {http://stat.ethz.ch/R-manual/R-devel/library/survival/html/cox.zph.html}}
+@manual{coxzph,
+  Title = {Test the Proportional Hazards Assumption of a Cox Regression (R manual for Cox Zph)}
+}
 
 @article{V84a,
-	Abstract = {{Several new methods are presented for selecting n records at random without replacement from a file containing N records. Each algorithm selects the records for the sample in a sequential manner---in the same order the records appear in the file. The algorithms are online in that the records for the sample are selected iteratively with no preprocessing. The algorithms require a constant amount of space and are short and easy to implement. The main result of this paper is the design and analysis of Algorithm D, which does the sampling in O(n) time, on the average; roughly n uniform random variates are generated, and approximately n exponentiation operations (of the form ab, for real numbers a and b) are performed during the sampling. This solves an open problem in the literature. CPU timings on a large mainframe computer indicate that Algorithm D is significantly faster than the sampling algorithms in use today.}},
+	Abstract = {{Several new methods are presented for selecting n records at
+	random without replacement from  a file containing N records. Each algorithm
+	selects the records for the sample in a sequential manner---in the same
+	order the records appear in the file. The algorithms are online in that the
+	records for the sample are selected iteratively with no preprocessing. The
+	algorithms require a constant amount of space and are short and easy to
+	implement. The main result of this paper is the design and analysis of
+	Algorithm D, which does the sampling in O(n) time, on the average; roughly n
+	uniform random variates are generated, and approximately n exponentiation
+	operations (of the form ab, for real numbers a and b) are performed during
+	the sampling. This solves an open problem in the literature. CPU timings on
+	a large mainframe computer indicate that Algorithm D is significantly faster
+	than the sampling algorithms in use today.}},
 	Author = {Vitter, Jeffrey Scott},
 	Date-Added = {2012-03-10 23:03:35 -0800},
 	Date-Modified = {2012-03-10 23:03:35 -0800},
@@ -862,7 +874,7 @@ Applied Survival Analysis},
 	journal={Proceedings of AISTATS},
 	year={2012}
 	}
-	
+
 @inproceedings{ShSS07,
 	title={Pegasos: Primal Estimated Sub-Gradient Solver for SVM},
 	author={Shalev-Shwartz, Shai and Singer, Yoram and Srebro, Nathan},
@@ -878,5 +890,5 @@ Applied Survival Analysis},
  number = {7},
  year = {2001},
  pages = {1443--1471},
-} 
-	
+}
+