%\VignetteIndexEntry{Likelihood Calculations for vsn}
%\VignetteDepends{vsn}
%\VignetteKeywords{Expression Analysis}
%\VignettePackage{vsn}

\documentclass[11pt]{article}
\usepackage{color}
\usepackage{amssymb}
\definecolor{darkblue}{rgb}{0.0,0.0,0.75}
\usepackage[%
baseurl={http://www.bioconductor.org},%
pdftitle={vsn likelihood Calculations},%
pdfauthor={Wolfgang Huber},%
pdfsubject={vsn},%
pdfkeywords={Bioconductor},%
pagebackref,bookmarks,colorlinks,linkcolor=darkblue,citecolor=darkblue,%
pagecolor=darkblue,raiselinks,plainpages,pdftex]{hyperref}
\usepackage{geometry}

%------------------------------------------------------------
% newcommand
%------------------------------------------------------------
\newcommand{\arsinh}{\mathop{\mathgroup\symoperators arsinh}\nolimits}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rpackage}[1]{\textit{#1}}
\newcommand{\mbs}[1]{{\mbox{\scriptsize #1}}}

\begin{document}

%---------------------------------------------------------------------------
\title{Likelihood calculations for \Rpackage{vsn}}
%---------------------------------------------------------------------------
\author{Wolfgang Huber}
\maketitle
\tableofcontents

\section{Introduction}
This vignette contains the computations that underlie the
numerical code of \Rpackage{vsn}.  If you are a new user and looking for an
introduction on how to \textbf{use} \Rpackage{vsn}, please refer to the
vignette \emph{Robust calibration and variance stabilization with
\Rpackage{vsn}}, which is provided separately.

\section{Setup and Notation}
Consider the model
\begin{equation}\label{eq:model}
\arsinh\left(f(b_i)\cdot y_{ki}+a_i\right) = \mu_k + \varepsilon_{ki}
\end{equation}
where $\mu_k$, for $k=1,\ldots,n$, and $a_i$, $b_i$, for $i=1,\ldots,d$
are real-valued parameters, $f$ is a function $\mathbb{R}\to\mathbb{R}$ 
(see below), 
and $\varepsilon_{ki}$ are i.i.d.\ Normal with mean 0 and variance $\sigma^2$. 
$y_{ki}$ are the data. In applications to $\mu$array data, $k$ indexes the 
features and $i$ the arrays and/or colour channels.

Examples for $f$ are $f(b)=b$ and $f(b)=e^b$. The former is the most
obvious choice; in that case we will usually need to require $b_i>0$.
The choice $f(b)=e^b$ assures that the factor in front of $y_{ki}$ is
positive for all $b\in\mathbb{R}$, and as it turns out, simplifies
some of the computations.

In the following calculations, I will also use the notation
\begin{eqnarray}
Y \equiv Y(y,a,b) &=&              f(b)\cdot y+a\\
h \equiv h(y,a,b) &=& \arsinh\left(f(b)\cdot y+a\right).
\end{eqnarray}

The probability of the data $(y_{ki})_{k=1\ldots n,\;i=1\ldots d}$ 
lying in a certain volume element of $y$-space
(hyperrectangle with sides $[y_{ki}^\alpha,y_{ki}^\beta]$)  is 
\begin{equation}
P=\prod_{k=1}^n\prod_{i=1}^d
\int\limits_{y_{ki}^\alpha}^{y_{ki}^\beta} dy_{ki}\;\;
p_{\mbs{Normal}}(h(y_{ki}),\mu_k,\sigma^2)\;\;
\frac{dh}{dy}(y_{ki}),
\end{equation}
where $\mu_k$ is the expectation value for feature $k$ and
$\sigma^2$ the variance.

With
\begin{equation}
p_{\mbs{Normal}}(x,\mu,\sigma^2)=\frac{1}{\sqrt{2\pi\sigma^2}}
\exp\left(-\frac{(x-\mu)^2}{2\sigma^2}\right)
\end{equation}
the likelihood is
\begin{equation}\label{eq:likelihood}
L=\left(\frac{1}{\sqrt{2\pi\sigma^2}}\right)^{nd}
\prod_{k=1}^n \prod_{i=1}^d
\exp\left(-\frac{(h(y_{ki})-\mu_k)^2}{2\sigma^2}\right)
\cdot\frac{dh}{dy}(y_{ki})\,.
\end{equation}

For the following, I will need the derivatives
\begin{eqnarray}
\frac{\partial Y}{\partial a}&=&1\\
\frac{\partial Y}{\partial b}&=&y\cdot f'(b)\\
\frac{dh}{dy}&=&
\frac{f(b)}{\sqrt{1+(f(b)y+a)^2}}=
\frac{f(b)}{\sqrt{1+Y^2}},\\
\frac{\partial h}{\partial a}&=&\frac{1}{\sqrt{1+Y^2}},\\
\frac{\partial h}{\partial b}&=&\frac{y}{\sqrt{1+Y^2}}\cdot f'(b).
\end{eqnarray}
Note that for $f(b)=b$, we have $f'(b)=1$,
and for $f(b)=e^b$,  $f'(b)=f(b)=e^b$.

%----------------------------------------------------------------
\section{Likelihood for Incremental Normalization}\label{sec:inc}
%----------------------------------------------------------------
Here, \textit{incremental normalization} means that the model
parameters $\mu_1,\ldots,\mu_n$ and $\sigma^2$ are already known from a
fit to a previous set of $\mu$arrays, i.\,e.\ a set of reference
arrays. See Section~\ref{sec:prof} for the profile likelihood approach
that is used if $\mu_1,\ldots,\mu_n$ and $\sigma^2$ are not known and
need to be estimated from the same data. 
Versions $\ge2.0$ of the \Rpackage{vsn} package implement both of 
these approaches; 
in versions $1.X$ only the profile likelihood approach was implemented, 
and it was described in the initial publication~\cite{HuberISMB2002}.

First, let us note that the likelihood (\ref{eq:likelihood}) 
is simply a product of independent terms for different $i$. 
We can optimize the parameters $(a_i,b_i)$ 
separately for each $i=1,\ldots,d$.
From the likelihood (\ref{eq:likelihood}) we get the 
$i$-th negative log-likelihood
\begin{eqnarray}\label{eq:nll}
-\log(L) &=&\sum_{i=1}^d -LL_i\\
-LL_i&=&\frac{n}{2}\log\left(2\pi\sigma^2\right)+
\sum_{k=1}^n \left(\frac{(h(y_{ki})-\mu_k)^2}{2\sigma^2}
+\log\frac{\sqrt{1+Y_{ki}^2}}{f(b_i)}\right)\\
&=&\frac{n}{2}\log\left(2\pi\sigma^2\right)
-n\log f(b_i)
+\sum_{k=1}^n\left(\frac{(h(y_{ki})-\mu_k)^2}{2\sigma^2}
+\frac{1}{2}\log\left(1+Y_{ki}^2\right)\right)
\end{eqnarray}
This is what we want to optimize as a function of $a_i$ and $b_i$. 
The optimizer benefits from the derivatives. 
The  derivative with respect to $a_i$ is
\begin{eqnarray}
\frac{\partial}{\partial a_i}(-LL_i) &=&
\sum_{k=1}^n \left( \frac{h(y_{ki})-\mu_k}{\sigma^2}
+\frac{Y_{ki}}{\sqrt{1+Y_{ki}^2}} \right)
\cdot\frac{1}{\sqrt{1+Y_{ki}^2}}
\nonumber\\
&=& \sum_{k=1}^n 
\left(\frac{r_{ki}}{\sigma^2}+A_{ki}Y_{ki}\right)A_{ki}
\label{eq:ddanll}
\end{eqnarray}
and with respect to $b_i$
\begin{eqnarray}
\frac{\partial}{\partial b_i}(-LL_i) &=&
-n\frac{f'(b_i)}{f(b_i)} 
+\sum_{k=1}^n \left( \frac{h(y_{ki})-\mu_k}{\sigma^2}
+\frac{Y_{ki}}{\sqrt{1+Y_{ki}^2}}\right)
\cdot\frac{y_{ki}}{\sqrt{1+Y_{ki}^2}}\cdot f'(b_i)
\nonumber\\
&=&
-n\frac{f'(b_i)}{f(b_i)} 
+f'(b_i)\sum_{k=1}^n \left(\frac{r_{ki}}{\sigma^2}+A_{ki}Y_{ki}\right)
A_{ki}y_{ki}
\label{eq:ddbnll}
\end{eqnarray}
Here, I have introduced the following shorthand notation for the 
``intermediate results'' terms
\begin{eqnarray}
r_{ki}&=& h(y_{ki})-\mu_k\\
A_{ki}&=&\frac{1}{\sqrt{1+Y_{ki}^2}}.
\end{eqnarray}
Variables for these intermediate values are also used 
in the C code to organise
the computations of the gradient.
 
%--------------------------------------------------
\section{Profile Likelihood}\label{sec:prof}
%--------------------------------------------------
If $\mu_1,\ldots,\mu_n$ and $\sigma^2$ are not already known,
we can plug in their maximum likelihood estimates, obtained 
from optimizing $LL$ for $\mu_1,\ldots,\mu_n$ and $\sigma^2$:
\begin{eqnarray}
\hat{\mu}_k &=& \frac{1}{d}\sum_{j=1}^d h(y_{kj})\label{eq:muhat}\\
\hat{\sigma}^2 &=& \frac{1}{nd}\sum_{k=1}^n\sum_{j=1}^d 
(h(y_{kj})-\hat{\mu}_k)^2\label{eq:sigmahat}
\end{eqnarray}
into the negative log-likelihood. 
The result is called the negative profile log-likelihood
\begin{equation}\label{eq:npll}
-PLL=
 \frac{nd}{2}\log\left(2\pi\hat{\sigma}^2\right)
+\frac{nd}{2}
-n\sum_{j=1}^d\log f(b_j)
+\frac{1}{2}\sum_{k=1}^n\sum_{j=1}^d \log\sqrt{1+Y_{kj}^2}.
\end{equation}
Note that this no longer decomposes into a sum of terms for each $j$
that are independent of each other -- the terms for different $j$
are coupled through Equations~(\ref{eq:muhat}) and (\ref{eq:sigmahat}).
We need the following derivatives.
\begin{eqnarray}
\frac{\partial \hat{\sigma}^2}{\partial a_i} &=&
\frac{2}{nd}\sum_{k=1}^n
r_{ki}\frac{\partial h(y_{ki})}{\partial a_i}\nonumber\\
&=&
\frac{2}{nd}
\sum_{k=1}^n r_{ki}A_{ki}\\
\frac{\partial \hat{\sigma}^2}{\partial b_i} &=&
\frac{2}{nd}\cdot f'(b_i)
\sum_{k=1}^n r_{ki}A_{ki}y_{ki}
\end{eqnarray}
So, finally
\begin{eqnarray}
\frac{\partial}{\partial a_i}(-PLL) &=&
\frac{nd}{2\hat{\sigma}^2}\cdot 
\frac{\partial \hat{\sigma}^2}{\partial a_i} 
+\sum_{k=1}^n A_{ki}^2Y_{ki}\nonumber\\
&=&\sum_{k=1}^n 
\left(\frac{r_{ki}}{\hat{\sigma}^2}+A_{ki}Y_{ki}\right)A_{ki}
\label{eq:ddanpll}\\
\frac{\partial}{\partial b_i}(-PLL) &=&
-n\frac{f'(b_i)}{f(b_i)} 
+ f'(b_i) \sum_{k=1}^n 
\left(\frac{r_{ki}}{\hat{\sigma}^2}+A_{ki}Y_{ki}\right)A_{ki}y_{ki}
\label{eq:ddbnpll}
\end{eqnarray}


%--------------------------------------------------
\newpage
\section{Summary}\label{sec:ggu}
%--------------------------------------------------
Likelihoods, from Equations~(\ref{eq:nll}) and (\ref{eq:npll}):
\begin{eqnarray}
-LL_i&=&
\underbrace{%
  \frac{n}{2}\log\left(2\pi\sigma^2\right)
}_{\mbox{scale}} +
\underbrace{%
  \sum_{k=1}^n \frac{(h(y_{ki})-\mu_k)^2}{2\sigma^2}
}_{\mbox{residuals}} 
\underbrace{%
  -n\log f(b_i) + 
  \frac{1}{2}\sum_{k=1}^n \log(1+Y_{ki}^2)
}_{\mbox{jacobian}}\\
-PLL&=&
\underbrace{%
\frac{nd}{2}\log\left(2\pi\hat{\sigma}^2\right)
}_{\mbox{scale}}+
\underbrace{%
\frac{nd}{2}
}_{\mbox{residuals}} +
\underbrace{%  
  \sum_{i=1}^d\left(
  -n\log f(b_i) + 
  \frac{1}{2}\sum_{k=1}^n \log(1+Y_{ki}^2)\right)
}_{\mbox{jacobian}}
\end{eqnarray}
The computations in the C code are organised into steps for computing the
terms ``scale'', ``residuals'' and ``jacobian''.

Partial derivatives with respect to $a_i$,
from Equations~(\ref{eq:ddanll}) and (\ref{eq:ddanpll}):
\begin{eqnarray}
\frac{\partial}{\partial a_i}(-LL_i) &=&
\sum_{k=1}^n 
\left(\frac{r_{ki}}{\sigma^2}+A_{ki}Y_{ki}\right)A_{ki}\\
%
\frac{\partial}{\partial a_i}(-PLL) &=&
\sum_{k=1}^n 
\left(\frac{r_{ki}}{\hat{\sigma}^2}+A_{ki}Y_{ki}\right)A_{ki}
\end{eqnarray}

Partial derivatives with respect to $b_i$,
from Equations~(\ref{eq:ddbnll}) and (\ref{eq:ddbnpll}):
\begin{eqnarray}
\frac{\partial}{\partial b_i}(-LL_i) &=&
-n\frac{f'(b_i)}{f(b_i)}
+f'(b_i)\sum_{k=1}^n 
\left(\frac{r_{ki}}{\sigma^2}+A_{ki}Y_{ki}\right)A_{ki}y_{ki}\\
%
\frac{\partial}{\partial b_i}(-PLL) &=&
-n\frac{f'(b_i)}{f(b_i)}
+f'(b_i)\sum_{k=1}^n 
\left(\frac{r_{ki}}{\hat{\sigma}^2}+A_{ki}Y_{ki}\right)A_{ki}y_{ki}.
\end{eqnarray}

Note that the terms have many similarities -- this
is used in the implementation in the C code.

\begin{thebibliography}{10}
\bibitem{HuberISMB2002}
W. Huber, A. von Heydebreck, H. {S\"ultmann}, A. Poustka, and M. Vingron.
\newblock Variance stablization applied to microarray data calibration and to
  quantification of differential expression.
\newblock \textit{Bioinformatics}, 18:S96--S104, 2002.

\bibitem{HuberSAGMB2003}
W. Huber, A. von Heydebreck, H. {S\"ultmann}, A. Poustka, and M. Vingron.
\newblock Parameter estimation for the calibration and variance stabilization 
of microarray data.
\newblock \textit{Statistical Applications in Genetics and Molecular Biology}, 
Vol. 2: No. 1, Article 3, 2003. 
http://www.bepress.com/sagmb/vol2/iss1/art3

\end{thebibliography}
\end{document}