%\VignetteIndexEntry{Supplement. Calculation of the cost matrix} %\VignetteDepends{tilingArray} %\VignetteKeywords{Expression Analysis} %\VignettePackage{tilingArray} \documentclass[11pt]{article} \newcommand{\Robject}[1]{\texttt{#1}} \newcommand{\Rpackage}[1]{\textit{#1}} \newcommand{\Rfunction}[1]{\textit{#1}} \newcommand{\Rclass}[1]{\textit{#1}} \begin{document} %------------------------------------------------------------ \title{Calculation of the cost matrix} %------------------------------------------------------------ \author{Wolfgang Huber} \maketitle \section{Problem statement and definitions} Let $y_{nj}$ be the data value at position (genomic coordinate) $n=1,\ldots,N$ for replicate array $j=1,\ldots,J$. Hence we have $J$ arrays and sequences of length $N$. The goal of this note is to describe an $O(NJ)$ algorithm to calculate the cost matrix of a piecewise linear model for the segmentation of the $(1,\ldots,N)$ axis. It is implemented in the function \textit{costMatrix} in the package \textit{tilingArray}. The cost matrix is the input for a dynamic programming algorithm that finds the optimal (least squares) segmentation. The cost matrix $G_{km}$ is the sum of squared residuals for a segment from $m$ to $m+k-1$ (i.\,e.\ including $m+k-1$ but excluding $m+k$), \begin{equation}\label{eq:costFun} G_{km} := \sum_{j=1}^J \sum_{n=m}^{m+k-1} \left(y_{nj} - \hat{\mu}_{km} \right)^2 \end{equation} where $1\le m \le m+k-1\le N$ and $\hat{\mu}_{km}$ is the mean of that segment, \begin{equation}\label{eq:defmu} \hat{\mu}_{km} = \frac{1}{Jk} \sum_{j=1}^J \sum_{n=m}^{m+k-1} y_{nj}. \end{equation} \textit{Sidenote:} a perhaps more straightforward definition of a cost matrix would be $\bar{G}_{m'\,m} = G_{(m'-m)\,m}$, the sum of squared residuals for a segment from $m$ to $m'-1$. I use version~(\ref{eq:costFun}) because it makes it easier to use the condition of maximum segment length $(k<=k_{\mbox{\scriptsize max}})$, which I need to get the algorithm from complexity $O(N^2)$ to $O(N)$. \newpage \section{Algebra} \begin{eqnarray} \lefteqn{G_{km} = \sum_{j=1}^J \sum_{n=m}^{m+k-1} \left(y_{nj} - \hat{\mu}_{km} \right)^2} \\ % &=& \sum_{n,j} y_{nj}^2 - \frac{1}{Jk} \left(\sum_{n',j'} y_{n'j'}\right)^2 \\ % &=& \sum_n q_n - \frac{1}{Jk}\left(\sum_{n'} r_{n'}\right)^2 \label{Gbyqr} \end{eqnarray} with \begin{eqnarray} q_n&:=&\sum_j y_{nj}^2 \\ r_n&:=&\sum_j y_{nj} \end{eqnarray} If \texttt{y} is an $N\times J$ matrix, then the $N$-vectors \texttt{q} and \texttt{r} can be obtained by \begin{Sinput} q = rowSums(y*y) r = rowSums(y) \end{Sinput} Now define \begin{eqnarray} c_\nu &=& \sum_{n=1}^{\nu}r_n\\ d_\nu &=& \sum_{n=1}^{\nu}q_n \end{eqnarray} which be obtained from \begin{Sinput} c = cumsum(r) d = cumsum(q) \end{Sinput} then (\ref{Gbyqr}) becomes \begin{equation} (d_{m+k-1}-d_{m-1}) - \frac{1}{Jk}(c_{m+k-1}-c_{m-1})^2 \end{equation} %% \begin{equation} %% A_{km} = \frac{1}{J}\left(\sum_{n=m}^{m+k-1}r_n\right)^2, %% \end{equation} %% then %% \begin{eqnarray} %% A_{1m} &=& \frac{r_n^2}{J} \\ %% A_{km} &=& \frac{1}{J}\left( %% A_{k-1,m} + r_{m+k-1}^2 + 2 r_{m+k-1} %% \underbrace{\left( \sum_{n=m}^{m+k-2}r_n\right)}_{=:\;c_{m+k-2}-c_{m-1}} %% \right) %% \end{eqnarray} %% where %% Thus we can construct a recursion for $G_{km}$, which can be used to %% build it up row by row: %% \begin{eqnarray} %% G_{1m} &=& \frac{q_n}{J} - \frac{r_n^2}{J^2} \\ %% \end{eqnarray} \end{document}