% % NOTE -- ONLY EDIT THE .Rnw FILE!!! The .tex file is % likely to be overwritten. % % \VignetteIndexEntry{vbmp Tutorial} % \VignetteDepends{} % \VignetteKeywords{} % \VignettePackage{vbmp} \documentclass[11pt]{article} \usepackage{amsmath,epsfig,fullpage} \usepackage{graphicx} \usepackage[authoryear,round]{natbib} \usepackage{hyperref} \newcommand{\Rfunction}[1]{{\texttt{#1}}} \newcommand{\Robject}[1]{{\texttt{#1}}} \newcommand{\Rpackage}[1]{{\textit{#1}}} \newcommand{\Rmethod}[1]{{\texttt{#1}}} \newcommand{\Rfunarg}[1]{{\texttt{#1}}} \newcommand{\Rclass}[1]{{\textit{#1}}} %\parindent 0in \bibliographystyle{plainnat} \begin{document} \title{Variational Bayesian Multinomial Probit Regression for Gaussian Process multi-class classification } \author{Nicola Lama\\ Mark Girolami} \maketitle \section*{Introduction} The \Rpackage{vbmp} package implements multinomial probit regression with Gaussian Process priors and estimates class membership posterior probability employing variational and sparse approximation to the full posterior. This software also incorporates feature weighting by means of Automatic Relevance Determination. \\ For more detailed description of classification using multinomial-probit regression model with Gaussian Process priors, refer to \citet{Girolami2006}.\\ To illustrate the potential of the method, a couple of examples are presented on a synthetic toy dataset and on literature data from the breast cancer study by \citet{Kote-Jarai2006} %and on literature data from breast cancer study \citet{Kote-Jarai2006} \section*{The \Rfunction{vbmp} function} The \Rfunction{vbmp} function carries out multi-class classification training and testing. The \Rfunarg{control} parameter list provides optional control settings. The default Gaussian Process covariance functions are radial basis-style functions but other choice are possible. Whether or not to infer the scale parameter of the covariance functions is an option (\Rfunarg{bThetaEstimate=T}). The target values \Rfunarg{t.class} and \Rfunarg{t.class.TEST} can be factors or integer values in 1:n where n is the number of classes.\\ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section*{The synthetic toy dataset} In this illustrative example dataset is composed of uniformly distributed 2-D data points drawn from 3 nonlinearly separable classes which take the form of two annular rings and one zero-centered bivariate Gaussian with spherical covariance matrix (see Fig.~\ref{fig:Example1}). Four additional random noise parameters were added to the dataset\\ Gaussian kernel functions were adopted with scale parameter inferred by the \Rfunction{vbmp} method using vague hyperpriors (Gamma prior over covaraince params defaults to $10^{-6}$). Monitor diagnostics are depicted in figure \ref{fig:ARD} where the progress of Automatic Relevance Detection (ARD) \citet{Neal1998} is evident with the four irrelevant features are effectively removed from the model.\\ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% <>= ## Init random number generator set.seed(123); @ Define a function to generate the dataset: <>= genSample <- function(n, noiseVar=0) { KF <- 10; # magnify factor nt <- round(n/3,0)# ## class 1 and 2 (x ~ U(0,1)) u <- 4. * matrix(runif(2*KF*n), nrow=KF*n, ncol=2) - 2.; i <- which(((u[, 1]^2 + u[, 2]^2) > .1) & ((u[, 1]^2 + u[, 2]^2) < .5) ); j <- which(((u[, 1]^2 + u[, 2]^2) > .6) & ((u[, 1]^2 + u[, 2]^2) < 1) ); X <- u[c(sample(i,nt), sample(j,nt)),]; t.class <- c(rep(1, nt),rep(2, nt)); ## class 3 (x ~ N(0,1)) x <- 0.1 * matrix(rnorm(2*KF*length(i)), ncol=2, nrow=length(i)*KF ); k <- which((x[, 1]^2 + x[, 2]^2) < 0.1); nt <- n - 2*nt; X <- rbind(X, x[sample(k,nt), ]); t.class <- c(t.class, rep(3, nt)); ## limit number #n <- min(n, nrow(X)); i <- sample(1:nrow(X),n);X <- X[i,]; t.class <- t.class[i]; ## add random coloumns if (noiseVar>0) X <- cbind(X, matrix(rnorm(noiseVar*n), ncol=noiseVar, nrow=nrow(X))); structure( list( t.class=t.class, X=X), class="MultiNoisyData"); } @ Set the number of additional noisy input parameters: <>= nNoisyInputs <- 4; @ Set the sample sizes: <>= Ntrain <- 100; Ntest <- Ntrain * 5; @ Generate training and test samples independently: <>= dataXtTrain <- genSample(Ntrain, nNoisyInputs); dataXtTest <- genSample(Ntest, nNoisyInputs); @ Initialization of scale covariance function and of the maximum number of iterations: <>= theta <- rep(1., ncol(dataXtTrain$X)); max.train.iter <- 12; @ Train, test and plot monitors <>= library(vbmp); res <- vbmp( dataXtTrain$X, dataXtTrain$t.class, dataXtTest$X, dataXtTest$t.class, theta, control=list(bThetaEstimate=T, bMonitor=T, maxIts=max.train.iter)); @ Print the out-of-sample prediction error <>= predError(res); @ \begin{figure} \centering <>= i.t1 <- which(dataXtTest$t.class == 1); i.t2 <- which(dataXtTest$t.class == 2); i.t3 <- which(dataXtTest$t.class == 3); plot(dataXtTest$X[, 1], dataXtTest$X[,2], type="n", xlab="X1", ylab="X2"); points(dataXtTest$X[i.t1, 1],dataXtTest$X[i.t1, 2], type="p", col="blue"); points(dataXtTest$X[i.t2, 1],dataXtTest$X[i.t2, 2], type="p", col="red"); points(dataXtTest$X[i.t3, 1],dataXtTest$X[i.t3, 2], type="p", col="green"); @ \caption{Scatter plot of the first two dimensions of the toy dataset. Classes are identified by different colors} \label{fig:Example1} \end{figure} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{figure} \centering <>= ## plot convergence diagnostics (same as setting bPlotFitting=T) plotDiagnostics(res); @ \caption{Monitor diagnostics for the evolution of: posterior means for the Gaussian kernel scale parameters (top-left), lower bound on the marginal likelihood (top-right), predictive likelihood (bottom-left) and out-of-sample accuracy ($0/1$-error loss)} \label{fig:ARD} \end{figure} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newpage \section*{Breast cancer dataset} The dataset consist of 8080 genes and 30 samples from three classes of individual with specific gene mutations. As reported in \citet{Kote-Jarai2006} using a linear inner product covariance function the Leave-One-Out error is zero which improves on an SVM multiclass classifier adopted by \citet{Kote-Jarai2006}. <>= library("Biobase"); data("BRCA12"); brca.y <- BRCA12$Target.class; brca.x <- t(exprs(BRCA12)); @ Define a function to set up the target class predictor during cross-validation. It uses linear inner product covariance function and 24 training iterations and sets iteration threshold to $10^{-8}$. Initialize all theta values to $1$ and, in this case, it is not necessarily to run the importance sampler to obtain the weightings in the features. <>= predictVBMP <- function(x) { sKernelType <- "iprod"; Thresh <- 1e-8; theta <- rep(1.0, ncol(brca.x)); max.train.iter <- 24; resX <- vbmp( brca.x[!x,], brca.y[!x], brca.x[ x,], brca.y[ x], theta, control=list(bThetaEstimate=F, bPlotFitting=F, maxIts=max.train.iter, sKernelType=sKernelType, Thresh=Thresh)); predClass(resX); } @ <>= # fake predictVBMP to speed-up the RCMD TEST predictVBMP <- function(x) { rep(0,sum(x)); } @ In the following code, \Robject{Kfold} is the number of cross validation folds and it is set equal to \Robject{n}, the number of pts in the dataset, in order to obtain the Leave-One-Out approach. \Robject{res} collects the predicted targets. <>= n <- nrow(brca.x); Kfold <- n; # number of folds , if equal to n then LOO samps <- sample(rep(1:Kfold, length=n), n, replace=FALSE); res <- rep(NA, n); print(paste("Crossvalidation started...... (",n,"steps )")); for (x in 1:Kfold) { cat(paste(x,", ",ifelse(x%%10==0,"\n",""),sep="")); flush.console(); res[samps==x] <- predictVBMP(samps==x); } @ The Cross-validated error rate can be finally evaluated: <>= CVerrorRate <- round(sum(res!=brca.y)/n,2); # don't print out the results, owing to the fake predictVBMP @ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \bibliography{vbmp} \end{document}