Skip to content.

bioconductor.org

Bioconductor is an open source and open development software project
for the analysis and comprehension of genomic data.

Sections

Lab11.R

################################################### ### chunk number 1: loadpacks ################################################### library(Biobase) library(annotate) library(golubEsets) library(genefilter) library(Design) library(Hmisc)

################################################### ### chunk number 2: setup ################################################### data(golubTrain) data(golubTest) LS<-exprs(golubTrain) cl<-golubTrain$ALL.AML TS<-exprs(golubTest) clts<-golubTest$ALL.AML

LS[LS<100]<-100 TS[TS<100]<-100 LS[LS>16000]<-16000 TS[TS>16000]<-16000

mmfilt <- function(r=5, d=500, na.rm=TRUE) { function(x) { minval <- min(x, na.rm=na.rm) maxval <- max(x, na.rm=na.rm) (maxval/minval > r) && (maxval-minval > d) } }

mmfun <- mmfilt()

ffun <- filterfun(mmfun) good <- genefilter(cbind(LS, TS), ffun ) sum(good) ## should get 3571

LSsub <- log10(LS[good,]) # log transform of the data TSsub <- log10(TS[good,]) # log transform of the data LTsub <- cbind(LSsub, TSsub) # concatenating the two sets

################################################### ### chunk number 3: filtering ################################################### gf <- gapFilter(900, 1500, .1) ff <- filterfun(gf) xx <- 10^LSsub ##to get back to original units good <- genefilter(xx, gf) sum(good) # should be 200

# Reducing the number of genes

LS2 <- LSsub[good,] TS2 <- TSsub[good,]

################################################### ### chunk number 4: simple ################################################### y=as.integer(cl)-1 x=LS2[99,] aux=sort(x) fit=glm(y~x,family="binomial") auxy=fit$coef[1]+fit$coef[2]*sort(x) auxy=plogis(auxy) plot(x,y) lines(sort(x),auxy) abline(0.5,0)

################################################### ### chunk number 5: design ################################################### X = t(LS2) Y =as.integer(cl)-1 n =nrow(X) p = ncol(X) X.val=t(TS2) Y.val=as.integer(clts)-1 pm <- diag(diag(var(X))) # penalty matrix Penalty <- seq(10,10^5,length=10) reps <- length(Penalty) effective.df <- effective.df2 <- aic <- aic2 <- deviance.val <- Lpenalty <- single(reps) n.t <- round(n^.75) ncv <- 10 # # try 10 reps in cross-val. deviance <- matrix(NA,nrow=reps,ncol=length(ncv)) for(i in 1:reps) { pen <- log10(Penalty[i]) cat(format(pen),"") f.full <- lrm.fit(X, Y, penalty.matrix=pen*pm) Lpenalty[i] <- pen t(f.full$coef[-1]) %% pm %% f.full$coef[-1] f.full.nopenalty <- lrm.fit(X, Y, penalty.matrix=0.05pm, maxit=1) info.matrix.unpenalized <- solve(f.full.nopenalty$var) effective.df[i] <- sum(diag(info.matrix.unpenalized %*% f.full$var)) - 1 lrchisq <- f.full.nopenalty$stats["Model L.R."] # lrm does all this penalty adjustment automatically (for var, d.f., # chi-square) aic[i] <- lrchisq - 2*effective.df[i] # pred <- plogis(f.full$linear.predictors) score.matrix <- cbind(1,X) (Y - pred) sum.u.uprime <- t(score.matrix) %% score.matrix effective.df2[i] <- sum(diag(f.full$var %*% sum.u.uprime)) aic2[i] <- lrchisq - 2*effective.df2[i] # #Shao suggested averaging 2*n cross-validations, but let's do only 40 #and stop along the way to see if fewer is OK dev <- 0 for(j in 1:max(ncv)) { s <- sample(1:n, n.t) cof <- lrm.fit(X[s,],Y[s],penalty.matrix=pen*pm)$coef pred <- cof[1] + (X[-s,] %% cof[-1]) dev <- dev -2sum(Y[-s]*pred + log(1-plogis(pred))) for(k in 1:length(ncv)) if(j==ncv[k]) deviance[i,k] <- dev/j } # pred.val <- f.full$coef[1] + (X.val %*% f.full$coef[-1]) prob.val <- plogis(pred.val) deviance.val[i] <- -2sum(Y.valpred.val + log(1-prob.val)) } par(mfrow=c(1,3)) # to be printed as they are finished plot(Penalty, effective.df, type="l") lines(Penalty, effective.df2, lty=2) plot(Penalty, Lpenalty, type="l") title("Penalty on -2 log L") plot(Penalty, aic, type="l") lines(Penalty, aic2, lty=2)

################################################### ### chunk number 6: logistic ################################################### par(mfrow=c(2,1)) pen=200 f.full <- lrm.fit(X, Y, penalty.matrix=pen*pm) predLS <- plogis(f.full$linear.predictors) yLS=as.integer(cl)-1 yTS=as.integer(clts)-1 val.prob(predLS,yLS,m=1,cex=.5) # plotting the result pred.val <- f.full$coef[1] + (X.val %*% f.full$coef[-1]) predTS <- plogis(pred.val) val.prob(predTS,yTS,m=1,cex=.5) # plotting the result

################################################### ### chunk number 7: con ################################################### con <- function(x,y) { tab <- table(x,y) print(tab) diag(tab) <- 0 cat("error rate = ", round(100*sum(tab)/length(x),2),"%\n") invisible() }

predProbLS=(predLS>0.5) predProbTS=(predTS>0.5) con(as.integer(predProbLS),yLS) # concordance table in the LS con(as.integer(predProbTS),yTS) # concordance table in the TS

################################################### ### chunk number 8: plots ################################################### par(mfrow=c(2,2)) plot.ts(abs(f.full$coef[-1])) plot.ts(sqrt(diag(var(X)))abs(f.full$coef[-1])) hist(f.full$coef[-1],breaks=30) hist(sqrt(diag(var(X)))f.full$coef[-1],breaks=30)

News
2009-10-26

BioC 2.5, consisting of 352 packages and designed to work with R 2.10.z, was released today.

2009-01-07

R, the open source platform used by Bioconductor, featured in a series of articles in the New York Times.