GOstats.R

################################################### ### chunk number 1: setup1 ################################################### options(continue=" ", width=68)

################################################### ### chunk number 2: SubsetDef ################################################### ## For this data we can have ALL1/AF4 or BCR/ABL subsetType <- "ALL1/AF4"

################################################### ### chunk number 3: Setup ################################################### library("ALL") ## coming soon ##library("hgu95av2db") library("hgu95av2") library("annotate") library("genefilter") library("GOstats") library("RColorBrewer") library("xtable") library("Rgraphviz")

################################################### ### chunk number 4: GOcharacteristics1 ################################################### ## FIXME: add a function to abstract this computation ## to allow a different implementation when GO is SQLite-based. goCats <- unlist(eapply(GOTERM, Ontology)) gCnums <- table(goCats)[c("BP","CC", "MF")] gCmat <- matrix(as.integer(gCnums), dimnames=list(c("BP","CC", "MF"), "Number of Terms"))

################################################### ### chunk number 5: GOcharacteristics2 ################################################### xtable.matrix(gCmat, display=c("d","d"), caption="Number of GO terms per ontology.", label="ta:GOprops")

################################################### ### chunk number 6: tfG ################################################### tfG <- GOGraph("GO:0003700", GOMFPARENTS) ## Drop the top-level GO node root <- "all" if (root %in% nodes(tfG)) tfG <- removeNode(root, tfG)

################################################### ### chunk number 7: tfGplot ################################################### nL <- nodes(tfG) nLterm <- unlist(getGOTerm(nL), use.names=FALSE) names(nLterm) <- nL nLterm <- gsub("_", " ", nLterm) tGfnA <- list() tmp <- rep("ellipse", length(nL)) names(tmp) <- nL tGfnA$shape = tmp; tmp <- rep(FALSE, length(nL)) names(tmp) = nL tGfnA$fixedsize = tmp names(nLterm) <- nL tGfnA$label = nLterm plot(tfG, nodeAttrs=tGfnA)

################################################### ### chunk number 8: tfG.children ################################################### tfG.ch <- get("GO:0003700", GOMFCHILDREN) tfg.terms <- getGOTerm(tfG.ch)[["MF"]] tfg.tch <- paste(names(tfg.terms), tfg.terms, sep=": ") for(i in tfg.tch) cat(i, "\n")

################################################### ### chunk number 9: duplicates ################################################### lls <- unlist(as.list(hgu95av2ENTREZID)) tab <- table(table(lls))

################################################### ### chunk number 10: duplicates ################################################### cat("Multiplicity ", sapply(names(tab), function(x) sprintf("%4s", x)), "\nNo. EntrezGene IDs ", sapply(tab, function(x) sprintf("%4d", x)), "\n")

## Cleanup remove(lls)

################################################### ### chunk number 11: getGO ################################################### tfprobes <- get("GO:0003700", hgu95av2GO2PROBE) tfprobesLen <- length(tfprobes) syms <- getSYMBOL(tfprobes, "hgu95av2") alltfprobes <- get("GO:0003700", hgu95av2GO2ALLPROBES) alltfprobesLen <- length(alltfprobes)

## Cleanup remove(tfprobes, alltfprobes)

################################################### ### chunk number 12: universeSizeVsPvalue ################################################### hg_tester <- function(size) { numFound <- 10 numDrawn <- 400 numAtCat <- 40 numNotAtCat <- size - numAtCat phyper(numFound-1, numAtCat, numNotAtCat, numDrawn, lower.tail=FALSE) } pv1000 <- hg_tester(1000) pv5000 <- hg_tester(5000)

################################################### ### chunk number 13: bcrAblOrNegSubset ################################################### data(ALL, package="ALL") Bcell <- grep("^B", as.character(ALL$BT)) bcrAblOrNegIdx <- which(as.character(ALL$mol) %in% c("NEG", subsetType))

bcrAblOrNeg <- ALL[, intersect(Bcell, bcrAblOrNegIdx)] bcrAblOrNeg$mol.biol = factor(bcrAblOrNeg$mol.biol)

################################################### ### chunk number 14: sol1 ################################################### numSamp <- length(bcrAblOrNeg$mol.biol) table(bcrAblOrNeg$mol.biol)

################################################### ### chunk number 15: sol2 ################################################### annotation(bcrAblOrNeg) length(featureNames(bcrAblOrNeg))

################################################### ### chunk number 16: nsFiltering-noEntrez ################################################### ## Remove genes that have no entrezGene id entrezIds <- mget(featureNames(bcrAblOrNeg), envir=hgu95av2ENTREZID) haveEntrezId <- names(entrezIds)[sapply(entrezIds, function(x) !is.na(x))]

bcrAblOrNeg <- bcrAblOrNeg[haveEntrezId, ]

################################################### ### chunk number 17: sol3 ################################################### numNoEntrezId <- length(featureNames(bcrAblOrNeg)) - length(haveEntrezId)

################################################### ### chunk number 18: nsFiltering-noGO ################################################### ## Remove genes with no GO mapping haveGo <- sapply(mget(featureNames(bcrAblOrNeg), hgu95av2GO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE }) numNoGO <- sum(!haveGo) bcrAblOrNeg <- bcrAblOrNeg[haveGo, ]

################################################### ### chunk number 19: nsFiltering-I ################################################### ## Non-specific filtering based on IQR iqrCutoff <- 0.5 bcrAblOrNegIqr <- apply(exprs(bcrAblOrNeg), 1, IQR) selected <- bcrAblOrNegIqr > iqrCutoff

################################################### ### chunk number 20: nsFiltering-Y ################################################### chrN <- mget(featureNames(bcrAblOrNeg), envir=hgu95av2CHR) onY <- sapply(chrN, function(x) any(x=="Y")) onY[is.na(onY)] <- FALSE selected <- selected & !onY

nsFiltered <- bcrAblOrNeg[selected, ]

################################################### ### chunk number 21: nsFiltering-unique ################################################### numNsWithDups <- length(featureNames(nsFiltered)) nsFilteredIqr <- bcrAblOrNegIqr[selected] uniqGenes <- findLargest(featureNames(nsFiltered), nsFilteredIqr, "hgu95av2") nsFiltered <- nsFiltered[uniqGenes, ] numSelected <- length(featureNames(nsFiltered))

################################################### ### chunk number 22: defineGeneUniverse ################################################### ## Define gene universe based on results of non-specific filtering affyUniverse <- featureNames(nsFiltered) entrezUniverse <- unlist(mget(affyUniverse, hgu95av2ENTREZID)) if (any(duplicated(entrezUniverse))) stop("error in gene universe: can't have duplicate Entrez Gene Ids")

################################################### ### chunk number 23: altUniv ################################################### ## an alternate universe based on the entire chip chipAffyUniverse <- featureNames(bcrAblOrNeg) chipEntrezUniverse <- mget(chipAffyUniverse, hgu95av2ENTREZID) chipEntrezUniverse <- unique(unlist(chipEntrezUniverse))

################################################### ### chunk number 24: parametric1 ################################################### ttestCutoff <- 0.05 ttests = rowttests(nsFiltered, "mol.biol")

smPV = ttests$p.value < ttestCutoff

pvalFiltered <- nsFiltered[smPV, ] selectedEntrezIds <- unlist(mget(featureNames(pvalFiltered), hgu95av2ENTREZID))

################################################### ### chunk number 25: sumPV ################################################### sumpv <- sum(smPV)

################################################### ### chunk number 26: withYourData1 eval=FALSE ################################################### ## entrezUniverse <- unlist(mget(featureNames(yourData), ## hgu95av2ENTREZID)) ## if (any(duplicated(entrezUniverse))) ## stop("error in gene universe: can't have duplicate Entrez Gene Ids") ## ## pvalFiltered <- yourData[hasSmallPvalue, ] ## selectedEntrezIds <- unlist(mget(featureNames(pvalFiltered), ## hgu95av2ENTREZID))

################################################### ### chunk number 27: standardHyperGeo ################################################### hgCutoff <- 0.001 params <- new("GOHyperGParams", geneIds=selectedEntrezIds, universeGeneIds=entrezUniverse, annotation="hgu95av2", ontology="BP", pvalueCutoff=hgCutoff, conditional=FALSE, testDirection="over")

################################################### ### chunk number 28: standardHGTEST ################################################### hgOver <- hyperGTest(params)

################################################### ### chunk number 29: HGTestAns ################################################### hgOver

################################################### ### chunk number 30: summaryEx ################################################### df <- summary(hgOver, categorySize=250) df$Term <- substr(df$Term, 0, 30) sizeOrd <- order(df$Size, decreasing=TRUE) names(df) # the columns df[sizeOrd, c("Count", "Size", "Term")]

################################################### ### chunk number 31: resultAccessors ################################################### pvalues(hgOver)[1:3]

oddsRatios(hgOver)[1:3]

expectedCounts(hgOver)[1:3]

geneCounts(hgOver)[1:3] universeCounts(hgOver)[1:3]

length(geneIds(hgOver)) length(geneIdUniverse(hgOver)[[3]])

## GOHyperGResult only ## (NB: edges go from parent to child) goDag(hgOver)

geneMappedCount(hgOver) universeMappedCount(hgOver)

isConditional(hgOver) testDirection(hgOver) testName(hgOver)

################################################### ### chunk number 32: htmlReportExample ################################################### htmlReport(hgOver, file="ALL_hgo.html")

################################################### ### chunk number 33: termGraphs eval=FALSE ################################################### ## sigSub <- termGraphs(hgOver)

################################################### ### chunk number 34: inducedTermGraph ################################################### cellCom <- inducedTermGraph(hgOver, id = c("GO:0007154", "GO:0007165"), parents=FALSE) cellCom

################################################### ### chunk number 35: cellComFig ################################################### plotGOTermGraph(cellCom, hgOver, y="neato")

################################################### ### chunk number 36: cellComGenes ################################################### cellComGenes <- unlist(selectedGenes(hgOver, "GO:0007154")) sigTraGenes <- unlist(selectedGenes(hgOver, "GO:0007165")) length(cellComGenes) length(sigTraGenes) length(intersect(cellComGenes, sigTraGenes))

################################################### ### chunk number 37: condHyperGeo ################################################### paramsCond <- params conditional(paramsCond) <- TRUE

################################################### ### chunk number 38: condhgRun ################################################### hgCond <- hyperGTest(paramsCond)

################################################### ### chunk number 39: condhgResult ################################################### hgCond

################################################### ### chunk number 40: condhgSummary ################################################### dfcond <- summary(hgCond, categorySize=50) sizeOrd <- order(dfcond$Size, decreasing=TRUE) dfcond[sizeOrd, c("Count", "Size", "Term")]

################################################### ### chunk number 41: compareResults ################################################### stdIds <- sigCategories(hgOver) condIds <- sigCategories(hgCond) setdiff(stdIds, condIds)

################################################### ### chunk number 42: kegg1 ################################################### kparams <- new("KEGGHyperGParams", geneIds=selectedEntrezIds, universeGeneIds=entrezUniverse, annotation="hgu95av2", pvalueCutoff=hgCutoff, testDirection="over")

kans <- hyperGTest(kparams)

################################################### ### chunk number 43: kegg2 ################################################### summary(kans)

kparamsUnder <- kparams testDirection(kparamsUnder) <- "under"

################################################### ### chunk number 44: keggUnder ################################################### kansUnder <- hyperGTest(kparamsUnder)

################################################### ### chunk number 45: kegg3 ################################################### summary(kansUnder)

################################################### ### chunk number 46: pfam1 ################################################### pparams <- new("PFAMHyperGParams", geneIds=selectedEntrezIds, universeGeneIds=entrezUniverse, annotation="hgu95av2", pvalueCutoff=hgCutoff, testDirection="over")

pans <- hyperGTest(pparams)

################################################### ### chunk number 47: pfam2 ################################################### summary(pans)

################################################### ### chunk number 48: info ################################################### toLatex(sessionInfo())

bioconductor.org

Sections

GOstats.R

Navigation

News