R Script: affyCluster
Approx. CPU requirement: minutes
Description:
| affyCluster | R Documentation |
Clustering of Affymetrix CEL Files
Description
This script clusters CEL files into groups. Clustering can be useful in quality assessment (e.g., do the CEL files group according to some unanticipated factor such as date of processing) and exploratory analysis.
Usage
affyCluster(dataset, preprocess = c("rma", "vsnrma"),
method = c("hclust", "kmeans"), groups = 2L)
Arguments
dataset |
A ‘zip’ archive containing (a) Affymetrix CEL files and (b) a ‘targets.csv’ file. The ‘targets.csv’ file is a comma-separated table, as might be exported from Excel. The file must contain a column ‘FileName’ that references each CEL file in the archive to be included in the analysis. |
preprocess |
The 'preprocess' option is one of 'rma', or 'vsnrma'. 'rma' pre-processing is implemented in the affy Bioconductor package. 'vsnrma' is implemented in the vsn Bioconductor package. In addition, only the probesets with between-sample variability in the top 20th quantile are retained. |
method |
The 'method' for clustering is either 'hclust' or 'kmeans'. 'kmeans' uses the Hartigan-Wong algorithm, with results based on 25 starts. |
groups |
Specify 'groups' as the number of groups to be identified. |
Value
A clusteringOutput class, as defined in the MLInterfaces package.
Author(s)
Martin Morgan <mtmorgan@fhcrc.org>
Examples
clusters <-
affyCluster(dataset, preprocess, method, groups)
partition <- clusters@partition
partition
if (method != "kmeans") {
plot(RObject(clusters))
plot(clusters@silhouette)
plot(clusters@prcomp)
plot(clusters@prcomp$x[, 1], clusters@prcomp$x[, 2],
col = clusters@partition)
}
Source code:
1 version <- '0.0.6' 2 #<crdata_text>Script version: </crdata_text> 3 #<crdata_object>version</crdata_object> 4 CRDATA_SOURCE_R_FILES <- TRUE 5 ## setup, if source'd in scripts 6 7 if (interactive() || 8 (exists("CRDATA_SOURCE_R_FILES") && get("CRDATA_SOURCE_R_FILES"))) 9 { 10 require(Biobase) 11 require(Category) 12 require(KEGG.db) 13 require(MLInterfaces) 14 require(RColorBrewer) 15 require(affy) 16 require(annotate) 17 require(bioDist) 18 require(genefilter) 19 require(limma) 20 require(multicore) 21 require(vsn) 22 } 23 24 ## utilities 25 26 .stopf <- function(fmt, ...) 27 stop(sprintf(fmt, ...), call.=FALSE) 28 29 ## steps 30 31 .affyPhenoData <- 32 function(dataset, covariate) 33 ## 'covariate' provides a test allowing for early exit 34 { 35 pData <- read.csv(dataset) 36 ## validation 37 if (0L == nrow(pData)) 38 .stopf("%s has 0 rows", "dataset") 39 requiredNames <- c("FileName", covariate) 40 if (!all(requiredNames %in% names(pData))) 41 .stopf("'%s' must contain names '%s'", basename(dataset), 42 paste(requiredNames, collapse="' '")) 43 if (!is.factor(pData[["FileName"]])) 44 .stopf("'%s' column '%s' must be %s", basename(dataset), 45 "FileName", "factor()") 46 if (1L == length(covariate) && !is.factor(pData[[covariate]])) 47 .stopf("'%s' column '%s' must be %s", basename(dataset), 48 covariate, "factor()") 49 50 new("AnnotatedDataFrame", data=pData) 51 } 52 53 .affyPreprocess <- 54 function(dataset, algorithm, covariate=character(0)) 55 { 56 dir <- tempfile() 57 if (!dir.create(dir)) 58 .stopf("failed to create directory '%s'", dir) 59 on.exit(unlink(dir)) 60 61 tryCatch(unzip(dataset, exdir=dir), 62 error=function(err) { 63 .stopf("failed to unzip dataset '%s': %s", 64 dataset, conditionMessage(err)) 65 }) 66 target <- list.files(dir, ".*\\.csv$", full=TRUE) 67 if (1L != length(target)) 68 .stopf("dataset '%s' does not contain a '.csv' file", 69 dataset) 70 phenoData <- .affyPhenoData(target, covariate) 71 72 fnames <- as.character(phenoData[["FileName"]]) 73 if (!all(ok <- file.exists(file.path(dir, fnames)))) 74 .stopf("'%s' does not contain file(s) listed in .csv:\n %s", 75 basename(dataset), 76 paste(basename(fnames)[!ok], collapse="\n ")) 77 switch(algorithm, 78 rma = { 79 justRMA(filenames=fnames, phenoData=phenoData, 80 celfile.path=dir) 81 }, 82 vsnrma = { 83 vsnrma(ReadAffy(filenames=fnames, phenoData=phenoData, 84 celfile.path=dir)) 85 }) 86 } 87 88 .esetNonSpecificFilter <- 89 function(eset, ...) 90 { 91 nsFilter(eset, ...)$eset 92 } 93 94 .esetScale <- 95 function(eset) 96 { 97 iqr <- esApply(eset, 1, IQR) 98 scale(t(exprs(eset)), rowMedians(eset), iqr[featureNames(eset)]) 99 } 100 101 .esetDifferentialExpression <- 102 function(eset, design) 103 { 104 efit <- eBayes(lmFit(eset, design)) 105 topTable(efit, coef=2, number=Inf) 106 } 107 108 .esetDistance <- 109 function(eset, metric) 110 { 111 switch(metric, 112 manhattan=, 113 euclidean= dist(.esetScale(eset), method=metric), 114 spearman= spearman.dist(eset), 115 .stopf("'%s' metric unknown", metric)) 116 } 117 118 .distHeatmap <- 119 function(dist) 120 { 121 col <- rev(colorRampPalette(brewer.pal(10, "RdBu"))(256)) 122 heatmap(as.matrix(dist), sym=TRUE, col=col, 123 distfun=function(x) as.dist(x)) 124 } 125 126 .esetCluster <- 127 function(eset, method, groups) 128 { 129 data <- as.data.frame(.esetScale(eset)) 130 switch(method, 131 hclust=MLearn(~., data, hclustI(dist, list(k=groups))), 132 kmeans=MLearn(~., data, kmeansI, centers=groups, 133 nstart=25L, algorithm="Hartigan-Wong"), 134 .stopf("'%s' method unknown", method)) 135 } 136 137 .esetClassify <- 138 function(eset, method, covariate, parameters) 139 { 140 data <- cbind(as.data.frame(.esetScale(eset)), eset[[covariate]]) 141 names(data)[ncol(data)] <- covariate 142 formula <- as.formula(sprintf("%s ~ .", covariate)) 143 xvalSpec <- xvalSpec("LOO") 144 switch(method, 145 knn={ 146 vars <- c("numberOfNeighbors", "minimumVote") 147 if (!all(ok <- vars %in% names(parameters))) 148 .stopf("'%s' must have values for '%s'", 149 "parameters", 150 paste(vars[!ok], collapse="', '")) 151 MLearn(formula, data, 152 knnI(k=parameters[["numberOfNeighbors"]], 153 l=parameters[["minimumVote"]]), 154 xvalSpec) 155 }, 156 svm=MLearn(formula, data, svmI, xvalSpec), 157 lda=MLearn(formula, data, ldaI, xvalSpec), 158 dlda=MLearn(formula, data, dldaI, xvalSpec), 159 .stopf("'%s' method unknown", method)) 160 } 161 162 .esetContinuousGeneSet <- 163 function(eset, pathway, covariate, 164 parameters=list(minimumGenes=10L)) 165 { 166 ## FIXME: Permuation significance 167 ttests <- rowttests(eset, covariate) 168 probeIds <- findLargest(featureNames(eset), abs(ttests$statistic), 169 annotation(eset)) 170 havePath <- probes2Path(probeIds, annotation(eset)) 171 probeIds <- probeIds[probeIds %in% names(havePath)] 172 eset1 <- eset[probeIds,] 173 174 adjMatrix <- local({ 175 m <- t(PWAmat(annotation(eset))) 176 m <- m[, names(probeIds), drop=FALSE] 177 idx <- rowSums(m) >= parameters[["minimumGenes"]] 178 m <- m[idx,, drop=FALSE] 179 }) 180 181 idx <- colSums(adjMatrix) != 0L 182 probeIds <- probeIds[idx] 183 adjMatrix <- adjMatrix[, idx, drop=FALSE] 184 185 z <- local({ 186 x <- adjMatrix %*% ttests[probeIds, "statistic"] 187 x / sqrt(rowSums(adjMatrix)) 188 }) 189 logFC <- adjMatrix %*% ttests[probeIds, "dm"] 190 191 pathwayName <- 192 mappedRkeys(KEGGPATHID2NAME[row.names(z)]) 193 genesInPathway <- rowSums(adjMatrix) 194 df <- data.frame(N=genesInPathway, logFC=logFC, z=z, 195 Pathway=pathwayName, 196 row.names=row.names(z)) 197 df[order(abs(df[["z"]]), decreasing=TRUE),,drop=FALSE] 198 } 199 200 ## work flows 201 ## 'force()' used to prompt early error for missing parameters 202 203 affyPreprocess <- 204 function(dataset, preprocess=c("rma", "vsnrma")) 205 { 206 force(dataset) 207 preprocess <- match.arg(preprocess) 208 .affyPreprocess(dataset, preprocess) 209 } 210 211 affyHeatmap <- 212 function(dataset, preprocess=c("rma", "vsnrma"), 213 metric=c("manhattan", "euclidean", "spearman")) 214 { 215 force(dataset) 216 preprocess <- match.arg(preprocess) 217 metric <- match.arg(metric) 218 219 eset <- .affyPreprocess(dataset, preprocess) 220 eset <- .esetNonSpecificFilter(eset, var.cutoff=0.8) 221 .distHeatmap(.esetDistance(eset, metric)) 222 } 223 224 affyDifferentialExpression <- 225 function(dataset, preprocess=c("rma", "vsnrma"), 226 covariate) 227 { 228 force(dataset) 229 preprocess <- match.arg(preprocess) 230 force(covariate) 231 232 eset <- .affyPreprocess(dataset, preprocess, covariate) 233 eset <- .esetNonSpecificFilter(eset, var.cutoff=0.5) 234 235 formula <- as.formula(sprintf("~ %s", covariate)) 236 design <- model.matrix(formula, pData(eset)) 237 de <- .esetDifferentialExpression(eset, design) 238 239 eid <- getAnnMap("ENTREZID", annotation(eset)) 240 gid <- getAnnMap("GENENAME", annotation(eset)) 241 de <- merge(de, 242 merge(toTable(eid[de$ID]), toTable(gid[de$ID])), 243 by.x="ID", by.y="probe_id", sort=FALSE) 244 idx <- match(c("ID", "gene_id", "gene_name"), names(de)) 245 names(de)[idx] <- c("ProbesetId", "EntrezId", "GeneName") 246 de 247 } 248 249 affyCluster <- 250 function(dataset, preprocess=c("rma", "vsnrma"), 251 method=c("hclust", "kmeans"), 252 groups=2L) 253 { 254 force(dataset) 255 preprocess <- match.arg(preprocess) 256 method <- match.arg(method) 257 258 eset <- .affyPreprocess(dataset, preprocess) 259 eset <- .esetNonSpecificFilter(eset, var.cutoff=0.8) 260 .esetCluster(eset, method, groups) 261 } 262 263 affyClassify <- 264 function(dataset, preprocess=c("rma", "vsnrma"), 265 method=c("knn", "svm", "lda", "dlda"), covariate, 266 parameters=list()) 267 { 268 force(dataset) 269 ## uses leave-one-out cross-validation 270 preprocess <- match.arg(preprocess) 271 method <- match.arg(method) 272 force(covariate) 273 274 eset <- .affyPreprocess(dataset, preprocess, covariate) 275 eset <- .esetNonSpecificFilter(eset, var.cutoff=0.8) 276 .esetClassify(eset, method, covariate, parameters) 277 } 278 279 affyContinuousGeneSet <- 280 function(dataset, preprocess=c("rma", "vsnrma"), covariate) 281 { 282 force(dataset) 283 preprocess <- match.arg(preprocess) 284 pathway <- "KEGG" # FIXME: support for more pathways 285 force(covariate) 286 287 eset <- .affyPreprocess(dataset, preprocess, covariate) 288 eset <- .esetNonSpecificFilter(eset, var.cutoff=0.5) 289 .esetContinuousGeneSet(eset, pathway, covariate) 290 } 291 #<crdata_title>Clustering of Affymetrix CEL Files</crdata_title> 292 293 #<crdata_text>This script clusters CEL files into groups. Clustering can be useful in quality assessment (e.g., do the CEL files group according to some unanticipated factor such as date of processing) and exploratory analysis.</crdata_text> 294 295 #<crdata_text>The 'preprocess' option is one of 'rma', or 'vsnrma'. 'rma' pre-processing is implemented in the affy Bioconductor package. 'vsnrma' is implemented in the vsn Bioconductor package. In addition, only the probesets with between-sample variability in the top 20th quantile are retained.</crdata_text> 296 297 #<crdata_text>The 'method' for clustering is either 'hclust' or 'kmeans'. 'kmeans' uses the Hartigan-Wong algorithm, with results based on 25 starts.</crdata_text> 298 299 #<crdata_text>Specify 'groups' as the number of groups to be identified.</crdata_text> 300 301 #<crdata_text>Supplied parameters:</crdata_text> 302 303 #<crdata_text>celArchive: </crdata_text> 304 #<crdata_object>celArchive</crdata_object> 305 306 #<crdata_text>preprocess: </crdata_text> 307 #<crdata_object>preprocess</crdata_object> 308 309 #<crdata_text>method: </crdata_text> 310 #<crdata_object>method</crdata_object> 311 312 #<crdata_text>groups: </crdata_text> 313 #<crdata_object>groups</crdata_object> 314 315 clusters <- 316 affyCluster(celArchive, preprocess, method, groups) 317 partition <- clusters@partition 318 #<crdata_object>partition</crdata_object> 319 320 if (method != "kmeans") { 321 322 #<crdata_text>The following dendrogram summarizes relationships between samples</crdata_text> 323 #<crdata_image> 324 print(plot(RObject(clusters))) 325 #</crdata_image> 326 327 #<crdata_text>A silhouette plot summarizes how well clustered the data are -- horizontal bars extending to the right represent samples that are closer to the mediod of samples in a different cluster than they are to samples in the cluster to which they were assigned.</crdata_text> 328 #<crdata_image> 329 print(plot(clusters@silhouette)) 330 #</crdata_image> 331 332 #<crdata_text>Principle components plots summarize how well reduces high-dimensional data is reduced to lower dimensions; relatively few components explaining most of the variation correspond to bars that are largest on the left of the figure.</crdata_text> 333 #<crdata_image> 334 print(plot(clusters@prcomp)) 335 #</crdata_image> 336 337 #<crdata_text>The following figure plots each sample in terms of its first two principle components, with a point colored by the partition to which it is assigned.</crdata_text> 338 #<crdata_image> 339 print(plot(clusters@prcomp$x[, 1], clusters@prcomp$x[, 2], 340 col = clusters@partition)) 341 #</crdata_image> 342 343 }
Back