R Script: affyCluster

Approx. CPU requirement: minutes
Description:

affyCluster R Documentation

Clustering of Affymetrix CEL Files

Description

This script clusters CEL files into groups. Clustering can be useful in quality assessment (e.g., do the CEL files group according to some unanticipated factor such as date of processing) and exploratory analysis.

Usage

affyCluster(dataset, preprocess = c("rma", "vsnrma"),
            method = c("hclust", "kmeans"), groups = 2L)

Arguments

dataset A ‘zip’ archive containing (a) Affymetrix CEL files and (b) a ‘targets.csv’ file. The ‘targets.csv’ file is a comma-separated table, as might be exported from Excel. The file must contain a column ‘FileName’ that references each CEL file in the archive to be included in the analysis.
preprocess The 'preprocess' option is one of 'rma', or 'vsnrma'. 'rma' pre-processing is implemented in the affy Bioconductor package. 'vsnrma' is implemented in the vsn Bioconductor package. In addition, only the probesets with between-sample variability in the top 20th quantile are retained.
method The 'method' for clustering is either 'hclust' or 'kmeans'. 'kmeans' uses the Hartigan-Wong algorithm, with results based on 25 starts.
groups Specify 'groups' as the number of groups to be identified.

Value

A clusteringOutput class, as defined in the MLInterfaces package.

Author(s)

Martin Morgan <mtmorgan@fhcrc.org>

Examples

clusters <-
    affyCluster(dataset, preprocess, method, groups)
partition <- clusters@partition
partition

if (method != "kmeans") {
    plot(RObject(clusters))
    plot(clusters@silhouette)
    plot(clusters@prcomp)
    plot(clusters@prcomp$x[, 1], clusters@prcomp$x[, 2],
         col = clusters@partition)
}

Source code:


   1  version <- '0.0.6'
   2  #<crdata_text>Script version: </crdata_text>
   3  #<crdata_object>version</crdata_object>
   4  CRDATA_SOURCE_R_FILES <- TRUE
   5  ## setup, if source'd in scripts
   6  
   7  if (interactive() ||
   8      (exists("CRDATA_SOURCE_R_FILES") && get("CRDATA_SOURCE_R_FILES")))
   9  {
  10      require(Biobase)
  11      require(Category)
  12      require(KEGG.db)
  13      require(MLInterfaces)
  14      require(RColorBrewer)
  15      require(affy)
  16      require(annotate)
  17      require(bioDist)
  18      require(genefilter)
  19      require(limma)
  20      require(multicore)
  21      require(vsn)
  22  }
  23  
  24  ## utilities
  25  
  26  .stopf <- function(fmt, ...)
  27      stop(sprintf(fmt, ...), call.=FALSE)
  28  
  29  ## steps
  30  
  31  .affyPhenoData <-
  32      function(dataset, covariate)
  33      ## 'covariate' provides a test allowing for early exit
  34  {
  35      pData <- read.csv(dataset)
  36      ## validation
  37      if (0L == nrow(pData))
  38          .stopf("%s has 0 rows", "dataset")
  39      requiredNames <- c("FileName", covariate)
  40      if (!all(requiredNames %in% names(pData)))
  41          .stopf("'%s' must contain names '%s'", basename(dataset),
  42                 paste(requiredNames, collapse="' '"))
  43      if (!is.factor(pData[["FileName"]]))
  44          .stopf("'%s' column '%s' must be %s", basename(dataset),
  45                 "FileName", "factor()")
  46      if (1L == length(covariate) && !is.factor(pData[[covariate]]))
  47          .stopf("'%s' column '%s' must be %s", basename(dataset),
  48                 covariate, "factor()")
  49  
  50      new("AnnotatedDataFrame", data=pData)
  51  }
  52  
  53  .affyPreprocess <-
  54      function(dataset, algorithm, covariate=character(0))
  55  {
  56      dir <- tempfile()
  57      if (!dir.create(dir))
  58          .stopf("failed to create directory '%s'", dir)
  59      on.exit(unlink(dir))
  60  
  61      tryCatch(unzip(dataset, exdir=dir),
  62               error=function(err) {
  63                   .stopf("failed to unzip dataset '%s': %s",
  64                          dataset, conditionMessage(err))
  65               })
  66      target <- list.files(dir, ".*\\.csv$", full=TRUE)
  67      if (1L != length(target))
  68          .stopf("dataset '%s' does not contain a '.csv' file",
  69                 dataset)
  70      phenoData <- .affyPhenoData(target, covariate)
  71  
  72      fnames <- as.character(phenoData[["FileName"]])
  73      if (!all(ok <- file.exists(file.path(dir, fnames))))
  74          .stopf("'%s' does not contain file(s) listed in .csv:\n    %s",
  75                 basename(dataset),
  76                 paste(basename(fnames)[!ok], collapse="\n    "))
  77      switch(algorithm,
  78             rma = {
  79                 justRMA(filenames=fnames, phenoData=phenoData,
  80                         celfile.path=dir)
  81             },
  82             vsnrma = {
  83                 vsnrma(ReadAffy(filenames=fnames, phenoData=phenoData,
  84                                 celfile.path=dir))
  85             })
  86  }
  87  
  88  .esetNonSpecificFilter <-
  89      function(eset, ...)
  90  {
  91      nsFilter(eset, ...)$eset
  92  }
  93  
  94  .esetScale <-
  95      function(eset)
  96  {
  97      iqr <- esApply(eset, 1, IQR)
  98      scale(t(exprs(eset)), rowMedians(eset), iqr[featureNames(eset)])
  99  }
 100  
 101  .esetDifferentialExpression <-
 102      function(eset, design)
 103  {
 104      efit <- eBayes(lmFit(eset, design))
 105      topTable(efit, coef=2, number=Inf)
 106  }
 107  
 108  .esetDistance <-
 109      function(eset, metric)
 110  {
 111      switch(metric,
 112             manhattan=,
 113             euclidean= dist(.esetScale(eset), method=metric),
 114             spearman= spearman.dist(eset),
 115             .stopf("'%s' metric unknown", metric))
 116  }
 117  
 118  .distHeatmap <-
 119      function(dist)
 120  {
 121      col <- rev(colorRampPalette(brewer.pal(10, "RdBu"))(256))
 122      heatmap(as.matrix(dist), sym=TRUE, col=col,
 123              distfun=function(x) as.dist(x))
 124  }
 125  
 126  .esetCluster <-
 127      function(eset, method, groups)
 128  {
 129      data <- as.data.frame(.esetScale(eset))
 130      switch(method,
 131             hclust=MLearn(~., data, hclustI(dist, list(k=groups))),
 132             kmeans=MLearn(~., data, kmeansI, centers=groups,
 133                           nstart=25L, algorithm="Hartigan-Wong"),
 134             .stopf("'%s' method unknown", method))
 135  }
 136  
 137  .esetClassify <-
 138      function(eset, method, covariate, parameters)
 139  {
 140      data <- cbind(as.data.frame(.esetScale(eset)), eset[[covariate]])
 141      names(data)[ncol(data)] <- covariate
 142      formula <- as.formula(sprintf("%s ~ .", covariate))
 143      xvalSpec <- xvalSpec("LOO")
 144      switch(method,
 145             knn={
 146                 vars <- c("numberOfNeighbors", "minimumVote")
 147                 if (!all(ok <- vars %in% names(parameters)))
 148                     .stopf("'%s' must have values for '%s'",
 149                            "parameters",
 150                            paste(vars[!ok], collapse="', '"))
 151                 MLearn(formula, data,
 152                        knnI(k=parameters[["numberOfNeighbors"]],
 153                             l=parameters[["minimumVote"]]),
 154                        xvalSpec)
 155             },
 156             svm=MLearn(formula, data, svmI, xvalSpec),
 157             lda=MLearn(formula, data, ldaI, xvalSpec),
 158             dlda=MLearn(formula, data, dldaI, xvalSpec),
 159             .stopf("'%s' method unknown", method))
 160  }
 161  
 162  .esetContinuousGeneSet <-
 163      function(eset, pathway, covariate,
 164               parameters=list(minimumGenes=10L))
 165  {
 166      ## FIXME: Permuation significance
 167      ttests <- rowttests(eset, covariate)
 168      probeIds <- findLargest(featureNames(eset), abs(ttests$statistic),
 169                            annotation(eset))
 170      havePath <- probes2Path(probeIds, annotation(eset))
 171      probeIds <- probeIds[probeIds %in% names(havePath)]
 172      eset1 <- eset[probeIds,]
 173  
 174      adjMatrix <- local({
 175          m <- t(PWAmat(annotation(eset)))
 176          m <- m[, names(probeIds), drop=FALSE]
 177          idx <- rowSums(m) >= parameters[["minimumGenes"]]
 178          m <- m[idx,, drop=FALSE]
 179      })
 180  
 181      idx <- colSums(adjMatrix) != 0L
 182      probeIds <- probeIds[idx]
 183      adjMatrix <- adjMatrix[, idx, drop=FALSE]
 184  
 185      z <- local({
 186          x <- adjMatrix %*% ttests[probeIds, "statistic"]
 187          x / sqrt(rowSums(adjMatrix))
 188      })
 189      logFC <- adjMatrix %*% ttests[probeIds, "dm"]
 190      
 191      pathwayName <-
 192          mappedRkeys(KEGGPATHID2NAME[row.names(z)])
 193      genesInPathway <- rowSums(adjMatrix)
 194      df <- data.frame(N=genesInPathway, logFC=logFC, z=z,
 195                       Pathway=pathwayName,
 196                       row.names=row.names(z))
 197      df[order(abs(df[["z"]]), decreasing=TRUE),,drop=FALSE]
 198  }
 199  
 200  ## work flows
 201  ## 'force()' used to prompt early error for missing parameters
 202  
 203  affyPreprocess <-
 204      function(dataset, preprocess=c("rma", "vsnrma"))
 205  {
 206      force(dataset)
 207      preprocess <- match.arg(preprocess)
 208      .affyPreprocess(dataset, preprocess)
 209  }
 210  
 211  affyHeatmap <-
 212      function(dataset, preprocess=c("rma", "vsnrma"),
 213               metric=c("manhattan", "euclidean", "spearman"))
 214  {
 215      force(dataset)
 216      preprocess <- match.arg(preprocess)
 217      metric <- match.arg(metric)
 218  
 219      eset <- .affyPreprocess(dataset, preprocess)
 220      eset <- .esetNonSpecificFilter(eset, var.cutoff=0.8)
 221      .distHeatmap(.esetDistance(eset, metric))
 222  }
 223  
 224  affyDifferentialExpression <-
 225      function(dataset, preprocess=c("rma", "vsnrma"),
 226               covariate)
 227  {
 228      force(dataset)
 229      preprocess <- match.arg(preprocess)
 230      force(covariate)
 231  
 232      eset <- .affyPreprocess(dataset, preprocess, covariate)
 233      eset <- .esetNonSpecificFilter(eset, var.cutoff=0.5)
 234  
 235      formula <- as.formula(sprintf("~ %s", covariate))
 236      design <- model.matrix(formula, pData(eset))
 237      de <- .esetDifferentialExpression(eset, design)
 238  
 239      eid <- getAnnMap("ENTREZID", annotation(eset))
 240      gid <- getAnnMap("GENENAME", annotation(eset))
 241      de <- merge(de,
 242                  merge(toTable(eid[de$ID]), toTable(gid[de$ID])),
 243                  by.x="ID", by.y="probe_id", sort=FALSE)
 244      idx <- match(c("ID", "gene_id", "gene_name"), names(de))
 245      names(de)[idx] <- c("ProbesetId", "EntrezId", "GeneName")
 246      de
 247  }
 248  
 249  affyCluster <-
 250      function(dataset, preprocess=c("rma", "vsnrma"),
 251               method=c("hclust", "kmeans"),
 252               groups=2L)
 253  {
 254      force(dataset)
 255      preprocess <- match.arg(preprocess)
 256      method <- match.arg(method)
 257  
 258      eset <- .affyPreprocess(dataset, preprocess)
 259      eset <- .esetNonSpecificFilter(eset, var.cutoff=0.8)
 260      .esetCluster(eset, method, groups)
 261  }
 262  
 263  affyClassify <-
 264      function(dataset, preprocess=c("rma", "vsnrma"),
 265               method=c("knn", "svm", "lda", "dlda"), covariate,
 266               parameters=list())
 267  {
 268      force(dataset)
 269      ## uses leave-one-out cross-validation
 270      preprocess <- match.arg(preprocess)
 271      method <- match.arg(method)
 272      force(covariate)
 273  
 274      eset <- .affyPreprocess(dataset, preprocess, covariate)
 275      eset <- .esetNonSpecificFilter(eset, var.cutoff=0.8)
 276      .esetClassify(eset, method, covariate, parameters)
 277  }
 278  
 279  affyContinuousGeneSet <-
 280      function(dataset, preprocess=c("rma", "vsnrma"), covariate)
 281  {
 282      force(dataset)
 283      preprocess <- match.arg(preprocess)
 284      pathway <- "KEGG"               # FIXME: support for more pathways
 285      force(covariate)
 286  
 287      eset <- .affyPreprocess(dataset, preprocess, covariate)
 288      eset <- .esetNonSpecificFilter(eset, var.cutoff=0.5)
 289      .esetContinuousGeneSet(eset, pathway, covariate)
 290  }
 291  #<crdata_title>Clustering of Affymetrix CEL Files</crdata_title>
 292  
 293  #<crdata_text>This script clusters CEL files into groups. Clustering can be useful in quality assessment (e.g., do the CEL files group according to some unanticipated factor such as date of processing) and exploratory analysis.</crdata_text>
 294  
 295  #<crdata_text>The 'preprocess' option is one of 'rma', or 'vsnrma'. 'rma' pre-processing is implemented in the affy Bioconductor package. 'vsnrma' is implemented in the vsn Bioconductor package. In addition, only the probesets with between-sample variability in the top 20th quantile are retained.</crdata_text>
 296  
 297  #<crdata_text>The 'method' for clustering is either 'hclust' or 'kmeans'. 'kmeans' uses the Hartigan-Wong algorithm, with results based on 25 starts.</crdata_text>
 298  
 299  #<crdata_text>Specify 'groups' as the number of groups to be identified.</crdata_text>
 300  
 301  #<crdata_text>Supplied parameters:</crdata_text>
 302  
 303  #<crdata_text>celArchive: </crdata_text> 
 304  #<crdata_object>celArchive</crdata_object>
 305  
 306  #<crdata_text>preprocess: </crdata_text> 
 307  #<crdata_object>preprocess</crdata_object>
 308  
 309  #<crdata_text>method: </crdata_text> 
 310  #<crdata_object>method</crdata_object>
 311      
 312  #<crdata_text>groups: </crdata_text> 
 313  #<crdata_object>groups</crdata_object>
 314  
 315  clusters <-
 316      affyCluster(celArchive, preprocess, method, groups)
 317  partition <- clusters@partition
 318  #<crdata_object>partition</crdata_object>
 319  
 320  if (method != "kmeans") {
 321  
 322  #<crdata_text>The following dendrogram summarizes relationships between samples</crdata_text>
 323  #<crdata_image>
 324      print(plot(RObject(clusters)))
 325  #</crdata_image>
 326  
 327  #<crdata_text>A silhouette plot summarizes how well clustered the data are -- horizontal bars extending to the right represent samples that are closer to the mediod of samples in a different cluster than they are to samples in the cluster to which they were assigned.</crdata_text>
 328  #<crdata_image>
 329      print(plot(clusters@silhouette))
 330  #</crdata_image>
 331  
 332  #<crdata_text>Principle components plots summarize how well  reduces high-dimensional data is reduced to lower dimensions; relatively few components explaining most of the variation correspond to bars that are largest on the left of the figure.</crdata_text>
 333  #<crdata_image>
 334      print(plot(clusters@prcomp))
 335  #</crdata_image>
 336  
 337  #<crdata_text>The following figure plots each sample in terms of its first two principle components, with a point colored by the partition to which it is assigned.</crdata_text>
 338  #<crdata_image>
 339      print(plot(clusters@prcomp$x[, 1], clusters@prcomp$x[, 2],
 340           col = clusters@partition))
 341  #</crdata_image>
 342  
 343  }




Back