R Script: affyHeatmap

Approx. CPU requirement: minutes
Description:

affyHeatmap R Documentation

Heatmap: Distance Between CEL Files

Description

This script produces a heat map showing the distance between Affymetrix arrays. Arrays are preprocessed and distances calculated using the specified method. Clustering uses the Hartigan-Wong algorithm.

Usage

affyHeatmap(dataset, preprocess = c("rma", "vsnrma"),
            metric = c("manhattan", "euclidean", "spearman"))

Arguments

dataset A ‘zip’ archive containing (a) Affymetrix CEL files and (b) a ‘targets.csv’ file. The ‘targets.csv’ file is a comma-separated table, as might be exported from Excel. The file must contain a column ‘FileName’ that references each CEL file in the archive to be included in the analysis.
preprocess The 'preprocess' option is one of 'rma', or 'vsnrma'. 'rma' pre-processing is implemented in the affy Bioconductor package. 'vsnrma' is implemented in the vsn Bioconductor package. In addition, only the probesets with between-sample variability in the top 20th quantile are retained.
metric The distance used to describe distance between N expression values. ‘manhattan’ measures distance between two samples (represented by coordinates in N-dimensional space) as the sum of the changes in each of the N dimensions, much as one would measure distances as the edges of blocks in Manhattan. The ‘euclidean’ distance is the minimum length between two points in N-dimensional space. The ‘spearman’ distance is the correlation across coordinates N-dimensional space.

Value

A heatmap, where color is proportional to distnace between arrays.

Author(s)

Martin Morgan <mtmorgan@fhcrc.org>

Examples

affyHeatmap(dataset, preprocess, metric)

Source code:


   1  version <- '0.0.6'
   2  #<crdata_text>Script version: </crdata_text>
   3  #<crdata_object>version</crdata_object>
   4  CRDATA_SOURCE_R_FILES <- TRUE
   5  ## setup, if source'd in scripts
   6  
   7  if (interactive() ||
   8      (exists("CRDATA_SOURCE_R_FILES") && get("CRDATA_SOURCE_R_FILES")))
   9  {
  10      require(Biobase)
  11      require(Category)
  12      require(KEGG.db)
  13      require(MLInterfaces)
  14      require(RColorBrewer)
  15      require(affy)
  16      require(annotate)
  17      require(bioDist)
  18      require(genefilter)
  19      require(limma)
  20      require(multicore)
  21      require(vsn)
  22  }
  23  
  24  ## utilities
  25  
  26  .stopf <- function(fmt, ...)
  27      stop(sprintf(fmt, ...), call.=FALSE)
  28  
  29  ## steps
  30  
  31  .affyPhenoData <-
  32      function(dataset, covariate)
  33      ## 'covariate' provides a test allowing for early exit
  34  {
  35      pData <- read.csv(dataset)
  36      ## validation
  37      if (0L == nrow(pData))
  38          .stopf("%s has 0 rows", "dataset")
  39      requiredNames <- c("FileName", covariate)
  40      if (!all(requiredNames %in% names(pData)))
  41          .stopf("'%s' must contain names '%s'", basename(dataset),
  42                 paste(requiredNames, collapse="' '"))
  43      if (!is.factor(pData[["FileName"]]))
  44          .stopf("'%s' column '%s' must be %s", basename(dataset),
  45                 "FileName", "factor()")
  46      if (1L == length(covariate) && !is.factor(pData[[covariate]]))
  47          .stopf("'%s' column '%s' must be %s", basename(dataset),
  48                 covariate, "factor()")
  49  
  50      new("AnnotatedDataFrame", data=pData)
  51  }
  52  
  53  .affyPreprocess <-
  54      function(dataset, algorithm, covariate=character(0))
  55  {
  56      dir <- tempfile()
  57      if (!dir.create(dir))
  58          .stopf("failed to create directory '%s'", dir)
  59      on.exit(unlink(dir))
  60  
  61      tryCatch(unzip(dataset, exdir=dir),
  62               error=function(err) {
  63                   .stopf("failed to unzip dataset '%s': %s",
  64                          dataset, conditionMessage(err))
  65               })
  66      target <- list.files(dir, ".*\\.csv$", full=TRUE)
  67      if (1L != length(target))
  68          .stopf("dataset '%s' does not contain a '.csv' file",
  69                 dataset)
  70      phenoData <- .affyPhenoData(target, covariate)
  71  
  72      fnames <- as.character(phenoData[["FileName"]])
  73      if (!all(ok <- file.exists(file.path(dir, fnames))))
  74          .stopf("'%s' does not contain file(s) listed in .csv:\n    %s",
  75                 basename(dataset),
  76                 paste(basename(fnames)[!ok], collapse="\n    "))
  77      switch(algorithm,
  78             rma = {
  79                 justRMA(filenames=fnames, phenoData=phenoData,
  80                         celfile.path=dir)
  81             },
  82             vsnrma = {
  83                 vsnrma(ReadAffy(filenames=fnames, phenoData=phenoData,
  84                                 celfile.path=dir))
  85             })
  86  }
  87  
  88  .esetNonSpecificFilter <-
  89      function(eset, ...)
  90  {
  91      nsFilter(eset, ...)$eset
  92  }
  93  
  94  .esetScale <-
  95      function(eset)
  96  {
  97      iqr <- esApply(eset, 1, IQR)
  98      scale(t(exprs(eset)), rowMedians(eset), iqr[featureNames(eset)])
  99  }
 100  
 101  .esetDifferentialExpression <-
 102      function(eset, design)
 103  {
 104      efit <- eBayes(lmFit(eset, design))
 105      topTable(efit, coef=2, number=Inf)
 106  }
 107  
 108  .esetDistance <-
 109      function(eset, metric)
 110  {
 111      switch(metric,
 112             manhattan=,
 113             euclidean= dist(.esetScale(eset), method=metric),
 114             spearman= spearman.dist(eset),
 115             .stopf("'%s' metric unknown", metric))
 116  }
 117  
 118  .distHeatmap <-
 119      function(dist)
 120  {
 121      col <- rev(colorRampPalette(brewer.pal(10, "RdBu"))(256))
 122      heatmap(as.matrix(dist), sym=TRUE, col=col,
 123              distfun=function(x) as.dist(x))
 124  }
 125  
 126  .esetCluster <-
 127      function(eset, method, groups)
 128  {
 129      data <- as.data.frame(.esetScale(eset))
 130      switch(method,
 131             hclust=MLearn(~., data, hclustI(dist, list(k=groups))),
 132             kmeans=MLearn(~., data, kmeansI, centers=groups,
 133                           nstart=25L, algorithm="Hartigan-Wong"),
 134             .stopf("'%s' method unknown", method))
 135  }
 136  
 137  .esetClassify <-
 138      function(eset, method, covariate, parameters)
 139  {
 140      data <- cbind(as.data.frame(.esetScale(eset)), eset[[covariate]])
 141      names(data)[ncol(data)] <- covariate
 142      formula <- as.formula(sprintf("%s ~ .", covariate))
 143      xvalSpec <- xvalSpec("LOO")
 144      switch(method,
 145             knn={
 146                 vars <- c("numberOfNeighbors", "minimumVote")
 147                 if (!all(ok <- vars %in% names(parameters)))
 148                     .stopf("'%s' must have values for '%s'",
 149                            "parameters",
 150                            paste(vars[!ok], collapse="', '"))
 151                 MLearn(formula, data,
 152                        knnI(k=parameters[["numberOfNeighbors"]],
 153                             l=parameters[["minimumVote"]]),
 154                        xvalSpec)
 155             },
 156             svm=MLearn(formula, data, svmI, xvalSpec),
 157             lda=MLearn(formula, data, ldaI, xvalSpec),
 158             dlda=MLearn(formula, data, dldaI, xvalSpec),
 159             .stopf("'%s' method unknown", method))
 160  }
 161  
 162  .esetContinuousGeneSet <-
 163      function(eset, pathway, covariate,
 164               parameters=list(minimumGenes=10L))
 165  {
 166      ## FIXME: Permuation significance
 167      ttests <- rowttests(eset, covariate)
 168      probeIds <- findLargest(featureNames(eset), abs(ttests$statistic),
 169                            annotation(eset))
 170      havePath <- probes2Path(probeIds, annotation(eset))
 171      probeIds <- probeIds[probeIds %in% names(havePath)]
 172      eset1 <- eset[probeIds,]
 173  
 174      adjMatrix <- local({
 175          m <- t(PWAmat(annotation(eset)))
 176          m <- m[, names(probeIds), drop=FALSE]
 177          idx <- rowSums(m) >= parameters[["minimumGenes"]]
 178          m <- m[idx,, drop=FALSE]
 179      })
 180  
 181      idx <- colSums(adjMatrix) != 0L
 182      probeIds <- probeIds[idx]
 183      adjMatrix <- adjMatrix[, idx, drop=FALSE]
 184  
 185      z <- local({
 186          x <- adjMatrix %*% ttests[probeIds, "statistic"]
 187          x / sqrt(rowSums(adjMatrix))
 188      })
 189      logFC <- adjMatrix %*% ttests[probeIds, "dm"]
 190      
 191      pathwayName <-
 192          mappedRkeys(KEGGPATHID2NAME[row.names(z)])
 193      genesInPathway <- rowSums(adjMatrix)
 194      df <- data.frame(N=genesInPathway, logFC=logFC, z=z,
 195                       Pathway=pathwayName,
 196                       row.names=row.names(z))
 197      df[order(abs(df[["z"]]), decreasing=TRUE),,drop=FALSE]
 198  }
 199  
 200  ## work flows
 201  ## 'force()' used to prompt early error for missing parameters
 202  
 203  affyPreprocess <-
 204      function(dataset, preprocess=c("rma", "vsnrma"))
 205  {
 206      force(dataset)
 207      preprocess <- match.arg(preprocess)
 208      .affyPreprocess(dataset, preprocess)
 209  }
 210  
 211  affyHeatmap <-
 212      function(dataset, preprocess=c("rma", "vsnrma"),
 213               metric=c("manhattan", "euclidean", "spearman"))
 214  {
 215      force(dataset)
 216      preprocess <- match.arg(preprocess)
 217      metric <- match.arg(metric)
 218  
 219      eset <- .affyPreprocess(dataset, preprocess)
 220      eset <- .esetNonSpecificFilter(eset, var.cutoff=0.8)
 221      .distHeatmap(.esetDistance(eset, metric))
 222  }
 223  
 224  affyDifferentialExpression <-
 225      function(dataset, preprocess=c("rma", "vsnrma"),
 226               covariate)
 227  {
 228      force(dataset)
 229      preprocess <- match.arg(preprocess)
 230      force(covariate)
 231  
 232      eset <- .affyPreprocess(dataset, preprocess, covariate)
 233      eset <- .esetNonSpecificFilter(eset, var.cutoff=0.5)
 234  
 235      formula <- as.formula(sprintf("~ %s", covariate))
 236      design <- model.matrix(formula, pData(eset))
 237      de <- .esetDifferentialExpression(eset, design)
 238  
 239      eid <- getAnnMap("ENTREZID", annotation(eset))
 240      gid <- getAnnMap("GENENAME", annotation(eset))
 241      de <- merge(de,
 242                  merge(toTable(eid[de$ID]), toTable(gid[de$ID])),
 243                  by.x="ID", by.y="probe_id", sort=FALSE)
 244      idx <- match(c("ID", "gene_id", "gene_name"), names(de))
 245      names(de)[idx] <- c("ProbesetId", "EntrezId", "GeneName")
 246      de
 247  }
 248  
 249  affyCluster <-
 250      function(dataset, preprocess=c("rma", "vsnrma"),
 251               method=c("hclust", "kmeans"),
 252               groups=2L)
 253  {
 254      force(dataset)
 255      preprocess <- match.arg(preprocess)
 256      method <- match.arg(method)
 257  
 258      eset <- .affyPreprocess(dataset, preprocess)
 259      eset <- .esetNonSpecificFilter(eset, var.cutoff=0.8)
 260      .esetCluster(eset, method, groups)
 261  }
 262  
 263  affyClassify <-
 264      function(dataset, preprocess=c("rma", "vsnrma"),
 265               method=c("knn", "svm", "lda", "dlda"), covariate,
 266               parameters=list())
 267  {
 268      force(dataset)
 269      ## uses leave-one-out cross-validation
 270      preprocess <- match.arg(preprocess)
 271      method <- match.arg(method)
 272      force(covariate)
 273  
 274      eset <- .affyPreprocess(dataset, preprocess, covariate)
 275      eset <- .esetNonSpecificFilter(eset, var.cutoff=0.8)
 276      .esetClassify(eset, method, covariate, parameters)
 277  }
 278  
 279  affyContinuousGeneSet <-
 280      function(dataset, preprocess=c("rma", "vsnrma"), covariate)
 281  {
 282      force(dataset)
 283      preprocess <- match.arg(preprocess)
 284      pathway <- "KEGG"               # FIXME: support for more pathways
 285      force(covariate)
 286  
 287      eset <- .affyPreprocess(dataset, preprocess, covariate)
 288      eset <- .esetNonSpecificFilter(eset, var.cutoff=0.5)
 289      .esetContinuousGeneSet(eset, pathway, covariate)
 290  }
 291  #<crdata_title>Heatmap: Distance Between CEL Files</crdata_title>
 292  
 293  #<crdata_text>This script produces a heat map showing the distance between Affymetrix arrays. Arrays are preprocessed and distances calculated using the specified method. Clustering uses the Hartigan-Wong algorithm.</crdata_text>
 294  
 295  #<crdata_text>Supplied parameters:</crdata_text>
 296  
 297  #<crdata_text>celArchive: </crdata_text> 
 298  #<crdata_object>celArchive</crdata_object>
 299  
 300  #<crdata_text>preprocess: </crdata_text> 
 301  #<crdata_object>preprocess</crdata_object>
 302  
 303  #<crdata_text>metric: </crdata_text> 
 304  #<crdata_object>metric</crdata_object>
 305  
 306  #<crdata_image>
 307  affyHeatmap(celArchive, preprocess, metric)
 308  #</crdata_image>




Back