R Script: sequenceCountsPerTranscript (RNA-seq)

Approx. CPU requirement: minutes
Description:

sequenceCountsPerTranscript R Documentation

RNA-Seq Sequence Counts Per Transcript

Description

This script summarized the number of reads aligning to different genomic features. Features are retrieved from the UCSC genome browser. Reads are present in one or more BAM files; the BAM files can be located locally, or accessible through http or anonymous ftp.

Usage

sequenceCountsPerTranscript(bamFiles, genome, tableName,
                            transcriptSequenceNames,
                            bamSequenceNames)

Arguments

bamFiles The file name(s) or url(s) of bam (samtools binary alignment) files contained aligned reads.
genome The name of the UCSC genome from which transcripts are to be extracted. Valid genome names can be discovered by omitting this argument.
tableName The name of the UCSC genome table to be parsed for transcript information. Valid table names can be discovered by omitting this argument.
transcriptSequenceNames The names of the sequences (e.g., chromosomes) as they appear in the transcript annotation table. For instance, Yeast has transcriptSequenceNames chrI, chrII, chrIII, etc. Include only those names relevant to the query. The number of sequences names must equal the number of sequence names provided as the bamSequenceNames. There must be a one-to-one correspondence between the ordering of names in transcriptSequenceNames and bamSequenceNames.
bamSequenceNames The names of the sequences (e.g., chromosomes) as they appear in the bam file. For instance Scchr01, Sschr02, Scchr03, etc. Include only those names relevant to the query. The number of sequences names must equal the number of sequence names provided as the transcriptSequenceNames. There must be a one-to-one correspondence between the ordering of names in transcriptSequenceNames and bamSequenceNames.

Value

A data.frame with the following columns:

seqnames The names of the sequences (e.g., chromosomes) on which the features of interest occur
ranges The start and end coordinates (1-based, closed interval) of the features of interest
strand The strand on which the feature of interest occurs, possibly including ‘*’ (strand irrelevant).
tx_id The (internal) identifier used to identify transcripts.
tx_name The name of the feature, as specified in the annotation table.
Additional columns Counts of the number of transcripts overlapping each region, in each file.
Total The total number of reads in each region of interest, summed over all samples.

 

Author(s)

Martin Morgan <mtmorgan@fhcrc.org>

Examples

## Discovery
sequenceCountsPerTranscript()

## Count
tx <- sequenceCountsPerTranscript(bamFiles, genome, tableName,
                                  transcriptSequenceNames,
                                  bamSequenceNames)
df <- as.data.frame(tx)
df$Total <- rowSums(df[,8:ncol(df), drop=FALSE])
idx <- order(df$Total, decreasing=TRUE)
abundant <- head(df[idx,], 20)
abundant

counts <- xyplot(Total~width, df[df$Total!=0,], scales=list(log=TRUE),
                 xlab="Transcript Width (log10)",
                 ylab="Total Count (log10)")
counts

## write.table(df, file="SequenceCountsPerTranscript.txt")

Source code:


   1  version <- '0.0.6'
   2  #<crdata_text>Script version: </crdata_text>
   3  #<crdata_object>version</crdata_object>
   4  CRDATA_SOURCE_R_FILES <- TRUE
   5  if (interactive() ||
   6      (exists("CRDATA_SOURCE_R_FILES") && get("CRDATA_SOURCE_R_FILES")))
   7  {
   8      library(DESeq)
   9      library(GenomicFeatures)
  10      library(GenomicRanges)
  11      library(IRanges)
  12  }
  13  
  14  sequenceDifferentialExpression <-
  15      function(dataset, condition1, condition2)
  16  {
  17      conn <- file(dataset, "r")
  18      conditions <- scan(conn, "character", nlines=1)
  19      counts <- read.delim(conn, header=TRUE, stringsAsFactors=TRUE,
  20                           row.names=1)
  21      close(conn)
  22  
  23      cds <- newCountDataSet( counts, conditions )
  24      cds <- estimateSizeFactors( cds )
  25      cds <- estimateVarianceFunctions( cds )
  26      topTable <- nbinomTest( cds, condition1, condition2)[,c(1:4, 6:8)]
  27      names(topTable) <-
  28          c("Id", "NormalizedMean",
  29            "NormalizedMean.Condition1", "NormalizedMean.Condition2", 
  30            "Log2FoldChange", "PValue", "AdjustedPValue")
  31      data.frame(topTable[order(topTable$AdjustedPValue),], row.names=NULL)
  32  }
  33  
  34  sequenceCountsPerTranscript <- 
  35      function(bamFiles, genome, tableName,
  36               transcriptSequenceNames,
  37               bamSequenceNames)
  38  {
  39      if (missing(genome) ||
  40          (length(genome) == 1 && !nzchar(genome))) {
  41          require(rtracklayer)
  42          return(list(genome=ucscGenomes(),
  43                      supportedTables=supportedUCSCtables()))
  44      }
  45      nameMap <- transcriptSequenceNames
  46      names(nameMap) <- bamSequenceNames
  47      txdb <-
  48          makeTranscriptDbFromUCSC(genome=genome, tablename=tableName)
  49      tx <- transcripts(txdb)
  50      txLevels <- levels(seqnames(tx))
  51      if (!all(idx <- nameMap %in% txLevels))
  52          stop("'transcriptSequenceNames' do not exist in transcript list: '",
  53               paste(nameMap[!idx], collapse="' '"), "'")
  54      for (bamFile in bamFiles) {
  55          bam <- readGappedAlignments(bamFile)
  56          idx <- match(rname(bam), names(nameMap), nomatch=0L) >= 0L
  57          bam <- bam[idx,]
  58          rname(bam) <- factor(nameMap[as.character(rname(bam))],
  59                               levels=nameMap)
  60          values(tx)[[basename(bamFile)]] <- countOverlaps(tx, bam)
  61      }
  62      df <- as.data.frame(tx)
  63      df$Total <- rowSums(df[,8:ncol(df), drop=FALSE])
  64      df
  65  }
  66  #<crdata_title>Sequence Counts Per Transcript</crdata_title>
  67  
  68  #<crdata_text>Supplied parameters:</crdata_text>
  69  
  70  bamFiles <- strsplit(bamFiles, ", *")[[1]]
  71  #<crdata_text>bamFiles: </crdata_text>
  72  #<crdata_object>bamFiles</crdata_object>
  73  
  74  #<crdata_text>genome: </crdata_text>
  75  #<crdata_object>genome</crdata_object>
  76  
  77  #<crdata_text>tableName: </crdata_text>
  78  #<crdata_object>tableName</crdata_object>
  79  
  80  transcriptSequenceNames <- strsplit(transcriptSequenceNames, ", *")[[1]]
  81  #<crdata_text>transcriptSequenceNames: </crdata_text>
  82  #<crdata_object>transcriptSequenceNames</crdata_object>
  83  
  84  bamSequenceNames <- strsplit(bamSequenceNames, ", *")[[1]]
  85  #<crdata_text>bamSequenceNames: </crdata_text>
  86  #<crdata_object>bamSequenceNames</crdata_object>
  87  
  88  if (length(transcriptSequenceNames) != length(bamSequenceNames))
  89      stop("'transcriptSequenceNames' and 'bamSequenceNames' must be the same length")
  90  
  91      ## analysis
  92      df <- sequenceCountsPerTranscript(bamFiles, genome, tableName,
  93                                        transcriptSequenceNames,
  94                                        bamSequenceNames)
  95  
  96      idx <- order(df$Total, decreasing=TRUE)
  97      abundant <- head(df[idx,], 20)
  98  #<crdata_text>Most abundant transcripts:</crdata_text>
  99  #<crdata_object>abundant</crdata_object>
 100  
 101      library(lattice)
 102      counts <- xyplot(Total~width, df[df$Total!=0,], scales=list(log=TRUE),
 103                       xlab="Transcript Width (log10)",
 104                       ylab="Total Count (log10)")
 105  #<crdata_image>
 106        print(counts)
 107  #</crdata_image>
 108  
 109  #<crdata_text>Saved to file 'job-JOBID-SequenceCountsPerTranscript.txt'</crdata_text>
 110  #<crdata_object>df</crdata_object>
 111  write.table(df, file="SequenceCountsPerTranscript.txt")




Back