In this post, I will continue with reproducing the analysis from Shao et al and compare spliced reads across various 4sU labelled samples. The assumption is that in the short metabolic labelling experiments, the majority of reads should come from unspliced pre-mRNAs.
In order to extract spliced reads from the data, we will extract information in the 6th column of sam file (the CIGAR string) - see code below. The CIGAR string provides information on how your reads align to the reference sequence. Gaps in the alignments (indicative of splicing) are indicated as “N” in the CIGAR string. I refer you to the following blog for a really clear information on the CIGAR
samtools view -h alignment.bam | awk '$6 ~ /N/ || $1 ~ /^@/' | samtools view -bS - > spliced.bam
The first part of the statement uses samtools view to convert the bam to sam and include the header (-h). The awk command is then used to filter reads that contain N in the 6th column (the CIGAR string) or start with @ (therefore are header). The second view statement converts the output back to bam format.
To measure spliced reads in TT-seq I then followed Shao’s script with slight modifications:
setwd("your/working/dir")
source("dir/with/shaos/utils.R")
pacman::p_load(Rsubread, org.Mm.eg.db, TxDb.Mmusculus.UCSC.mm10.knownGene)
# get genes intervals
gene.gr <- GenomicFeatures::genes(TxDb.Mmusculus.UCSC.mm10.knownGene::TxDb.Mmusculus.UCSC.mm10.knownGene)
gene.gr <- gene.gr[!duplicated(ranges(gene.gr))]
ks <- keys(org.Mm.eg.db, keytype = "ENSEMBL")
res <- biomaRt::select(org.Mm.eg.db, keys = ks, keytype = "ENSEMBL",
columns = c("ENTREZID", "SYMBOL"))
gene.ann <- data.frame(GeneID = res$ENSEMBL[match(gene.gr$gene_id, res$ENTREZID)],
Chr = seqnames(gene.gr),
Start = start(gene.gr),
End = end(gene.gr),
Strand = strand(gene.gr))
gene.ann <- gene.ann[!is.na(gene.ann$GeneID), ]
Next, get sample sizes method. I used DESeq2
library(DESeq2)
#read in the file previously created for spike counts
spike_matrix_label=read.table(spike_matrix_label, file="spikematrix_L.txt", row.names=TRUE, col.names=TRUE)
samples_L<-c("LRNA_2i_2d_rep1","LRNA_2i_2d_rep2", "LRNA_2i_7d_rep1","LRNA_SL_rep1", "LRNA_SL_rep2")
cond_L<-c("2d","2d","7d","SL", "SL")
meta_L<-as.matrix(data.frame(samples_L,cond_L))
#here input from counts.rmd
dds_L<-DESeqDataSetFromMatrix(countData=spike_matrix_label,
colData=meta_L,
design= ~ cond_L)
dds_L <- estimateSizeFactors(dds_L)
LRNA.sizefactor<-sizeFactors(dds_L)
Count total and spliced reads
# count total reads
bam_files <- list.files("/your/working/dir/bams", pattern = ".bam$", full.names = T)
fc_PE <- Rsubread::featureCounts(bam_files, annot.ext=gene.ann, isPairedEnd=TRUE)
#size factor required here
#divide each count by appropriate size factor
readCounts<-fc_PE$counts
readCounts_scaled<-sweep(readCounts, 2, LRNA.sizefactor, '/')
# count spliced reads
spliced_bam_files <- list.files("/your/working/dir//spliced/", pattern = ".bam$", full.names = T)
fc_PE_spliced <- Rsubread::featureCounts(spliced_bam_files, annot.ext=gene.ann, isPairedEnd=TRUE)
#size factor required here
#divide each count by appropriate size factor
readCounts_spliced<-fc_PE_spliced$counts
readCounts_spliced_scaled<-sweep(readCounts_spliced, 2, LRNA.sizefactor, '/')
# get spliced ratio
spliced_ratio <- readCounts_spliced_scaled / readCounts_scaled
spliced_ratio <- spliced_ratio[rowSums(readCounts_scaled) > 0, ] # remove inactive genes
spliced_ratio <- spliced_ratio[!apply(spliced_ratio, 1, function(x) any(is.na(x) | is.infinite(x))), ]
# remove unspliced genes, counts true or false, so
#we have 6 types of samples, at least 6 x>0
spliced_ratio <- spliced_ratio[apply(spliced_ratio, 1, function(x) sum(x > 0) > 4), ] # remove unspliced genes
colnames(spliced_ratio)<- c("FRNA_2i_2d_rep1", "FRNA_2i_2d_rep2", "FRNA_2i_7d_rep1", "FRNA_SL_rep1","FRNA_SL_rep2", "LRNA_2i_2d_rep1", "LRNA_2i_2d_rep2", "LRNA_2i_7d_rep1", "LRNA_SL_rep1","LRNA_SL_rep2")
### make the table for data - table with medians etc.
#create matrix
cellCountsmine=data.frame(
# Taking sequence of elements
c("FRNA_2i_2d_rep1", "FRNA_2i_2d_rep2", "FRNA_2i_7d_rep1", "FRNA_SL_rep1","FRNA_SL_rep2", "LRNA_2i_2d_rep1", "LRNA_2i_2d_rep2", "LRNA_2i_7d_rep1", "LRNA_SL_rep1","LRNA_SL_rep2"),
# No of rows
nrow = 10,
# No of columns
ncol = 1,
# By default matrices are in column-wise order
# So this parameter decides how to arrange the matrix
byrow = TRUE
)
colnames(cellCountsmine) = c("Samples")
cellCountsmine$Spliced <- matrixStats::colMedians(spliced_ratio)[match(cellCountsmine$Samples, colnames(spliced_ratio))]
cellCountsmine$Spliced_lower <- apply(spliced_ratio, 2, function(x) quantile(x, 0.25))[match(cellCountsmine$Samples, colnames(spliced_ratio))]
cellCountsmine$Spliced_upper <- apply(spliced_ratio, 2, function(x) quantile(x, 0.75))[match(cellCountsmine$Samples, colnames(spliced_ratio))]
cellCountsmine$Spliced_sd <- colSds(spliced_ratio)[match(cellCountsmine$Samples, colnames(spliced_ratio))]
cellCountsmine$Spliced_mean <- colMeans(spliced_ratio)[match(cellCountsmine$Samples, colnames(spliced_ratio))]
cellCountsmine$Color <- gsub("(.*)\\_rep.*", "\\1", cellCountsmine$Samples)
write.table(cellCountsmine, "/your/working/dir/Splicing_cell_number.txt",
quote = F, row.names = F, col.names = T, sep = "\t")
Statistical tests and plotting
spliced_ratio2 <- spliced_ratio[, !grepl("2i_7d", colnames(spliced_ratio))]
spliced_ratio_mean <- sapply(c("2i_2d", "SL"),
function(x) rowMeans(spliced_ratio2[, grep(x, colnames(spliced_ratio2))])) %>%
as.data.frame()
colMedians(as.matrix(spliced_ratio_mean))
colMeans(as.matrix(spliced_ratio_mean))
plot(density(log(spliced_ratio_mean$`2i_2d`/ spliced_ratio_mean$SL), na.rm = T))
median(na.omit((spliced_ratio_mean$`2i_2d` / spliced_ratio_mean$SL))) # 0.9686969
inf.omit = function(x) x[is.infinite(x)]
##Go through each row and determine if a value is zero
row_sub = apply(spliced_ratio_mean, 1, function(row) all(row !=0 ))
##Subset as usual
spliced_ratio_mean=spliced_ratio_mean[row_sub,]
t.test(na.omit( log2(spliced_ratio_mean$SL / spliced_ratio_mean$`2i_2d`))) # p-value < 2.2e-16
MASS::glm.nb(SL ~ `2i_2d`, data = spliced_ratio_mean) %>% summary()
# --------------------------------------------------------------------------------------------
cellCounts <- read.table("/your/working/dir/Splicing_cell_number.txt", header = T)
cellCounts$Samples <- factor(cellCounts$Samples, unique(cellCounts$Samples))
# plot
ggplot(cellCounts, aes(x = Samples, y = Spliced * 100, fill = Color)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin=Spliced_lower * 100, ymax=Spliced_upper * 100)) +
xlab("") +
ylab("Spliced %\n") +
ylim(0, 20) +
labs(fill='Samples') +
theme_setting +
theme(axis.text.x = element_text(size = 11, angle = 45, hjust=1))
ggsave(filename = "splicing.pdf", path = "/your/working/dir/analysis/", device = "pdf",
width = 5, height = 4)
The code results in the following plot:
References: 1) Shao R, Kumar B, Lidschreiber K, Lidschreiber M, Cramer P, Elsässer SJ (2022). "Distinct transcription kinetics of pluripotent cell states." Molecular Systems Biology (2022)18:e10407 2) Love MI, Huber W, Anders S (2014). “Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2.” Genome Biology, 15, 550