For this tutorial, we will be analyzing a single-cell ATAC-seq dataset of adult mouse brain cells provided by 10x Genomics. The following files are used in this vignette, all available through the 10x Genomics website:
This vignette echoes the commands run in the introductory Signac vignette on human PBMC. We provide a secondary analysis in a second system to demonstrate performance on a second system, and to provide a non-human example.
First load in Signac, Seurat, and some other packages we will be using for analyzing human data.
The vignette below presents a detailed workflow for the pre-processing, QC, clustering, visualization, integration, and interpretation of single-cell ATAC-seq data. While there are many steps (some of which are computationally intensive), we note that the core analytical components are similar to single-cell RNA-seq, and can be run quickly in a few lines. For example, the following lines encapsulate our scATAC-seq workflow (but does not include the calculation of advanced QC metrics, or integration with scRNA-seq data).
library(dplyr) counts <- Read10X_h5("/home/stuartt/github/chrom/vignette_data/atac_v1_adult_brain_fresh_5k_filtered_peak_bc_matrix.h5") metadata <- read.csv("/home/stuartt/github/chrom/vignette_data/atac_v1_adult_brain_fresh_5k_singlecell.csv", row.names = 1) brain_fast <- CreateSeuratObject(counts = counts,meta.data = metadata) %>% subset(peak_region_fragments > 1000) %>% RunTFIDF() %>% FindTopFeatures(cutoff = 'q75') %>% RunSVD(reduction.name='lsi') %>% FindNeighbors(reduction='lsi', dims = 1:30) %>% FindClusters(resolution = 0.5,verbose = FALSE) %>% RunUMAP(reduction='lsi', dims = 1:30) DimPlot(brain_fast, label = FALSE)
Below, we describe these steps in detail, beginning with pre-processing.
counts <- Read10X_h5("/home/stuartt/github/chrom/vignette_data/atac_v1_adult_brain_fresh_5k_filtered_peak_bc_matrix.h5") metadata <- read.csv( file = "/home/stuartt/github/chrom/vignette_data/atac_v1_adult_brain_fresh_5k_singlecell.csv", header = TRUE, row.names = 1 ) brain <- CreateSeuratObject( counts = counts, assay = 'peaks', project = 'ATAC', min.cells = 1, meta.data = metadata )
## Warning in CreateSeuratObject(counts = counts, assay = "peaks", project = ## "ATAC", : Some cells in meta.data not present in provided counts matrix.
fragment.path <- '/home/stuartt/github/chrom/vignette_data/atac_v1_adult_brain_fresh_5k_fragments.tsv.gz' brain <- SetFragments( object = brain, file = fragment.path )
Optional step: Filtering the fragment file
To make downstream steps that use this file faster, we can filter the fragments file to contain only reads from cells that we retain in the analysis. This is optional, but slow, and only needs to be performed once. Running this command writes a new file to disk and indexes the file so it is ready to be used by Signac.
fragment_file_filtered <- "/home/stuartt/github/chrom/vignette_data/atac_v1_adult_brain_fresh_5k_fragments_filtered.tsv.bgz"
FilterFragments( fragment.path = fragment.path cells = colnames(brain), output.path = fragment_file_filtered )
brain <- SetFragments(object = brain, file = fragment_file_filtered)
brain <- NucleosomeSignal(object = brain)
brain$pct_reads_in_peaks <- brain$peak_region_fragments / brain$total * 100 brain$blacklist_ratio <- brain$blacklist_region_fragments / brain$peak_region_fragments plot1 <- VlnPlot( object = brain, features = c('pct_reads_in_peaks', 'blacklist_ratio', 'nucleosome_signal'), pt.size = 0.1) + NoLegend() plot2_a <- VlnPlot( object = brain, features = 'peak_region_fragments', pt.size = 0.1, log = TRUE) + NoLegend() plot2_b <- FeatureScatter(brain, "peak_region_fragments", "nucleosome_signal", pt.size = 0.1) + NoLegend() plot2_c <- FeatureScatter(brain,"peak_region_fragments", "blacklist_ratio", pt.size = 0.1) + NoLegend() plot2 <- CombinePlots(plots = list(plot2_a,plot2_b,plot2_c), ncol = 3) CombinePlots(list(plot1,plot2), ncol = 1)
We can also look at the fragment length periodicity for all the cells, and group by cells with high or low nucleosomal signal strength. You can see that cells which are outliers for the mononucleosomal/ nucleosome-free ratio (based off the plots above) have different banding patterns. The remaining cells exhibit a pattern that is typical for a successful ATAC-seq experiment.
We remove cells that are outliers for these QC metrics.
brain$nucleosome_group <- ifelse(brain$nucleosome_signal > 10, 'NS > 10', 'NS < 10') PeriodPlot(object = brain, group.by = 'nucleosome_group', region = 'chr1-1-10000000')
#filter out cells brain <- subset(brain, subset = peak_region_fragments > 1000 & peak_region_fragments < 100000 & pct_reads_in_peaks > 15 & blacklist_ratio < 0.025 & nucleosome_signal < 10)
Now that the cells are embedded in a low-dimensional space, we can use methods commonly applied for the analysis of scRNA-seq data to perform graph-based clustering, and non-linear dimension reduction for visualization. The functions
FindClusters all come from the Seurat package.
brain <- RunUMAP( object = brain, reduction = 'lsi', dims = 1:30 ) brain <- FindNeighbors( object = brain, reduction = 'lsi', dims = 1:30 ) brain <- FindClusters( object = brain, verbose = FALSE ) DimPlot(object = brain, label = TRUE) + NoLegend()
#extract gene coordinates from Ensembl, and ensure name formatting is consistent with Seurat object gene.coords <- genes(EnsDb.Mmusculus.v79, filter = ~ gene_biotype == "protein_coding") seqlevelsStyle(gene.coords) <- 'UCSC' genebody.coords <- keepStandardChromosomes(gene.coords, pruning.mode = 'coarse') genebodyandpromoter.coords <- Extend(x = gene.coords, upstream = 2000, downstream = 0) # build a gene by cell matrix gene.activities <- FeatureMatrix( fragments = fragment_file_filtered, features = genebodyandpromoter.coords, cells = colnames(brain), chunk = 10 ) # convert rownames from chromsomal coordinates into gene names gene.key <- genebodyandpromoter.coords$gene_name names(gene.key) <- GRangesToString(grange = genebodyandpromoter.coords) rownames(gene.activities) <- make.unique(gene.key[rownames(gene.activities)]) gene.activities <- gene.activities[rownames(gene.activities)!="",] #Add the gene activity matrix to the Seurat object as a new assay, and normalize it brain[['RNA']] <- CreateAssayObject(counts = gene.activities) brain <- NormalizeData( object = brain, assay = 'RNA', normalization.method = 'LogNormalize', scale.factor = median(brain$nCount_RNA) )
DefaultAssay(brain) <- 'RNA' FeaturePlot( object = brain, features = c('Sst','Pvalb',"Gad2","Neurod6","Rorb","Syt6"), pt.size = 0.1, max.cutoff = 'q95', ncol = 3 )
To help interpret the scATAC-seq data, we can classify cells based on an scRNA-seq experiment from the same biological system (the adult mouse brain). We utilize methods for cross-modality integration and label transfer, described here, with a more in-depth tutorial here.
You can download the raw data for this experiment from the Allen Institute website, and view the code used to construct this object on GitHub. Alternatively, you can download the pre-processed Seurat object here.
# Load the pre-processed scRNA-seq data allen_rna <- readRDS("/home/stuartt/github/chrom/vignette_data/allen_brain.rds") allen_rna <- FindVariableFeatures( object = allen_rna, nfeatures = 5000 ) transfer.anchors <- FindTransferAnchors( reference = allen_rna, query = brain, reduction = 'cca', dims = 1:40 ) predicted.labels <- TransferData( anchorset = transfer.anchors, refdata = allen_rna$subclass, weight.reduction = brain[['lsi']], dims = 1:40 ) brain <- AddMetaData(object = brain, metadata = predicted.labels)
Why did we change default parameters?
We changed default parameters for
FindVariableFeatures (including more features and dimensions). You can run the analysis both ways, and observe very similar results. However, when using default parameters we mislabel cluster 11 cells as Vip-interneurons, when they are in fact a Meis2 expressing CGE-derived interneuron population recently described by us and others. The reason is that this subset is exceptionally rare in the scRNA-seq data (0.3%), and so the genes define this subset (for example, Meis2) were too lowly expressed to be selected in the initial set of variable features. We therefore need more genes and dimensions to facilitate cross-modality mapping. Interestingly, this subset is 10-fold more abundant in the scATAC-seq data compared to the scRNA-seq data (see this paper for possible explanations.)
You can see that the RNA-based classifications are entirely consistent with the UMAP visualization, computed only on the ATAC-seq data. We can now easily annotate our scATAC-seq derived clusters (alternately, we could use the RNA classifications themselves). We note three small clusters (13, 20, 21) which represent subdivisions of the scRNA-seq labels. Try transferring the ‘cluster’ label (which shows finer distinctions) from the allen scRNA-seq dataset, to annotate them!
Here, we find DA regions between excitatory neurons in different layers of the cortex.
#switch back to working with peaks instead of gene activities DefaultAssay(brain) <- 'peaks' da_peaks <- FindMarkers( object = brain, ident.1 = c("L2/3 IT"), ident.2 = c("L4", "L5 IT", "L6 IT"), min.pct = 0.4, test.use = 'LR', latent.vars = 'peak_region_fragments' ) head(da_peaks)
## p_val avg_logFC pct.1 pct.2 p_val_adj ## chr15:87605281-87607659 5.767600e-54 0.5370435 0.444 0.083 9.066841e-49 ## chr13:69329933-69331707 6.275780e-54 -0.4555658 0.136 0.419 9.865715e-49 ## chr2:118700082-118704897 6.326604e-50 0.3151431 0.571 0.173 9.945611e-45 ## chr3:137056475-137058371 5.417237e-49 0.3664840 0.481 0.098 8.516059e-44 ## chr19:28380552-28383664 6.386696e-48 0.4108124 0.426 0.077 1.004008e-42 ## chr10:107751762-107753240 6.445321e-48 0.4083500 0.571 0.174 1.013224e-42
plot1 <- VlnPlot( object = brain, features = rownames(da_peaks), ncol = 3, pt.size = 0.1, idents = c("L4","L5 IT","L2/3 IT") ) plot2 <- FeaturePlot( object = brain, features = rownames(da_peaks), ncol = 3, pt.size = 0.1, max.cutoff = 'q95' ) CombinePlots(list(plot1,plot2))
open_l23 <- rownames(da_peaks[da_peaks$avg_logFC > 0.25, ]) open_l456 <- rownames(da_peaks[da_peaks$avg_logFC < -0.25, ]) closest_l23 <- ClosestFeature(regions = open_l23, annotation = EnsDb.Mmusculus.v79, sep = c(':', '-')) closest_l456 <- ClosestFeature(regions = open_l456, annotation = EnsDb.Mmusculus.v79, sep = c(':', '-')) head(closest_l23)
## gene_id gene_name gene_biotype ## ENSMUSG00000054863 ENSMUSG00000054863 Fam19a5 protein_coding ## ENSMUSG00000078137 ENSMUSG00000078137 Ankrd63 protein_coding ## ENSMUSG00000028161 ENSMUSG00000028161 Ppp3ca protein_coding ## ENSMUSG00000052942 ENSMUSG00000052942 Glis3 protein_coding ## ENSMUSG00000091455 ENSMUSG00000091455 Otogl protein_coding ## ENSMUSG00000030592 ENSMUSG00000030592 Ryr1 protein_coding ## seq_coord_system symbol entrezid ## ENSMUSG00000054863 chromosome Fam19a5 106014 ## ENSMUSG00000078137 chromosome Ankrd63 383787 ## ENSMUSG00000028161 chromosome Ppp3ca 19055 ## ENSMUSG00000052942 chromosome Glis3 226075 ## ENSMUSG00000091455 chromosome Otogl 628870 ## ENSMUSG00000030592 chromosome Ryr1 20190 ## region distance ## ENSMUSG00000054863 chr15-87625230-87759336 17570 ## ENSMUSG00000078137 chr2-118699103-118703963 0 ## ENSMUSG00000028161 chr3-136670124-136937727 118747 ## ENSMUSG00000052942 chr19-28258851-28680077 0 ## ENSMUSG00000091455 chr10-107762223-107912134 8982 ## ENSMUSG00000030592 chr7-29003340-29125151 0
## gene_id gene_name gene_biotype ## ENSMUSG00000034575 ENSMUSG00000034575 Papd7 protein_coding ## ENSMUSG00000027827 ENSMUSG00000027827 Kcnab1 protein_coding ## ENSMUSG00000020866 ENSMUSG00000020866 Cacna1g protein_coding ## ENSMUSG00000022603 ENSMUSG00000022603 Mroh4 protein_coding ## ENSMUSG00000032456 ENSMUSG00000032456 Nmnat3 protein_coding ## ENSMUSG00000000399 ENSMUSG00000000399 Ndufa9 protein_coding ## seq_coord_system symbol entrezid ## ENSMUSG00000034575 chromosome Papd7 210106 ## ENSMUSG00000027827 chromosome Kcnab1 16497 ## ENSMUSG00000020866 chromosome Cacna1g 12291 ## ENSMUSG00000022603 chromosome Mroh4 69439 ## ENSMUSG00000032456 chromosome Nmnat3 74080 ## ENSMUSG00000000399 chromosome Ndufa9 66108 ## region distance ## ENSMUSG00000034575 chr13-69497959-69533839 166251 ## ENSMUSG00000027827 chr3-65109384-65378223 0 ## ENSMUSG00000020866 chr11-94408391-94474198 0 ## ENSMUSG00000022603 chr15-74606029-74636353 0 ## ENSMUSG00000032456 chr9-98287435-98420438 0 ## ENSMUSG00000000399 chr6-126821863-126849144 37363
We can also create coverage plots grouped by cluster around any genomic region using the
CoveragePlot function. These represent ‘pseudo-bulk’ accessibility tracks, where all cells within a cluster have been averaged together, in order to visualize a more robust chromatin landscape.
# set plotting order levels(brain) <- c("L2/3 IT","L4","L5 IT","L5 PT","L6 CT", "L6 IT","NP","Sst","Pvalb","Vip","Lamp5","Meis2","Oligo","Astro","Endo","VLMC","Macrophage") region1 <- rownames(da_peaks) region2 <- GRangesToString(subset(gene.coords, symbol=="Gad2")) CoveragePlot( object = brain, region = c(region1, region2), sep = c(":", "-"), annotation = EnsDb.Mmusculus.v79, extend.upstream = 5000, extend.downstream = 5000, ncol = 1 )