In this vignette, we’ll demonstrate how to jointly analyze a single-cell dataset measuring both DNA accessibility and gene expression in the same cells using Signac and Seurat. In this vignette we’ll be using a publicly available 10x Genomic Multiome dataset for human PBMCs.
View data download code
You can download the required data by running the following lines in a shell:
wget https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5 wget https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz wget https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz.tbi
# load the RNA and ATAC data counts <- Read10X_h5("../vignette_data/multiomic/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5") fragpath <- "../vignette_data/multiomic/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz"
# get gene annotations for hg38 annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86) seqlevelsStyle(annotation) <- "UCSC" genome(annotation) <- "hg38" # create a Seurat object containing the RNA adata pbmc <- CreateSeuratObject( counts = counts$`Gene Expression`, assay = "RNA" ) # create ATAC assay and add it to the object pbmc[["ATAC"]] <- CreateChromatinAssay( counts = counts$Peaks, sep = c(":", "-"), fragments = fragpath, annotation = annotation )
## An object of class Seurat ## 144978 features across 11909 samples within 2 assays ## Active assay: RNA (36601 features, 0 variable features) ## 1 other assay present: ATAC
We can compute per-cell quality control metrics using the DNA accessibility data and remove cells that are outliers for these metrics, as well as cells with low or unusually high counts for either the RNA or ATAC assay.
DefaultAssay(pbmc) <- "ATAC" pbmc <- NucleosomeSignal(pbmc) pbmc <- TSSEnrichment(pbmc) VlnPlot( object = pbmc, features = c("nCount_RNA", "nCount_ATAC", "TSS.enrichment", "nucleosome_signal"), ncol = 4, pt.size = 0 )
# filter out low quality cells pbmc <- subset( x = pbmc, subset = nCount_ATAC < 100000 & nCount_RNA < 25000 & nCount_ATAC > 1000 & nCount_RNA > 1000 & nucleosome_signal < 2 & TSS.enrichment > 1 ) pbmc
## An object of class Seurat ## 144978 features across 11331 samples within 2 assays ## Active assay: ATAC (108377 features, 0 variable features) ## 1 other assay present: RNA
The set of peaks identified using Cellranger often merges distinct peaks that are close together. This can create a problem for certain analyses, particularly motif enrichment analysis and peak-to-gene linkage. To identify a more accurate set of peaks, we can call peaks using MACS2 with the
CallPeaks() function. Here we call peaks on all cells together, but we could identify peaks for each group of cells separately by setting the
group.by parameter, and this can help identify peaks specific to rare cell populations.
# call peaks using MACS2 peaks <- CallPeaks(pbmc, macs2.path = "/home/stuartt/miniconda3/envs/signac/bin/macs2") # remove peaks on nonstandard chromosomes and in genomic blacklist regions peaks <- keepStandardChromosomes(peaks, pruning.mode = "coarse") peaks <- subsetByOverlaps(x = peaks, ranges = blacklist_hg38_unified, invert = TRUE) # quantify counts in each peak macs2_counts <- FeatureMatrix( fragments = Fragments(pbmc), features = peaks, cells = colnames(pbmc) ) # create a new assay using the MACS2 peak set and add it to the Seurat object pbmc[["peaks"]] <- CreateChromatinAssay( counts = macs2_counts, fragments = fragpath, annotation = annotation )
We can normalize the gene expression data using SCTransform, and reduce the dimensionality using PCA.
Here we process the DNA accessibility assay the same way we would process a scATAC-seq dataset, by performing latent semantic indexing (LSI).
To annotate cell types in the dataset we can transfer cell labels from an existing PBMC reference dataset using tools in the Seurat package. See the Seurat reference mapping vignette for more information.
We’ll use an annotated PBMC reference dataset from Hao et al. (2020), available for download here: https://atlas.fredhutch.org/data/nygc/multimodal/pbmc_multimodal.h5seurat
Note that the SeuratDisk package is required to load the reference dataset. Installation instructions for SeuratDisk can be found here.
library(SeuratDisk) # load PBMC reference reference <- LoadH5Seurat("../vignette_data/multiomic/pbmc_multimodal.h5seurat") DefaultAssay(pbmc) <- "SCT" # transfer cell type labels from reference to query transfer_anchors <- FindTransferAnchors( reference = reference, query = pbmc, normalization.method = "SCT", reference.reduction = "spca", recompute.residuals = FALSE, dims = 1:50 ) predictions <- TransferData( anchorset = transfer_anchors, refdata = reference$celltype.l2, weight.reduction = pbmc[['pca']], dims = 1:50 ) pbmc <- AddMetaData( object = pbmc, metadata = predictions ) # set the cell identities to the cell type predictions Idents(pbmc) <- "predicted.id" # set a reasonable order for cell types to be displayed when plotting levels(pbmc) <- c("CD4 Naive", "CD4 TCM", "CD4 CTL", "CD4 TEM", "CD4 Proliferating", "CD8 Naive", "dnT", "CD8 TEM", "CD8 TCM", "CD8 Proliferating", "MAIT", "NK", "NK_CD56bright", "NK Proliferating", "gdT", "Treg", "B naive", "B intermediate", "B memory", "Plasmablast", "CD14 Mono", "CD16 Mono", "cDC1", "cDC2", "pDC", "HSPC", "Eryth", "ASDC", "ILC", "Platelet")
Using the weighted nearest neighbor methods in Seurat v4, we can compute a joint neighbor graph that represent both the gene expression and DNA accessibility measurements.
# build a joint neighbor graph using both assays pbmc <- FindMultiModalNeighbors( object = pbmc, reduction.list = list("pca", "lsi"), dims.list = list(1:50, 2:40), modality.weight.name = "RNA.weight", verbose = TRUE ) # build a joint UMAP visualization pbmc <- RunUMAP( object = pbmc, nn.name = "weighted.nn", assay = "RNA", verbose = TRUE ) DimPlot(pbmc, label = TRUE, repel = TRUE, reduction = "umap") + NoLegend()