This vignette demonstrates new features that allow users to analyze and explore multi-modal data with Seurat. While this represents an initial release, we are excited to release significant new functionality for multi-modal datasets in the future.
Here, we analyze a dataset of 8,617 cord blood mononuclear cells (CBMCs), produced with CITE-seq, where we simultaneously measure the single cell transcriptomes alongside the expression of 11 surface proteins, whose levels are quantified with DNA-barcoded antibodies. First, we load in two count matrices : one for the RNA measurements, and one for the antibody-derived tags (ADT). You can download the ADT file here and the RNA file here
# Load in the RNA UMI matrix
# Note that this dataset also contains ~5% of mouse cells, which we can use
# as negative controls for the protein measurements. For this reason, the
# gene expression matrix has HUMAN_ or MOUSE_ appended to the beginning of
# each gene.
cbmc.rna <- read.csv("~/Downloads/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz",
sep = ",", header = TRUE, row.names = 1)
# To make life a bit easier going forward, we're going to discard all but
# the top 100 most highly expressed mouse genes, and remove the 'HUMAN_'
# from the CITE-seq prefix
cbmc.rna.collapsed <- CollapseSpeciesExpressionMatrix(cbmc.rna)
# Load in the ADT UMI matrix
cbmc.adt <- read.csv("~/Downloads/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz",
sep = ",", header = TRUE, row.names = 1)
# To avoid any confusion where genes and proteins might have the same name,
# we'll append 'CITE_' to each of the ADT rownames. This is not strictly
# necessary, but it helps for clarity
cbmc.citeseq <- cbmc.adt
rownames(cbmc.citeseq) <- paste0("CITE_", rownames(cbmc.adt))
# Lastly, we observed poor enrichments for CCR5, CCR7, and CD10 - and
# therefore remove them from the matrix (optional)
cbmc.citeseq <- cbmc.citeseq[setdiff(rownames(cbmc.citeseq), c("CITE_CCR5",
"CITE_CCR7", "CITE_CD10")), ]
The steps below represent a quick clustering of the PBMCs based on the scRNA-seq data. For more detail on individual steps or more advanced options, see our PBMC clustering guided tutorial here
cbmc <- CreateSeuratObject(raw.data = cbmc.rna.collapsed)
# standard log-normalization
cbmc <- NormalizeData(cbmc)
# choose ~1k variable genes
cbmc <- FindVariableGenes(cbmc, do.plot = FALSE, y.cutoff = 0.5)
# standard scaling (no regression)
cbmc <- ScaleData(cbmc, display.progress = FALSE)
# Run PCA, select 13 PCs for tSNE visualization and graph-based clustering
cbmc <- RunPCA(cbmc, pcs.print = 0)
PCElbowPlot(cbmc)
cbmc <- FindClusters(cbmc, dims.use = 1:13, print.output = FALSE)
cbmc <- RunTSNE(cbmc, dims.use = 1:13)
# Find the markers that define each cluster, and use these to annotate the
# clusters, we use max.cells.per.ident to speed up the process
cbmc.rna.markers <- FindAllMarkers(cbmc, max.cells.per.ident = 100, logfc.threshold = log(2),
only.pos = TRUE, min.diff.pct = 0.3, do.print = F)
current.cluster.ids <- 0:15
# Note, for simplicity we are merging two CD14+ Mono clusters (that differ
# in the expression of HLA-DR genes), and two NK clusters (that differ in
# cell cycle stage)
new.cluster.ids <- c("CD4 T", "CD14+ Mono", "CD14+ Mono", "NK", "Mouse", "B",
"CD8 T", "CD16+ Mono", "Unknown", "CD34+", "Mk", "Eryth", "DC", "Mouse",
"pDC", "NK")
cbmc@ident <- plyr::mapvalues(x = cbmc@ident, from = current.cluster.ids, to = new.cluster.ids)
TSNEPlot(cbmc, do.label = TRUE, pt.size = 0.5)
Seurat v2.1 allows you to store information from multiple assays in the same object, as long as the data is multi-modal (collected on the same set of cells). You can use the SetAssayData
and GetAssayData
accessor functions to add and fetch data from additional assays.
# We will define a CITE assay, and store raw data for it. Note that it's
# convenient, but not required, to use the same name as the rowname prefix
# we defined earlier.
# If you are interested in how these data are internally stored, you can
# check out the @assay slot, and the assay class, which is defined in
# multimodal.R Note that RNA data is still stored in its normal slots, but
# can also be accessed using GetAssayData and SetAssayData, using the 'RNA'
# assay
cbmc <- SetAssayData(cbmc, assay.type = "CITE", slot = "raw.data", new.data = cbmc.citeseq)
# Now we can repeat the preprocessing (normalization and scaling) steps that
# we typically run with RNA, but modifying the 'assay.type' argument. For
# CITE-seq data, we do not recommend typical LogNormalization. Instead, we
# use a centered log-ratio (CLR) normalization, computed independently for
# each gene. This is a slightly improved procedure from the original
# publication, and we will release more advanced versions of CITE-seq
# normalizations soon.
cbmc <- NormalizeData(cbmc, assay.type = "CITE", normalization.method = "genesCLR")
cbmc <- ScaleData(cbmc, assay.type = "CITE", display.progress = FALSE)
You can use the names of any ADT markers, (i.e. “CITE_CD4”), in FetchData, FeaturePlot, RidgePlot, GenePlot, DoHeatmap, or any other visualization features
# in this plot, protein (ADT) levels are on top, and RNA levels are on the
# bottom
FeaturePlot(cbmc, features.plot = c("CITE_CD3", "CITE_CD11c", "CITE_CD8", "CITE_CD16",
"CD3E", "ITGAX", "CD8A", "FCGR3A"), min.cutoff = "q05", max.cutoff = "q95",
nCol = 4, cols.use = c("lightgrey", "blue"), pt.size = 0.5)
RidgePlot(cbmc, features.plot = c("CITE_CD3", "CITE_CD11c", "CITE_CD8", "CITE_CD16"),
nCol = 2)
par(mfrow = c(1, 2))
# Draw ADT scatter plots (like biaxial plots for FACS). Note that you can
# even 'gate' cells if desired by setting do.identify = TRUE
GenePlot(cbmc, gene1 = "CITE_CD19", gene2 = "CITE_CD3", cex = 0.5)
# view relationship between protein and RNA
GenePlot(cbmc, gene1 = "CITE_CD3", gene2 = "CD3E", cex.use = 0.5)
# Let's plot CD4 vs CD8 levels in T cells
tcells <- SubsetData(cbmc, ident.use = c("CD4 T", "CD8 T"))
par(mfrow = c(1, 2))
GenePlot(tcells, gene1 = "CITE_CD4", gene2 = "CITE_CD8", cex = 0.5)
# Let's look at the raw (non-normalized) ADT counts. You can see the values
# are quite high, particularly in comparison to RNA values. This is due to
# the significantl higher protein copy number in cells, which significantly
# reduces 'drop-out' in ADT data
GenePlot(tcells, gene1 = "CITE_CD4", gene2 = "CITE_CD8", use.raw = TRUE, cex = 0.5)
# If you look a bit more closely, you'll see that our CD8 T cell cluster is
# enriched for CD8 T cells, but still contains many CD4+ CD8- T cells. This
# is because Naive CD4 and CD8 T cells are quite similar transcriptomically,
# and the RNA dropout levels for CD4 and CD8 are quite high. This
# demonstrates the challenge of defining subtle immune cell differences from
# scRNA-seq data alone.
mono.markers <- FindMarkers(cbmc, "CD14+ Mono", "CD16+ Mono", assay.type = "CITE",
logfc.threshold = log(1.5))
head(mono.markers)
## p_val avg_logFC pct.1 pct.2 p_val_adj
## CITE_CD16 5.295377e-114 -1.0542509 1 1 5.295377e-113
## CITE_CD14 6.081424e-56 0.4259318 1 1 6.081424e-55
# Note that we observe CD14 protein expression only on CD4+ T cells, as has
# been previously observed in the literature
tcell.markers <- FindMarkers(cbmc, ident.1 = "CD4 T", ident.2 = "CD8 T", assay.type = "CITE",
logfc.threshold = log(1.5))
head(tcell.markers)
## p_val avg_logFC pct.1 pct.2 p_val_adj
## CITE_CD4 6.920999e-108 1.0355947 1 1 6.920999e-107
## CITE_CD8 8.917315e-107 -2.7302936 1 1 8.917315e-106
## CITE_CD14 2.958142e-98 0.4868748 1 1 2.958142e-97
# Downsample the clusters to a maximum of 300 cells each (makes the heatmap
# easier to see for small clusters)
cbmc.small <- SubsetData(cbmc, max.cells.per.ident = 300)
# Find protein markers for all clusters, and draw a heatmap
adt.markers <- FindAllMarkers(cbmc.small, assay.type = "CITE", only.pos = TRUE,
print.bar = F)
DoHeatmap(cbmc.small, genes.use = unique(adt.markers$gene), assay.type = "CITE",
slim.col.label = TRUE, remove.key = TRUE, group.label.rot = TRUE)
# You can see that our unknown cells co-express both myeloid and lymphoid
# markers (true at the RNA level as well). They are likely cell clumps
# (multiplets) that should be discarded. We'll remove the mouse cells now as
# well
cbmc <- SubsetData(cbmc, ident.remove = c("Unknown", "Mouse"))
You can also run dimensional reduction and graph-based clustering directly on CITE-seq data
# We will store the results in a new object, cbmc_cite.
cbmc_cite <- RunPCA(cbmc, pc.genes = rownames(cbmc.citeseq), assay.type = "CITE",
pcs.print = 0)
PCAPlot(cbmc_cite, pt.size = 0.5)
# Since we only have 10 markers, instead of doing PCA, we'll just use a
# standard euclidean distance matrix here. Also, this provides a good
# opportunity to demonstrate how to do visualization and clustering using a
# custom distance matrix in Seurat
adt.data <- GetAssayData(cbmc_cite, assay.type = "CITE", slot = "data")
adt.dist <- as.matrix(dist(t(adt.data)))
# Before we recluster the data on ADT levels, we'll stash the RNA cluster
# IDs for later
cbmc_cite <- StashIdent(cbmc_cite, "rnaClusterID")
# Now, we rerun tSNE using our distance matrix defined only on ADT (protein)
# levels.
cbmc_cite <- RunTSNE(cbmc_cite, distance.matrix = adt.dist)
# We can also rerun clustering using the same distance matrix. We'll start
# with a very coarse clustering (resolution=0.2)
cbmc_cite <- FindClusters(cbmc_cite, distance.matrix = adt.dist, print.output = FALSE,
resolution = 0.2)
# We can compare the RNA and protein clustering, and use this to annotate
# the protein clustering (we could also of course use FindMarkers)
clustering.table <- table(cbmc_cite@ident, cbmc_cite@meta.data$rnaClusterID)
B | CD14+ Mono | CD16+ Mono | CD34+ | CD4 T | CD8 T | DC | Eryth | Mk | NK | pDC | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 4 | 0 | 3 | 2982 | 57 | 0 | 3 | 26 | 31 | 2 |
1 | 4 | 2206 | 37 | 2 | 3 | 0 | 69 | 13 | 35 | 4 | 2 |
2 | 3 | 1 | 0 | 2 | 2 | 5 | 2 | 2 | 3 | 932 | 0 |
3 | 316 | 8 | 2 | 4 | 0 | 0 | 0 | 1 | 3 | 2 | 0 |
4 | 1 | 0 | 0 | 1 | 82 | 216 | 0 | 0 | 3 | 4 | 0 |
5 | 0 | 7 | 0 | 131 | 4 | 2 | 5 | 62 | 19 | 2 | 1 |
6 | 0 | 177 | 10 | 0 | 8 | 0 | 3 | 0 | 1 | 0 | 1 |
7 | 0 | 12 | 165 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
8 | 0 | 63 | 5 | 0 | 1 | 2 | 0 | 0 | 1 | 0 | 0 |
9 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 44 |
10 | 24 | 3 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
current.cluster.ids <- 0:10
# Note, for simplicity we are merging two CD14+ Mono clusters (that differ
# in the expression of HLA-DR genes), and two NK clusters (that differ in
# cell cycle stage)
new.cluster.ids <- c("CD4 T", "CD14+ Mono", "NK", "B", "CD8 T", "CD34+", "Unknown1",
"CD16+ Mono", "Unknown2", "pDC", "Unknown3")
cbmc_cite@ident <- plyr::mapvalues(x = cbmc_cite@ident, from = current.cluster.ids,
to = new.cluster.ids)
tsne_rnaClusters <- TSNEPlot(cbmc_cite, do.return = TRUE, group.by = "rnaClusterID",
pt.size = 0.5)
tsne_rnaClusters <- tsne_rnaClusters + ggtitle("Clustering based on scRNA-seq") +
theme(plot.title = element_text(hjust = 0.5))
tsne_adtClusters <- TSNEPlot(cbmc_cite, do.return = TRUE, pt.size = 0.5)
tsne_adtClusters <- tsne_adtClusters + ggtitle("Clustering based on ADT signal") +
theme(plot.title = element_text(hjust = 0.5))
# Note: for this comparison, both the RNA and protein clustering are
# visualized on a tSNE generated using the ADT distance matrix.
plot_grid(tsne_rnaClusters, tsne_adtClusters, ncol = 2)
The ADT-based clustering yields similar results, but with a few differences
tcells <- SubsetData(cbmc_cite, ident.use = c("CD4 T", "CD8 T"))
GenePlot(tcells, gene1 = "CITE_CD4", gene2 = "CITE_CD8", use.raw = F, cex = 0.5)
RidgePlot(cbmc_cite, features.plot = c("CITE_CD11c", "CITE_CD8", "CITE_CD16",
"CITE_CD4", "CITE_CD19", "CITE_CD14"), nCol = 2)
Seurat is also able to analyze data from other multi-modal technologies like REAP-seq. Below we recreate above the same plots that we generated with CITE-seq data, and observe the same qualitative trends. You can download the data from GEO here.
reap_rna_2 <- read.table("~/Downloads/GSM2685238_mRNA_2_PBMCs_matrix.txt.gz",
row.names = 1)
reap_rna_3 <- read.table("~/Downloads/GSM2685239_mRNA_3_PBMCs_matrix.txt.gz",
row.names = 1)
reap_protein_2 <- read.table("~/Downloads/GSM2685243_protein_2_PBMCs_matrix.txt.gz",
row.names = 1)
reap_protein_3 <- read.table("~/Downloads/GSM2685244_protein_3_PBMCs_matrix.txt.gz",
row.names = 1)
reap_rna_combined <- Matrix(as.matrix(cbind(reap_rna_2, reap_rna_3)), sparse = TRUE)
reap_protein_combined <- Matrix(as.matrix(cbind(reap_protein_2, reap_protein_3)),
sparse = TRUE)
rownames(reap_protein_combined) <- make.unique(paste0("REAP_", sapply(rownames(reap_protein_combined),
ExtractField)))
reap <- CreateSeuratObject(raw.data = reap_rna_combined, min.genes = 250)
reap <- NormalizeData(reap, display.progress = FALSE)
reap <- SetAssayData(reap, assay.type = "REAP", slot = "raw.data", new.data = reap_protein_combined[,
reap@cell.names])
reap <- NormalizeData(reap, assay.type = "REAP", normalization.method = "genesCLR",
display.progress = FALSE)
par(mfrow = c(1, 3))
GenePlot(reap, gene1 = "REAP_CD19", gene2 = "REAP_CD3", cex.use = 0.5)
GenePlot(reap, gene1 = "REAP_CD4", gene2 = "REAP_CD8a", cex.use = 0.5)
GenePlot(reap, gene1 = "REAP_CD3", gene2 = "CD3E", cex.use = 0.5)