Using Seurat with multi-modal data

Load in the data

This vignette demonstrates new features that allow users to analyze and explore multi-modal data with Seurat. While this represents an initial release, we are excited to release significant new functionality for multi-modal datasets in the future.

Here, we analyze a dataset of 8,617 cord blood mononuclear cells (CBMCs), produced with CITE-seq, where we simultaneously measure the single cell transcriptomes alongside the expression of 11 surface proteins, whose levels are quantified with DNA-barcoded antibodies. First, we load in two count matrices : one for the RNA measurements, and one for the antibody-derived tags (ADT). You can download the ADT file here and the RNA file here

# Load in the RNA UMI matrix

# Note that this dataset also contains ~5% of mouse cells, which we can use
# as negative controls for the protein measurements. For this reason, the
# gene expression matrix has HUMAN_ or MOUSE_ appended to the beginning of
# each gene.
cbmc.rna <- read.csv("~/Downloads/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz", 
    sep = ",", header = TRUE, row.names = 1)

# To make life a bit easier going forward, we're going to discard all but
# the top 100 most highly expressed mouse genes, and remove the 'HUMAN_'
# from the CITE-seq prefix
cbmc.rna.collapsed <- CollapseSpeciesExpressionMatrix(cbmc.rna)

# Load in the ADT UMI matrix
cbmc.adt <- read.csv("~/Downloads/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz", 
    sep = ",", header = TRUE, row.names = 1)

# To avoid any confusion where genes and proteins might have the same name,
# we'll append 'CITE_' to each of the ADT rownames. This is not strictly
# necessary, but it helps for clarity
cbmc.citeseq <- cbmc.adt
rownames(cbmc.citeseq) <- paste0("CITE_", rownames(cbmc.adt))

# Lastly, we observed poor enrichments for CCR5, CCR7, and CD10 - and
# therefore remove them from the matrix (optional)
cbmc.citeseq <- cbmc.citeseq[setdiff(rownames(cbmc.citeseq), c("CITE_CCR5", 
    "CITE_CCR7", "CITE_CD10")), ]

Setup a Seurat object, and cluster cells based on RNA expression

The steps below represent a quick clustering of the PBMCs based on the scRNA-seq data. For more detail on individual steps or more advanced options, see our PBMC clustering guided tutorial here

cbmc <- CreateSeuratObject(raw.data = cbmc.rna.collapsed)

# standard log-normalization
cbmc <- NormalizeData(cbmc)

# choose ~1k variable genes
cbmc <- FindVariableGenes(cbmc, do.plot = FALSE, y.cutoff = 0.5)

# standard scaling (no regression)
cbmc <- ScaleData(cbmc, display.progress = FALSE)

# Run PCA, select 13 PCs for tSNE visualization and graph-based clustering
cbmc <- RunPCA(cbmc, pcs.print = 0)
PCElbowPlot(cbmc)

cbmc <- FindClusters(cbmc, dims.use = 1:13, print.output = FALSE)
cbmc <- RunTSNE(cbmc, dims.use = 1:13)

# Find the markers that define each cluster, and use these to annotate the
# clusters, we use max.cells.per.ident to speed up the process
cbmc.rna.markers <- FindAllMarkers(cbmc, max.cells.per.ident = 100, logfc.threshold = log(2), 
    only.pos = TRUE, min.diff.pct = 0.3, do.print = F)

current.cluster.ids <- 0:15
# Note, for simplicity we are merging two CD14+ Mono clusters (that differ
# in the expression of HLA-DR genes), and two NK clusters (that differ in
# cell cycle stage)
new.cluster.ids <- c("CD4 T", "CD14+ Mono", "CD14+ Mono", "NK", "Mouse", "B", 
    "CD8 T", "CD16+ Mono", "Unknown", "CD34+", "Mk", "Eryth", "DC", "Mouse", 
    "pDC", "NK")
cbmc@ident <- plyr::mapvalues(x = cbmc@ident, from = current.cluster.ids, to = new.cluster.ids)

TSNEPlot(cbmc, do.label = TRUE, pt.size = 0.5)

Add the protein expression levels to the Seurat object

Seurat v2.1 allows you to store information from multiple assays in the same object, as long as the data is multi-modal (collected on the same set of cells). You can use the SetAssayData and GetAssayData accessor functions to add and fetch data from additional assays.

# We will define a CITE assay, and store raw data for it. Note that it's
# convenient, but not required, to use the same name as the rowname prefix
# we defined earlier.

# If you are interested in how these data are internally stored, you can
# check out the @assay slot, and the assay class, which is defined in
# multimodal.R Note that RNA data is still stored in its normal slots, but
# can also be accessed using GetAssayData and SetAssayData, using the 'RNA'
# assay

cbmc <- SetAssayData(cbmc, assay.type = "CITE", slot = "raw.data", new.data = cbmc.citeseq)

# Now we can repeat the preprocessing (normalization and scaling) steps that
# we typically run with RNA, but modifying the 'assay.type' argument.  For
# CITE-seq data, we do not recommend typical LogNormalization. Instead, we
# use a centered log-ratio (CLR) normalization, computed independently for
# each gene.  This is a slightly improved procedure from the original
# publication, and we will release more advanced versions of CITE-seq
# normalizations soon.
cbmc <- NormalizeData(cbmc, assay.type = "CITE", normalization.method = "genesCLR")
cbmc <- ScaleData(cbmc, assay.type = "CITE", display.progress = FALSE)

Visualize protein levels on RNA clusters

You can use the names of any ADT markers, (i.e. “CITE_CD4”), in FetchData, FeaturePlot, RidgePlot, GenePlot, DoHeatmap, or any other visualization features

# in this plot, protein (ADT) levels are on top, and RNA levels are on the
# bottom
FeaturePlot(cbmc, features.plot = c("CITE_CD3", "CITE_CD11c", "CITE_CD8", "CITE_CD16", 
    "CD3E", "ITGAX", "CD8A", "FCGR3A"), min.cutoff = "q05", max.cutoff = "q95", 
    nCol = 4, cols.use = c("lightgrey", "blue"), pt.size = 0.5)

RidgePlot(cbmc, features.plot = c("CITE_CD3", "CITE_CD11c", "CITE_CD8", "CITE_CD16"), 
    nCol = 2)

par(mfrow = c(1, 2))
# Draw ADT scatter plots (like biaxial plots for FACS). Note that you can
# even 'gate' cells if desired by setting do.identify = TRUE
GenePlot(cbmc, gene1 = "CITE_CD19", gene2 = "CITE_CD3", cex = 0.5)

# view relationship between protein and RNA
GenePlot(cbmc, gene1 = "CITE_CD3", gene2 = "CD3E", cex.use = 0.5)

# Let's plot CD4 vs CD8 levels in T cells
tcells <- SubsetData(cbmc, ident.use = c("CD4 T", "CD8 T"))

par(mfrow = c(1, 2))
GenePlot(tcells, gene1 = "CITE_CD4", gene2 = "CITE_CD8", cex = 0.5)

# Let's look at the raw (non-normalized) ADT counts. You can see the values
# are quite high, particularly in comparison to RNA values. This is due to
# the significantl higher protein copy number in cells, which significantly
# reduces 'drop-out' in ADT data
GenePlot(tcells, gene1 = "CITE_CD4", gene2 = "CITE_CD8", use.raw = TRUE, cex = 0.5)

# If you look a bit more closely, you'll see that our CD8 T cell cluster is
# enriched for CD8 T cells, but still contains many CD4+ CD8- T cells.  This
# is because Naive CD4 and CD8 T cells are quite similar transcriptomically,
# and the RNA dropout levels for CD4 and CD8 are quite high.  This
# demonstrates the challenge of defining subtle immune cell differences from
# scRNA-seq data alone.

Identify differentially expressed proteins between clusters

mono.markers <- FindMarkers(cbmc, "CD14+ Mono", "CD16+ Mono", assay.type = "CITE", 
    logfc.threshold = log(1.5))
head(mono.markers)

##                   p_val  avg_logFC pct.1 pct.2     p_val_adj
## CITE_CD16 5.295377e-114 -1.0542509     1     1 5.295377e-113
## CITE_CD14  6.081424e-56  0.4259318     1     1  6.081424e-55

# Note that we observe CD14 protein expression only on CD4+ T cells, as has
# been previously observed in the literature
tcell.markers <- FindMarkers(cbmc, ident.1 = "CD4 T", ident.2 = "CD8 T", assay.type = "CITE", 
    logfc.threshold = log(1.5))
head(tcell.markers)

##                   p_val  avg_logFC pct.1 pct.2     p_val_adj
## CITE_CD4  6.920999e-108  1.0355947     1     1 6.920999e-107
## CITE_CD8  8.917315e-107 -2.7302936     1     1 8.917315e-106
## CITE_CD14  2.958142e-98  0.4868748     1     1  2.958142e-97

# Downsample the clusters to a maximum of 300 cells each (makes the heatmap
# easier to see for small clusters)
cbmc.small <- SubsetData(cbmc, max.cells.per.ident = 300)
# Find protein markers for all clusters, and draw a heatmap
adt.markers <- FindAllMarkers(cbmc.small, assay.type = "CITE", only.pos = TRUE, 
    print.bar = F)

DoHeatmap(cbmc.small, genes.use = unique(adt.markers$gene), assay.type = "CITE", 
    slim.col.label = TRUE, remove.key = TRUE, group.label.rot = TRUE)

# You can see that our unknown cells co-express both myeloid and lymphoid
# markers (true at the RNA level as well). They are likely cell clumps
# (multiplets) that should be discarded. We'll remove the mouse cells now as
# well
cbmc <- SubsetData(cbmc, ident.remove = c("Unknown", "Mouse"))

Cluster directly on protein levels

You can also run dimensional reduction and graph-based clustering directly on CITE-seq data

# We will store the results in a new object, cbmc_cite.
cbmc_cite <- RunPCA(cbmc, pc.genes = rownames(cbmc.citeseq), assay.type = "CITE", 
    pcs.print = 0)
PCAPlot(cbmc_cite, pt.size = 0.5)

# Since we only have 10 markers, instead of doing PCA, we'll just use a
# standard euclidean distance matrix here.  Also, this provides a good
# opportunity to demonstrate how to do visualization and clustering using a
# custom distance matrix in Seurat

adt.data <- GetAssayData(cbmc_cite, assay.type = "CITE", slot = "data")
adt.dist <- as.matrix(dist(t(adt.data)))

# Before we recluster the data on ADT levels, we'll stash the RNA cluster
# IDs for later
cbmc_cite <- StashIdent(cbmc_cite, "rnaClusterID")

# Now, we rerun tSNE using our distance matrix defined only on ADT (protein)
# levels.
cbmc_cite <- RunTSNE(cbmc_cite, distance.matrix = adt.dist)

# We can also rerun clustering using the same distance matrix. We'll start
# with a very coarse clustering (resolution=0.2)
cbmc_cite <- FindClusters(cbmc_cite, distance.matrix = adt.dist, print.output = FALSE, 
    resolution = 0.2)

# We can compare the RNA and protein clustering, and use this to annotate
# the protein clustering (we could also of course use FindMarkers)
clustering.table <- table(cbmc_cite@ident, cbmc_cite@meta.data$rnaClusterID)

	B	CD14+ Mono	CD16+ Mono	CD34+	CD4 T	CD8 T	DC	Eryth	Mk	NK	pDC
0	0	4	0	3	2982	57	0	3	26	31	2
1	4	2206	37	2	3	0	69	13	35	4	2
2	3	1	0	2	2	5	2	2	3	932	0
3	316	8	2	4	0	0	0	1	3	2	0
4	1	0	0	1	82	216	0	0	3	4	0
5	0	7	0	131	4	2	5	62	19	2	1
6	0	177	10	0	8	0	3	0	1	0	1
7	0	12	165	0	0	0	0	0	1	1	0
8	0	63	5	0	1	2	0	0	1	0	0
9	0	0	0	0	1	0	1	0	1	0	44
10	24	3	0	0	3	0	0	0	0	0	0

current.cluster.ids <- 0:10
# Note, for simplicity we are merging two CD14+ Mono clusters (that differ
# in the expression of HLA-DR genes), and two NK clusters (that differ in
# cell cycle stage)
new.cluster.ids <- c("CD4 T", "CD14+ Mono", "NK", "B", "CD8 T", "CD34+", "Unknown1", 
    "CD16+ Mono", "Unknown2", "pDC", "Unknown3")
cbmc_cite@ident <- plyr::mapvalues(x = cbmc_cite@ident, from = current.cluster.ids, 
    to = new.cluster.ids)

tsne_rnaClusters <- TSNEPlot(cbmc_cite, do.return = TRUE, group.by = "rnaClusterID", 
    pt.size = 0.5)
tsne_rnaClusters <- tsne_rnaClusters + ggtitle("Clustering based on scRNA-seq") + 
    theme(plot.title = element_text(hjust = 0.5))

tsne_adtClusters <- TSNEPlot(cbmc_cite, do.return = TRUE, pt.size = 0.5)
tsne_adtClusters <- tsne_adtClusters + ggtitle("Clustering based on ADT signal") + 
    theme(plot.title = element_text(hjust = 0.5))

# Note: for this comparison, both the RNA and protein clustering are
# visualized on a tSNE generated using the ADT distance matrix.

plot_grid(tsne_rnaClusters, tsne_adtClusters, ncol = 2)

The ADT-based clustering yields similar results, but with a few differences

Clustering is improved for CD4/CD8 T cell populations, based on the robust ADT data for CD4, CD8, CD14, and CD45RA
However, some clusters for which the ADT data does not contain good distinguishing protein markers (i.e. Mk/Ery/DC) lose separation
Notably, our unknown populations again correspond to rare populations that co-express markers for different lineages
Unknown 1 (Myeloid/CD4), (Myeloid/CD8), and 3 (B/CD4) likely correspond to additional doublets, whose signal was too subtle to cluster separately in scRNA-seq
You can verify this using FindMarkers at the RNA level, as well

tcells <- SubsetData(cbmc_cite, ident.use = c("CD4 T", "CD8 T"))
GenePlot(tcells, gene1 = "CITE_CD4", gene2 = "CITE_CD8", use.raw = F, cex = 0.5)

RidgePlot(cbmc_cite, features.plot = c("CITE_CD11c", "CITE_CD8", "CITE_CD16", 
    "CITE_CD4", "CITE_CD19", "CITE_CD14"), nCol = 2)

Loading data from other multi-modal technologies

Seurat is also able to analyze data from other multi-modal technologies like REAP-seq. Below we recreate above the same plots that we generated with CITE-seq data, and observe the same qualitative trends. You can download the data from GEO here.

reap_rna_2 <- read.table("~/Downloads/GSM2685238_mRNA_2_PBMCs_matrix.txt.gz", 
    row.names = 1)
reap_rna_3 <- read.table("~/Downloads/GSM2685239_mRNA_3_PBMCs_matrix.txt.gz", 
    row.names = 1)
reap_protein_2 <- read.table("~/Downloads/GSM2685243_protein_2_PBMCs_matrix.txt.gz", 
    row.names = 1)
reap_protein_3 <- read.table("~/Downloads/GSM2685244_protein_3_PBMCs_matrix.txt.gz", 
    row.names = 1)

reap_rna_combined <- Matrix(as.matrix(cbind(reap_rna_2, reap_rna_3)), sparse = TRUE)
reap_protein_combined <- Matrix(as.matrix(cbind(reap_protein_2, reap_protein_3)), 
    sparse = TRUE)

rownames(reap_protein_combined) <- make.unique(paste0("REAP_", sapply(rownames(reap_protein_combined), 
    ExtractField)))

reap <- CreateSeuratObject(raw.data = reap_rna_combined, min.genes = 250)
reap <- NormalizeData(reap, display.progress = FALSE)
reap <- SetAssayData(reap, assay.type = "REAP", slot = "raw.data", new.data = reap_protein_combined[, 
    reap@cell.names])
reap <- NormalizeData(reap, assay.type = "REAP", normalization.method = "genesCLR", 
    display.progress = FALSE)
par(mfrow = c(1, 3))
GenePlot(reap, gene1 = "REAP_CD19", gene2 = "REAP_CD3", cex.use = 0.5)
GenePlot(reap, gene1 = "REAP_CD4", gene2 = "REAP_CD8a", cex.use = 0.5)
GenePlot(reap, gene1 = "REAP_CD3", gene2 = "CD3E", cex.use = 0.5)