Load in the data

This vignette demonstrates new features that allow users to analyze and explore multi-modal data with Seurat. While this represents an initial release, we are excited to release significant new functionality for multi-modal datasets in the future.

Here, we analyze a dataset of 8,617 cord blood mononuclear cells (CBMCs), produced with CITE-seq, where we simultaneously measure the single cell transcriptomes alongside the expression of 11 surface proteins, whose levels are quantified with DNA-barcoded antibodies. First, we load in two count matrices : one for the RNA measurements, and one for the antibody-derived tags (ADT). You can download the ADT file here and the RNA file here

# Load in the RNA UMI matrix

# Note that this dataset also contains ~5% of mouse cells, which we can use
# as negative controls for the protein measurements. For this reason, the
# gene expression matrix has HUMAN_ or MOUSE_ appended to the beginning of
# each gene.
cbmc.rna <- read.csv("~/Downloads/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz", 
    sep = ",", header = TRUE, row.names = 1)

# To make life a bit easier going forward, we're going to discard all but
# the top 100 most highly expressed mouse genes, and remove the 'HUMAN_'
# from the CITE-seq prefix
cbmc.rna.collapsed <- CollapseSpeciesExpressionMatrix(cbmc.rna)

# Load in the ADT UMI matrix
cbmc.adt <- read.csv("~/Downloads/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz", 
    sep = ",", header = TRUE, row.names = 1)

# To avoid any confusion where genes and proteins might have the same name,
# we'll append 'CITE_' to each of the ADT rownames. This is not strictly
# necessary, but it helps for clarity
cbmc.citeseq <- cbmc.adt
rownames(cbmc.citeseq) <- paste0("CITE_", rownames(cbmc.adt))

# Lastly, we observed poor enrichments for CCR5, CCR7, and CD10 - and
# therefore remove them from the matrix (optional)
cbmc.citeseq <- cbmc.citeseq[setdiff(rownames(cbmc.citeseq), c("CITE_CCR5", 
    "CITE_CCR7", "CITE_CD10")), ]

Setup a Seurat object, and cluster cells based on RNA expression

The steps below represent a quick clustering of the PBMCs based on the scRNA-seq data. For more detail on individual steps or more advanced options, see our PBMC clustering guided tutorial here

cbmc <- CreateSeuratObject(raw.data = cbmc.rna.collapsed)

# standard log-normalization
cbmc <- NormalizeData(cbmc)

# choose ~1k variable genes
cbmc <- FindVariableGenes(cbmc, do.plot = FALSE, y.cutoff = 0.5)

# standard scaling (no regression)
cbmc <- ScaleData(cbmc, display.progress = FALSE)

# Run PCA, select 13 PCs for tSNE visualization and graph-based clustering
cbmc <- RunPCA(cbmc, pcs.print = 0)
PCElbowPlot(cbmc)

cbmc <- FindClusters(cbmc, dims.use = 1:13, print.output = FALSE)
cbmc <- RunTSNE(cbmc, dims.use = 1:13)

# Find the markers that define each cluster, and use these to annotate the
# clusters, we use max.cells.per.ident to speed up the process
cbmc.rna.markers <- FindAllMarkers(cbmc, max.cells.per.ident = 100, logfc.threshold = log(2), 
    only.pos = TRUE, min.diff.pct = 0.3, do.print = F)

current.cluster.ids <- 0:15
# Note, for simplicity we are merging two CD14+ Mono clusters (that differ
# in the expression of HLA-DR genes), and two NK clusters (that differ in
# cell cycle stage)
new.cluster.ids <- c("CD4 T", "CD14+ Mono", "CD14+ Mono", "NK", "Mouse", "B", 
    "CD8 T", "CD16+ Mono", "Unknown", "CD34+", "Mk", "Eryth", "DC", "Mouse", 
    "pDC", "NK")
cbmc@ident <- plyr::mapvalues(x = cbmc@ident, from = current.cluster.ids, to = new.cluster.ids)
TSNEPlot(cbmc, do.label = TRUE, pt.size = 0.5)

Add the protein expression levels to the Seurat object

Seurat v2.1 allows you to store information from multiple assays in the same object, as long as the data is multi-modal (collected on the same set of cells). You can use the SetAssayData and GetAssayData accessor functions to add and fetch data from additional assays.

# We will define a CITE assay, and store raw data for it. Note that it's
# convenient, but not required, to use the same name as the rowname prefix
# we defined earlier.

# If you are interested in how these data are internally stored, you can
# check out the @assay slot, and the assay class, which is defined in
# multimodal.R Note that RNA data is still stored in its normal slots, but
# can also be accessed using GetAssayData and SetAssayData, using the 'RNA'
# assay

cbmc <- SetAssayData(cbmc, assay.type = "CITE", slot = "raw.data", new.data = cbmc.citeseq)

# Now we can repeat the preprocessing (normalization and scaling) steps that
# we typically run with RNA, but modifying the 'assay.type' argument.  For
# CITE-seq data, we do not recommend typical LogNormalization. Instead, we
# use a centered log-ratio (CLR) normalization, computed independently for
# each gene.  This is a slightly improved procedure from the original
# publication, and we will release more advanced versions of CITE-seq
# normalizations soon.
cbmc <- NormalizeData(cbmc, assay.type = "CITE", normalization.method = "genesCLR")
cbmc <- ScaleData(cbmc, assay.type = "CITE", display.progress = FALSE)

Visualize protein levels on RNA clusters

You can use the names of any ADT markers, (i.e. “CITE_CD4”), in FetchData, FeaturePlot, RidgePlot, GenePlot, DoHeatmap, or any other visualization features

# in this plot, protein (ADT) levels are on top, and RNA levels are on the
# bottom
FeaturePlot(cbmc, features.plot = c("CITE_CD3", "CITE_CD11c", "CITE_CD8", "CITE_CD16", 
    "CD3E", "ITGAX", "CD8A", "FCGR3A"), min.cutoff = "q05", max.cutoff = "q95", 
    nCol = 4, cols.use = c("lightgrey", "blue"), pt.size = 0.5)

RidgePlot(cbmc, features.plot = c("CITE_CD3", "CITE_CD11c", "CITE_CD8", "CITE_CD16"), 
    nCol = 2)

par(mfrow = c(1, 2))
# Draw ADT scatter plots (like biaxial plots for FACS). Note that you can
# even 'gate' cells if desired by setting do.identify = TRUE
GenePlot(cbmc, gene1 = "CITE_CD19", gene2 = "CITE_CD3", cex = 0.5)

# view relationship between protein and RNA
GenePlot(cbmc, gene1 = "CITE_CD3", gene2 = "CD3E", cex.use = 0.5)

# Let's plot CD4 vs CD8 levels in T cells
tcells <- SubsetData(cbmc, ident.use = c("CD4 T", "CD8 T"))

par(mfrow = c(1, 2))
GenePlot(tcells, gene1 = "CITE_CD4", gene2 = "CITE_CD8", cex = 0.5)

# Let's look at the raw (non-normalized) ADT counts. You can see the values
# are quite high, particularly in comparison to RNA values. This is due to
# the significantl higher protein copy number in cells, which significantly
# reduces 'drop-out' in ADT data
GenePlot(tcells, gene1 = "CITE_CD4", gene2 = "CITE_CD8", use.raw = TRUE, cex = 0.5)

# If you look a bit more closely, you'll see that our CD8 T cell cluster is
# enriched for CD8 T cells, but still contains many CD4+ CD8- T cells.  This
# is because Naive CD4 and CD8 T cells are quite similar transcriptomically,
# and the RNA dropout levels for CD4 and CD8 are quite high.  This
# demonstrates the challenge of defining subtle immune cell differences from
# scRNA-seq data alone.

Identify differentially expressed proteins between clusters

mono.markers <- FindMarkers(cbmc, "CD14+ Mono", "CD16+ Mono", assay.type = "CITE", 
    logfc.threshold = log(1.5))
head(mono.markers)
##                   p_val  avg_logFC pct.1 pct.2     p_val_adj
## CITE_CD16 5.295377e-114 -1.0542509     1     1 5.295377e-113
## CITE_CD14  6.081424e-56  0.4259318     1     1  6.081424e-55
# Note that we observe CD14 protein expression only on CD4+ T cells, as has
# been previously observed in the literature
tcell.markers <- FindMarkers(cbmc, ident.1 = "CD4 T", ident.2 = "CD8 T", assay.type = "CITE", 
    logfc.threshold = log(1.5))
head(tcell.markers)
##                   p_val  avg_logFC pct.1 pct.2     p_val_adj
## CITE_CD4  6.920999e-108  1.0355947     1     1 6.920999e-107
## CITE_CD8  8.917315e-107 -2.7302936     1     1 8.917315e-106
## CITE_CD14  2.958142e-98  0.4868748     1     1  2.958142e-97
# Downsample the clusters to a maximum of 300 cells each (makes the heatmap
# easier to see for small clusters)
cbmc.small <- SubsetData(cbmc, max.cells.per.ident = 300)
# Find protein markers for all clusters, and draw a heatmap
adt.markers <- FindAllMarkers(cbmc.small, assay.type = "CITE", only.pos = TRUE, 
    print.bar = F)

DoHeatmap(cbmc.small, genes.use = unique(adt.markers$gene), assay.type = "CITE", 
    slim.col.label = TRUE, remove.key = TRUE, group.label.rot = TRUE)

# You can see that our unknown cells co-express both myeloid and lymphoid
# markers (true at the RNA level as well). They are likely cell clumps
# (multiplets) that should be discarded. We'll remove the mouse cells now as
# well
cbmc <- SubsetData(cbmc, ident.remove = c("Unknown", "Mouse"))

Cluster directly on protein levels

You can also run dimensional reduction and graph-based clustering directly on CITE-seq data

# We will store the results in a new object, cbmc_cite.
cbmc_cite <- RunPCA(cbmc, pc.genes = rownames(cbmc.citeseq), assay.type = "CITE", 
    pcs.print = 0)
PCAPlot(cbmc_cite, pt.size = 0.5)

# Since we only have 10 markers, instead of doing PCA, we'll just use a
# standard euclidean distance matrix here.  Also, this provides a good
# opportunity to demonstrate how to do visualization and clustering using a
# custom distance matrix in Seurat

adt.data <- GetAssayData(cbmc_cite, assay.type = "CITE", slot = "data")
adt.dist <- as.matrix(dist(t(adt.data)))

# Before we recluster the data on ADT levels, we'll stash the RNA cluster
# IDs for later
cbmc_cite <- StashIdent(cbmc_cite, "rnaClusterID")

# Now, we rerun tSNE using our distance matrix defined only on ADT (protein)
# levels.
cbmc_cite <- RunTSNE(cbmc_cite, distance.matrix = adt.dist)

# We can also rerun clustering using the same distance matrix. We'll start
# with a very coarse clustering (resolution=0.2)
cbmc_cite <- FindClusters(cbmc_cite, distance.matrix = adt.dist, print.output = FALSE, 
    resolution = 0.2)

# We can compare the RNA and protein clustering, and use this to annotate
# the protein clustering (we could also of course use FindMarkers)
clustering.table <- table(cbmc_cite@ident, cbmc_cite@meta.data$rnaClusterID)
B CD14+ Mono CD16+ Mono CD34+ CD4 T CD8 T DC Eryth Mk NK pDC
0 0 4 0 3 2982 57 0 3 26 31 2
1 4 2206 37 2 3 0 69 13 35 4 2
2 3 1 0 2 2 5 2 2 3 932 0
3 316 8 2 4 0 0 0 1 3 2 0
4 1 0 0 1 82 216 0 0 3 4 0
5 0 7 0 131 4 2 5 62 19 2 1
6 0 177 10 0 8 0 3 0 1 0 1
7 0 12 165 0 0 0 0 0 1 1 0
8 0 63 5 0 1 2 0 0 1 0 0
9 0 0 0 0 1 0 1 0 1 0 44
10 24 3 0 0 3 0 0 0 0 0 0
current.cluster.ids <- 0:10
# Note, for simplicity we are merging two CD14+ Mono clusters (that differ
# in the expression of HLA-DR genes), and two NK clusters (that differ in
# cell cycle stage)
new.cluster.ids <- c("CD4 T", "CD14+ Mono", "NK", "B", "CD8 T", "CD34+", "Unknown1", 
    "CD16+ Mono", "Unknown2", "pDC", "Unknown3")
cbmc_cite@ident <- plyr::mapvalues(x = cbmc_cite@ident, from = current.cluster.ids, 
    to = new.cluster.ids)

tsne_rnaClusters <- TSNEPlot(cbmc_cite, do.return = TRUE, group.by = "rnaClusterID", 
    pt.size = 0.5)
tsne_rnaClusters <- tsne_rnaClusters + ggtitle("Clustering based on scRNA-seq") + 
    theme(plot.title = element_text(hjust = 0.5))

tsne_adtClusters <- TSNEPlot(cbmc_cite, do.return = TRUE, pt.size = 0.5)
tsne_adtClusters <- tsne_adtClusters + ggtitle("Clustering based on ADT signal") + 
    theme(plot.title = element_text(hjust = 0.5))

# Note: for this comparison, both the RNA and protein clustering are
# visualized on a tSNE generated using the ADT distance matrix.

plot_grid(tsne_rnaClusters, tsne_adtClusters, ncol = 2)

The ADT-based clustering yields similar results, but with a few differences

  • Clustering is improved for CD4/CD8 T cell populations, based on the robust ADT data for CD4, CD8, CD14, and CD45RA
  • However, some clusters for which the ADT data does not contain good distinguishing protein markers (i.e. Mk/Ery/DC) lose separation
  • Notably, our unknown populations again correspond to rare populations that co-express markers for different lineages
  • Unknown 1 (Myeloid/CD4), (Myeloid/CD8), and 3 (B/CD4) likely correspond to additional doublets, whose signal was too subtle to cluster separately in scRNA-seq
  • You can verify this using FindMarkers at the RNA level, as well
tcells <- SubsetData(cbmc_cite, ident.use = c("CD4 T", "CD8 T"))
GenePlot(tcells, gene1 = "CITE_CD4", gene2 = "CITE_CD8", use.raw = F, cex = 0.5)

RidgePlot(cbmc_cite, features.plot = c("CITE_CD11c", "CITE_CD8", "CITE_CD16", 
    "CITE_CD4", "CITE_CD19", "CITE_CD14"), nCol = 2)

Loading data from other multi-modal technologies

Seurat is also able to analyze data from other multi-modal technologies like REAP-seq. Below we recreate above the same plots that we generated with CITE-seq data, and observe the same qualitative trends. You can download the data from GEO here.

reap_rna_2 <- read.table("~/Downloads/GSM2685238_mRNA_2_PBMCs_matrix.txt.gz", 
    row.names = 1)
reap_rna_3 <- read.table("~/Downloads/GSM2685239_mRNA_3_PBMCs_matrix.txt.gz", 
    row.names = 1)
reap_protein_2 <- read.table("~/Downloads/GSM2685243_protein_2_PBMCs_matrix.txt.gz", 
    row.names = 1)
reap_protein_3 <- read.table("~/Downloads/GSM2685244_protein_3_PBMCs_matrix.txt.gz", 
    row.names = 1)

reap_rna_combined <- Matrix(as.matrix(cbind(reap_rna_2, reap_rna_3)), sparse = TRUE)
reap_protein_combined <- Matrix(as.matrix(cbind(reap_protein_2, reap_protein_3)), 
    sparse = TRUE)

rownames(reap_protein_combined) <- make.unique(paste0("REAP_", sapply(rownames(reap_protein_combined), 
    ExtractField)))

reap <- CreateSeuratObject(raw.data = reap_rna_combined, min.genes = 250)
reap <- NormalizeData(reap, display.progress = FALSE)
reap <- SetAssayData(reap, assay.type = "REAP", slot = "raw.data", new.data = reap_protein_combined[, 
    reap@cell.names])
reap <- NormalizeData(reap, assay.type = "REAP", normalization.method = "genesCLR", 
    display.progress = FALSE)
par(mfrow = c(1, 3))
GenePlot(reap, gene1 = "REAP_CD19", gene2 = "REAP_CD3", cex.use = 0.5)
GenePlot(reap, gene1 = "REAP_CD4", gene2 = "REAP_CD8a", cex.use = 0.5)
GenePlot(reap, gene1 = "REAP_CD3", gene2 = "CD3E", cex.use = 0.5)