Demultiplexing with hashtag oligos (HTOs)

Developed in collaboration with the Technology Innovation Group at NYGC, Cell Hashing uses oligo-tagged antibodies against ubuquitously expressed surface proteins to place a “sample barcode” on each single cell, enabling different samples to be multiplexed together and run in a single experiment. For more information, please refer to our preprint: https://www.biorxiv.org/content/early/2017/12/21/237693

This vignette will give a brief demonstration on how to work with data produced with Cell Hashing in Seurat. Applied to two datasets, we can successfully demultiplex cells to their the original sample-of-origin, and identify cross-sample doublets.

8-HTO dataset from human PBMCs

Dataset description:

Data represent peripheral blood mononuclear cells (PBMCs) from eight different donors.
Cells from each donor are uniquely labeled, using CD45 as a hashing antibody.
Samples were subsequently pooled, and run on a single lane of the the 10X Chromium v2 system.
You can download the count matrices for RNA and HTO here, or the FASTQ files from GEO

Basic setup

Load packages

library(Seurat)

Read in data

setwd("~/Downloads/HTODemuxFiles/")
# Load in the UMI matrix
pbmc_umi_sparse <- readRDS("pbmc_umi_mtx.rds")

# For generating a hashtag count matrix from fastq files, please refer to https://github.com/Hoohm/CITE-seq-Count.
# Load in the HTO count matrix
pbmc_hto <- readRDS("pbmc_hto_mtx.rds")

# Select cell barcodes detected by both RNA and HTO
# In the example datasets we have already filtered the cells for you, but perform this step for clarity.
joint_bcs <- intersect(colnames(pbmc_umi_sparse),colnames(pbmc_hto))

# Subset RNA and HTO counts by joint cell barcodes
pbmc_umi_sparse <- pbmc_umi_sparse[,joint_bcs]
pbmc_hto <- as.matrix(pbmc_hto[,joint_bcs])

# Confirm that the HTO have the correct names
print (rownames(pbmc_hto))

## [1] "HTO_A" "HTO_B" "HTO_C" "HTO_D" "HTO_E" "HTO_F" "HTO_G" "HTO_H"

Setup Seurat object and add in the HTO data

# Setup Seurat object
pbmc_hashtag <- CreateSeuratObject(raw.data = pbmc_umi_sparse)

# Normalize RNA data with log normalization
pbmc_hashtag <- NormalizeData(pbmc_hashtag,display.progress = FALSE)
# Find and scale variable genes
pbmc_hashtag <- FindVariableGenes(pbmc_hashtag,do.plot = F,display.progress = FALSE)
pbmc_hashtag <- ScaleData(pbmc_hashtag,genes.use = pbmc_hashtag@var.genes,display.progress = FALSE)

Adding HTO data as an independent assay

You can read more about working with multi-modal data here

# Add HTO data as a new assay independent from RNA
pbmc_hashtag <- SetAssayData(pbmc_hashtag,assay.type = "HTO",slot = "raw.data",new.data = pbmc_hto)
# Normalize HTO data, here we use centered log-ratio (CLR) transformation
pbmc_hashtag <- NormalizeData(pbmc_hashtag,assay.type = "HTO",normalization.method = "genesCLR",display.progress = FALSE)

Demultiplex cells based on HTO enrichment

Here we use the Seurat function HTODemux() to assign single cells back to their sample origins.

# If you have a very large dataset we suggest using k_function = "clara". This is a k-medoid clustering function for large applications
# You can also play with additional parameters (see documentation for HTODemux()) to adjust the threshold for classification
# Here we are using the default settings
pbmc_hashtag <- HTODemux(pbmc_hashtag,assay.type = "HTO",positive_quantile =  0.99,print.output = FALSE)

Visualize demultiplexing results

Output from running HTODemux() is saved in the object metadata. We can visualize how many cells are classified as singlets, doublets and negative/ambiguous cells.

# Global classification results
print (table(pbmc_hashtag@meta.data$hto_classification_global))

## 
##  Doublet Negative  Singlet 
##     2598      346    13972

Visualize enrichment for selected HTOs with ridge plots

# Group cells based on the max HTO signal
pbmc_hashtag <- SetAllIdent(pbmc_hashtag,id = "hash_maxID")
RidgePlot(pbmc_hashtag,features.plot = rownames(GetAssayData(pbmc_hashtag,assay.type = "HTO"))[1:2],nCol = 2)

Visualize pairs of HTO signals to confirm mutual exclusivity in singlets

GenePlot(pbmc_hashtag,"HTO_A","HTO_B",use.raw=FALSE,cex.use = 0.6,col.use = "black")

Compare number of UMIs for singlets, doublets and negative cells

pbmc_hashtag <- SetAllIdent(pbmc_hashtag,"hto_classification_global")
VlnPlot(pbmc_hashtag,"nUMI",point.size.use = 0.1,y.log = T)

Generate a two dimensional tSNE embedding for HTOs.Here we are grouping cells by singlets and doublets for simplicity.

#First, we will remove negative cells from the object
pbmc_hashtag <- SetAllIdent(pbmc_hashtag,"hto_classification_global")
pbmc_hashtag_subset <- SubsetData(pbmc_hashtag,ident.remove = "Negative")

# Calculate a distance matrix using HTO
hto_dist_mtx <- as.matrix(dist(t(GetAssayData(pbmc_hashtag_subset,assay.type = "HTO",slot = "data"))))

# Calculate tSNE embeddings with a distance matrix
pbmc_hashtag_subset <- RunTSNE(pbmc_hashtag_subset, distance.matrix = hto_dist_mtx,perplexity=100)
TSNEPlot(pbmc_hashtag_subset,pt.size = 0.2)

#You can also visualize the more detailed classification result by running SetAllIdent(object,"hto_classification") beofre plotting. Here, you can see that each of the small clouds on the tSNE plot corresponds to one of the 28 possible doublet combinations.

Create an HTO heatmap, based on Figure 1C in the Cell Hashing paper.

#To increase the efficiency of plotting, you can subsample cells using the num.cells argument
HTOHeatmap(pbmc_hashtag,num.cells = 5000,batch.names = c("A","B","C","D","E","F","G","H"))

Cluster and visualize cells using the usual scRNA-seq workflow, and examine for the potential presence of batch effects.

# Extract the singlets
pbmc_hashtag <- SetAllIdent(pbmc_hashtag,id = "hto_classification_global")
pbmc_singlet <- SubsetData(pbmc_hashtag,ident.use = "Singlet")

# Select the top 1000 most variable genes
pbmc_singlet <- FindVariableGenes(pbmc_singlet,do.plot = F)

# Scaling RNA data, we only scale the variable genes here for efficiency
pbmc_singlet <- ScaleData(pbmc_singlet,genes.use = pbmc_singlet@var.genes,display.progress = FALSE)

# Run PCA
pbmc_singlet <- RunPCA(pbmc_singlet,pc.genes = pbmc_singlet@var.genes,pcs.print = 0)

# We select the top 10 PCs for clustering and tSNE based on PCElbowPlot
pbmc_singlet <- FindClusters(pbmc_singlet,reduction.type = "pca",dims.use = 1:10,print.output = F,resolution = 0.6)
pbmc_singlet <- RunTSNE(pbmc_singlet,reduction.use = "pca",dims.use = 1:10)

# Projecting singlet identities on TSNE visualization
TSNEPlot(pbmc_singlet,group.by = "hto_classification")

12-HTO dataset from four human cell lines

Dataset description:

Data represent single cells collected from four cell lines: HEK, K562, KG1 and THP1
Each cell line was further split into three samples (12 samples in total).
Each sample was labeled with a hashing antibody mixture (CD29 and CD45), pooled, and run on a single lane of 10X.
Based on this design, we should be able to detect doublets both across and within cell types
You can download the count matrices for RNA and HTO here, and we are currently uploading data to GEO

Create Seurat object, add HTO data and perform normalization

setwd("~/Downloads/HTODemuxFiles/")

# Read in UMI count matrix for RNA
hto12_umi <- readRDS("hto12_umi_mtx.rds")

# Read in HTO count matrix
hto12_hto_raw <- readRDS("hto12_hto_mtx.rds")

# Select cell barcodes detected in both RNA and HTO
cells_use <- intersect(rownames(hto12_hto_raw),colnames(hto12_umi))

# Create Seurat object and add HTO data
hto12 <- CreateSeuratObject(hto12_umi[,cells_use],min.genes = 300)
hto12 <- SetAssayData(hto12,assay.type = "HTO",slot = "raw.data",new.data = t(hto12_hto_raw[hto12@cell.names,1:12]))

# Normalize data
hto12 <- NormalizeData(hto12,display.progress = FALSE)
hto12 <- NormalizeData(hto12,assay.type = "HTO",normalization.method = "genesCLR",display.progress = FALSE)

Demultiplex data

hto12 <- HTODemux(hto12,assay.type = "HTO",positive_quantile =  0.99, print.output = TRUE)

## [1] "Cutoff for HEK_A : 4 reads"
## [1] "Cutoff for HEK_B : 8 reads"
## [1] "Cutoff for HEK_C : 6 reads"
## [1] "Cutoff for THP1_A : 19 reads"
## [1] "Cutoff for THP1_B : 13 reads"
## [1] "Cutoff for THP1_C : 13 reads"
## [1] "Cutoff for K562_A : 4 reads"
## [1] "Cutoff for K562_B : 9 reads"
## [1] "Cutoff for K562_C : 5 reads"
## [1] "Cutoff for KG1_A : 26 reads"
## [1] "Cutoff for KG1_B : 46 reads"
## [1] "Cutoff for KG1_C : 41 reads"
## 
##  Doublet Negative  Singlet 
##      889      318     6984

Visualize demultiplexing results

Distribution of selected HTOs grouped by classification, displayed by ridge plots

hto_plot <- c("HEK_A","K562_B","KG1_A","THP1_C")
RidgePlot(hto12,features.plot = hto_plot,nCol = 2)

Visualize HTO signals in a heatmap

HTOHeatmap(hto12,assay.type = "HTO",num.cells = length(hto12@cell.names))

Visualize RNA clustering

Below, we cluster the cells using our standard scRNA-seq workflow. As expected we see four major clusters, corresponding to the cell lines

In addition, we see small clusters in between, representing mixed transcriptomes that are correctly annotated as doublets.

We also see within-cell type doublets, that are (perhaps unsurprisingly) intermixed with singlets of the same cell type

# Remove the negative cells
hto12 <- SubsetData(hto12,ident.remove = "Negative")

# Run PCA on most variable genes
hto12 <- FindVariableGenes(hto12,do.plot = F)
hto12 <- ScaleData(hto12,genes.use = hto12@var.genes,display.progress = FALSE)
hto12 <- RunPCA(hto12,do.print = FALSE)

hto12 <- RunTSNE(hto12,dims.use = 1:5,perplexity = 100)
TSNEPlot(hto12)