Load in the data

This vignette demonstrates some useful features for interacting with the Seurat object. For demonstration purposes, we will be using the 2,700 PBMC object that is created in the first guided tutorial. You can download the pre-computed object here. To simulate the scenario where we have two replicates, we will randomly assign half the cells in each cluster to be from “rep1” and other half from “rep2”.

library(Seurat)
library(dplyr)
pbmc <- readRDS(file = "~/Projects/datasets/pbmc3k_final.rds")

# pretend that cells were originally assigned to one of two replicates (we
# assign randomly here) if your cells do belong to multiple replicates, and
# you want to add this info to the Seurat object create a data frame with
# this information (similar to replicate.info below)
set.seed(seed = 42)
replicate.info <- data.frame(replicate = sample(x = c("rep1", "rep2"), size = length(x = pbmc@cell.names), 
    replace = TRUE), row.names = pbmc@cell.names)
pbmc <- AddMetaData(object = pbmc, metadata = replicate.info)

Switch identity class between cluster ID and replicate

# Plot tSNE, coloring cells by cell type (currently stored in object@ident)
TSNEPlot(object = pbmc)

# How do I create a tSNE plot where cells are colored by replicate?  First,
# store the current identities in a new column of meta.data called CellType
pbmc <- StashIdent(object = pbmc, save.name = "CellType")
# Next, switch the identity class of all cells to reflect replicate ID
pbmc <- SetAllIdent(object = pbmc, id = "replicate")
TSNEPlot(object = pbmc)

# alternately : TSNEPlot(pbmc, group.by = 'replicate') you can pass the
# pt.shape to label points by both replicate and cell type

# Switch back to cell type labels
pbmc <- SetAllIdent(object = pbmc, id = "CellType")

Tabulate cells by cluster ID, replicate, or both

# How many cells are in each cluster
table(pbmc@ident)
## 
##           B cells       CD4 T cells       CD8 T cells   CD14+ Monocytes 
##               342              1151               308               479 
##   Dendritic cells FCGR3A+ Monocytes    Megakaryocytes          NK cells 
##                32               157                14               155
# How many cells are in each replicate?
table(pbmc@meta.data$replicate)
## 
## rep1 rep2 
## 1327 1311
# restore cluster ID
pbmc <- SetAllIdent(object = pbmc, id = "CellType")

# What proportion of cells are in each cluster?
prop.table(x = table(pbmc@ident))
## 
##           B cells       CD4 T cells       CD8 T cells   CD14+ Monocytes 
##       0.129643669       0.436315390       0.116755118       0.181576952 
##   Dendritic cells FCGR3A+ Monocytes    Megakaryocytes          NK cells 
##       0.012130402       0.059514784       0.005307051       0.058756634
# How does cluster membership vary by replicate?
table(pbmc@ident, pbmc@meta.data$replicate)
##                    
##                     rep1 rep2
##   B cells            164  178
##   CD4 T cells        563  588
##   CD8 T cells        176  132
##   CD14+ Monocytes    240  239
##   Dendritic cells     19   13
##   FCGR3A+ Monocytes   80   77
##   Megakaryocytes       4   10
##   NK cells            81   74
prop.table(x = table(pbmc@ident, pbmc@meta.data$replicate), margin = 2)
##                    
##                            rep1        rep2
##   B cells           0.123587038 0.135774218
##   CD4 T cells       0.424265260 0.448512586
##   CD8 T cells       0.132629992 0.100686499
##   CD14+ Monocytes   0.180859081 0.182303585
##   Dendritic cells   0.014318011 0.009916095
##   FCGR3A+ Monocytes 0.060286360 0.058733791
##   Megakaryocytes    0.003014318 0.007627765
##   NK cells          0.061039940 0.056445461

Selecting particular cells and subsetting the Seurat object

# What are the cell names of all NK cells?
WhichCells(object = pbmc, ident = "NK cells")
##   [1] "AAACCGTGTATGCG" "AAATTCGATTCTCA" "AACCTTACGCGAGA" "AACGCCCTCGTACA"
##   [5] "AACGTCGAGTATCG" "AAGATTACCTCAAG" "AAGCAAGAGGTGTT" "AAGTAGGATACAGC"
##   [9] "AATACTGAATTGGC" "AATCCTTGGTGAGG" "AATCTCTGCTTTAC" "ACAAATTGTTGCGA"
##  [13] "ACAACCGAGGGATG" "ACAATTGATGACTG" "ACACCCTGGTGTTG" "ACAGGTACTGGTGT"
##  [17] "ACCTGGCTAAGTAG" "ACGAACACCTTGTT" "ACGATCGAGGACTT" "ACGCAATGGTTCAG"
##  [21] "ACGCTGCTGTTCTT" "ACGGAACTCAGATC" "ACGTGATGTGACAC" "ACGTTGGAGCCAAT"
##  [25] "ACTGCCACTCCGTC" "ACTGGCCTTCAGTG" "ACTTCAACGTAGGG" "AGAACAGAAATGCC"
##  [29] "AGATATACCCGTAA" "AGATTCCTGTTCAG" "AGCCTCTGCCAATG" "AGCGATTGAGATCC"
##  [33] "AGGATGCTTTAGGC" "AGGGACGAGTCAAC" "AGTAATACATCACG" "AGTCACGATGAGCT"
##  [37] "AGTTTGCTACTGGT" "ATACCACTGCCAAT" "ATACTCTGGTATGC" "ATCCCGTGCAGTCA"
##  [41] "ATCTTTCTTGTCCC" "ATGAAGGACTTGCC" "ATGATAACTTCACT" "ATGATATGGTGCTA"
##  [45] "ATGGACACGCATCA" "ATGGGTACATCGGT" "ATTAACGATGAGAA" "ATTCCAACTTAGGC"
##  [49] "CAAGGTTGTCTGGA" "CAATCTACTGACTG" "CACCACTGGCGAAG" "CACGGGTGGAGGAC"
##  [53] "CAGATGACATTCTC" "CAGCAATGGAGGGT" "CAGCGGACCTTTAC" "CAGCTCTGTGTGGT"
##  [57] "CAGTTTACACACGT" "CATCAGGACTTCCG" "CATCAGGATAGCCA" "CATGAGACGTTGAC"
##  [61] "CATTACACCAACTG" "CATTTCGAGATACC" "CCTCGAACACTTTC" "CGACCACTAAAGTG"
##  [65] "CGACCACTGCCAAT" "CGAGGCTGACGCTA" "CGCCGAGAGCTTAG" "CGGCGAACGACAAA"
##  [69] "CGGCGAACTACTTC" "CGGGCATGTCTCTA" "CGTACCTGGCATCA" "CGTGTAGACGATAC"
##  [73] "CGTGTAGAGTTACG" "CGTGTAGATTCGGA" "CTAAACCTCTGACA" "CTAACGGAACCGAT"
##  [77] "CTACGCACTGGTCA" "CTACTCCTATGTCG" "CTAGTTACGAAACA" "CTATACTGCTACGA"
##  [81] "CTATACTGTCTCAT" "CTCGACTGGTTGAC" "CTGAGAACGTAAAG" "CTTTAGTGACGGGA"
##  [85] "GAACCAACTTCCGC" "GAAGTGCTAAACGA" "GAATGCACCTTCGC" "GAATTAACGTCGTA"
##  [89] "GACGGCACACGGGA" "GAGCGCTGAAGATG" "GAGGTACTGACACT" "GAGGTGGATCCTCG"
##  [93] "GATAGAGAAGGGTG" "GATCCCTGACCTTT" "GCACACCTGTGCTA" "GCACCACTTCCTTA"
##  [97] "GCACTAGAGTCGTA" "GCAGGGCTATCGAC" "GCCGGAACGTTCTT" "GCCTACACAGTTCG"
## [101] "GCGCATCTTGCTCC" "GCGCGATGGTGCAT" "GGAAGGTGGCGAGA" "GGACGCTGTCCTCG"
## [105] "GGAGGCCTCGTTGA" "GGCAAGGAAAAAGC" "GGCATATGCTTATC" "GGCCGAACTCTAGG"
## [109] "GGCTAAACACCTGA" "GGGTTAACGTGCAT" "GGTGGAGAAACGGG" "GGTGGAGACAGATC"
## [113] "GTAGTGTGAGCGGA" "GTCGACCTGAATGA" "GTGATTCTGGTTCA" "GTTAAAACCGAGAG"
## [117] "GTTCAACTGGGACA" "GTTGACGAGCCCTT" "GTTGACGATATCGG" "TAACTCACTCTACT"
## [121] "TAAGAGGACTTGTT" "TAATGCCTCGTCTC" "TACGCAGAGAATCC" "TACGGCCTGGGACA"
## [125] "TACTCTGAATCGAC" "TACTGTTGAGGCGA" "TAGCATCTCAGCTA" "TAGCCCACAGCTAC"
## [129] "TAGTGGTGAAGTGA" "TAGTTAGAACCACA" "TATCGACTACTAGC" "TATGAATGGAGGAC"
## [133] "TATGGGTGCATCAG" "TATTTCCTGGAGGT" "TCAACACTGTTTGG" "TCAGACGACGTTAG"
## [137] "TCCCGAACACAGTC" "TCCTAAACCGCATA" "TCGATTTGCAGCTA" "TCTAACACCAGTTG"
## [141] "TGATAAACTCCGTC" "TGCACAGACGACAT" "TGCCACTGCGATAC" "TGCTGAGAGAGCAG"
## [145] "TGGAACACAAACAG" "TGGTAGACCCTCAC" "TGTAATGACACAAC" "TGTAATGAGGTAAA"
## [149] "TTACTCGATCTACT" "TTAGTCTGCCAACA" "TTCCAAACTCCCAC" "TTCCCACTTGAGGG"
## [153] "TTCTAGTGGAGAGC" "TTCTGATGGAGACG" "TTGTCATGGACGGA"
# How can I extract expression matrix for all NK cells (perhaps, to load
# into another package)
nk.raw.data <- as.matrix(x = pbmc@raw.data[, WhichCells(object = pbmc, ident = "NK cells")])

# Can I create a Seurat object of just the NK cells and B cells?
nk.subset <- SubsetData(object = pbmc, ident.use = c("NK cells", "B cells"))

# Can I create a Seurat object of all cells except the NK cells and B cells?
other.subset <- SubsetData(object = pbmc, ident.remove = c("NK cells", "B cells"))

# note that if you wish to perform additional rounds of clustering after
# SubsetData we recommend re-running FindVariableGenes() and ScaleData()

Calculating the average gene expression within a cluster

# How can I calculate the average expression of all cells within a cluster?
cluster.averages <- AverageExpression(object = pbmc)
## [1] "Finished averaging RNA for cluster B cells"
## [1] "Finished averaging RNA for cluster CD4 T cells"
## [1] "Finished averaging RNA for cluster CD8 T cells"
## [1] "Finished averaging RNA for cluster CD14+ Monocytes"
## [1] "Finished averaging RNA for cluster Dendritic cells"
## [1] "Finished averaging RNA for cluster FCGR3A+ Monocytes"
## [1] "Finished averaging RNA for cluster Megakaryocytes"
## [1] "Finished averaging RNA for cluster NK cells"
head(x = cluster.averages[, 1:5])
##                  B cells CD4 T cells CD8 T cells CD14+ Monocytes
## AL627309.1    0.00000000 0.006198564  0.01807769      0.04864473
## AP006222.2    0.00000000 0.003443558  0.01048354      0.01090743
## RP11-206L10.2 0.02077107 0.004513297  0.00000000      0.00000000
## RP11-206L10.9 0.00000000 0.000000000  0.00000000      0.01052309
## LINC00115     0.03911281 0.018897675  0.02850651      0.03761574
## NOC2L         0.55560804 0.457757658  0.49811901      0.27827017
##               Dendritic cells
## AL627309.1         0.00000000
## AP006222.2         0.00000000
## RP11-206L10.2      0.08462847
## RP11-206L10.9      0.00000000
## LINC00115          0.00000000
## NOC2L              0.27913384
# Return this information as a Seurat object (enables downstream plotting
# and analysis)
cluster.averages <- AverageExpression(object = pbmc, return.seurat = TRUE, show.progress = FALSE)

# How can I plot the average expression of NK cells vs. CD8 T cells?  Pass
# do.hover = T for an interactive plot to identify gene outliers
CellPlot(object = cluster.averages, cell1 = "NK cells", cell2 = "CD8 T cells")

# How can I calculate expression averages separately for each replicate?
cluster.averages <- AverageExpression(object = pbmc, return.seurat = TRUE, add.ident = "replicate", 
    show.progress = FALSE)
CellPlot(object = cluster.averages, cell1 = "CD14+ Monocytes_rep1", cell2 = "CD14+ Monocytes_rep2")

# You can also plot heatmaps of these 'in silico' bulk datasets to visualize
# agreement between replicates
DoHeatmap(object = cluster.averages, genes.use = PCTopGenes(object = pbmc, pc.use = 1, 
    do.balanced = TRUE), group.label.rot = TRUE, group.cex = 0)