Load existing Seurat object
load("~/seurat_files/nbt_tutorial_part1.Robj")
Pulling data from a Seurat object
# First, we introduce the fetch.data function, a very useful way to pull information from the dataset.
# Essentially it is a wrapper to pull from nbt@data, nbt@ident, nbt@pca.rot, nbt@data.info, etc...
# Pulls the identity class (cluster ID), PC1 scores, # of genes, original identity class (parsed from the cell name), gene expression levels for SOX1 and ACTB
# Info is pulled for all cells, but displayed for the last 5
# Note that the GW16 neurons fall into three clusters (1-3), with variable expression of PAX6 and DLX2 (see below for more)
my.data=fetch.data(nbt,c("ident","PC1","nGene","orig.ident","PAX6","DLX2","ACTB"))
tail(my.data,5)
## ident PC1 nGene orig.ident PAX6 DLX2 ACTB
## Hi_GW16_22 3 0.03747778 3519 GW16 4.592996 0.000000 9.007845
## Hi_GW16_23 1 0.09227503 2670 GW16 0.000000 3.355502 9.503926
## Hi_GW16_24 2 0.06869470 2938 GW16 4.699935 0.000000 9.351865
## Hi_GW16_25 3 0.06927854 3863 GW16 4.089500 0.000000 9.407497
## Hi_GW16_26 1 0.10876099 3084 GW16 0.000000 6.492603 9.023716
# Can use the which.cells function to pull the names of cells that were experimentally marked as iPS cells, and find that they have been placed in cluster 7
ips.cells=which.cells(nbt,"iPS",id = "orig.ident")
my.data=fetch.data(nbt,"ident",cells.use = ips.cells)
head(my.data,5)
## ident
## Hi_iPS_1 7
## Hi_iPS_2 7
## Hi_iPS_3 7
## Hi_iPS_4 7
## Hi_iPS_5 7
# You can also switch easily switch the cell's identity (for example, going back to the original annotation)
nbt = set.all.ident(nbt, "orig.ident")
# And switch back - to the cluster ID defined by the tree building
nbt = set.all.ident(nbt, "tree.ident")
# Also see set.ident for changing identity classes for only a subset of cells
Finding differentially expressed genes (cluster biomarkers)
#find all markers of cluster 8
#thresh.use speeds things up (increase value to increase speed) by only testing genes whose average expression is > thresh.use between cluster
#Note that Seurat finds both positive and negative markers (avg_diff either >0 or <0)
ips.markers=find.markers(nbt,7,thresh.use = 2)
print(head(ips.markers,5))
## p_val avg_diff
## TDGF1 0 7.225206
## ESRG 0 5.248835
## CRABP1 0 5.037518
## D21S2088E 0 4.737998
## FOXG1 0 -4.549359
# note that Seurat has four tests for differential expression:
# ROC test ("roc"), t-test ("t"), LRT test based on zero-inflated data ("bimod", default), LRT test based on tobit-censoring models ("tobit")
# The ROC test returns the 'classification power' for any individual marker (ranging from 0 - random, to 1 - perfect). Though not a statistical test, it is often very useful for finding clean markers.
ips.markers=find.markers(nbt,7,thresh.use = 2,test.use = "roc")
#Visualize these new markers with a violin plot
vlnPlot(nbt,c("CRABP1","LINC-ROR"))
#plots a correlation analysis of gene/gene (ie. 'FACS' plot - cells colored by cluster number)
genePlot(nbt,"CRABP1","LINC-ROR")
# Neuronal cells in the dataset (GW represents gestational week) cluster into three groups (1-3) on the phylogenetic tree, let's explore these grouos
plotClusterTree(nbt)
# Names of cells in C1
which.cells(nbt,1)
## [1] "Hi_GW21.2_3" "Hi_GW16_2" "Hi_GW16_7" "Hi_GW16_9" "Hi_GW16_10"
## [6] "Hi_GW16_23" "Hi_GW16_26"
# Find markers of C1 (compared to a background of all other cells)
c1.markers=find.markers(nbt,1,thresh.use = 2,test.use = "roc")
# Find markers of C1 (compared to a background of clusters 2 and 3)
# As noted in Pollen et al., Many of these markers (DLX6, DLX2, GAD2, etc.) are known markers of inhibitory interneurons
c1.markers=find.markers(nbt,1,c(2,3),thresh.use = 2,test.use = "roc")
print(head(c1.markers,10))
## myAUC avg_diff power
## DLX6-AS1 1.000 5.315307 1.000
## ERBB4 0.983 4.371347 0.966
## DLX2 0.967 2.930245 0.934
## TRIM16 0.967 3.703620 0.934
## MEIS2 0.043 -5.315592 0.914
## TPI1 0.053 -5.000135 0.894
## PABPC1 0.066 -4.207054 0.868
## HSPE1 0.073 -2.473312 0.854
## ENO1 0.080 -6.525642 0.840
## GAD2 0.919 3.418394 0.838
# Find markers that distinguish clusters 2 and 3 (markers with +avg_diff distinguish C2, markers with -avg_diff characterize C3)
# As noted in Pollen et al., these markers are consistent with different neuronal maturation states
c2.markers=find.markers(nbt,2,3,thresh.use = 2,test.use = "roc")
print(head(c2.markers,10))
## myAUC avg_diff power
## STMN2 0.995 3.865081 0.990
## TFAP2C 0.020 -3.286907 0.960
## CXADR 0.971 3.897852 0.942
## MLLT3 0.962 2.492868 0.924
## NEUROD2 0.942 3.745887 0.884
## MIAT 0.940 3.327704 0.880
## SOX9 0.062 -3.002205 0.876
## AUTS2 0.930 3.421453 0.860
## PTPRD 0.928 5.136168 0.856
## PLXNA2 0.926 3.524949 0.852
Visualizing markers of different clusters
par(mfrow=c(1,2))
genes.viz=c("DLX6-AS1","NEUROD2","GAD2","TFAP2C")
vlnPlot(nbt, genes.viz)
# Note that you can also view the violin plot grouping the cells by the original annotation - where these markers appear highly heterogeneous
# vlnPlot(nbt, genes.viz,group.by="orig.ident")
# View which cells express a given gene (red is high expression) on either a tSNE plot
feature.plot(nbt,genes.viz,pt.size = 1)
# Can also visualize on a PCA plot using:
# feature.plot(nbt,genes.viz,pt.size = 1,reduction.use = "pca")
# A different visualization, where the size of the dot is the percentage of cells expressing, and the color is the average expression level (green is high)
# Visualize markers that define cluster 2 vs cluster 3
dot.plot(nbt,genes.plot = rownames(c2.markers)[1:10],cex.use=4)
# Find markers for all clusters, set do.print=TRUE to output progress
markers.all=find_all_markers(nbt,thresh.test = 3,test.use = "roc", do.print = TRUE)
## [1] "Calculating cluster 1"
## [1] "Calculating cluster 2"
## [1] "Calculating cluster 3"
## [1] "Calculating cluster 4"
## [1] "Calculating cluster 5"
## [1] "Calculating cluster 6"
## [1] "Calculating cluster 7"
## [1] "Calculating cluster 8"
## [1] "Calculating cluster 9"
## [1] "Calculating cluster 10"
## [1] "Calculating cluster 11"
head(markers.all)
## myAUC avg_diff power cluster gene
## DLX6-AS1 1.000 6.005142 1.000 1 DLX6-AS1
## ERBB4 0.998 5.920021 0.996 1 ERBB4
## TPI1 0.008 -5.763074 0.984 1 TPI1
## PABPC1 0.010 -5.490769 0.980 1 PABPC1
## ENO1 0.012 -7.594403 0.976 1 ENO1
## COX6B1 0.016 -5.936334 0.968 1 COX6B1
# Select markers for plotting on a Heatmap (positive markers with high discriminatory power)
markers.use=subset(markers.all,avg_diff>0&power>0.8)$gene
markers.use.neuronal=subset(markers.all,avg_diff>0&power>0.8&(cluster<4|cluster==8))$gene
# Draw a heatmap of all cells for these marker genes
# Gene names are a bit too small to see in the tutorial, but you can blow up the heatmap in R
doHeatMap(nbt,genes.use = markers.use,slim.col.label = TRUE,remove.key = TRUE,cexRow=0.1)
## Warning in hmFunction(data.use, Rowv = NA, Colv = NA, trace = "none", col =
## col.use, : Discrepancy: Rowv is FALSE, while dendrogram is `both'. Omitting
## row dendogram.
## Warning in hmFunction(data.use, Rowv = NA, Colv = NA, trace = "none", col =
## col.use, : Discrepancy: Colv is FALSE, while dendrogram is `none'. Omitting
## column dendogram.
# Just for the neuronal cells
doHeatMap(nbt,genes.use = markers.use.neuronal,slim.col.label = TRUE,remove.key = TRUE,cells.use = which.cells(nbt,c(1,2,3,8)))
## Warning in hmFunction(data.use, Rowv = NA, Colv = NA, trace = "none", col =
## col.use, : Discrepancy: Rowv is FALSE, while dendrogram is `both'. Omitting
## row dendogram.
## Warning in hmFunction(data.use, Rowv = NA, Colv = NA, trace = "none", col =
## col.use, : Discrepancy: Colv is FALSE, while dendrogram is `none'. Omitting
## column dendogram.
A few more tricks
# Since we have placed cells into clusters, we can look at and compare the average expression within a cluster
# For example, we can compare cluster 7 (ES cells), and cluster 8 (NPCs)
nbt.avg=average.expression(nbt)
colnames(nbt.avg)=paste("c",colnames(nbt.avg),sep="")
plot(nbt.avg[,"c7"],nbt.avg[,"c8"],pch=16,cex=0.8)
# uncomment the next line to identify individual genes
# identify(nbt.avg[,"c7"],nbt.avg[,"c8"],labels = rownames(nbt.avg))
# What if we want to define markers of a clade, instead of an individual cluster? Re-examine the cluster tree above
# Find markers of the largest tree split (defined by node 12)
# Markers with a avg_diff>0 mark the left branch, avg_diff<0 mark the right branch
node12.markers=find.markers.node(nbt,12,thresh.use = 2,test.use = "roc")
head(node12.markers,5)
## myAUC avg_diff power
## NNAT 0.999 7.025542 0.998
## MAP1B 0.997 3.282430 0.994
## TUBA1A 0.996 3.568423 0.992
## DPYSL2 0.991 2.786088 0.982
## DCX 0.989 5.728801 0.978
# These are markers which are shared between clusters (1:3).
vlnPlot(nbt,c("NNAT","TUBA1A","DCX","FXYD5"))
# Find markers of a middle split (node 14). These separate clusters 4:6 from clusters 7:11
node14.markers=find.markers.node(nbt,14,thresh.use = 2,test.use = "roc")
vlnPlot(nbt,c("LCP1","NGFRAP1","PTPRF","CNN3"))
# Rename a cluster - for example let's rename cluster 1 to be Interneurons
# Now 1 will be replaced with Interneurons for future plots
nbt=rename.ident(nbt,1,"Interneurons")
# Final visualization! Splits a 'feature plot' into clusters, very useful for seeing lots of info across many clusters
# Applied here to PC scores. You can see that cluster 3 is uniquely marked by PC8, and cluster 8 is uniquely marked by PC11, etc.
pcs.plot=paste("PC",1:11,sep="")
feature.heatmap(nbt,pcs.plot,cols.use = heat.colors(10), pt.size = 2)
## Warning: attributes are not identical across measure variables; they will
## be dropped