Setup Seurat Object

# Load in dataset from Pollen et al. 2014
nbt.data=read.table("~/seurat_files/HiSeq301-RSEM-linear values.txt",sep="\t",header=TRUE,row.names=1)

# transform data to log scale
nbt.data=log(nbt.data+1)

# Look at the transformed data matrix
corner(nbt.data)
##          Hi_2338_1  Hi_2338_2 Hi_2338_3 Hi_2338_4 Hi_2338_5
## A1BG      2.310553 0.00000000  0.000000 1.0116009         0
## A1BG-AS1  0.000000 0.00000000  1.497388 0.3074847         0
## A1CF      0.000000 0.04879016  0.000000 0.0000000         0
## A2LD1     0.000000 0.00000000  0.000000 0.2546422         0
## A2M       0.000000 0.00000000  0.000000 0.0000000         0
dim(nbt.data)
## [1] 23730   301
nbt=new("seurat",raw.data=nbt.data)

# Take all genes in > 3 cells, all cells with > 1k genes, use an expression threshold of 1
# Cell type is encoded in the second _ field, will be stored in nbt@ident and also placed in the "orig.ident" field of object@data.info
nbt=setup(nbt,project="NBT",min.cells = 3,names.field = 2,names.delim = "_",min.genes = 1000,is.expr=1,)

nbt
## An object of class seurat in project NBT
##  16842 genes across 301 samples.

Basic exploration of data

# Look at some canonical marker genes and metrics
vlnPlot(nbt,c("DPPA4","GATA1","BMP3","nGene"))

# Plot two cells against each other
# Set do.ident=TRUE to identify outliers by clicking on points (ESC to exit after)

par(mfrow=c(2,2))
cellPlot(nbt,nbt@cell.names[1],nbt@cell.names[2],do.ident = FALSE)
cellPlot(nbt,nbt@cell.names[3],nbt@cell.names[4],do.ident = FALSE)

# Plot two genes against each other, can do this in limited groups of cells
genePlot(nbt,"DLX1","DLX2",cex.use = 1)
genePlot(nbt,"DLX1","DLX2",cell.ids = which.cells(nbt,"GW16"),cex.use = 1)


Identify variable genes across the single cells

# Genes placed into 20 bins based on X-axis (average expression). Y-axis is within-bin z-score of log(Variance/mean).
# Running this sets object@var.genes by default
nbt=mean.var.plot(nbt,y.cutoff = 2,x.low.cutoff = 2,fxn.x = expMean,fxn.y = logVarDivMean)

length(nbt@var.genes)
## [1] 377

Perform linear dimensional reduction (PCA)

# Run a PCA using the variable genes as input (to change the input gene set, use the pc.genes argument)
# For example, try running PCA on all genes - i.e. , nbt=pca(nbt,pc.genes=rownames(nbt@data)) - which does not perform as well
nbt=pca(nbt,do.print=FALSE)
pca.plot(nbt,1,2,pt.size = 2)

# Examine  and visualize PCA results a few different ways
print.pca(nbt,1)
## [1] "PC1"
##  [1] "LGALS1"    "TIMP1"     "KRT18"     "IFI30"     "ARHGDIB"
##  [6] "IFI27"     "UCA1"      "HIST1H2BK" "KRT15"     "LCN2"
## [11] "S100A9"    "KRT81"     "ALDH1A3"   "KLK5"      "CEACAM6"
## [1] ""
##  [1] "SOX11"     "TUBB2B"    "DCX"       "GPM6A"     "CRMP1"
##  [6] "RTN1"      "NNAT"      "C1orf61"   "STMN2"     "FABP7"
## [11] "LOC150568" "41520"     "TMSB15A"   "PPP2R2B"   "GAP43"
## [16] "NREP"
## [1] ""
## [1] ""
viz.pca(nbt,1:2)

# Draw a heatmap where both cells and genes are ordered by PCA score
# Options to explore include do.balanced (show equal # of genes with +/- PC scores), and use.full (after projection, see below)
pcHeatmap(nbt,pc.use = 1,do.balanced = FALSE)
## Warning in hmFunction(data.use, Rowv = NA, Colv = NA, trace = "none", col =
## col.use, : Discrepancy: Rowv is FALSE, while dendrogram is `none'. Omitting
## row dendogram.


Determine statistically significant principal components

# Do 200 random samplings to find significant genes, each time randomly permute 1% of genes
# This returns a 'p-value' for each gene in each PC, based on how likely the gene/PC score woud have been observed by chance
# Note that in this case we get the same result with 200 or 1000 samplings, so we do 200 here for expediency
nbt=jackStraw(nbt,num.replicate = 200,do.print = FALSE)

# The jackStraw plot compares the distribution of P-values for each PC with a uniform distribution (dashed line)
# 'Significant' PCs will have a strong enrichment of genes with low p-values (solid curve above dashed line)
# In this case PC1-9 are strongly significant
jackStrawPlot(nbt,PCs = 1:12)


Optional step, grow gene list through PCA projection

# Previous analysis was performed on < 400 variable genes. To identify a larger gene set that may drive
# biological differences, but did not pass our mean/variability thresholds, we first calculate PCA scores
# for all genes (PCA projection)
nbt=project.pca(nbt,do.print=FALSE)

# Visualize the full projected PCA, which now includes new genes which were not previously included (use.full=TRUE)
pcHeatmap(nbt,pc.use = 1,use.full = TRUE,do.balanced = TRUE,remove.key = TRUE)

# Choose significant genes for PC1-9, allow each PC to contribute a max of 200 genes (to avoid one PC swamping the analysis)
nbt.sig.genes=pca.sig.genes(nbt,1:9,pval.cut = 1e-5,max.per.pc = 200)
length(nbt.sig.genes)
## [1] 1601
# Now redo the PCA analysis with the new gene list
nbt=pca(nbt,pc.genes=nbt.sig.genes,do.print = FALSE)

# Redo random sampling, PCs 1-11 are significant (additional PCs are borderline, but signal disappears with 1k replicates, though we do 200 here for expediency)
nbt=jackStraw(nbt,num.replicate = 200,do.print = FALSE)
jackStrawPlot(nbt,PCs = 1:15)


Run Non-linear dimensional reduction (tSNE)

# When we run tSNE using our 11 significant PCs as input (spectral tSNE), we get distinct point clouds
# All cell types cluster togther, with the exception of GW16/21 (gestational week 16/21 neurons)
# These cluster separately by maturation state and subtype (see part 2)
nbt=run_tsne(nbt,dims.use = 1:11,max_iter=2000)
## sigma summary: Min. : 0.316 |1st Qu. : 0.5964 |Median : 0.7123 |Mean : 0.7132 |3rd Qu. : 0.8674 |Max. : 1.195 |
## Epoch: Iteration #100 error is: 9.70132741958134
## Epoch: Iteration #200 error is: 0.127010473892875
## Epoch: Iteration #300 error is: 0.153273654208696
## Epoch: Iteration #400 error is: 0.245469561193061
## Epoch: Iteration #500 error is: 0.316057847931056
## Epoch: Iteration #600 error is: 0.340189760254242
## Epoch: Iteration #700 error is: 0.292026401119793
## Epoch: Iteration #800 error is: 0.28717404359992
## Epoch: Iteration #900 error is: 0.299151020873036
## Epoch: Iteration #1000 error is: 0.302867342591712
## Epoch: Iteration #1100 error is: 0.295406575735116
## Epoch: Iteration #1200 error is: 0.293464973497574
## Epoch: Iteration #1300 error is: 0.280668805882324
## Epoch: Iteration #1400 error is: 0.266135652742922
## Epoch: Iteration #1500 error is: 0.26283877111651
## Epoch: Iteration #1600 error is: 0.25005151881014
## Epoch: Iteration #1700 error is: 0.242511532375138
## Epoch: Iteration #1800 error is: 0.244544924387362
## Epoch: Iteration #1900 error is: 0.237581728690564
## Epoch: Iteration #2000 error is: 0.234277392035558
tsne.plot(nbt,pt.size = 1)

# You can also run tSNE at the gene level (set genes.use instead of dims.use), but this fails to acheive clean separation between cell types
# Note that, depending on the perplexity parameter, this approach may be sub-optimal for finding very rare subtypes (~5 cells or less)
# in this case, Seurat could be complementary with other clustering approaches.

Cluster the cells

# Density cluster the tSNE map - note that the G.use parameter is the density parameter for the clustering - lower G.use to get finer settings
# Cells which are 'unassigned' are put in cluster 1 - though in this case there are none
# Assigned cluster will be placed in the 'DBClust.ident' field of nbt@data.info. Putting set.ident=TRUE means that the assigned clusters will also be stored in nbt@ident
nbt=DBclust_dimension(nbt,1,2,reduction.use = "tsne",G.use = 8,set.ident = TRUE)
tsne.plot(nbt,pt.size = 1)

# Build a phylogenetic tree, and rename/reorder cluster names according to their position on the tree
# See help for details on tree building strategy
# This gives closely related clusters similar cluster IDs, which is occasionally useful for visualization later on
# Assigned cluster will be placed in the 'tree.ident' field of nbt@data.info, and also stored in nbt@ident
nbt=buildClusterTree(nbt,do.reorder = TRUE,reorder.numeric = TRUE,pcs.use = 1:11)

# View the t-SNE plot with the new labels, in a slightly different format
tsne.plot(nbt,do.label = TRUE,label.pt.size = 0.5)

#Save the Seurat object for further analysis in part 2
save(nbt,file="~/seurat_files/nbt_tutorial_part1.Robj")