Setup Seurat Object

#Read in log-space expression matrix. Has been pre-computed and normalized (see manuscript for exact details)
#The data was run in three batches (zf1, zf2, zf3), which is denoted in the column name
zfish.data=read.table("~/seurat_files/zdata.matrix.txt",sep="\t",header=TRUE)

#Due to the high dynamic range in RNA-seq, transform data to log-space. Not required for Seurat, but highly recommended
zfish.log=log(zfish.data+1)

#Create and setup the Seurat object. Include all genes detected in > 3 cells (expression >0.01), and all cells with > 2k genes
#Cells will be initially assigned to an identity class (grouping) based on the first field (underscore-delimited)
zf.all=new("seurat",raw.data=zfish.log)
zf.all=setup(zf.all,project="zfish",min.cells = 3,min.genes = 2000,is.expr=0.01,names.field = 1,names.delim = "_")

zf.all
## An object of class seurat in project zfish
##  10945 genes across 945 samples.

Basic exploration of data

#View the expression of genes and basic metrics across samples
vlnPlot(zf.all,c("ACTB2","VENT","CHD","nGene"))

#Plot two cells against each other
#Set do.ident=TRUE to identify outliers by clicking on points (ESC to exit after)

par(mfrow=c(2,2))
cellPlot(zf.all,zf.all@cell.names[1],zf.all@cell.names[2],do.ident = FALSE)
cellPlot(zf.all,zf.all@cell.names[3],zf.all@cell.names[4],do.ident = FALSE)

#Plot two genes against each other, can do this in limited groups of cells
genePlot(zf.all,"MIXL1","OSR1",cex.use = 1)
genePlot(zf.all,"MIXL1","SOX3",cell.ids = which.cells(zf.all,"zf1"),cex.use = 1)


Identify variable genes across the single cells

#Genes placed into 20 bins based on X-axis (average expression). Y-axis is within-bin z-score of log(Variance/mean).
zf.all=mean.var.plot(zf.all,y.cutoff = 2,do.plot=TRUE,x.low.cutoff=0.25,x.high.cutoff=7,fxn.x = expMean,fxn.y=logVarDivMean)

#remove markers which primarily define batch
markers.remove=batch.gene(zf.all,idents.use = c("zf1","zf2","zf3"),genes.use=zf.all@var.genes,auc.cutoff = 0.7)
zf.all@var.genes=zf.all@var.genes[!(zf.all@var.genes%in%markers.remove)]
length(zf.all@var.genes)
## [1] 160

Run PCA, examine results

#To demonstrate, run PCA on all genes (note that the result isn't great, PC1/2 contain little known biology)
zf.all=pca(zf.all,pc.genes = rownames(zf.all@data),pcs.print = 3,genes.print = 5)
## [1] "PC1"
## [1] "CNBPA"  "SNRPD2" "PARN"   "NOP56"  "HN1L"
## [1] ""
## [1] "LYRM2"   "NRXN1B"  "WDR38"   "SAT1A.2" "PGR"     "ST3GAL4"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "NANOG"  "ACP5A"  "RBM4.3" "EPCAM"  "PFN2"
## [1] ""
## [1] "RPS13"     "RPS19"     "HMGB2B"    "MARCKSL1B" "DYNLL1"    "EEF1A1L1"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "H3F3D" "FXYD6" "CX43"  "OSR1"  "C1QB"
## [1] ""
## [1] "CXCR4B"   "SOX3"     "ZIC2B"    "ALDOB"    "SOX19A"   "FAM212AA"
## [1] ""
## [1] ""
#Instead, run PCA on zf.all@var.genes (default value for pc.genes).
zf.all=pca(zf.all,pcs.print = 3,genes.print = 5)
## [1] "PC1"
## [1] "SOX3"    "ID1"     "ALCAMB"  "CXCR4A"  "SULT6B1"
## [1] ""
## [1] "OSR1"     "MIXL1"    "DKK1B"    "KIRREL3L" "ZIC2A"    "APLNRB"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "KRT18"      "KRT4"       "WU:FB15G10" "KRT8"       "CEBPB"
## [1] ""
## [1] "ID1"    "SOX3"   "CXCR4A" "SP5L"   "NNR"    "RGCC"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "FRZB"  "CHD"   "GSC"   "OTX1A" "SIX3B"
## [1] ""
## [1] "VED"    "EVE1"   "HES6"   "APOC1L" "WNT8A"  "WNT8-2"
## [1] ""
## [1] ""
pca.plot(zf.all,pt.size = 2)

#If desired, project PCA structure across entire dataset (so all genes are included)
zf.all=project.pca(zf.all, do.print = FALSE) 
#visualize top genes for each PC, use.full selects the projected PCA. See also print.pca
#Explicit thanks to Olga Botvinnik, who showed me this visualization in her Flotilla package
viz.pca(zf.all,pcs.use = 1:3,num.genes = 10,use.full = TRUE,nCol = 3)


Create new object without EVL cells

#Identify EVL cells (see Manuscript for further detail)
plot(zf.all@pca.rot[,1],zf.all@pca.rot[,2],pch=16,xlab="PC1",ylab="PC2")

x=seq(-0.2,0.2,.01); lines(x,-x*0.5-0.04,lwd=2,lty=2,col="red")
evl.quant=zf.all@pca.rot[,1]+2*zf.all@pca.rot[,2]+0.08; names(evl.quant)=colnames(zf.all@data)
not.evl=names(evl.quant[evl.quant>0]); is.evl=names(evl.quant[evl.quant<0])

points(zf.all@pca.rot[is.evl,1],zf.all@pca.rot[is.evl,2],pch=16,col="red")

#subsetData allows you to create a new Seurat object with a subset of the data
zf=subsetData(zf.all,cells.use = not.evl)

zf
## An object of class seurat in project zfish
##  10945 genes across 851 samples.

Save the data

#So we can move through the tutorial in parts if needed
#Note that you can now send/e-mail/etc. the object file, enabling easy collaboration (receiver uses load)
save(zf,file="~/seurat_files/output_part1.Robj")