#Read in log-space expression matrix. Has been pre-computed and normalized (see manuscript for exact details)
#The data was run in three batches (zf1, zf2, zf3), which is denoted in the column name
zfish.data=read.table("~/seurat_files/zdata.matrix.txt",sep="\t",header=TRUE)
#Due to the high dynamic range in RNA-seq, transform data to log-space. Not required for Seurat, but highly recommended
zfish.log=log(zfish.data+1)
#Create and setup the Seurat object. Include all genes detected in > 3 cells (expression >0.01), and all cells with > 2k genes
#Cells will be initially assigned to an identity class (grouping) based on the first field (underscore-delimited)
zf.all=new("seurat",raw.data=zfish.log)
zf.all=setup(zf.all,project="zfish",min.cells = 3,min.genes = 2000,is.expr=0.01,names.field = 1,names.delim = "_")
zf.all
## An object of class seurat in project zfish
## 10945 genes across 945 samples.
#View the expression of genes and basic metrics across samples
vlnPlot(zf.all,c("ACTB2","VENT","CHD","nGene"))
#Plot two cells against each other
#Set do.ident=TRUE to identify outliers by clicking on points (ESC to exit after)
par(mfrow=c(2,2))
cellPlot(zf.all,zf.all@cell.names[1],zf.all@cell.names[2],do.ident = FALSE)
cellPlot(zf.all,zf.all@cell.names[3],zf.all@cell.names[4],do.ident = FALSE)
#Plot two genes against each other, can do this in limited groups of cells
genePlot(zf.all,"MIXL1","OSR1",cex.use = 1)
genePlot(zf.all,"MIXL1","SOX3",cell.ids = which.cells(zf.all,"zf1"),cex.use = 1)
#Genes placed into 20 bins based on X-axis (average expression). Y-axis is within-bin z-score of log(Variance/mean).
zf.all=mean.var.plot(zf.all,y.cutoff = 2,do.plot=TRUE,x.low.cutoff=0.25,x.high.cutoff=7,fxn.x = expMean,fxn.y=logVarDivMean)
#remove markers which primarily define batch
markers.remove=batch.gene(zf.all,idents.use = c("zf1","zf2","zf3"),genes.use=zf.all@var.genes,auc.cutoff = 0.7)
zf.all@var.genes=zf.all@var.genes[!(zf.all@var.genes%in%markers.remove)]
length(zf.all@var.genes)
## [1] 160
#To demonstrate, run PCA on all genes (note that the result isn't great, PC1/2 contain little known biology)
zf.all=pca(zf.all,pc.genes = rownames(zf.all@data),pcs.print = 3,genes.print = 5)
## [1] "PC1"
## [1] "CNBPA" "SNRPD2" "PARN" "NOP56" "HN1L"
## [1] ""
## [1] "LYRM2" "NRXN1B" "WDR38" "SAT1A.2" "PGR" "ST3GAL4"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "NANOG" "ACP5A" "RBM4.3" "EPCAM" "PFN2"
## [1] ""
## [1] "RPS13" "RPS19" "HMGB2B" "MARCKSL1B" "DYNLL1" "EEF1A1L1"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "H3F3D" "FXYD6" "CX43" "OSR1" "C1QB"
## [1] ""
## [1] "CXCR4B" "SOX3" "ZIC2B" "ALDOB" "SOX19A" "FAM212AA"
## [1] ""
## [1] ""
#Instead, run PCA on zf.all@var.genes (default value for pc.genes).
zf.all=pca(zf.all,pcs.print = 3,genes.print = 5)
## [1] "PC1"
## [1] "SOX3" "ID1" "ALCAMB" "CXCR4A" "SULT6B1"
## [1] ""
## [1] "OSR1" "MIXL1" "DKK1B" "KIRREL3L" "ZIC2A" "APLNRB"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "KRT18" "KRT4" "WU:FB15G10" "KRT8" "CEBPB"
## [1] ""
## [1] "ID1" "SOX3" "CXCR4A" "SP5L" "NNR" "RGCC"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "FRZB" "CHD" "GSC" "OTX1A" "SIX3B"
## [1] ""
## [1] "VED" "EVE1" "HES6" "APOC1L" "WNT8A" "WNT8-2"
## [1] ""
## [1] ""
pca.plot(zf.all,pt.size = 2)
#If desired, project PCA structure across entire dataset (so all genes are included)
zf.all=project.pca(zf.all, do.print = FALSE)
#visualize top genes for each PC, use.full selects the projected PCA. See also print.pca
#Explicit thanks to Olga Botvinnik, who showed me this visualization in her Flotilla package
viz.pca(zf.all,pcs.use = 1:3,num.genes = 10,use.full = TRUE,nCol = 3)
#Identify EVL cells (see Manuscript for further detail)
plot(zf.all@pca.rot[,1],zf.all@pca.rot[,2],pch=16,xlab="PC1",ylab="PC2")
x=seq(-0.2,0.2,.01); lines(x,-x*0.5-0.04,lwd=2,lty=2,col="red")
evl.quant=zf.all@pca.rot[,1]+2*zf.all@pca.rot[,2]+0.08; names(evl.quant)=colnames(zf.all@data)
not.evl=names(evl.quant[evl.quant>0]); is.evl=names(evl.quant[evl.quant<0])
points(zf.all@pca.rot[is.evl,1],zf.all@pca.rot[is.evl,2],pch=16,col="red")
#subsetData allows you to create a new Seurat object with a subset of the data
zf=subsetData(zf.all,cells.use = not.evl)
zf
## An object of class seurat in project zfish
## 10945 genes across 851 samples.
Save the data
#So we can move through the tutorial in parts if needed
#Note that you can now send/e-mail/etc. the object file, enabling easy collaboration (receiver uses load)
save(zf,file="~/seurat_files/output_part1.Robj")