Identify genes to use for building gene expression models

#load previous data object, enables you to start tutorial from Part 2.
load("~/seurat_files/output_part1.Robj")

#recalculate a set of variable genes, now that EVL are removed
zf <- mean.var.plot(zf, y.cutoff = 2, do.plot=FALSE, x.low.cutoff=1, x.high.cutoff=7, fxn.x = expMean, fxn.y=logVarDivMean, set.var.genes = TRUE)
markers.remove=batch.gene(zf,idents.use = c("zf1","zf2","zf3"),genes.use=zf@var.genes)
zf@var.genes=zf@var.genes[!(zf@var.genes%in%markers.remove)]

#redo the PCA on the variable genes.
zf <- pca(zf, do.print = FALSE)

#Run a 'random' PCA 1,000 times - scrambling a random 2.5% of the data each time
#This enables us to identify statistically significant PCs (in this case, 1:3), and genes with significant PC scores
zf <- jackStraw(zf, num.replicate=1000, prop.freq=0.025)
jackStrawPlot(zf)

zf <- project.pca(zf, do.print = FALSE,do.center=FALSE)
genes.sig <- pca.sig.genes(zf,pcs.use = c(1,2,3), pval.cut = 1e-2, use.full = TRUE)

plot.1=pca.plot(zf,do.return = TRUE); plot.2=pca.plot(zf,1,3,do.return = TRUE);
multiplotList(list(plot.1,plot.2),cols = 2)

viz.pca(zf,pcs.use = 1:3,num.genes = 10,nCol = 3)


Build models of gene expression

Matrices of gene expression were generated from published in situ stainings, and saved in an Excel file (which eases data entry). So, we import this data and add it to the Seurat object.

# Load in the Excel file.
wb <- loadWorkbook("~/seurat_files/Spatial_ReferenceMap.xlsx", create=FALSE); insitu.genes <- getSheets(wb)
insitu.matrix <- data.frame(sapply(1:length(insitu.genes),function(x)as.numeric(as.matrix(wb[x][2:9,2:9]))))
insitu.genes <- toupper(insitu.genes); colnames(insitu.matrix) <- (insitu.genes)

# Then, we store this information in the Seurat object.
zf@insitu.matrix=insitu.matrix[,insitu.genes]

Now build models for these insitu genes, and predict robust values

lasso.genes.use=unique(c(genes.sig,zf@var.genes))

# we will fit models for the landmark genes using the 'structured' genes (with significant PCA scores), and variable genes
zf <- addImputedScore(zf, genes.use=lasso.genes.use,genes.fit=insitu.genes, do.print=FALSE, s.use=40, gram=FALSE)

Demonstrate the benefit of imputation

#before imputation - MIXL1 and OSR1 should be tightly co-expressed (on the left)
par(mfrow=c(1,2))
genePlot(zf,"MIXL1","OSR1",col="black",cex.use=1)
#after imputation (on the right)
genePlot(zf,"MIXL1","OSR1",use.imputed = TRUE,col="black",cex.use = 1)

Save the data so we can move to the next RMD without having to Reload the data

save(zf,file="~/seurat_files/output_part2.Robj")