Build Mixture models of Gene Expression

The in situ patterns that we use to provide geographical information are scored in a binary on/off format. In order to translate the continuous RNAseq data into this form, we model it as mixtures of 2 normal distributions that represent the on state and off state. We then use this to estimate whether each cell should be considered on or off for each gene.

#load previous data object, enables you to start tutorial from Part 3.
load("~/seurat_files/output_part2.Robj")

insitu.genes=colnames(zf@insitu.matrix)
for(i in rev(insitu.genes)) zf=fit.gene.k(zf,i,do.plot=FALSE,do.k = 2,start.pct=mean(zf@insitu.matrix[,i]),num.iter = 1)

#show an example mixture model
par(mfrow=c(2,2))
zf_temp=fit.gene.k(zf,"SOX3",do.plot=TRUE,do.k = 2,start.pct=mean(zf@insitu.matrix[,"SOX3"]))
zf_temp=fit.gene.k(zf,"OSR1",do.plot=TRUE,do.k = 2,start.pct=mean(zf@insitu.matrix[,"OSR1"]))
zf_temp=fit.gene.k(zf,"BAMBIA",do.plot=TRUE,do.k = 2,start.pct=mean(zf@insitu.matrix[,"BAMBIA"]))
zf_temp=fit.gene.k(zf,"SEBOX",do.plot=TRUE,do.k = 2,start.pct=mean(zf@insitu.matrix[,"SEBOX"]))


Project each cell into its proper location

#Perform an initial mapping of cells to bins
zf <- initial.mapping(zf)

#Now, perform the quantitative refinement procedure (see manuscript for details)

#first identify the genes to use for the refinement (top 3 genes in both directions for PC1-3)
num.pc=3; num.genes=6
genes.use=pcTopGenes(zf,pc.use = 1:num.pc,num.genes = num.genes,use.full = TRUE,do.balanced = TRUE)

#impute values for these genes if needed
new.imputed=genes.use[!genes.use%in%rownames(zf@imputed)]
lasso.genes.use=unique(c(zf@var.genes,pca.sig.genes(zf,pcs.use = c(1,2,3), pval.cut = 1e-2, use.full = TRUE,)))
zf <- addImputedScore(zf, genes.use=lasso.genes.use,genes.fit=new.imputed, do.print=FALSE, s.use=40, gram=FALSE)

#refine the mapping with quantitative models that also consider gene covariance
zf <- refined.mapping(zf,genes.use)

Brief analysis of the mapped cells

#view the raw mapping probabilities
corner(zf@final.prob)
##   zf1_p7_S31_umi zf2_dis_S1238_umi zf1_p5_S91_umi zf2_dis_S322_umi
## 1   8.663956e-39      5.460885e-20   1.018919e-31     1.175099e-53
## 2   2.049824e-39      7.846050e-34   2.868368e-24     2.268786e-68
## 3   1.761869e-49      2.197068e-31   7.929891e-14     1.720273e-71
## 4   3.233612e-90      2.237164e-24   1.620428e-21    8.469051e-105
## 5   7.718742e-47      6.362071e-31   7.557316e-19     8.506210e-46
##   zf2_dis_S1179_umi
## 1      2.094456e-84
## 2      1.890547e-84
## 3      2.359875e-97
## 4     9.393634e-157
## 5      4.739701e-61
#calculate centroids for each cell
zf.centroids=get.centroids(zf); colnames(zf.centroids)=c("bin.tier","bin.dv")
corner(zf.centroids)
##                   bin.tier   bin.dv
## zf1_p7_S31_umi    7.908600 3.499695
## zf2_dis_S1238_umi 1.107902 3.002494
## zf1_p5_S91_umi    4.571245 6.522931
## zf2_dis_S322_umi  7.999830 3.220121
## zf2_dis_S1179_umi 7.999473 2.009662
#add this info into the Seurat object
#note that you can do this for any metaData info (alignment rate, external measurements for each cell, etc.)
zf=addMetaData(zf,zf.centroids)

#We can use genePlot to examine relationships between different types of variables
#Note that individual genes (or PCs) display strong, but noisy and non-linear, relationships with Seurat's mapping positions along both axes
par(mfrow=c(2,2))
genePlot(zf,"OSR1","bin.tier",cex.use = 1)
genePlot(zf,"BAMBIA","bin.dv",cex.use=1)
genePlot(zf,"PC1","bin.tier",cex.use=1)
genePlot(zf,"PC2","bin.dv",cex.use=1)

Save the data so we can move to the next RMD without having to Reload the data

save(zf,file="~/seurat_files/output_part3.Robj")