The in situ patterns that we use to provide geographical information are scored in a binary on/off format. In order to translate the continuous RNAseq data into this form, we model it as mixtures of 2 normal distributions that represent the on state and off state. We then use this to estimate whether each cell should be considered on or off for each gene.
#load previous data object, enables you to start tutorial from Part 3.
load("~/seurat_files/output_part2.Robj")
insitu.genes=colnames(zf@insitu.matrix)
for(i in rev(insitu.genes)) zf=fit.gene.k(zf,i,do.plot=FALSE,do.k = 2,start.pct=mean(zf@insitu.matrix[,i]),num.iter = 1)
#show an example mixture model
par(mfrow=c(2,2))
zf_temp=fit.gene.k(zf,"SOX3",do.plot=TRUE,do.k = 2,start.pct=mean(zf@insitu.matrix[,"SOX3"]))
zf_temp=fit.gene.k(zf,"OSR1",do.plot=TRUE,do.k = 2,start.pct=mean(zf@insitu.matrix[,"OSR1"]))
zf_temp=fit.gene.k(zf,"BAMBIA",do.plot=TRUE,do.k = 2,start.pct=mean(zf@insitu.matrix[,"BAMBIA"]))
zf_temp=fit.gene.k(zf,"SEBOX",do.plot=TRUE,do.k = 2,start.pct=mean(zf@insitu.matrix[,"SEBOX"]))
#Perform an initial mapping of cells to bins
zf <- initial.mapping(zf)
#Now, perform the quantitative refinement procedure (see manuscript for details)
#first identify the genes to use for the refinement (top 3 genes in both directions for PC1-3)
num.pc=3; num.genes=6
genes.use=pcTopGenes(zf,pc.use = 1:num.pc,num.genes = num.genes,use.full = TRUE,do.balanced = TRUE)
#impute values for these genes if needed
new.imputed=genes.use[!genes.use%in%rownames(zf@imputed)]
lasso.genes.use=unique(c(zf@var.genes,pca.sig.genes(zf,pcs.use = c(1,2,3), pval.cut = 1e-2, use.full = TRUE,)))
zf <- addImputedScore(zf, genes.use=lasso.genes.use,genes.fit=new.imputed, do.print=FALSE, s.use=40, gram=FALSE)
#refine the mapping with quantitative models that also consider gene covariance
zf <- refined.mapping(zf,genes.use)
#view the raw mapping probabilities
corner(zf@final.prob)
## zf1_p7_S31_umi zf2_dis_S1238_umi zf1_p5_S91_umi zf2_dis_S322_umi
## 1 8.663956e-39 5.460885e-20 1.018919e-31 1.175099e-53
## 2 2.049824e-39 7.846050e-34 2.868368e-24 2.268786e-68
## 3 1.761869e-49 2.197068e-31 7.929891e-14 1.720273e-71
## 4 3.233612e-90 2.237164e-24 1.620428e-21 8.469051e-105
## 5 7.718742e-47 6.362071e-31 7.557316e-19 8.506210e-46
## zf2_dis_S1179_umi
## 1 2.094456e-84
## 2 1.890547e-84
## 3 2.359875e-97
## 4 9.393634e-157
## 5 4.739701e-61
#calculate centroids for each cell
zf.centroids=get.centroids(zf); colnames(zf.centroids)=c("bin.tier","bin.dv")
corner(zf.centroids)
## bin.tier bin.dv
## zf1_p7_S31_umi 7.908600 3.499695
## zf2_dis_S1238_umi 1.107902 3.002494
## zf1_p5_S91_umi 4.571245 6.522931
## zf2_dis_S322_umi 7.999830 3.220121
## zf2_dis_S1179_umi 7.999473 2.009662
#add this info into the Seurat object
#note that you can do this for any metaData info (alignment rate, external measurements for each cell, etc.)
zf=addMetaData(zf,zf.centroids)
#We can use genePlot to examine relationships between different types of variables
#Note that individual genes (or PCs) display strong, but noisy and non-linear, relationships with Seurat's mapping positions along both axes
par(mfrow=c(2,2))
genePlot(zf,"OSR1","bin.tier",cex.use = 1)
genePlot(zf,"BAMBIA","bin.dv",cex.use=1)
genePlot(zf,"PC1","bin.tier",cex.use=1)
genePlot(zf,"PC2","bin.dv",cex.use=1)