Getting Start
Examples of scPhyloX and steps are provided in Jupyter Notebook under notebooks folder. For start, please refer to records analyzing simulation, hek293T, Fly organs, Human HSC/MPPs and mouse CRC tumor datasets.
ScPhyloX on New Dataset
ScPhyloX provides an integrated function for Phylodynamics Inference by default whilst specific configurations might need to be adjusted accordingly.
1. import package
import scPhyloX as spx
2. Data preprocessing
2.1 Analysis with phylogenetic tree (rooted, with branch length)
from bio import Phylo
tree = Phylo.read('path_to_your_tree.nwk', format='newick')
depths = tree.get_depths()
lr_dist = [depths[i] for i in tree.get_terminals()]
lp_dist = [i.branch_length for i in tree.get_terminals()]
2.2 Analysis with character matrix
import pandas as pd
charater_matrix = pd.read_csv('path_to_your_mat.csv', index_col=0)
lr_dist = spx.data_factory.get_mutnum(charater_matrix)
lp_dist = spx.data_factory.get_branchlen(charater_matrix)
3. Estimation of mutation rate and cell generation
import arviz as az
idata_bl = spx.est_mr.mutation_rate_mcmc(lp_dist, draw=500, tune=500)
ge = spx.est_mr.GenerationEst(lr_dist, az.summary(idata_bl).loc['mu']['mean'])
gen_num = ge.estimate(cell_number) ## Cell number is the total number of cells in the study tissue, determined by prior knowledge.
4. Perform phylodynamics inference
4.1 Tissue development model
res_de = spx.est_tissue.para_inference_DE(gen_num, T=35) ## T is the time of tissue growth, unlimited time unit.
axh, bxh, rh, dh, kh, t0h, c0 = res_de[0][-1]
dh = 10**(-dh)
mcmc_prior = (axh, bxh, rh, dh, kh, t0h)
idata = spx.est_tissue.mcmc_inference(gen_num, mcmc_prior, T=35, c0=c0, sigma=100) ## sigma is hyper-parameter, determined by total cell number
4.2 Tumor growth model
res_de = spx.est_tumor.para_inference_DE(gen_num, T=35, c0=100)
rh, ah, sh, uh, c0 = res_de[0][-1]
uh = 10**(-uh)
mcmc_prior = (rh, ah, sh, uh)
idata = spx.est_tumor.mcmc_inference(gen_num, mcmc_prior, T=35, c0=c0, sigma=100)
5. Convergence analysis of MCMC
print(az.summary(idata_bl))
print(az.summary(idata))
fig = plt.figure(layout='constrained',figsize=(11,3))
gs = GridSpec(2, 3, figure=fig)
ax1 = fig.add_subplot(gs[:,0])
ls = 'solid,dotted,dashed,dashdot'.split(',')
for i, l in enumerate(ls):
sns.kdeplot(idata_bl.posterior['mu'].to_numpy()[i], linestyle=l, ax=ax1, label=f'Chain {i+1}')
ax1.vlines(2, 0, 1.7, color='black')
ax1.legend(fontsize=10,loc=2)
ax1.set_title(r'$\mu$ distribution')
ax2 = fig.add_subplot(gs[:,1])
for i, l in enumerate(ls):
sns.kdeplot(idata_bl.posterior['delta'].to_numpy()[i], linestyle=l, ax=ax2, label=f'Chain {i+1}')
ax2.set_title(r'$\delta$ distribution')
ax3 = fig.add_subplot(gs[0,2])
for i, l in enumerate(ls):
ax3.plot(idata_bl.posterior['mu'][i], linestyle=l, label=f'Chain {i+1}')
ax3.set_title('$\mu$ mcmc trace')
ax3.set_xlabel('step')
ax4 = fig.add_subplot(gs[1,2])
for i, l in enumerate(ls):
ax4.plot(idata_bl.posterior['delta'][i], linestyle=l, label=f'Chain {i+1}')
ax4.set_title(r'$\delta$ mcmc trace')
ax4.set_xlabel('step')
fig = plt.figure(figsize=(12,7))
plt.rcParams['font.size'] = 12
gt = [0.5, 0.6, 0.2, 1e-3]
gs = GridSpec(4, 3, figure=fig)
ls = 'solid.dotted.dashed.dashdot.solid'.split('.')
for ind, sym in enumerate('r,a,s,u'.split(',')):
if ind >= 2:
ax = fig.add_subplot(gs[2:, ind-2])
else:
ax = fig.add_subplot(gs[:2, ind])
for i, l in enumerate(ls):
sns.kdeplot(idata.posterior[sym].to_numpy()[i], linestyle=l, ax=ax, label=f'Chain {i+1}')
ax.legend(fontsize=10,loc=2)
ylim = ax.get_ylim()
ax.vlines(gt[ind], *ylim, color='black')
ax.set_ylim(ylim)
ax.set_title(fr'${sym}$ distribution')
ax0 = fig.add_subplot(gs[-1, 2])
for i, l in enumerate(ls):
ax0.plot(idata.posterior['u'][i], linestyle=l, label=f'Chain {i+1}')
ax0.set_title(fr'$u$ mcmc trace')
for ind, sym in enumerate('r,a,s'.split(',')):
ax = fig.add_subplot(gs[ind, 2], sharex=ax0)
for i, l in enumerate(ls):
ax.plot(idata.posterior[sym][i], linestyle=l, label=f'Chain {i+1}')
ax.set_title(fr'${sym}$ mcmc trace')
plt.setp(ax.get_xticklabels(), visible=False)
ax0.set_xlabel('step')
plt.tight_layout()
6. Phylodynamics inference results visualization
theta_h = az.summary(idata).loc['ax,bx,r,d,k,t0'.split(',')]['mean'].to_numpy()
x0 = [c0, 0]
n_stemcells = np.array([[spx.est_tissue.ncyc(i, j, c0, *theta_h) for j in range(T)] for i in range(max(100, len(gen_num)))])
n_nonstemcells = np.array([[spx.est_tissue.nnc(i, j, c0, *theta_h) for j in range(T)] for i in range(max(100, len(gen_num)))])
fig, ax = plt.subplots()
ax.set_xlabel('Time')
ax.set_ylabel('Cell number')
ax.plot(n_stemcells.sum(0), c='#9098d9', lw=4, label='stem cell')
ax.plot(n_nonstemcells.sum(0), c='#ed9e44', lw=4, label='Non-stem cell')
ax.ticklabel_format (style='sci', scilimits= (-1,2), axis='y')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.legend(loc=2)
ax.ticklabel_format (style='sci', scilimits= (-1,2), axis='y', useMathText=True)
[ ]: