Getting Start

Examples of scPhyloX and steps are provided in Jupyter Notebook under notebooks folder. For start, please refer to records analyzing simulation, hek293T, Fly organs, Human HSC/MPPs and mouse CRC tumor datasets.

ScPhyloX on New Dataset

ScPhyloX provides an integrated function for Phylodynamics Inference by default whilst specific configurations might need to be adjusted accordingly.

1. import package

import scPhyloX as spx

2. Data preprocessing

2.1 Analysis with phylogenetic tree (rooted, with branch length)

from bio import Phylo
tree = Phylo.read('path_to_your_tree.nwk', format='newick')
depths = tree.get_depths()
lr_dist = [depths[i] for i in tree.get_terminals()]
lp_dist = [i.branch_length for i in tree.get_terminals()]

2.2 Analysis with character matrix

import pandas as pd
charater_matrix = pd.read_csv('path_to_your_mat.csv', index_col=0)
lr_dist = spx.data_factory.get_mutnum(charater_matrix)
lp_dist = spx.data_factory.get_branchlen(charater_matrix)

3. Estimation of mutation rate and cell generation

import arviz as az
idata_bl = spx.est_mr.mutation_rate_mcmc(lp_dist, draw=500, tune=500)
ge = spx.est_mr.GenerationEst(lr_dist, az.summary(idata_bl).loc['mu']['mean'])
gen_num = ge.estimate(cell_number) ## Cell number is the total number of cells in the study tissue, determined by prior knowledge.

4. Perform phylodynamics inference

4.1 Tissue development model

res_de = spx.est_tissue.para_inference_DE(gen_num, T=35) ## T is the time of tissue growth, unlimited time unit.
axh, bxh, rh, dh, kh, t0h, c0 = res_de[0][-1]
dh = 10**(-dh)
mcmc_prior = (axh, bxh, rh, dh, kh, t0h)
idata = spx.est_tissue.mcmc_inference(gen_num, mcmc_prior, T=35, c0=c0, sigma=100) ## sigma is hyper-parameter, determined by total cell number

4.2 Tumor growth model

res_de = spx.est_tumor.para_inference_DE(gen_num, T=35, c0=100)
rh, ah, sh, uh, c0 = res_de[0][-1]
uh = 10**(-uh)
mcmc_prior = (rh, ah, sh, uh)
idata = spx.est_tumor.mcmc_inference(gen_num, mcmc_prior, T=35, c0=c0, sigma=100)

5. Convergence analysis of MCMC

print(az.summary(idata_bl))
print(az.summary(idata))
fig = plt.figure(layout='constrained',figsize=(11,3))
gs = GridSpec(2, 3, figure=fig)
ax1 = fig.add_subplot(gs[:,0])
ls = 'solid,dotted,dashed,dashdot'.split(',')
for i, l in enumerate(ls):
    sns.kdeplot(idata_bl.posterior['mu'].to_numpy()[i], linestyle=l, ax=ax1, label=f'Chain {i+1}')
ax1.vlines(2, 0, 1.7, color='black')
ax1.legend(fontsize=10,loc=2)
ax1.set_title(r'$\mu$ distribution')

ax2 = fig.add_subplot(gs[:,1])
for i, l in enumerate(ls):
    sns.kdeplot(idata_bl.posterior['delta'].to_numpy()[i], linestyle=l, ax=ax2, label=f'Chain {i+1}')
ax2.set_title(r'$\delta$ distribution')
ax3 = fig.add_subplot(gs[0,2])
for i, l in enumerate(ls):
    ax3.plot(idata_bl.posterior['mu'][i], linestyle=l, label=f'Chain {i+1}')
ax3.set_title('$\mu$ mcmc trace')
ax3.set_xlabel('step')
ax4 = fig.add_subplot(gs[1,2])
for i, l in enumerate(ls):
    ax4.plot(idata_bl.posterior['delta'][i], linestyle=l, label=f'Chain {i+1}')
ax4.set_title(r'$\delta$ mcmc trace')
ax4.set_xlabel('step')

fig = plt.figure(figsize=(12,7))
plt.rcParams['font.size'] = 12
gt = [0.5, 0.6, 0.2, 1e-3]
gs = GridSpec(4, 3, figure=fig)
ls = 'solid.dotted.dashed.dashdot.solid'.split('.')
for ind, sym in enumerate('r,a,s,u'.split(',')):
    if ind >= 2:
        ax = fig.add_subplot(gs[2:, ind-2])
    else:
        ax = fig.add_subplot(gs[:2, ind])
    for i, l in enumerate(ls):
        sns.kdeplot(idata.posterior[sym].to_numpy()[i], linestyle=l, ax=ax, label=f'Chain {i+1}')
    ax.legend(fontsize=10,loc=2)
    ylim = ax.get_ylim()
    ax.vlines(gt[ind], *ylim, color='black')
    ax.set_ylim(ylim)
    ax.set_title(fr'${sym}$ distribution')

ax0 = fig.add_subplot(gs[-1, 2])
for i, l in enumerate(ls):
    ax0.plot(idata.posterior['u'][i], linestyle=l, label=f'Chain {i+1}')
    ax0.set_title(fr'$u$ mcmc trace')
for ind, sym in enumerate('r,a,s'.split(',')):
    ax = fig.add_subplot(gs[ind, 2], sharex=ax0)
    for i, l in enumerate(ls):
        ax.plot(idata.posterior[sym][i], linestyle=l, label=f'Chain {i+1}')
        ax.set_title(fr'${sym}$ mcmc trace')
    plt.setp(ax.get_xticklabels(), visible=False)
ax0.set_xlabel('step')
plt.tight_layout()

6. Phylodynamics inference results visualization

theta_h = az.summary(idata).loc['ax,bx,r,d,k,t0'.split(',')]['mean'].to_numpy()
x0 = [c0, 0]
n_stemcells = np.array([[spx.est_tissue.ncyc(i, j, c0, *theta_h) for j in range(T)] for i in range(max(100, len(gen_num)))])
n_nonstemcells = np.array([[spx.est_tissue.nnc(i, j, c0, *theta_h) for j in range(T)] for i in range(max(100, len(gen_num)))])
fig, ax = plt.subplots()
ax.set_xlabel('Time')
ax.set_ylabel('Cell number')
ax.plot(n_stemcells.sum(0), c='#9098d9', lw=4, label='stem cell')
ax.plot(n_nonstemcells.sum(0), c='#ed9e44', lw=4, label='Non-stem cell')
ax.ticklabel_format (style='sci', scilimits= (-1,2), axis='y')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.legend(loc=2)
ax.ticklabel_format (style='sci', scilimits= (-1,2), axis='y', useMathText=True)
[ ]: