从AlphaFold3的`full_data.json`到互作热图:手把手教你用Python解析并可视化接触概率矩阵

张开发
2026/4/15 10:32:03 15 分钟阅读

分享文章

从AlphaFold3的`full_data.json`到互作热图:手把手教你用Python解析并可视化接触概率矩阵
从AlphaFold3的full_data.json到互作热图Python解析与可视化接触概率矩阵全指南1. 理解AlphaFold3输出中的接触概率矩阵AlphaFold3作为蛋白质结构预测领域的里程碑工具其输出数据中隐藏着远比静态结构更丰富的信息。full_data.json文件中的contact_probs矩阵便是其中最具价值的宝藏之一——它记录了模型对残基间相互作用的概率预测为研究者提供了传统结构坐标无法提供的动态视角。接触概率矩阵的核心价值体现在三个维度置信度量化不同于简单的距离阈值判断如8Å每个数值代表模型对残基接触的确信程度0-1范围动态互作网络矩阵整体呈现蛋白质内部及分子间的潜在相互作用网络柔性区域识别低概率区域0.2-0.5可能指示构象动态性或预测不确定性import numpy as np contact_probs np.load(full_data.json)[contact_probs] print(f矩阵维度{contact_probs.shape} | 数值范围{contact_probs.min():.2f}-{contact_probs.max():.2f})典型接触概率矩阵的特征特征生物学意义可视化表现对角线高值残基自身接触对角线亮带邻近残基高值二级结构连续邻近对角线的亮区远距残基高值三级结构互作非对角线的离散亮点整体低值区柔性/无序区域大面积暗区2. 数据预处理从JSON到结构化数组原始JSON文件需要经过系统化处理才能用于分析。关键步骤包括2.1 数据加载与校验import json import numpy as np def load_contact_probs(json_path): with open(json_path) as f: data json.load(f) # 关键字段验证 required_fields [contact_probs, token_chain_ids, token_res_ids] if not all(field in data for field in required_fields): raise ValueError(JSON文件缺少必要字段) # 转换为NumPy数组 contact_matrix np.array(data[contact_probs]) chain_ids np.array(data[token_chain_ids]) res_ids np.array(data[token_res_ids]) return contact_matrix, chain_ids, res_ids2.2 矩阵标准化处理接触概率矩阵常需以下预处理def preprocess_matrix(matrix): # 对角线归零避免自接触干扰可视化 np.fill_diagonal(matrix, 0) # 可选高斯平滑减少噪声 from scipy.ndimage import gaussian_filter smoothed gaussian_filter(matrix, sigma0.7) # 数据缩放适应不同可视化工具 normalized (matrix - matrix.min()) / (matrix.max() - matrix.min()) return normalized2.3 链特异性数据提取def extract_chain_data(matrix, chain_ids, target_chain): chain_mask (chain_ids target_chain) intra_chain matrix[chain_mask, :][:, chain_mask] inter_chain matrix[chain_mask, :][:, ~chain_mask] return intra_chain, inter_chain3. 基础可视化热图绘制技巧3.1 Matplotlib基础热图import matplotlib.pyplot as plt def plot_basic_heatmap(matrix, titleContact Probability Matrix): plt.figure(figsize(10,8)) im plt.imshow(matrix, cmapviridis, originlower) plt.colorbar(im, labelContact Probability) plt.title(title) plt.xlabel(Residue Index) plt.ylabel(Residue Index) plt.show()3.2 增强型热图带链注释def plot_annotated_heatmap(matrix, chain_ids, res_ids): fig, ax plt.subplots(figsize(12,10)) # 主热图 im ax.imshow(matrix, cmapplasma, interpolationnearest, originlower) # 添加链分隔线 unique_chains np.unique(chain_ids) chain_boundaries np.where(np.diff(chain_ids))[0] 1 for boundary in chain_boundaries: ax.axhline(boundary-0.5, colorwhite, linestyle--, linewidth0.5) ax.axvline(boundary-0.5, colorwhite, linestyle--, linewidth0.5) # 添加颜色条 cbar fig.colorbar(im, axax, shrink0.8) cbar.set_label(Contact Probability, rotation270, labelpad20) # 设置坐标轴 ax.set_xlabel(Residue Index, fontsize12) ax.set_ylabel(Residue Index, fontsize12) ax.set_title(Annotated Contact Probability Matrix, pad20) # 添加链标签 for i, chain in enumerate(unique_chains): chain_pos np.mean(np.where(chain_ids chain)[0]) ax.text(-0.05, chain_pos, chain, haright, vacenter, transformax.get_yaxis_transform(), fontsize10) ax.text(chain_pos, -0.05, chain, hacenter, vatop, transformax.get_xaxis_transform(), fontsize10) plt.tight_layout() plt.show()3.3 交互式热图Plotlyimport plotly.graph_objects as go def interactive_contact_map(matrix, chain_ids): fig go.Figure(datago.Heatmap( zmatrix, colorscaleViridis, hoverongapsFalse, xchain_ids, ychain_ids )) fig.update_layout( titleInteractive Contact Probability Map, xaxis_titleResidue Chain Index, yaxis_titleResidue Chain Index, width800, height700 ) fig.show()4. 高级分析技术4.1 接触网络构建import networkx as nx def build_contact_network(matrix, threshold0.7): 将接触概率矩阵转换为图网络 G nx.Graph() n_residues matrix.shape[0] # 添加节点 G.add_nodes_from(range(n_residues)) # 添加边仅保留高概率接触 for i in range(n_residues): for j in range(i1, n_residues): if matrix[i,j] threshold: G.add_edge(i, j, weightmatrix[i,j]) return G def plot_contact_network(G, chain_ids): pos nx.spring_layout(G, seed42) node_colors [plt.cm.tab20(i%20) for i in range(len(chain_ids))] plt.figure(figsize(12,10)) nx.draw_networkx_nodes(G, pos, node_size50, node_colornode_colors) nx.draw_networkx_edges(G, pos, alpha0.2) plt.title(Residue Contact Network) plt.show()4.2 8Å阈值分析def analyze_threshold(matrix, thresholdsnp.linspace(0.1, 0.9, 9)): results [] for thresh in thresholds: n_contacts np.sum(matrix thresh) density n_contacts / (matrix.size - len(matrix)) results.append((thresh, n_contacts, density)) # 绘制阈值敏感性曲线 fig, ax1 plt.subplots() ax1.plot([r[0] for r in results], [r[1] for r in results], b-) ax1.set_xlabel(Probability Threshold) ax1.set_ylabel(Number of Contacts, colorb) ax2 ax1.twinx() ax2.plot([r[0] for r in results], [r[2] for r in results], r--) ax2.set_ylabel(Contact Density, colorr) plt.title(Threshold Sensitivity Analysis) plt.show() return results4.3 与实验结构比对def compare_with_experimental(pred_matrix, exp_distances, cutoff8.0): 将预测接触概率与实验距离比对 exp_contacts (exp_distances cutoff).astype(float) # 计算性能指标 from sklearn.metrics import roc_auc_score auroc roc_auc_score(exp_contacts.flatten(), pred_matrix.flatten()) # 绘制ROC曲线 from sklearn.metrics import roc_curve fpr, tpr, _ roc_curve(exp_contacts.flatten(), pred_matrix.flatten()) plt.plot(fpr, tpr, labelfAUROC {auroc:.3f}) plt.plot([0,1], [0,1], k--) plt.xlabel(False Positive Rate) plt.ylabel(True Positive Rate) plt.title(ROC Curve: Predicted vs Experimental Contacts) plt.legend() plt.show() return auroc5. 出版级可视化优化5.1 复合热图示例def publication_quality_plot(matrix, chain_ids): # 创建图形和网格布局 fig plt.figure(figsize(15,12)) gs fig.add_gridspec(2, 2, width_ratios[1,0.05], height_ratios[1,0.2], hspace0.1, wspace0.05) # 主热图 ax fig.add_subplot(gs[0,0]) im ax.imshow(matrix, cmapcoolwarm, aspectauto, originlower, vmin0, vmax1) # 添加链分隔线 for boundary in np.where(np.diff(chain_ids))[0]: ax.axhline(boundary0.5, colorblack, linewidth1, linestyle--) ax.axvline(boundary0.5, colorblack, linewidth1, linestyle--) # 颜色条 cax fig.add_subplot(gs[0,1]) cbar fig.colorbar(im, caxcax) cbar.set_label(Contact Probability, rotation270, labelpad20) # 添加链注释 ax_anno fig.add_subplot(gs[1,0]) unique_chains np.unique(chain_ids) colors plt.cm.Paired(np.linspace(0,1,len(unique_chains))) for i, chain in enumerate(unique_chains): mask (chain_ids chain) ax_anno.bar(np.where(mask)[0], np.ones(sum(mask)), colorcolors[i], labelchain) ax_anno.set_xlim(ax.get_xlim()) ax_anno.set_yticks([]) ax_anno.legend(locupper center, bbox_to_anchor(0.5,1.5), ncollen(unique_chains), frameonFalse) # 美化 ax.set_ylabel(Residue Index) ax.set_xlabel(Residue Index) ax.set_title(Inter-Residue Contact Probabilities, pad20) plt.savefig(contact_matrix_publication.png, dpi300, bbox_inchestight) plt.show()5.2 3D结构映射def plot_3d_contacts(pdb_file, contact_pairs, threshold0.7): 在3D结构中高亮显示重要接触 from Bio.PDB import PDBParser from mpl_toolkits.mplot3d import Axes3D # 解析PDB结构 parser PDBParser() structure parser.get_structure(protein, pdb_file) # 提取CA原子坐标 ca_atoms [] for model in structure: for chain in model: for residue in chain: if CA in residue: ca_atoms.append(residue[CA].get_coord()) ca_coords np.array(ca_atoms) # 创建3D图 fig plt.figure(figsize(12,10)) ax fig.add_subplot(111, projection3d) # 绘制蛋白质骨架 ax.plot(ca_coords[:,0], ca_coords[:,1], ca_coords[:,2], gray, alpha0.3, linewidth1) # 绘制高概率接触 for i,j,prob in contact_pairs: if prob threshold: ax.plot([ca_coords[i,0], ca_coords[j,0]], [ca_coords[i,1], ca_coords[j,1]], [ca_coords[i,2], ca_coords[j,2]], colorplt.cm.plasma(prob), linewidth2*prob) # 添加颜色条 sm plt.cm.ScalarMappable(cmapplasma, normplt.Normalize(vminthreshold, vmax1)) sm.set_array([]) cbar fig.colorbar(sm, axax, shrink0.8) cbar.set_label(Contact Probability, rotation270, labelpad20) ax.set_title(3D Visualization of High-Probability Contacts) plt.tight_layout() plt.show()6. 实战案例从数据到生物学洞见案例1蛋白质-配体相互作用分析def analyze_protein_ligand_interaction(contact_matrix, chain_ids, protein_chainA, ligand_chainB): # 提取蛋白-配体互作子矩阵 protein_mask (chain_ids protein_chain) ligand_mask (chain_ids ligand_chain) interaction_matrix contact_matrix[protein_mask, :][:, ligand_mask] # 找出高概率互作残基 threshold 0.6 high_prob_interactions np.where(interaction_matrix threshold) # 生成结果报告 print(f发现 {len(high_prob_interactions[0])} 个高概率互作({threshold})) for prot_idx, lig_idx in zip(*high_prob_interactions): prot_res prot_idx 1 # 假设残基索引从1开始 lig_res lig_idx 1 prob interaction_matrix[prot_idx, lig_idx] print(f蛋白质残基 {prot_res} - 配体残基 {lig_res}: 概率{prob:.2f}) # 可视化互作谱 plt.figure(figsize(10,4)) plt.imshow(interaction_matrix.T, cmaphot, aspectauto) plt.colorbar(labelInteraction Probability) plt.xlabel(Protein Residue Index) plt.ylabel(Ligand Residue Index) plt.title(fProtein-Ligand Interaction Profile (Chain {protein_chain} vs {ligand_chain})) plt.show()案例2突变效应预测def predict_mutation_effect(contact_matrix, chain_ids, wild_type, mutants): 预测突变对接触网络的影响 # 获取野生型残基索引 wt_chain, wt_pos wild_type[0], int(wild_type[1:])-1 # 假设格式如A123 wt_idx np.where((chain_ids wt_chain) (np.arange(len(chain_ids)) wt_pos))[0][0] # 分析野生型接触 wt_contacts contact_matrix[wt_idx, :] strong_contacts np.where(wt_contacts 0.7)[0] print(f野生型 {wild_type} 有 {len(strong_contacts)} 个强接触:) for contact_idx in strong_contacts: contact_chain chain_ids[contact_idx] contact_pos contact_idx - np.where(chain_ids contact_chain)[0][0] 1 print(f- {contact_chain}{contact_pos} (概率{wt_contacts[contact_idx]:.2f})) # 模拟突变效应简化示例 print(\n突变效应预测:) for mut in mutants: # 这里应使用更精细的能量函数此处仅为示例 effect 破坏性 if np.random.rand() 0.5 else 中性/稳定 print(f{wild_type}→{mut}: {effect})案例3多结构域蛋白质分析def analyze_domain_interaction(contact_matrix, chain_ids, domain_boundaries): 分析多结构域蛋白质的域间互作 # 创建域掩码 domains {} for domain, (start, end) in domain_boundaries.items(): domains[domain] (chain_ids A) (np.arange(len(chain_ids)) start) (np.arange(len(chain_ids)) end) # 计算域内/域间接触密度 results {} for dom1 in domains: for dom2 in domains: submatrix contact_matrix[domains[dom1], :][:, domains[dom2]] avg_prob np.mean(submatrix) results[f{dom1}-{dom2}] avg_prob # 可视化域互作网络 import networkx as nx G nx.Graph() for domain in domains: G.add_node(domain) for (dom1, dom2), prob in results.items(): if dom1 ! dom2 and prob 0.2: # 仅显示显著域间互作 G.add_edge(dom1, dom2, weightprob*10) plt.figure(figsize(8,6)) pos nx.spring_layout(G) nx.draw_networkx_nodes(G, pos, node_size2000, node_colorlightblue) nx.draw_networkx_edges(G, pos, width[d[weight] for _,_,d in G.edges(dataTrue)]) nx.draw_networkx_labels(G, pos, font_size12) plt.title(Inter-Domain Interaction Network) plt.axis(off) plt.show() return results

更多文章