diff --git a/surface_analyses/commandline_electrostatic.py b/surface_analyses/commandline_electrostatic.py index a0f6b0a..9c5cd72 100644 --- a/surface_analyses/commandline_electrostatic.py +++ b/surface_analyses/commandline_electrostatic.py @@ -55,6 +55,7 @@ def parse_args(argv=None): parser.add_argument('--apbs_dir', help="Directory in which intermediate files are stored when running APBS. Will be created if it does not exist.", type=str, default=None) parser.add_argument('--probe_radius', type=float, help='Probe radius in nm', default=0.14) parser.add_argument('-o', '--out', default=None, type=str, help='Output csv file.') + parser.add_argument('-ro', '--resout', default=None, type=str, help='Residue csv file.') parser.add_argument( '-c', '--patch_cutoff', type=float, @@ -148,6 +149,7 @@ def run_electrostatics( apbs_dir: str = None, probe_radius: float = 0.14, out: str = None, + resout: str = None, patch_cutoff: tuple = (2., -2.), integral_cutoff: tuple = (0.3, -0.3), surface_type: str = "sas", @@ -174,6 +176,11 @@ def run_electrostatics( else: csv_outfile = open(out, "w") + if resout is None: + res_outfile = sys.stdout + else: + res_outfile = open(resout, "w") + ion_species = get_ion_species(ion_species) # Renumber residues, takes care of insertion codes in PDB residue numbers for i, res in enumerate(traj.top.residues,start=1): @@ -237,27 +244,28 @@ def run_electrostatics( # The patch searching print('Finding patches') - # the griddata object is in Angstroms - values = griddata.interpolate(columns, surf.vertices * 10)[columns[0]] - patches = pd.DataFrame({ - 'positive': assign_patches(surf.faces, values > patch_cutoff[0]), - 'negative': assign_patches(surf.faces, values < patch_cutoff[1]), - 'area': vert_areas, - 'atom': closest_atom, - 'residue': residues[closest_atom], - 'cdr': np.isin(residues, cdrs)[closest_atom], - 'value': values, - }) + values = griddata.interpolate(columns, surf.vertices * 10)[columns[0]] + # save values and atom in surf for consistency with commandline_hydrophobic - surf['values'] = values + surf['positive'] = assign_patches(surf.faces, values > patch_cutoff[0]) + neg_patches = assign_patches(surf.faces, values < patch_cutoff[1]) + # Give numbers higher than every positive patch to the negative patches + neg_patches[neg_patches != -1] += max(surf['positive']) + surf['negative'] = neg_patches + surf['value'] = values surf['atom'] = closest_atom + surf['area'] = vert_areas + surf['residue'] = np.array(residues[closest_atom]) + surf['cdr'] = np.isin(residues, cdrs)[closest_atom] + patches = surf.vertices_to_df() #keep args.n_patches largest patches (n > 0) or smallest patches (n < 0) or patches with an area over the size cutoff if n_patches != 0 or size_cutoff != 0: #interesting interaction: setting a -n cutoff and size cutoff should yield the n smallest patches with an area over the size cutoff replace_vals = {} - for patch_type in ('negative', 'positive'): + max_previous = 0 + for patch_type in ('positive', 'negative'): # sort patches by area and filter top n patches (or bottom n patches for n < 0) # also we apply the size cutoff here. It defaults to 0, so should not do anything if not explicitly set as all areas should be > 0. area = patches.query(f'{patch_type} != -1').groupby(f'{patch_type}').sum(numeric_only=True)['area'] @@ -267,16 +275,22 @@ def run_electrostatics( filtered = order[:n_patches] if n_patches > 0 else order[n_patches:] # set patches not in filtered to -1 patches.loc[~patches[patch_type].isin(filtered), patch_type] = -1 - # build replacement dict to renumber patches in df according to size - order_map = {elem: i for i, elem in enumerate(order[:n_patches])} + order_map = {elem: i for i, elem in enumerate(filtered, start=max_previous)} + max_previous = max(order_map, key=order_map.get) replace_vals[patch_type] = order_map patches.replace(replace_vals, inplace=True) - output = csv.writer(csv_outfile) - output.writerow(['type', 'npoints', 'area', 'cdr', 'main_residue']) - write_patches(patches, output, 'positive') - write_patches(patches, output, 'negative') + # output patches information + patch_summ = summarize_patches(patches) + if not check_cdrs: + patch_summ = patch_summ.drop(columns='cdr') + patch_summ.to_csv(csv_outfile, index=False) + + # output residues involved in each patch + residue_output = csv.writer(res_outfile) + residue_output.writerow(['nr', 'residues']) + write_residues(patches, residue_output) # Compute the total solvent-accessible potential. within_range, closest_atom, distance = grid.distance_to_spheres(centers=traj.xyz[0], rmax=1., radii=radii) @@ -313,6 +327,8 @@ def run_electrostatics( # close user output file, but not stdout if out is not None: csv_outfile.close() + if resout is not None: + res_outfile.close() return { 'surface': surf, @@ -388,18 +404,35 @@ def get_apbs_potential_from_mdtraj(traj, apbs_dir, pH, ion_species): griddata = load_dx(dxfile, colname='DX') return griddata -def write_patches(df, out, column): - groups = dict(list(df[df[column] != -1].groupby(column))) - for patch in sorted(groups.values(), key=lambda df: -df['area'].sum()): - out.writerow([ - column, - len(patch), - patch['area'].sum(), - np.any(patch['cdr']), - biggest_residue_contribution(patch), - ]) - - +def summarize_patches(df, cols=['positive','negative']): + ix = 1 + Row = namedtuple('Row', 'nr type npoints area value cdr main_residue') + output = [] + for column in cols: + groups = dict(list(df[df[column] != -1].groupby(column))) + for patch in sorted(groups.values(), key=lambda df: -df['area'].sum()): + output.append(Row( + ix, + column, + len(patch), + patch['area'].sum(), + patch['value'].sum(), + np.any(patch['cdr']), + biggest_residue_contribution(patch) + )) + ix += 1 + return pd.DataFrame(output) + +def write_residues(df, out, cols=['positive','negative']): + ix = 1 + for column in cols: + groups = dict(list(df[df[column] != -1].groupby(column))) + for patch in sorted(groups.values(), key=lambda df: -df['area'].sum()): + out.writerow([ + ix, ' '.join(sorted(patch['residue'].unique(), key=lambda x: int(x[3:]))) + ]) + ix += 1 + def biggest_residue_contribution(df): """Find the element in df['residue'] with the highest total contribution in df['area'].""" return ( @@ -411,7 +444,6 @@ def biggest_residue_contribution(df): .index[-1] ) - def run_pdb2pqr(pdbfile, cwd=".", ff="AMBER", name_base="apbs", pH=None): if not isinstance(cwd, pathlib.Path): cwd = pathlib.Path(cwd) diff --git a/surface_analyses/surface.py b/surface_analyses/surface.py index dff8ec0..e8483b1 100644 --- a/surface_analyses/surface.py +++ b/surface_analyses/surface.py @@ -18,6 +18,7 @@ import matplotlib.colors import numpy as np import plyfile +import pandas as pd import msms.wrapper as msms @@ -188,6 +189,15 @@ def to_dict(self, basename="surface"): out[f"{basename}:data:{element}"] = self.data[element] return out + def vertices_to_df(self): + """ + Output surface vertices and data as pandas DataFrame + """ + out = {} + out['x'], out['y'], out['z'] = self.vertices.T + for element in self.data: + out[f"{element}"] = self.data[element] + return pd.DataFrame(out) def surfaces_from_dict(dictionary, basename="surfaces"): """Load several surfaces from a dictionary. diff --git a/test/commandline_electrostatic_test.py b/test/commandline_electrostatic_test.py index 8ca709d..07b60c1 100644 --- a/test/commandline_electrostatic_test.py +++ b/test/commandline_electrostatic_test.py @@ -38,10 +38,10 @@ def run_commandline(pdb, dx, *args, **kwargs): kwargs_list = [] for k, v in kwargs.items(): kwargs_list.extend(["--" + str(k), str(v)]) - print(kwargs_list) + args_list = [str(arg) for arg in args] with redirect_stdout(output): # using the pdb as "topology" and "trajectory" - main([str(pdb), str(pdb), "--dx", str(dx)] + list(args) + kwargs_list) + main([str(pdb), str(pdb), "--dx", str(dx)] + args_list + kwargs_list) return output.getvalue() @@ -58,7 +58,8 @@ def test_parser_options_match_python_interface(): for opt in parser_options: assert parser_options[opt] == python_options[opt].default -@pytest.fixture(params=['without', 'with']) + +@pytest.fixture(params=[[], ['--check_cdrs']]) def with_or_without_cdrs(request): yield request.param @@ -73,14 +74,13 @@ def test_trastuzumab_sas_integrals(with_or_without_cdrs): -1.86765861091, ] ) - if with_or_without_cdrs == 'with': + resout_fname = TRASTUZUMAB_PATH / 'resout.csv' + args = ['--resout', resout_fname] + with_or_without_cdrs + if "--check_cdrs" in args: if shutil.which('hmmscan') is None: pytest.skip('hmmscan was not found') if not HAS_ANARCI: pytest.skip('ANARCI is not available') - args = ['--check_cdrs'] - else: - args = [] stdout = run_commandline( TRASTUZUMAB_PATH / 'apbs-input.pdb', TRASTUZUMAB_PATH / 'apbs-potential.dx', @@ -95,9 +95,16 @@ def test_trastuzumab_sas_integrals(with_or_without_cdrs): patches = pd.read_csv(str(TRASTUZUMAB_PATH / 'apbs-patches.csv')) exp_fname = 'apbs-patches-msms.save' if msms.msms_available() else 'apbs-patches.save' expected_patches = pd.read_csv(str(TRASTUZUMAB_PATH / exp_fname)) - if with_or_without_cdrs == 'without': - expected_patches['cdr'] = False + if "--check_cdrs" not in args: + del expected_patches['cdr'] assert_frame_equal(patches, expected_patches) + resout_df = pd.read_csv(resout_fname) + expected_n_patches = 32 if msms.msms_available() else 36 + assert len(resout_df) == expected_n_patches + assert resout_df.iloc[0]["residues"].startswith( + "TYR33 ARG50 ASN55 TYR57 THR58 ARG59 TYR60 ALA61 ASP62 LYS65 GLY66 TRP99 " + "ASP122 ILE123 GLN148 HIS212 TYR213 THR214 THR215 PRO216" # PRO217 is missing w/o msms + ) def get_nth_line(file, n): @@ -112,7 +119,7 @@ def test_trastuzumab_ply_out(): TRASTUZUMAB_PATH / 'apbs-input.pdb', TRASTUZUMAB_PATH / 'apbs-potential.dx', '--out', - str(TRASTUZUMAB_PATH / 'apbs-patches.csv'), + str(TRASTUZUMAB_PATH / 'apbs-patches-ply.csv'), '--ply_out', str(TRASTUZUMAB_PATH / 'apbs'), ] diff --git a/test/trastuzumab/apbs-patches-msms.save b/test/trastuzumab/apbs-patches-msms.save index 231a753..2e3a3a9 100644 --- a/test/trastuzumab/apbs-patches-msms.save +++ b/test/trastuzumab/apbs-patches-msms.save @@ -1,33 +1,33 @@ -type,npoints,area,cdr,main_residue -positive,1725,4.182278508729034,True,ARG59 -positive,1345,3.3018772409740245,True,LYS166 -positive,474,1.1227447014770573,False,LYS224 -positive,212,0.4780537234772543,False,PRO14 -positive,196,0.42350239444585136,False,GLN82 -positive,153,0.26352678455568534,True,THR74 -positive,131,0.2623972855174914,True,ARG187 -positive,113,0.2239198468055985,True,GLU1 -positive,91,0.18047135288680483,False,ARG182 -positive,82,0.17243666156532797,False,ARG187 -positive,85,0.11763809617857156,True,TYR170 -positive,24,0.04135049908954891,False,THR195 -positive,10,0.027526301024288966,False,SER85 -positive,12,0.02167615367273435,False,SER198 -positive,12,0.019511131671895172,False,THR69 -positive,7,0.015586532451777404,False,ARG87 -positive,18,0.01556431292971975,True,PHE27 -positive,8,0.006207044981287775,True,PHE219 -positive,5,0.0058994152378983485,False,SER197 -positive,3,0.005203025153503235,False,ARG182 -positive,6,0.00519997560517816,False,ARG182 -positive,3,0.003164115750466868,True,ARG145 -positive,2,0.0025863522808405913,True,LYS30 -negative,241,0.6363343407004047,True,ASP31 -negative,268,0.6162607965834226,False,GLU89 -negative,27,0.07296024063999074,False,ASP138 -negative,25,0.04096139412400949,False,GLU226 -negative,23,0.022811577815085794,True,THR32 -negative,17,0.016368615194651954,True,LEU175 -negative,14,0.015925282524936708,False,GLN13 -negative,7,0.011160988364316836,True,GLY100 -negative,6,0.010733508585959266,False,ALA40 +nr,type,npoints,area,value,cdr,main_residue +1,positive,1725,4.182278508729034,5837.588495521277,True,ARG59 +2,positive,1345,3.3018772409740245,4457.250193616104,True,LYS166 +3,positive,474,1.1227447014770573,1615.377352677039,False,LYS224 +4,positive,212,0.4780537234772543,551.4412348285368,False,PRO14 +5,positive,196,0.42350239444585136,675.6514743166699,False,GLN82 +6,positive,153,0.26352678455568534,540.226705945201,True,THR74 +7,positive,131,0.2623972855174914,318.4519895515599,True,ARG187 +8,positive,113,0.2239198468055985,265.7312049146501,True,GLU1 +9,positive,91,0.18047135288680483,242.82859600619585,False,ARG182 +10,positive,82,0.17243666156532797,188.3076970571323,False,ARG187 +11,positive,85,0.11763809617857156,217.4192353502932,True,TYR170 +12,positive,24,0.04135049908954891,70.28936614643578,False,THR195 +13,positive,10,0.027526301024288966,20.7597361666323,False,SER85 +14,positive,12,0.02167615367273435,25.6625185537019,False,SER198 +15,positive,12,0.019511131671895172,25.213701850502137,False,THR69 +16,positive,7,0.015586532451777404,14.131560782100696,False,ARG87 +17,positive,18,0.01556431292971975,37.40732945242014,True,PHE27 +18,positive,8,0.006207044981287775,17.677805714979243,True,PHE219 +19,positive,5,0.0058994152378983485,10.40878235947409,False,SER197 +20,positive,3,0.005203025153503235,6.0562606150358524,False,ARG182 +21,positive,6,0.00519997560517816,12.244182295189493,False,ARG182 +22,positive,3,0.003164115750466868,6.137556290116853,True,ARG145 +23,positive,2,0.0025863522808405913,4.018059952417971,True,LYS30 +24,negative,241,0.6363343407004047,-648.0796775146749,True,ASP31 +25,negative,268,0.6162607965834226,-626.4097689813246,False,GLU89 +26,negative,27,0.07296024063999074,-60.37267556867309,False,ASP138 +27,negative,25,0.04096139412400949,-55.94862378146032,False,GLU226 +28,negative,23,0.022811577815085794,-52.56402908640423,True,THR32 +29,negative,17,0.016368615194651954,-37.110362416262035,True,LEU175 +30,negative,14,0.015925282524936708,-31.015360920890828,False,GLN13 +31,negative,7,0.011160988364316836,-14.59490620019469,True,GLY100 +32,negative,6,0.010733508585959266,-13.172523925351646,False,ALA40 diff --git a/test/trastuzumab/apbs-patches.save b/test/trastuzumab/apbs-patches.save index 4e97ca3..ec97f44 100644 --- a/test/trastuzumab/apbs-patches.save +++ b/test/trastuzumab/apbs-patches.save @@ -1,37 +1,37 @@ -type,npoints,area,cdr,main_residue -positive,2706,4.0555237192411315,True,ARG59 -positive,2050,3.1358180610606072,True,LYS166 -positive,714,1.0785564850219775,False,LYS224 -positive,269,0.4349030355542176,False,PRO14 -positive,269,0.4006016732560038,False,GLN82 -positive,164,0.24193543018723437,True,ARG187 -positive,146,0.23264465711205656,True,THR74 -positive,138,0.2145502520054379,True,GLU1 -positive,120,0.1716530068706561,False,ARG187 -positive,119,0.1706755639411952,False,ARG182 -positive,66,0.10186810414633858,True,TYR170 -positive,22,0.033606791475904174,False,THR195 -positive,30,0.030287938567198577,True,ALA79 -positive,15,0.023222786999516153,False,SER85 -positive,9,0.015245096806514388,False,TYR60 -positive,14,0.011946238389706801,False,SER198 -positive,6,0.007587021401074405,True,PHE27 -positive,3,0.00633904083467011,False,ARG87 -positive,2,0.004148372012423352,False,ARG139 -positive,1,0.001185125942962865,True,THR218 -positive,6,0.0010143504841835236,False,LYS166 -positive,6,0.0006065153211238794,False,ILE34 -positive,6,0.00025960786297218874,True,LYS30 -positive,6,6.43316927835258e-05,False,LYS166 -positive,6,6.1429446418515e-08,True,TRP99 -negative,379,0.6080976480519841,True,ASP31 -negative,375,0.5670296154251784,False,GLU89 -negative,44,0.06279480835187692,False,ASP138 -negative,18,0.03518602688154715,False,GLU226 -negative,20,0.016015456201785128,True,THR32 -negative,5,0.00913510504324222,False,LEU11 -negative,5,0.0078237506319662,False,ALA40 -negative,3,0.004150241366005503,True,ASP108 -negative,5,0.003132833357085474,True,LEU175 -negative,4,0.0016012810156098565,False,SER181 -negative,6,0.001596131180122029,False,ALA40 +nr,type,npoints,area,value,cdr,main_residue +1,positive,2706,4.0555237192411315,8906.732408338641,True,ARG59 +2,positive,2050,3.1358180610606072,6378.02794764717,True,LYS166 +3,positive,714,1.0785564850219775,2366.2263496670826,False,LYS224 +4,positive,269,0.4349030355542176,663.5351588924126,False,PRO14 +5,positive,269,0.4006016732560038,820.109078686167,False,GLN82 +6,positive,164,0.24193543018723437,384.8357004032635,True,ARG187 +7,positive,146,0.23264465711205656,442.7531807008791,True,THR74 +8,positive,138,0.2145502520054379,308.98002828476854,True,GLU1 +9,positive,120,0.1716530068706561,266.72787666433067,False,ARG187 +10,positive,119,0.1706755639411952,307.27723743167127,False,ARG182 +11,positive,66,0.10186810414633858,163.63140165864482,True,TYR170 +12,positive,22,0.033606791475904174,54.60327632139041,False,THR195 +13,positive,30,0.030287938567198577,232.70025916176803,True,ALA79 +14,positive,15,0.023222786999516153,30.820802103676705,False,SER85 +15,positive,9,0.015245096806514388,18.922786593196502,False,TYR60 +16,positive,14,0.011946238389706801,30.10462995764804,False,SER198 +17,positive,6,0.007587021401074405,12.370451415593246,True,PHE27 +18,positive,3,0.00633904083467011,6.04540847008538,False,ARG87 +19,positive,2,0.004148372012423352,4.1426211246149105,False,ARG139 +20,positive,1,0.001185125942962865,2.110696531605022,True,THR218 +21,positive,6,0.0010143504841835236,14.350174996542579,False,LYS166 +22,positive,6,0.0006065153211238794,34.48725109895096,False,ILE34 +23,positive,6,0.00025960786297218874,34.820354009268065,True,LYS30 +24,positive,6,6.43316927835258e-05,17.98491281121401,False,LYS166 +25,positive,6,6.1429446418515e-08,49.21569313975243,True,TRP99 +26,negative,379,0.6080976480519841,-990.0929767279357,True,ASP31 +27,negative,375,0.5670296154251784,-869.4265885055943,False,GLU89 +28,negative,44,0.06279480835187692,-97.02849908589296,False,ASP138 +29,negative,18,0.03518602688154715,-38.661829480754555,False,GLU226 +30,negative,20,0.016015456201785128,-43.77045246436204,True,THR32 +31,negative,5,0.00913510504324222,-10.87626482898136,False,LEU11 +32,negative,5,0.0078237506319662,-10.770950367116772,False,ALA40 +33,negative,3,0.004150241366005503,-6.058838802786218,True,ASP108 +34,negative,5,0.003132833357085474,-10.606429320031229,True,LEU175 +35,negative,4,0.0016012810156098565,-8.446501503306715,False,SER181 +36,negative,6,0.001596131180122029,-15.624449931069849,False,ALA40