From 9b01420ce2f791a7ab362f9f37bee5dbe8efbfe1 Mon Sep 17 00:00:00 2001 From: Franz Waibl Date: Wed, 5 Feb 2025 15:15:58 +0100 Subject: [PATCH 1/5] Fix the scaling of the hydrophobicity output, and add a unit test * There is now a unit test to check that the scale of the written surface roughly matches that of the atomic coordinates. This is to avoid unit conversion errors in the future. * Updated the README with some information on PLY output and units. --- README.md | 5 +++ surface_analyses/commandline_hydrophobic.py | 2 +- surface_analyses/surface.py | 30 +++++++++++---- test/commandline_electrostatic_test.py | 41 +++++++++++++-------- test/commandline_hydrophobic_test.py | 31 +++++++++++++--- 5 files changed, 80 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index fb2d6b0..5924234 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,11 @@ pre-defined. See Eisenberg and McLachlan (1986) or Wildman and Crippen (1999) for references. Additionally, using `rdkit-crippen` will use a SMILES input to assign Crippen parameters using the RdKit (useful for small molecules). +## Output formats +Both the electrostatics and hydrophobicity script allow writing surfaces to .ply files, using the plyfile library (https://github.com/dranjan/python-plyfile). The surfaces is colored using either the patches (where each patch gets a different color), or using the raw data (using the electrostatic potential at the vertices, or the hydrohobic potential approach of Heiden et al. https://doi.org/10.1007/BF00124359). The raw data will also be written to PLY properties, and the coordinate units are chosen to match Pymol (units of $10^-8$m). + +Additionally, the hydrophobicity script supports output in the numpy npz format, which is more efficient when writing many surfaces. + ## Examples Visualize the hydrophobic potential on a non-protein molecule: ``` diff --git a/surface_analyses/commandline_hydrophobic.py b/surface_analyses/commandline_hydrophobic.py index c486641..441dc84 100644 --- a/surface_analyses/commandline_hydrophobic.py +++ b/surface_analyses/commandline_hydrophobic.py @@ -244,7 +244,7 @@ def run_hydrophobic( if ply_out: fnames = ply_filenames(ply_out, len(surfs)) for surf, fname in zip(surfs, fnames): - surf.write_ply(fname, coordinate_scaling=10) + surf.write_ply(fname) output.update(surfaces_to_dict(surfs, basename="hydrophobic_potential")) if out is not None: np.savez(out, **output) diff --git a/surface_analyses/surface.py b/surface_analyses/surface.py index e8483b1..bf29357 100644 --- a/surface_analyses/surface.py +++ b/surface_analyses/surface.py @@ -90,10 +90,10 @@ def vertex_areas(self): def __repr__(self): return f"{self.__class__.__name__}({self.n_vertices} vertices, {self.n_faces} faces)" - def as_plydata(self, text=True, units_per_angstrom=0.1): + def as_plydata(self, text=True, units_per_nm=0.1): """Convert to a plyfile.PlyData object, while scaling coordinates - the default scaling units_per_angstrom=0.1 matches the PyMol CGO + the default scaling units_per_nm=0.1 matches the PyMol CGO scaling factor of 1:100 and the nm -> A conversion. """ vertex_dtype = [("x", "f4"), ("y", "f4"), ("z", "f4")] @@ -107,7 +107,7 @@ def as_plydata(self, text=True, units_per_angstrom=0.1): vertex_dtype.append((k, PLY_DTYPES[dtype.kind])) vertex_data = [] for i in range(self.n_vertices): - vert = list(self.vertices[i] * units_per_angstrom) + vert = list(self.vertices[i] * units_per_nm) for k in keys: vert.append(self.data[k][i]) vertex_data.append(tuple(vert)) @@ -121,14 +121,28 @@ def as_plydata(self, text=True, units_per_angstrom=0.1): out = plyfile.PlyData([vertex, face], text=text) return out - def write_ply(self, fname, coordinate_scaling=1.): - "Write a PLY file, scaling is relative to the default of 0.1." + def write_ply(self, fname, units_per_nm=0.1): + "Write a PLY file, scaling of 0.1 matches nm -> PyMol [10^-8 m]." with open(fname, mode="wb") as f: - self.as_plydata(units_per_angstrom=coordinate_scaling*0.1).write(f) + self.as_plydata(units_per_nm=units_per_nm).write(f) @classmethod - def from_plydata(cls, plydata): - pass + def from_plydata(cls, plydata, units_per_nm=0.1): + vert = plydata['vertex'] + xyz = np.stack([vert['x'], vert['y'], vert['z']], axis=-1) / units_per_nm + faces = np.array(list(plydata['face']['vertex_indices'])) + surf = Surface(xyz, faces) + props = set([p.name for p in vert.properties]) - {'x', 'y', 'z'} + for prop in props: + surf[prop] = vert[prop] + return surf + + @classmethod + def read_ply(cls, fname, units_per_nm=0.1): + return cls.from_plydata( + plyfile.PlyData.read(fname), + units_per_nm=units_per_nm, + ) @classmethod def isosurface(cls, grid, values, isovalue, gradient_direction='descent'): diff --git a/test/commandline_electrostatic_test.py b/test/commandline_electrostatic_test.py index 07b60c1..14ba198 100644 --- a/test/commandline_electrostatic_test.py +++ b/test/commandline_electrostatic_test.py @@ -2,6 +2,8 @@ # -*- coding: utf-8 -*- from surface_analyses.commandline_electrostatic import main, biggest_residue_contribution, parse_args, run_electrostatics +from surface_analyses.surface import Surface + from contextlib import redirect_stdout import inspect import io @@ -9,7 +11,9 @@ import os import shutil from unittest.mock import patch +from tempfile import TemporaryDirectory +import mdtraj as md import pandas as pd from pandas.testing import assert_frame_equal import numpy as np @@ -115,21 +119,28 @@ def get_nth_line(file, n): def test_trastuzumab_ply_out(): - args = [ - TRASTUZUMAB_PATH / 'apbs-input.pdb', - TRASTUZUMAB_PATH / 'apbs-potential.dx', - '--out', - str(TRASTUZUMAB_PATH / 'apbs-patches-ply.csv'), - '--ply_out', - str(TRASTUZUMAB_PATH / 'apbs'), - ] - run_commandline(*args, surface_type="sas") - # check the number of vertices in the output - with open(TRASTUZUMAB_PATH / 'apbs-potential.ply') as f: - if msms.msms_available(): - assert get_nth_line(f, 2).strip() == "element vertex 40549" - else: - assert get_nth_line(f, 2).strip() == "element vertex 67252" + with TemporaryDirectory() as tmp_str: + tmp = Path(tmp_str) + pdb_name = str(TRASTUZUMAB_PATH / 'apbs-input.pdb') + pot_name = str(TRASTUZUMAB_PATH / 'apbs-potential.dx') + ply_name = str(tmp / 'apbs') + csv_name = str(tmp / 'apbs-patches-ply.csv') + args = [pdb_name, pot_name, '--out', csv_name, '--ply_out', ply_name] + run_commandline(*args, surface_type='sas') + # check the number of vertices in the output + with open(ply_name + '-potential.ply') as f: + if msms.msms_available(): + assert get_nth_line(f, 2).strip() == 'element vertex 40549' + else: + assert get_nth_line(f, 2).strip() == 'element vertex 67252' + surf = Surface.read_ply(ply_name + '-potential.ply') + crd = md.load(pdb_name).xyz[0] + surf_extent = surf.vertices.max(axis=0) - surf.vertices.min(axis=0) + crd_extent = crd.max(axis=0) - crd.min(axis=0) + scale_ratios = surf_extent / crd_extent + assert np.all(scale_ratios > 1) + assert np.all(scale_ratios < 1.2) + def test_biggest_residue_contribution(): diff --git a/test/commandline_hydrophobic_test.py b/test/commandline_hydrophobic_test.py index 0082d5c..56bd2c6 100644 --- a/test/commandline_hydrophobic_test.py +++ b/test/commandline_hydrophobic_test.py @@ -5,12 +5,14 @@ import inspect import os.path import shutil -import tempfile +from tempfile import TemporaryDirectory import pytest import numpy as np from surface_analyses.commandline_hydrophobic import main, parse_args, run_hydrophobic +from surface_analyses.surface import Surface +import mdtraj as md TrastuzumabRun = namedtuple('TrastuzumabRun', 'scale method expected_data') @@ -60,7 +62,8 @@ def trastuzumab_run(request): return def run_commandline(parm, traj, scale, outfile, *args): - main([str(parm), str(traj), '--scale', str(scale), '--out', str(outfile)] + list(args)) + args_s = [str(a) for a in args] + main([str(parm), str(traj), '--scale', str(scale), '--out', str(outfile)] + args_s) def test_output_consistent(trastuzumab_run): runtype = trastuzumab_run.method @@ -86,7 +89,7 @@ def test_output_consistent(trastuzumab_run): if scale == 'wimley-white': scale = TRASTUZUMAB_PATH / 'wimley-white-scaled.csv' - with tempfile.TemporaryDirectory() as tmp: + with TemporaryDirectory() as tmp: out = Path(tmp) / 'out.npz' run_commandline(parm7, rst7, scale, out, *args) with np.load(out, allow_pickle=True) as npz: @@ -96,18 +99,36 @@ def test_output_consistent(trastuzumab_run): def test_output_with_sc_norm(): scale = TRASTUZUMAB_PATH / 'glmnet.csv' args = ['--surfscore', '--surftype', 'sc_norm'] - parm7 = TRASTUZUMAB_PATH / 'input.parm7' + parm7 = TRASTUZUMAB_PATH / 'input.parm9' rst7 = TRASTUZUMAB_PATH / 'input.rst7' expected_fname = TRASTUZUMAB_PATH / 'jain-surfscore-sc-norm.npz' with np.load(expected_fname, allow_pickle=True) as npz: expected = dict(npz) - with tempfile.TemporaryDirectory() as tmp: + with TemporaryDirectory() as tmp: out = Path(tmp) / "out.npz" run_commandline(parm7, rst7, scale, out, *args) with np.load(out, allow_pickle=True) as npz: assert_outputs_equal(npz, expected) +def test_surface_size(): + scale = TRASTUZUMAB_PATH / 'glmnet.csv' + parm7 = TRASTUZUMAB_PATH / 'input.parm7' + rst7 = TRASTUZUMAB_PATH / 'input.rst7' + with TemporaryDirectory() as tmp: + out = Path(tmp) / "out.npz" + ply_out = Path(tmp) / "out.ply" + args = ['--potential', '--ply_out', ply_out] + run_commandline(parm7, rst7, scale, out, *args) + surf = Surface.read_ply(ply_out) + crd = md.load(rst7, top=parm7).xyz[0] + surf_extent = surf.vertices.max(axis=0) - surf.vertices.min(axis=0) + crd_extent = crd.max(axis=0) - crd.min(axis=0) + scale_ratios = surf_extent / crd_extent + assert np.all(scale_ratios > 1) + assert np.all(scale_ratios < 1.2) + + def test_rdkit_crippen(): pdb = TESTS_PATH / '1csa-model1.pdb' args = ['--surfscore', '--smiles', CsA_SMILES] From bef08cbbb642ff426d3973b54619db89eddddaa9 Mon Sep 17 00:00:00 2001 From: Franz Waibl <44898997+fwaibl@users.noreply.github.com> Date: Wed, 5 Feb 2025 15:19:43 +0100 Subject: [PATCH 2/5] Update README.md formatting --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5924234..e9a7101 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ for references. Additionally, using `rdkit-crippen` will use a SMILES input to assign Crippen parameters using the RdKit (useful for small molecules). ## Output formats -Both the electrostatics and hydrophobicity script allow writing surfaces to .ply files, using the plyfile library (https://github.com/dranjan/python-plyfile). The surfaces is colored using either the patches (where each patch gets a different color), or using the raw data (using the electrostatic potential at the vertices, or the hydrohobic potential approach of Heiden et al. https://doi.org/10.1007/BF00124359). The raw data will also be written to PLY properties, and the coordinate units are chosen to match Pymol (units of $10^-8$m). +Both the electrostatics and hydrophobicity script allow writing surfaces to .ply files, using the [plyfile](https://github.com/dranjan/python-plyfile) library. The surfaces is colored using either the patches (where each patch gets a different color), or using the raw data (using the electrostatic potential at the vertices, or the hydrohobic potential approach of [Heiden et al.](https://doi.org/10.1007/BF00124359)). The raw data will also be written to PLY properties, and the coordinate units are chosen to match Pymol (units of 10^(-8)m). Additionally, the hydrophobicity script supports output in the numpy npz format, which is more efficient when writing many surfaces. From dba10b07c89667e36dde46933236a1786067427b Mon Sep 17 00:00:00 2001 From: Franz Waibl Date: Wed, 5 Feb 2025 15:22:47 +0100 Subject: [PATCH 3/5] Fix surface_test.py --- test/surface_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/surface_test.py b/test/surface_test.py index be82cb7..50ea511 100644 --- a/test/surface_test.py +++ b/test/surface_test.py @@ -17,7 +17,7 @@ def test_repr_no_error(minimal): repr(minimal) def test_convert_to_ply(minimal): - plydat = minimal.as_plydata(units_per_angstrom=1.) + plydat = minimal.as_plydata(units_per_nm=1.) assert len(plydat['vertex']) == minimal.n_vertices for i_dim, dim in enumerate('xyz'): assert np.allclose(plydat['vertex'][dim], minimal.vertices[:, i_dim]) From 5f8c93063610a4307f132e00f545895d90d5aebc Mon Sep 17 00:00:00 2001 From: Franz Waibl Date: Wed, 5 Feb 2025 15:25:28 +0100 Subject: [PATCH 4/5] Fix commandline_hydrophobic_test.py --- test/commandline_hydrophobic_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/commandline_hydrophobic_test.py b/test/commandline_hydrophobic_test.py index 56bd2c6..44970ae 100644 --- a/test/commandline_hydrophobic_test.py +++ b/test/commandline_hydrophobic_test.py @@ -132,7 +132,7 @@ def test_surface_size(): def test_rdkit_crippen(): pdb = TESTS_PATH / '1csa-model1.pdb' args = ['--surfscore', '--smiles', CsA_SMILES] - with tempfile.TemporaryDirectory() as tmp: + with TemporaryDirectory() as tmp: out = Path(tmp) / "out.npz" run_commandline(pdb, pdb, 'rdkit-crippen', out, *args) with np.load(out, allow_pickle=True) as npz: From 89ff494ce5c48d02662b2a64cc8d9a885ee3acf0 Mon Sep 17 00:00:00 2001 From: Franz Waibl Date: Wed, 5 Feb 2025 15:27:42 +0100 Subject: [PATCH 5/5] Fix commandline_hydrophobic_test.py --- test/commandline_hydrophobic_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/commandline_hydrophobic_test.py b/test/commandline_hydrophobic_test.py index 44970ae..ade1c2f 100644 --- a/test/commandline_hydrophobic_test.py +++ b/test/commandline_hydrophobic_test.py @@ -99,7 +99,7 @@ def test_output_consistent(trastuzumab_run): def test_output_with_sc_norm(): scale = TRASTUZUMAB_PATH / 'glmnet.csv' args = ['--surfscore', '--surftype', 'sc_norm'] - parm7 = TRASTUZUMAB_PATH / 'input.parm9' + parm7 = TRASTUZUMAB_PATH / 'input.parm7' rst7 = TRASTUZUMAB_PATH / 'input.rst7' expected_fname = TRASTUZUMAB_PATH / 'jain-surfscore-sc-norm.npz' with np.load(expected_fname, allow_pickle=True) as npz: