From 48853b594bd786ffc6ee7b904787be85e9cdbd9c Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Mon, 6 Oct 2025 14:30:02 +0100 Subject: [PATCH 01/56] Added machine learning prediction capability to Sails --- package/_pyproject.toml | 7 +- package/pyproject.toml | 7 +- package/src/sails/clean.py | 73 +++++++ package/src/sails/install.py | 77 +++++++ package/src/sails/logs.py | 9 + package/src/sails/prediction/__init__.py | 0 package/src/sails/prediction/arguments.py | 77 +++++++ package/src/sails/prediction/config.py | 25 +++ package/src/sails/prediction/errors.py | 29 +++ package/src/sails/prediction/grid_tools.py | 87 ++++++++ package/src/sails/prediction/load.py | 75 +++++++ package/src/sails/prediction/model.py | 225 +++++++++++++++++++++ package/src/sails/prediction/predict.py | 219 ++++++++++++++++++++ package/src/sails/prediction/save.py | 10 + package/src/sails/prediction/util.py | 41 ++++ 15 files changed, 959 insertions(+), 2 deletions(-) create mode 100644 package/src/sails/clean.py create mode 100644 package/src/sails/install.py create mode 100644 package/src/sails/logs.py create mode 100644 package/src/sails/prediction/__init__.py create mode 100644 package/src/sails/prediction/arguments.py create mode 100644 package/src/sails/prediction/config.py create mode 100644 package/src/sails/prediction/errors.py create mode 100644 package/src/sails/prediction/grid_tools.py create mode 100644 package/src/sails/prediction/load.py create mode 100644 package/src/sails/prediction/model.py create mode 100644 package/src/sails/prediction/predict.py create mode 100644 package/src/sails/prediction/save.py create mode 100644 package/src/sails/prediction/util.py diff --git a/package/_pyproject.toml b/package/_pyproject.toml index 70b8d02..5083cd5 100644 --- a/package/_pyproject.toml +++ b/package/_pyproject.toml @@ -18,7 +18,9 @@ dependencies=[ 'tqdm', 'gemmi', 'numpy<2.0.0', - 'typing-extensions' + 'typing-extensions', + 'onnxruntime-gpu; platform_system != "Darwin"', + 'onnxruntime; platform_system == "Darwin"', ] [project.urls] @@ -32,6 +34,9 @@ sails-find = "sails.find:run" sails-test = "sails.test:run" sails-wurcs = "sails.wurcs:run" sails-morph = "sails.morph:run" +sails-predict = "sails.prediction.predict:run" +sails-install = "sails.install:run" +sails-clean = "sails.clean:run" [tool.scikit-build] # Protect the configuration against future changes in scikit-build-core diff --git a/package/pyproject.toml b/package/pyproject.toml index bd544d5..06c89c6 100644 --- a/package/pyproject.toml +++ b/package/pyproject.toml @@ -18,7 +18,9 @@ dependencies=[ 'tqdm', 'gemmi', 'numpy<2.0.0', - 'typing-extensions' + 'typing-extensions', + 'onnxruntime-gpu; platform_system != "Darwin"', + 'onnxruntime; platform_system == "Darwin"', ] [tool.setuptools] @@ -40,6 +42,9 @@ sails-find = "sails.find:run" sails-test = "sails.test:run" sails-wurcs = "sails.wurcs:run" sails-morph = "sails.morph:run" +sails-predict = "sails.prediction.predict:run" +sails-install = "sails.install:run" +sails-clean = "sails.clean:run" [tool.scikit-build] # Protect the configuration against future changes in scikit-build-core diff --git a/package/src/sails/clean.py b/package/src/sails/clean.py new file mode 100644 index 0000000..27a4ad4 --- /dev/null +++ b/package/src/sails/clean.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024 Jordan Dialpuri, Jon Agirre, Kathryn Cowtan, Paul Bond and University of York. All rights reserved + +import site +import os + + +def clean_models(): + site_packages_dir = site.getsitepackages() + + found_models = [] + + for folder in site_packages_dir: + sails_model_dir = os.path.join(folder, "sails_models") + if os.path.exists(sails_model_dir): + for model in os.scandir(sails_model_dir): + found_models.append(model) + + if not found_models: + print("No models were found in site-packages.") + return + + print("Pick an option to remove: ") + + for index, model in enumerate(found_models): + print(f"{index + 1}) {model.name.removesuffix('.onnx')}") + + if len(found_models) > 1: + print(f"{len(found_models) + 1}) All") + + option_selected = False + while not option_selected: + option = input("Number: ") + + try: + choice = int(option) + if choice <= 0 or choice > len(found_models) + 1: + raise ValueError() + + if choice == len(found_models) + 1: + print("Do you want to remove all the models?") + else: + model_to_remove = found_models[choice - 1] + print(f"Confirm you want to remove {model_to_remove.name}?") + + y_no_selected = False + confirm = False + while not y_no_selected: + y_or_n = input("Y/N ").lower() + if y_or_n not in ["y", "yes", "n", "no"]: + continue + + y_no_selected = True + + if y_or_n == "y" or y_or_n == "yes": + confirm = True + + if confirm: + if choice == len(found_models) + 1: + for model in found_models: + os.remove(model.path) + print("Removed", model.name) + else: + model_to_remove = found_models[choice - 1] + os.remove(model_to_remove.path) + print("Removed", model_to_remove.name) + + option_selected = True + except ValueError: + print("Invalid choice") + + +def run(): + clean_models() diff --git a/package/src/sails/install.py b/package/src/sails/install.py new file mode 100644 index 0000000..36246ed --- /dev/null +++ b/package/src/sails/install.py @@ -0,0 +1,77 @@ +# Copyright (c) 2024 Jordan Dialpuri, Jon Agirre, Kathryn Cowtan, Paul Bond and University of York. All rights reserved + +import site +import os +import argparse +import enum +from pathlib import Path +from .__version__ import __version__ +from .logs import setup_logging +from .prediction import model +import logging + + +class InstallLocation(enum.Enum): + site_packages = 0 + ccp4 = 1 + + +def clibd_error_msg(): + print("""CCP4 Environment Variable - CLIBD is not found. + You can try sourcing it: + Ubuntu - source /opt/xtal/ccp4-X.X/bin/ccp4.setup-sh + MacOS - source /Applications/ccp4-X.X/bin/ccp4.setup-sh + """) + + +def install_model(type: model.ModelType, location: str, reinstall: bool) -> bool: + logging.info(f"Installing {type.name} model to {location}") + if InstallLocation[location] == InstallLocation.ccp4: + clibd = os.environ.get("CLIBD", "") + if not os.path.exists(clibd): + clibd_error_msg() + return False + + model.download_model(type=type, folder=clibd, reinstall=reinstall) + return True + + if InstallLocation[location] == InstallLocation.site_packages: + site_packages_dir = site.getsitepackages() + if not site_packages_dir: + raise RuntimeError( + "Failed to get site packages directory, ensure you in a virtual environment" + ) + first_sitepackages = Path(site_packages_dir[0]) + model.download_model(type=type, folder=first_sitepackages, reinstall=reinstall) + # download_database(folder=first_sitepackages, reinstall=reinstall) + return True + return False + + +def run(): + setup_logging() + output_list = ["ccp4", "site_packages"] + + parser = argparse.ArgumentParser(description="nucleofind Install") + parser.add_argument( + "-m", "--model", choices=[type.name for type in model.ModelType], required=False + ) + parser.add_argument( + "-o", + "--output", + choices=[location.name for location in InstallLocation], + required=False, + default=output_list[1], + ) + parser.add_argument("--update", required=False, action="store_true") + parser.add_argument("-v", "--version", action="version", version=__version__) + + args = parser.parse_args() + + if not args.model: + print("Please specify a model you wish to download") + return + + install_model( + type=model.ModelType[args.model], location=args.output, reinstall=args.update + ) diff --git a/package/src/sails/logs.py b/package/src/sails/logs.py new file mode 100644 index 0000000..612af11 --- /dev/null +++ b/package/src/sails/logs.py @@ -0,0 +1,9 @@ +import logging +import logging.config + + +def setup_logging(): + """Setup basic logging configuration""" + logging.basicConfig( + level=logging.INFO, format="%(asctime)s %(levelname)s - %(message)s" + ) diff --git a/package/src/sails/prediction/__init__.py b/package/src/sails/prediction/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/package/src/sails/prediction/arguments.py b/package/src/sails/prediction/arguments.py new file mode 100644 index 0000000..5268917 --- /dev/null +++ b/package/src/sails/prediction/arguments.py @@ -0,0 +1,77 @@ +import argparse +from types import SimpleNamespace + +from sails.__version__ import __version__ +from .model import ModelType + + +def parse_arguments() -> SimpleNamespace: + """Parse command line arguments""" + parser = argparse.ArgumentParser() + parser.add_argument( + "-m", + "--model", + help="Model selection", + choices=[type.name for type in ModelType], + required=False, + ) + parser.add_argument("-i", "--input", help="Input mtz", required=True) + parser.add_argument( + "-o", + "--output", + help="Output directory, if does not exist it will be created model", + default="sails-output", + required=False, + ) + parser.add_argument("-r", "--resolution", nargs="?", help="Resolution cutoff") + parser.add_argument( + "-n", + "--nthreads", + nargs="?", + default=None, + type=int, + help="Number of threads to use", + ) + parser.add_argument( + "--amplitude", "-f", nargs="?", help="Name of amplitude column in MTZ, e.g. FWT" + ) + parser.add_argument( + "--phase", "-phi", nargs="?", help="Name of phase column in MTZ, e.g. PHWT" + ) + parser.add_argument( + "--overlap", + nargs="?", + help="Amount of overlap to use", + default=None, + type=int, + ) + parser.add_argument( + "--use-symmetry", + action=argparse.BooleanOptionalAction, + default=False, + help="Compute predictions for the entire unit cell", + ) + parser.add_argument( + "--variance", action=argparse.BooleanOptionalAction, help="Output variance map" + ) + parser.add_argument( + "--raw", + action=argparse.BooleanOptionalAction, + help="Output raw map (no argmax)", + ) + parser.add_argument( + "--gpu", action=argparse.BooleanOptionalAction, help="Use GPU (experimental)" + ) + parser.add_argument( + "--debug", action=argparse.BooleanOptionalAction, help="Turn on debug logging" + ) + parser.add_argument( + "--silent", + action=argparse.BooleanOptionalAction, + default=False, + help="Turn off progress bar", + ) + parser.add_argument("--model_path", nargs="?", help="Path to model (development)") + parser.add_argument("-v", "--version", action="version", version=__version__) + args = vars(parser.parse_args()) + return SimpleNamespace(**args) diff --git a/package/src/sails/prediction/config.py b/package/src/sails/prediction/config.py new file mode 100644 index 0000000..3356070 --- /dev/null +++ b/package/src/sails/prediction/config.py @@ -0,0 +1,25 @@ +import dataclasses +import enum + + +@dataclasses.dataclass +class Configuration: + """Configuration for Sails""" + + use_gpu: bool = False + n_threads: int | None = None + disable_progress_bar: bool = True + compute_entire_unit_cell: bool = True + compute_variance: bool = False + use_raw_values: bool = False + spacing: float = 0.7 + box_size: int = 128 + channels: int = 2 + overlap: int = 64 + + +class MapType(enum.Enum): + """Map types for sails, i.e. model will output 1 for protein...""" + + glycan: int = 1 + protein: int = 2 diff --git a/package/src/sails/prediction/errors.py b/package/src/sails/prediction/errors.py new file mode 100644 index 0000000..d6691f7 --- /dev/null +++ b/package/src/sails/prediction/errors.py @@ -0,0 +1,29 @@ +import logging +from typing import List + + +def show_missing_model_error(): + """Show error when no models are found""" + logging.critical(""" + No models have been found in either site_packages or CCP4/lib/data. + You can install models using the command: + sails-install -m {binary,multiclass} + """) + + +def show_missing_specified_model_error(model_name: str): + """Show error when model with specified name is not found""" + logging.critical(f""" + No model with the name {model_name} has been found in either site_packages or CCP4/lib/data.""") + + +def show_multiple_model_error(model_names: List[str]): + """Show warning when multiple models are found""" + multiple_model_names = "" + for model_name in model_names: + multiple_model_names += f"\t-model {model_name}\n" + + logging.warning(f""" + Multiple models have been found in either site_packages or CCP4/lib/data. + Please specify either: + {multiple_model_names}""") diff --git a/package/src/sails/prediction/grid_tools.py b/package/src/sails/prediction/grid_tools.py new file mode 100644 index 0000000..7347503 --- /dev/null +++ b/package/src/sails/prediction/grid_tools.py @@ -0,0 +1,87 @@ +import gemmi +import numpy as np +from typing import Tuple, List + +from .config import Configuration + + +def interpolate_grid( + grid: gemmi.FloatGrid, configuration: Configuration +) -> Tuple[np.ndarray, gemmi.Transform]: + """Interpolate grid to 0.7A grid spacing surrounding the unit cell and return interpolated grid and transform.""" + if configuration.compute_entire_unit_cell: + extent = gemmi.FractionalBox() + extent.extend(gemmi.Fractional(0, 0, 0)) + extent.extend(gemmi.Fractional(1, 1, 1)) + else: + extent = gemmi.find_asu_brick(grid.spacegroup).get_extent() + + box = grid.unit_cell.orthogonalize_box(extent) + margin = configuration.spacing * (configuration.box_size // 2) + box.add_margin(margin) + size = box.get_size() + numx = -( + -int(size.x / configuration.spacing) + // configuration.overlap + * configuration.overlap + ) + numy = -( + -int(size.y / configuration.spacing) + // configuration.overlap + * configuration.overlap + ) + numz = -( + -int(size.z / configuration.spacing) + // configuration.overlap + * configuration.overlap + ) + array = np.zeros((numx, numy, numz), dtype=np.float32) + scale = gemmi.Mat33(configuration.spacing * np.eye(3)) + transform: gemmi.Transform = gemmi.Transform(scale, box.minimum) + grid.interpolate_values(array, transform) + return array, transform + + +def precompute_slices(grid_shape: np.ndarray, overlap: int = 16) -> List[List[int]]: + """Precompute indices of slices to run inference on.""" + slices = [] + + for i in range(0, grid_shape[0] - overlap, overlap): + for j in range(0, grid_shape[1] - overlap, overlap): + for k in range(0, grid_shape[2] - overlap, overlap): + slices.append([i, j, k]) + return slices + + +def reinterpolate_grid( + work_array: np.ndarray, + transform: gemmi.Transform, + template_grid: gemmi.FloatGrid, + compute_entire_unit_cell: bool = True, +) -> gemmi.FloatGrid: + """Reinterpolate grid to original unit cell.""" + + output_grid = gemmi.FloatGrid() + output_grid.spacegroup = template_grid.spacegroup + output_grid.set_unit_cell(template_grid.unit_cell) + + grid_spacing = 0.7 + output_grid.set_size(*template_grid.shape) + size_x = work_array.shape[0] * grid_spacing + size_y = work_array.shape[1] * grid_spacing + size_z = work_array.shape[2] * grid_spacing + + array_cell = gemmi.UnitCell(size_x, size_y, size_z, 90, 90, 90) + array_grid = gemmi.FloatGrid(work_array, array_cell) + + if compute_entire_unit_cell: + grid_iterable = output_grid + else: + grid_iterable = output_grid.masked_asu() + + for point in grid_iterable: + position = output_grid.point_to_position(point) - gemmi.Position(transform.vec) + point.value = array_grid.interpolate_value(position) + + output_grid.symmetrize_max() + return output_grid diff --git a/package/src/sails/prediction/load.py b/package/src/sails/prediction/load.py new file mode 100644 index 0000000..e9e4791 --- /dev/null +++ b/package/src/sails/prediction/load.py @@ -0,0 +1,75 @@ +from pathlib import Path +import gemmi +import numpy as np +from typing import List +from .util import find_map_coefficients, check_density_path +import onnxruntime as rt +import sys +import logging + + +def load_mtz( + path: Path | str, + column_names: List[str] | None, + resolution_cutoff: float | None, +) -> gemmi.FloatGrid: + """Load MTZ file and transform to map with 0.7A grid spacing and with resolution cutoff if specified.""" + mtz = gemmi.read_mtz_file(str(path)) + if None in column_names: + logging.warning( + "No map coefficients were specified, NucleoFind will try and find some but they may be wrong." + ) + column_names = find_map_coefficients(mtz) + + res = mtz.resolution_high() + spacing = 0.7 + sample_rate = res / spacing + grid = mtz.transform_f_phi_to_map(*column_names, sample_rate=sample_rate) + grid.normalize() + if resolution_cutoff: + data = np.array(mtz, copy=False) + mtz.set_data(data[mtz.make_d_array() >= resolution_cutoff]) + return grid + + +def load_map(path: Path | str) -> gemmi.FloatGrid: + """Load map file and normalize""" + map = gemmi.read_ccp4_map(str(path)) + grid = map.grid + grid.normalize() + return grid + + +def load_density( + density_path: Path | str, + column_names: List[str] | None, + resolution_cutoff: float | None, +) -> gemmi.FloatGrid: + """Load density from MTZ file, or map file""" + density_path = check_density_path(density_path) + + if density_path.suffix == ".mtz": + return load_mtz(density_path, column_names, resolution_cutoff) + else: + return load_map(density_path) + + +def load_onnx_model( + model_path: Path | str, use_gpu: bool = True +) -> rt.InferenceSession: + """Load ONNX model from model_path""" + providers = ["CPUExecutionProvider"] + if use_gpu: + providers.insert(0, "CUDAExecutionProvider") + sess_options = rt.SessionOptions() + sess_options.intra_op_num_threads = 1 + try: + return rt.InferenceSession( + str(model_path), providers=providers, sess_options=sess_options + ) + except OSError: + logging.critical( + "This model is corrupted, perhaps due to an incomplete download. Try downloading it again with " + "nucleofind-install -m TYPE --reinstall" + ) + sys.exit(1) diff --git a/package/src/sails/prediction/model.py b/package/src/sails/prediction/model.py new file mode 100644 index 0000000..2da573f --- /dev/null +++ b/package/src/sails/prediction/model.py @@ -0,0 +1,225 @@ +import os +import site +import sys +import urllib +from pathlib import Path +import logging +from types import SimpleNamespace +from typing import Tuple, List + +import requests +from enum import Enum +import re +import hashlib + +from .errors import ( + show_missing_model_error, + show_multiple_model_error, + show_missing_specified_model_error, +) + + +class ModelType(Enum): + """Types of sails Model available""" + + binary = 1 + multiclass = 2 + + +def calculate_sha256(file_path: Path): + """Calculate SHA256 hash of file""" + logging.debug("Calculating SHA256 hash for %s", file_path) + with open(file_path, "rb") as f: + file_hash = hashlib.sha256() + while chunk := f.read(4096): + file_hash.update(chunk) + return file_hash.hexdigest() + + +def calculate_size(file_path: Path) -> int: + """Calculate size of file""" + logging.debug("Calculating size for %s", file_path) + return file_path.stat().st_size + + +def get_latest_model_metadata(type: ModelType, latest_model: str) -> Tuple[str, str]: + """Get latest model metadata from HuggingFace""" + url = f"https://huggingface.co/dialpuri/sails-{type.name}/raw/main/{latest_model}" + response = requests.get(url) + text = response.text + sha_match = re.search(r"sha256:([a-f0-9]+)", text) + if not sha_match: + raise RuntimeError("Failed to get SHA256 hash from model metadata") + sha256 = sha_match.group(1) + + size_match = re.search(r"size ([0-9]+)", text) + if not size_match: + raise RuntimeError("Failed to get size from model metadata") + size = size_match.group(1) + return sha256, size + + +def is_model_valid(type: ModelType, model_path: Path, latest_model: str) -> bool: + """Compare current model hash with latest model hash""" + current_model_hash = calculate_sha256(model_path) + + latest_model_hash, latest_model_size = get_latest_model_metadata(type, latest_model) + if latest_model_hash != current_model_hash: + logging.info("Latest model and current modal checksum failed") + return False + return True + + +def get_latest_model(type: ModelType) -> str: + """Query the HuggingFace API to get URL for latest model""" + base_url = "https://huggingface.co/api/models/Dialpuri/Sails" + url = f"{base_url}-{type.name}" + logging.debug("Getting latest model for %s from %s", type.name, url) + response = requests.get(url) + json = response.json() + if not json: + raise RuntimeError("Failed to get model URL") + + siblings = json.get("siblings", None) + if not siblings: + raise RuntimeError("Failed to get siblings from model") + + possible_models = [] + for filename in siblings: + file = filename.get("rfilename", "") + if file.endswith(".onnx"): + possible_models.append(file) + + # Get latest model out of list based on date + possible_models = sorted(possible_models, reverse=True) + latest_model = possible_models[0] + logging.debug("Latest model for %s is %s", type.name, latest_model) + return latest_model + + +def download_model( + type: ModelType, folder: Path, reinstall: bool = False, dry_run: bool = False +): + """Download model from HuggingFace""" + latest_model = get_latest_model(type) + sails_model_dir = folder / "sails_models" + sails_model_dir.mkdir(exist_ok=True) + model_path = sails_model_dir / f"sails-{type.name}.onnx" + + # Check if model already exists and is the latest version. + if model_path.exists() and not reinstall: + status = is_model_valid(type, model_path, latest_model) + if not status: + logging.warning( + "A model was found but did not pass the latest validation checks, it may be corrupted, or a newer version available. " + "To update the model, run `sails-install --update`" + ) + else: + logging.warning( + "Model already exists at %s, skipping download.", model_path + ) + return + + url = f"https://huggingface.co/dialpuri/Sails-{type.name}/resolve/main/{latest_model}?download=true" + logging.debug("Downloading model from %s", url) + if not dry_run: + urllib.request.urlretrieve(url, model_path) + + if not is_model_valid(type, model_path, latest_model): + logging.error("Model verification failed, model may be corrupted.") + + +def find_all_potential_models(): + """Find all potential models in site-packages and CCP4""" + model_extension = "*.onnx" + + potential_models = [] + + for pkg in site.getsitepackages(): + model_directory = Path(pkg) / "sails_models" + models = list(model_directory.glob(model_extension)) + potential_models += models + + clibd = Path(os.environ.get("CLIBD", "")) + if not clibd.exists() and not potential_models: + logging.warning( + """CCP4 Environment Variable - CLIBD is not found. + You can try sourcing it: + Ubuntu - source /opt/xtal/ccp4-X.X/bin/ccp4.setup-sh + MacOS - source /Applications/ccp4-X.X/bin/ccp4.setup-sh + """ + ) + return + + ccp4_model_path = clibd / "sails_models" + if not ccp4_model_path.exists() and not potential_models: + show_missing_model_error() + return + + potential_models += list(ccp4_model_path.glob(model_extension)) + + if not potential_models: + show_missing_model_error() + sys.exit(1) + + return [Path(x) for x in potential_models] + + +def extract_model_names(models: List[Path]) -> List[str]: + """Extract model names from model paths""" + model_names = [] + for model in models: + match = re.search(r"sails-(\w+).onnx", model.name) + if not match: + raise RuntimeError( + "Failed to extract model name from model path, have the models been renamed? Please report this issue on GitHub." + ) + name = match.group(1) + model_names.append(name) + return model_names + + +def find_model(model: ModelType | str | None) -> Path | None: + """Search through site-packages and CCP4/lib/data for a potential model""" + potential_models = find_all_potential_models() + if not potential_models: + sys.exit(1) + + if not model and len(potential_models) == 1: + return Path(potential_models[0]) + + model_names = extract_model_names(potential_models) + if not model: + show_multiple_model_error(model_names) + sys.exit(1) + + if isinstance(model, ModelType): + specified_model_name = model.name + else: + specified_model_name = model + + for name in model_names: + if name == specified_model_name: + return Path(potential_models[model_names.index(name)]) + + show_missing_specified_model_error(specified_model_name) + sys.exit(1) + + +def get_model_config(model_path: Path, overlap: int | None) -> SimpleNamespace: + """Get model configuration from model type""" + model_type = model_path.stem.removeprefix("sails-") + if model_type not in ModelType.__members__: + raise RuntimeError(f"Invalid model type - {model_type}") + model_type = ModelType[model_type] + match model_type: + case ModelType.binary: + return SimpleNamespace( + box_size=128, overlap=64 if overlap is None else overlap + ) + case ModelType.multiclass: + return SimpleNamespace( + box_size=128, overlap=64 if overlap is None else overlap + ) + case _: + raise RuntimeError(f"Invalid model type - {model_type}") diff --git a/package/src/sails/prediction/predict.py b/package/src/sails/prediction/predict.py new file mode 100644 index 0000000..2838273 --- /dev/null +++ b/package/src/sails/prediction/predict.py @@ -0,0 +1,219 @@ +import functools +import logging +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from typing import List, Tuple +import numpy as np +from tqdm import tqdm + +from .load import load_density, load_onnx_model +from .grid_tools import interpolate_grid, reinterpolate_grid, precompute_slices +from .arguments import parse_arguments +from .model import find_model, get_model_config +from .save import save_grid +from ..logs import setup_logging +from .config import Configuration, MapType + + +class Sails: + def __init__(self, model_path: Path | str, configuration: Configuration): + self.model_path = model_path + self.configuration = configuration + + self.model = None + self.predicted_grids = {} + + def _process_sample( + self, + input_name: str, + output_shape: Tuple[int], + box_size: int, + array: np.ndarray[np.float32], + translation: Tuple[int, int, int], + ) -> Tuple[np.ndarray, Tuple[int, int, int]]: + """Perform inference on a single sample of shape (1, box_size, box_size, box_size, 1) and return an array of shape + (box_size, box_size, box_size, output_channels) and the translation (for putting back into an array). + """ + i, j, k = translation + input_sub = array[i : i + box_size, j : j + box_size, k : k + box_size] + input_sub = input_sub[np.newaxis, ..., np.newaxis].astype(np.float32) + + return np.array(self.model.run(None, {input_name: input_sub})).reshape( + output_shape + ), translation + + def _run_prediction(self, work_grid: np.ndarray) -> np.ndarray: + """Run prediction on work_grid and calculate the average predicted grid""" + work_grid_shape = np.array(work_grid.shape) + slices = precompute_slices(work_grid_shape, overlap=self.configuration.overlap) + box_size = self.configuration.box_size + + total_array = np.zeros((*work_grid_shape, 2), dtype=np.float32) + count_array = np.zeros_like(total_array, dtype=np.float32) + + # Variance arrays for Welch's one pass variance method + variance_mean = np.zeros_like(total_array, dtype=np.float32) + variance_m2 = np.zeros_like(total_array, dtype=np.float32) + + channels = self.configuration.channels + input_name = self.model.get_inputs()[0].name + output_shape = (box_size, box_size, box_size, channels) + process_sample_worker = functools.partial( + self._process_sample, + input_name, + output_shape, + box_size, + work_grid, + ) + + miniters = 1_000 if len(slices) > 10_000 else 1 + max_workers = self.configuration.n_threads + if max_workers == 1: + results = list( + tqdm( + map(process_sample_worker, slices), + total=len(slices), + desc="Predicting", + miniters=miniters, + disable=self.configuration.disable_progress_bar, + ) + ) + else: + with ThreadPoolExecutor(max_workers=max_workers) as executor: + results = list( + tqdm( + executor.map(process_sample_worker, slices), + total=len(slices), + desc="Predicting", + miniters=miniters, + disable=self.configuration.disable_progress_bar, + ) + ) + + ones = np.ones(channels) + for result in tqdm(results, desc="Processing results"): + predicted_sub, (i, j, k) = result + box_slice = ( + slice(i, i + box_size), + slice(j, j + box_size), + slice(k, k + box_size), + slice(None), + ) + total_array[box_slice] += predicted_sub + count_array[box_slice] += ones + + if self.configuration.compute_variance: + delta_variance = total_array[box_slice] - variance_mean[box_slice] + variance_mean[box_slice] += delta_variance / count_array[box_slice] + variance_m2[box_slice] += delta_variance * ( + total_array[box_slice] - variance_mean[box_slice] + ) + + predicted_array = total_array / count_array + if self.configuration.use_raw_values: + return predicted_array.astype(np.float32) + + if self.configuration.compute_variance: + variance_array = variance_m2 / (np.subtract(count_array, 1)) + return variance_array.astype(np.float32) + + argmax_array = np.argmax(predicted_array, axis=-1).squeeze() + return argmax_array.astype(np.float32) + + def predict( + self, + density_path: Path | str, + column_names: List[str] | List[None], + resolution_cutoff: float | None = None, + ): + """Run a sails prediction on specified density file. If density file is an MTZ, supply column names and an + optional resolution cutoff. If density file is a MAP, these will be ignored.""" + self.model = load_onnx_model(self.model_path, self.configuration.use_gpu) + input_grid = load_density(density_path, column_names, resolution_cutoff) + + work_grid, transform = interpolate_grid(input_grid, self.configuration) + predicted_array = self._run_prediction(work_grid) + + rounded_array = np.round(predicted_array) + for i in range(1, self.configuration.channels): + if self.configuration.use_raw_values or self.configuration.compute_variance: + index_array = predicted_array[:, :, :, i].astype(np.float32) + else: + index_array = (rounded_array == i).astype(np.float32) + + interpolated_index_array = reinterpolate_grid( + index_array, + transform, + input_grid, + self.configuration.compute_entire_unit_cell, + ) + self.predicted_grids[MapType(i)] = interpolated_index_array + + def save_grid(self, type: MapType, output_path: Path | str): + """Save the predicted grid to directory specified by output_path, with filename sails-{type}.map.""" + output_path = Path(output_path) + output_path.mkdir(exist_ok=True, parents=True) + logging.info(f"Saving grid of type {type} to {output_path}") + + suffix = ".map" + suffix = ".variance.map" if self.configuration.compute_variance else suffix + suffix = ".raw.map" if self.configuration.use_raw_values else suffix + + save_grid( + self.predicted_grids[type], + output_path / f"sails-{type.name}{suffix}", + ) + + +def run(): + """Run prediction from command line arguments""" + setup_logging() + args = parse_arguments() + model_path = find_model(args.model) + model_configuration = get_model_config(model_path, args.overlap) + configuration = Configuration( + use_gpu=args.gpu, + disable_progress_bar=args.silent, + compute_entire_unit_cell=False, + use_raw_values=args.raw, + compute_variance=args.variance, + n_threads=args.nthreads, + **vars(model_configuration), + ) + sails = Sails(model_path, configuration) + sails.predict( + args.input, + [args.amplitude, args.phase], + ) + output_dir = Path(args.output) + sails.save_grid(MapType.glycan, output_dir) + + +def predict_map( + model: str, + input: str, + output: str, + resolution: float = None, + amplitude: str = "FWT", + phase: str = "PHWT", + overlap: int = None, + nthreads: int = 1, +): + """Run prediction from Python""" + logging.info( + f"Running prediction with model {model}, input {input}, output {output}, resolution {resolution}, amplitude {amplitude}, phase {phase}, overlap {overlap}" + ) + model_path = find_model(model) + model_configuration = get_model_config(model_path, overlap) + configuration = Configuration( + use_gpu=False, + disable_progress_bar=False, + compute_entire_unit_cell=False, + n_threads=nthreads, + **vars(model_configuration), + ) + prediction = Sails(model_path, configuration=configuration) + prediction.predict(input, [amplitude, phase], resolution_cutoff=resolution) + prediction.save_grid(MapType.phosphate, output) + prediction.save_grid(MapType.sugar, output) + prediction.save_grid(MapType.base, output) diff --git a/package/src/sails/prediction/save.py b/package/src/sails/prediction/save.py new file mode 100644 index 0000000..7cb62c7 --- /dev/null +++ b/package/src/sails/prediction/save.py @@ -0,0 +1,10 @@ +import gemmi +from pathlib import Path + + +def save_grid(grid: gemmi.FloatGrid, path: Path | str): + """Save grid to CCP4 map file.""" + map = gemmi.Ccp4Map() + map.grid = grid + map.update_ccp4_header() + map.write_ccp4_map(str(path)) diff --git a/package/src/sails/prediction/util.py b/package/src/sails/prediction/util.py new file mode 100644 index 0000000..174d14a --- /dev/null +++ b/package/src/sails/prediction/util.py @@ -0,0 +1,41 @@ +import logging +from pathlib import Path + +import gemmi +from typing import Tuple +import sys + + +def find_map_coefficients(mtz: gemmi.Mtz) -> Tuple[str, str]: + """Find F and P columns in MTZ file.""" + Fs = mtz.columns_with_type("F") + Ps = mtz.columns_with_type("P") + Fs = [F.label for F in Fs] + Ps = [P.label for P in Ps] + + if not Fs or not Ps: + logging.critical("No F and P columns found in MTZ file.") + sys.exit(1) + + if "FWT" in Fs and "PHWT" in Ps: + logging.warning("FWT and PHWT found, using them.") + return "FWT", "PHWT" + + F, P = Fs[0], Ps[0] + if len(Fs) != 1 or len(Ps) != 1: + logging.warning(f"Multiple F and P columns found. Using first set. {F=}, {P=}") + return F.label, P.label + + +def check_density_path(density_path): + """Check that density path is a valid type and exists.""" + allowed_extensions = [".mtz", ".map", ".ccp4", ".mrc", ".gz"] + density_path = Path(density_path) + if not density_path.exists(): + logging.critical(f"Density file {density_path} does not exist.") + sys.exit(1) + + if any(suffix not in allowed_extensions for suffix in density_path.suffixes): + logging.critical(f"Density file must be one of {allowed_extensions}") + sys.exit(1) + return density_path From f1a1c8d5492dbdf03ebdcba78b6af9ad8535aef4 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Tue, 7 Oct 2025 09:23:17 +0100 Subject: [PATCH 02/56] Update to gemmi 0.7.0 --- package/CMakeLists.txt | 2 +- package/gemmi/CMakeLists.txt | 19 ++++++++++--------- package/src/bindings/python_sails.cpp | 2 +- package/src/cpp/sails-utils.cpp | 2 +- package/src/cpp/sails-wurcs.cpp | 2 +- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/package/CMakeLists.txt b/package/CMakeLists.txt index 587a3f8..18248fd 100644 --- a/package/CMakeLists.txt +++ b/package/CMakeLists.txt @@ -50,7 +50,7 @@ FetchContent_Declare( # Download Gemmi FetchContent_Declare( gemmi-dependencies - URL http://www.ysbl.york.ac.uk/jsd523/gemmi-0.6.5.tar.gz + URL http://www.ysbl.york.ac.uk/jsd523/gemmi-v0.7.0.tar.gz ) FetchContent_MakeAvailable(clipper-dependencies mmdb2-dependencies fftw-dependencies ccp4-dependencies gemmi-dependencies) diff --git a/package/gemmi/CMakeLists.txt b/package/gemmi/CMakeLists.txt index ce1608a..8c8a539 100644 --- a/package/gemmi/CMakeLists.txt +++ b/package/gemmi/CMakeLists.txt @@ -39,6 +39,7 @@ add_library(gemmi_cpp STATIC ${gemmi_src}/src/sprintf.cpp ${gemmi_src}/src/to_mmcif.cpp ${gemmi_src}/src/to_pdb.cpp + ${gemmi_src}/src/pdb.cpp ${gemmi_src}/src/monlib.cpp ${gemmi_src}/src/topo.cpp ${gemmi_src}/src/xds_ascii.cpp @@ -63,7 +64,7 @@ set(gemmi_HEADERS ${gemmi_src}/include/gemmi/ccp4.hpp ${gemmi_src}/include/gemmi/cellred.hpp ${gemmi_src}/include/gemmi/chemcomp.hpp - ${gemmi_src}/include/gemmi/chemcomp_xyz.hpp + # ${gemmi_src}/include/gemmi/chemcomp_xyz.hpp ${gemmi_src}/include/gemmi/cif.hpp ${gemmi_src}/include/gemmi/cif2mtz.hpp ${gemmi_src}/include/gemmi/cifdoc.hpp @@ -93,7 +94,7 @@ set(gemmi_HEADERS ${gemmi_src}/include/gemmi/levmar.hpp ${gemmi_src}/include/gemmi/linkhunt.hpp ${gemmi_src}/include/gemmi/math.hpp -# ${gemmi_src}/include/gemmi/merge.hpp + # ${gemmi_src}/include/gemmi/merge.hpp ${gemmi_src}/include/gemmi/metadata.hpp ${gemmi_src}/include/gemmi/mmcif.hpp ${gemmi_src}/include/gemmi/mmcif_impl.hpp @@ -118,7 +119,7 @@ set(gemmi_HEADERS ${gemmi_src}/include/gemmi/recgrid.hpp ${gemmi_src}/include/gemmi/reciproc.hpp ${gemmi_src}/include/gemmi/refln.hpp - ${gemmi_src}/include/gemmi/remarks.hpp + # ${gemmi_src}/include/gemmi/remarks.hpp ${gemmi_src}/include/gemmi/resinfo.hpp ${gemmi_src}/include/gemmi/riding_h.hpp ${gemmi_src}/include/gemmi/scaling.hpp @@ -153,7 +154,7 @@ set(gemmi_HEADERS set(gemmi_third_party-headers_HEADERS ${gemmi_src}/include/gemmi/third_party/fast_float.h ${gemmi_src}/include/gemmi/third_party/pocketfft_hdronly.h -# ${gemmi_src}/include/gemmi/third_party/sajson.h + # ${gemmi_src}/include/gemmi/third_party/sajson.h ${gemmi_src}/include/gemmi/third_party/tinydir.h ) @@ -198,7 +199,7 @@ set(gemmi_third_party_tao_pegtl_analysis-headers_HEADERS ${gemmi_src}/include/gemmi/third_party/tao/pegtl/analysis/generic.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/analysis/grammar_info.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/analysis/insert_guard.hpp - ${gemmi_src}/include/gemmi/third_party/tao/pegtl/analysis/insert_rules.hpp + # ${gemmi_src}/include/gemmi/third_party/tao/pegtl/analysis/insert_rules.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/analysis/rule_info.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/analysis/rule_type.hpp ) @@ -217,7 +218,7 @@ set(gemmi_third_party_tao_pegtl_internal-headers_HEADERS ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/bof.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/bol.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/bump_help.hpp - ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/bump_impl.hpp + # ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/bump_impl.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/bytes.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/control.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/cr_crlf_eol.hpp @@ -240,7 +241,7 @@ set(gemmi_third_party_tao_pegtl_internal-headers_HEADERS ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/eof.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/eol.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/eolf.hpp - ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/file_mapper.hpp + # ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/file_mapper.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/file_opener.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/file_reader.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/has_apply.hpp @@ -262,7 +263,7 @@ set(gemmi_third_party_tao_pegtl_internal-headers_HEADERS ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/list_tail.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/list_tail_pad.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/marker.hpp - ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/minus.hpp + # ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/minus.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/must.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/not_at.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/one.hpp @@ -284,7 +285,7 @@ set(gemmi_third_party_tao_pegtl_internal-headers_HEADERS ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/rep_opt.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/require.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/result_on_found.hpp - ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/rule_conjunction.hpp + # ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/rule_conjunction.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/rules.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/seq.hpp ${gemmi_src}/include/gemmi/third_party/tao/pegtl/internal/skip_control.hpp diff --git a/package/src/bindings/python_sails.cpp b/package/src/bindings/python_sails.cpp index f09bf77..8d25276 100644 --- a/package/src/bindings/python_sails.cpp +++ b/package/src/bindings/python_sails.cpp @@ -54,7 +54,7 @@ NB_MODULE(sails_module, m) { nb::class_(m, "Model") .def(nb::init<>()) - .def_rw("name", &gemmi::Model::name) + .def_rw("num", &gemmi::Model::num) .def_rw("chains", &gemmi::Model::chains); nb::bind_vector >(m, "Chains"); diff --git a/package/src/cpp/sails-utils.cpp b/package/src/cpp/sails-utils.cpp index 2c9c33d..d4f3673 100644 --- a/package/src/cpp/sails-utils.cpp +++ b/package/src/cpp/sails-utils.cpp @@ -81,7 +81,7 @@ std::string Sails::Utils::linkage_to_id(const Sails::LinkageData &data) { void Sails::Utils::save_residues_to_file(std::vector residues, const std::string &path) { gemmi::Structure structure; - gemmi::Model model = gemmi::Model("A"); + gemmi::Model model = gemmi::Model(0); gemmi::Chain chain = gemmi::Chain("A"); for (auto& residue : residues) { chain.residues.push_back(residue); diff --git a/package/src/cpp/sails-wurcs.cpp b/package/src/cpp/sails-wurcs.cpp index 3faf9a2..70ecaa9 100644 --- a/package/src/cpp/sails-wurcs.cpp +++ b/package/src/cpp/sails-wurcs.cpp @@ -238,7 +238,7 @@ std::vector Sails::WURCS::form_residue_name_order(ResidueDatabase & gemmi::Structure Sails::WURCS::generate_pseudo_structure() { gemmi::Structure pseudo_structure; - gemmi::Model pseudo_model = gemmi::Model(""); + gemmi::Model pseudo_model = gemmi::Model(0); gemmi::Chain chain = gemmi::Chain("A"); pseudo_model.chains.emplace_back(chain); From 91a412410416a2bec80ecbb5ae14b8b7ffa93ca8 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Tue, 7 Oct 2025 13:36:31 +0100 Subject: [PATCH 03/56] Added sails-find to predicted density and updated to gemmi 0.7.3 --- package/CMakeLists.txt | 5 +- package/gemmi/CMakeLists.txt | 26 +-- package/src/bindings/python_sails.cpp | 5 + package/src/cpp/sails-predictions.cpp | 88 ++++++++++ package/src/cpp/sails.cpp | 13 ++ package/src/include/sails-model.h | 22 ++- package/src/include/sails-predictions.h | 34 ++++ package/src/sails/__init__.py | 2 + package/src/sails/find.py | 203 +++++++++++++++++++++--- package/src/sails/glycosylate.py | 49 ++++-- package/src/sails/interface.py | 2 +- package/src/sails/prediction/predict.py | 14 +- 12 files changed, 412 insertions(+), 51 deletions(-) create mode 100644 package/src/cpp/sails-predictions.cpp create mode 100644 package/src/include/sails-predictions.h diff --git a/package/CMakeLists.txt b/package/CMakeLists.txt index 18248fd..59387da 100644 --- a/package/CMakeLists.txt +++ b/package/CMakeLists.txt @@ -50,7 +50,7 @@ FetchContent_Declare( # Download Gemmi FetchContent_Declare( gemmi-dependencies - URL http://www.ysbl.york.ac.uk/jsd523/gemmi-v0.7.0.tar.gz + URL https://github.com/Dialpuri/gemmi-bundles/raw/refs/heads/main/gemmi-0.7.3.tar.gz ) FetchContent_MakeAvailable(clipper-dependencies mmdb2-dependencies fftw-dependencies ccp4-dependencies gemmi-dependencies) @@ -101,8 +101,9 @@ add_library( ${WRK_DIR}/src/cpp/sails-telemetry.cpp ${WRK_DIR}/src/cpp/sails-solvent.cpp ${WRK_DIR}/src/cpp/sails-wurcs.cpp - + ${WRK_DIR}/src/cpp/sails-predictions.cpp ${WRK_DIR}/src/cpp/sails-morph.cpp + # Density ${WRK_DIR}/src/cpp/density/sails-density.cpp ${WRK_DIR}/src/cpp/density/sails-xtal-density.cpp diff --git a/package/gemmi/CMakeLists.txt b/package/gemmi/CMakeLists.txt index 8c8a539..9451544 100644 --- a/package/gemmi/CMakeLists.txt +++ b/package/gemmi/CMakeLists.txt @@ -22,25 +22,31 @@ add_library(gemmi_cpp STATIC ${gemmi_src}/src/align.cpp ${gemmi_src}/src/assembly.cpp ${gemmi_src}/src/calculate.cpp + ${gemmi_src}/src/ccp4.cpp ${gemmi_src}/src/crd.cpp ${gemmi_src}/src/ddl.cpp + ${gemmi_src}/src/dssp.cpp ${gemmi_src}/src/eig3.cpp + ${gemmi_src}/src/fprime.cpp ${gemmi_src}/src/gz.cpp ${gemmi_src}/src/intensit.cpp ${gemmi_src}/src/json.cpp ${gemmi_src}/src/mmcif.cpp ${gemmi_src}/src/mmread_gz.cpp + ${gemmi_src}/src/monlib.cpp ${gemmi_src}/src/mtz.cpp ${gemmi_src}/src/mtz2cif.cpp ${gemmi_src}/src/polyheur.cpp + ${gemmi_src}/src/pdb.cpp ${gemmi_src}/src/read_cif.cpp ${gemmi_src}/src/resinfo.cpp ${gemmi_src}/src/riding_h.cpp + ${gemmi_src}/src/select.cpp ${gemmi_src}/src/sprintf.cpp + ${gemmi_src}/src/symmetry.cpp + ${gemmi_src}/src/to_json.cpp ${gemmi_src}/src/to_mmcif.cpp ${gemmi_src}/src/to_pdb.cpp - ${gemmi_src}/src/pdb.cpp - ${gemmi_src}/src/monlib.cpp ${gemmi_src}/src/topo.cpp ${gemmi_src}/src/xds_ascii.cpp ) @@ -64,7 +70,6 @@ set(gemmi_HEADERS ${gemmi_src}/include/gemmi/ccp4.hpp ${gemmi_src}/include/gemmi/cellred.hpp ${gemmi_src}/include/gemmi/chemcomp.hpp - # ${gemmi_src}/include/gemmi/chemcomp_xyz.hpp ${gemmi_src}/include/gemmi/cif.hpp ${gemmi_src}/include/gemmi/cif2mtz.hpp ${gemmi_src}/include/gemmi/cifdoc.hpp @@ -73,6 +78,7 @@ set(gemmi_HEADERS ${gemmi_src}/include/gemmi/ddl.hpp ${gemmi_src}/include/gemmi/dencalc.hpp ${gemmi_src}/include/gemmi/dirwalk.hpp + ${gemmi_src}/include/gemmi/dssp.hpp ${gemmi_src}/include/gemmi/ecalc.hpp ${gemmi_src}/include/gemmi/eig3.hpp ${gemmi_src}/include/gemmi/elem.hpp @@ -87,20 +93,21 @@ set(gemmi_HEADERS ${gemmi_src}/include/gemmi/grid.hpp ${gemmi_src}/include/gemmi/gz.hpp ${gemmi_src}/include/gemmi/input.hpp + ${gemmi_src}/include/gemmi/intensit.hpp ${gemmi_src}/include/gemmi/interop.hpp ${gemmi_src}/include/gemmi/it92.hpp ${gemmi_src}/include/gemmi/iterator.hpp ${gemmi_src}/include/gemmi/json.hpp ${gemmi_src}/include/gemmi/levmar.hpp ${gemmi_src}/include/gemmi/linkhunt.hpp + ${gemmi_src}/include/gemmi/logger.hpp ${gemmi_src}/include/gemmi/math.hpp - # ${gemmi_src}/include/gemmi/merge.hpp ${gemmi_src}/include/gemmi/metadata.hpp - ${gemmi_src}/include/gemmi/mmcif.hpp ${gemmi_src}/include/gemmi/mmcif_impl.hpp + ${gemmi_src}/include/gemmi/mmcif.hpp ${gemmi_src}/include/gemmi/mmdb.hpp - ${gemmi_src}/include/gemmi/mmread.hpp ${gemmi_src}/include/gemmi/mmread_gz.hpp + ${gemmi_src}/include/gemmi/mmread.hpp ${gemmi_src}/include/gemmi/model.hpp ${gemmi_src}/include/gemmi/modify.hpp ${gemmi_src}/include/gemmi/monlib.hpp @@ -109,17 +116,15 @@ set(gemmi_HEADERS ${gemmi_src}/include/gemmi/neighbor.hpp ${gemmi_src}/include/gemmi/neutron92.hpp ${gemmi_src}/include/gemmi/numb.hpp - ${gemmi_src}/include/gemmi/pdb.hpp ${gemmi_src}/include/gemmi/pdb_id.hpp + ${gemmi_src}/include/gemmi/pdb.hpp ${gemmi_src}/include/gemmi/pirfasta.hpp ${gemmi_src}/include/gemmi/polyheur.hpp ${gemmi_src}/include/gemmi/qcp.hpp ${gemmi_src}/include/gemmi/read_cif.hpp - ${gemmi_src}/include/gemmi/read_map.hpp ${gemmi_src}/include/gemmi/recgrid.hpp ${gemmi_src}/include/gemmi/reciproc.hpp ${gemmi_src}/include/gemmi/refln.hpp - # ${gemmi_src}/include/gemmi/remarks.hpp ${gemmi_src}/include/gemmi/resinfo.hpp ${gemmi_src}/include/gemmi/riding_h.hpp ${gemmi_src}/include/gemmi/scaling.hpp @@ -127,6 +132,7 @@ set(gemmi_HEADERS ${gemmi_src}/include/gemmi/seqalign.hpp ${gemmi_src}/include/gemmi/seqid.hpp ${gemmi_src}/include/gemmi/seqtools.hpp + ${gemmi_src}/include/gemmi/serialize.hpp ${gemmi_src}/include/gemmi/sfcalc.hpp ${gemmi_src}/include/gemmi/small.hpp ${gemmi_src}/include/gemmi/smcif.hpp @@ -147,7 +153,7 @@ set(gemmi_HEADERS ${gemmi_src}/include/gemmi/util.hpp ${gemmi_src}/include/gemmi/version.hpp ${gemmi_src}/include/gemmi/xds_ascii.hpp - + ${gemmi_src}/include/gemmi/xds2mtz.hpp ) diff --git a/package/src/bindings/python_sails.cpp b/package/src/bindings/python_sails.cpp index 8d25276..70e0826 100644 --- a/package/src/bindings/python_sails.cpp +++ b/package/src/bindings/python_sails.cpp @@ -139,6 +139,7 @@ NB_MODULE(sails_module, m) { .def_rw("chain_idx", &Sails::Glycosite::chain_idx) .def_rw("residue_idx", &Sails::Glycosite::residue_idx) .def_rw("atom_idx", &Sails::Glycosite::atom_idx); + nb::bind_vector >(m, "GlycoSites"); nb::class_(m, "Dot") .def(nb::init()) @@ -197,6 +198,10 @@ NB_MODULE(sails_module, m) { nb::overload_cast &, int, std::string &, bool>(&o_mannosylate), "structure"_a, "grid"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); + m.def("identify_predicted_sites", nb::overload_cast&, std::string &>(&identify_predicted_sites), + "structure"_a, "grid"_a, "resource_dir"_a); + + m.def("find_all_wurcs", &find_all_wurcs, "structure"_a, "resource_dir"_a); m.def("find_wurcs", &find_wurcs, "structure"_a, "chain"_a, "seqid"_a, "resource_dir"_a); m.def("model_wurcs", &model_wurcs, "structure"_a, "wurcs"_a, "chain"_a, "seqid"_a, "resource_dir"_a); diff --git a/package/src/cpp/sails-predictions.cpp b/package/src/cpp/sails-predictions.cpp new file mode 100644 index 0000000..097fda7 --- /dev/null +++ b/package/src/cpp/sails-predictions.cpp @@ -0,0 +1,88 @@ +// +// Created by Jordan Dialpuri on 07/10/2025. +// + +#include "../include/sails-predictions.h" + + +gemmi::NeighborSearch Sails::Predictions::create_neighbour_search(float threshold, gemmi::UnitCell unit_cell) { + + gemmi::Model model = gemmi::Model(0); + gemmi::Chain chain = gemmi::Chain("A"); + + int seqid = 0; + for (int u = 0; u < m_glycan_map.nu; u++) { + for (int v = 0; v < m_glycan_map.nv; v++) { + for (int w = 0; w < m_glycan_map.nw; w++) { + + gemmi::Grid<>::Point point = m_glycan_map.get_point(u, v, w); + if (*point.value < threshold) { + continue; + } + gemmi::Position position = m_glycan_map.point_to_position(point); + gemmi::Atom atom; + atom.name = "X"; + atom.element = gemmi::Element("C"); + atom.pos = position; + gemmi::Residue residue = gemmi::Residue(); + residue.name = "PRD"; + residue.seqid = gemmi::SeqId(++seqid, '0'); + residue.atoms.emplace_back(atom); + chain.residues.emplace_back(residue); + } + } + } + + model.chains = {chain}; + + gemmi::Structure s; + s.cell = m_glycan_map.unit_cell; + s.spacegroup_hm = m_glycan_map.spacegroup->hm; + s.models = {model}; + Utils::save_structure_to_file(s, "points.cif"); + + std::cout << m_glycan_map.unit_cell.a << " " << m_glycan_map.unit_cell.b << " " << m_glycan_map.unit_cell.c << " " + << m_glycan_map.unit_cell.alpha << " " << m_glycan_map.unit_cell.beta << " " << m_glycan_map.unit_cell.gamma << std::endl; + gemmi::NeighborSearch ns = {model, unit_cell, 2}; + ns.populate(); + return ns; +} + +Sails::Glycosites Sails::Predictions::find_potential_sites(gemmi::Structure &structure) { + + Glycosites potential_sites = {}; + + gemmi::NeighborSearch ns = create_neighbour_search(0.1, structure.cell); + + for (int m = 0; m < structure.models.size(); m++) { + for (int c = 0; c < structure.models[m].chains.size(); c++) { + for (int r = 0; r < structure.models[m].chains[c].residues.size(); r++) { + + Glycosite site = {m, c, r}; + gemmi::Residue residue = structure.models[m].chains[c].residues[r]; + std::string residue_name = residue.name; + + if (protein_donors.find(residue_name) == protein_donors.end()) { + continue; + } + + std::vector donor_sets = m_residue_database[residue_name].donors; + + for (const auto& donor_set : donor_sets) { + std::string last_donor_atom_name = donor_set.atom3; + gemmi::Atom* last_donor_atom = residue.find_atom(last_donor_atom_name, '*'); + auto nearby_points = ns.find_atoms(last_donor_atom->pos, '*', 0.1, 2); + + if (nearby_points.empty()) { + continue; + } + + potential_sites.emplace_back(site); + break; + } + } + } + } + + return potential_sites; +} diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 3c1e1c7..7228425 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -26,6 +26,8 @@ #include #include +#include "src/include/sails-predictions.h" + void print_rejection_dds(const Sails::Glycosite& s1, const Sails::Glycosite& s2, gemmi::Structure* structure, float score) { std::cout << "Removing " << Sails::Utils::format_residue_from_site(s1, structure) << "--" @@ -294,6 +296,17 @@ Sails::Output o_mannosylate(gemmi::Structure &structure, Sails::MTZ &sails_mtz, return run_cycle(glycosites, structure, sails_mtz, cycles, resource_dir, true, verbose); } +Sails::Glycosites identify_predicted_sites(gemmi::Structure &structure, gemmi::Grid<>& grid, std::string &resource_dir) { + std::string data_file = resource_dir + "/data.json"; + Sails::JSONLoader loader = {data_file}; + Sails::ResidueDatabase residue_database = loader.load_residue_database(); + Sails::LinkageDatabase linkage_database = loader.load_linkage_database(); + auto predictions = Sails::Predictions(grid, linkage_database, residue_database); + + Sails::Glycosites potential_sites = predictions.find_potential_sites(structure); + return potential_sites; +} + // EM FUNCTIONS Sails::Output n_glycosylate(gemmi::Structure &structure, gemmi::Grid<>& grid, int cycles, std::string &resource_dir, diff --git a/package/src/include/sails-model.h b/package/src/include/sails-model.h index 9260b81..e13062d 100644 --- a/package/src/include/sails-model.h +++ b/package/src/include/sails-model.h @@ -7,7 +7,8 @@ #include #include - +#include +#include #include namespace Sails { @@ -219,6 +220,25 @@ namespace Sails { typedef std::map > LinkageDatabase; + /** @brief Find protein donors in LinkageDatabase + * + */ + inline std::set find_protein_donors(LinkageDatabase &linkage_database) { + std::set acceptor_names = {}; + std::set donor_names = {}; + for (const auto& [donor_name, linkages]: linkage_database) { + for (auto& linkage: linkages) { + acceptor_names.insert(linkage.acceptor); + } + donor_names.insert(donor_name); + } + std::set difference = {}; + std::set_difference(donor_names.begin(), donor_names.end(), acceptor_names.begin(), + acceptor_names.end(), std::inserter(difference, difference.begin())); + return difference; + } + + /** * @class Glycosite * @brief A class representing a glycosite. diff --git a/package/src/include/sails-predictions.h b/package/src/include/sails-predictions.h new file mode 100644 index 0000000..e76d6de --- /dev/null +++ b/package/src/include/sails-predictions.h @@ -0,0 +1,34 @@ +// +// Created by Jordan Dialpuri on 07/10/2025. +// + +#ifndef SAILS_PREDICTIONS_H +#define SAILS_PREDICTIONS_H + +#include +#include +#include "sails-model.h" +#include "sails-utils.h" + +namespace Sails { + + class Predictions { + public: + explicit Predictions(gemmi::Grid<>& glycan_map, LinkageDatabase& linkage_database, ResidueDatabase& residue_database): m_glycan_map(glycan_map), m_residue_database(residue_database) { + protein_donors = find_protein_donors(linkage_database); + }; + + gemmi::NeighborSearch create_neighbour_search(float threshold, gemmi::UnitCell unit_cell); + + Glycosites find_potential_sites(gemmi::Structure &structure); + + private: + gemmi::Grid<>& m_glycan_map; + std::set protein_donors; + Sails::ResidueDatabase& m_residue_database; + }; + +} + + +#endif //SAILS_PREDICTIONS_H diff --git a/package/src/sails/__init__.py b/package/src/sails/__init__.py index c21863f..78d8a12 100644 --- a/package/src/sails/__init__.py +++ b/package/src/sails/__init__.py @@ -26,6 +26,7 @@ find_wurcs, model_wurcs, morph, + identify_predicted_sites, ) from .__version__ import __version__ from .glycosylate import glycosylate_xtal, glycosylate_em, Type @@ -79,4 +80,5 @@ "find_wurcs", "model_wurcs", "morph", + "identify_predicted_sites", ] diff --git a/package/src/sails/find.py b/package/src/sails/find.py index 0b40fbe..4194ef4 100644 --- a/package/src/sails/find.py +++ b/package/src/sails/find.py @@ -1,9 +1,16 @@ +import importlib +import time +from argparse import ArgumentError from collections import defaultdict from pathlib import Path from typing import List, Tuple import gemmi import argparse import json +from sails import identify_predicted_sites, GlycoSite +from .interface import get_sails_structure, get_sails_map +from .glycosylate import read_prediction_dir, save_log +from .prediction.predict import predict_map def find_n_glycosylation_sites(structure: gemmi.Structure): @@ -94,27 +101,7 @@ def format_sites( return d -def run(): - """ - Parse command-line arguments, read PDB model, find glycosylation sites, - and write the results to an output file in JSON format. - - :return: None - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "-modelin", required=True, type=str, help="Path to a model in PDB or CIF format" - ) - parser.add_argument( - "-logout", - required=False, - default="sites.json", - type=str, - help="Path to output file", - ) - - args = parser.parse_args() - +def sequence_find(args: argparse.Namespace): pdb_path = Path(args.modelin) if not pdb_path.exists(): raise FileNotFoundError("Could not find specified file") @@ -132,3 +119,177 @@ def run(): with open(args.logout, "w") as f: json.dump(data, f, indent=4) + + +def convert_residue_name_to_type(residue_name: str) -> str: + n_glycans = ["ASN"] + o_glycans = ["SER", "THR"] + c_glycans = ["TRP"] + + if residue_name in n_glycans: + return "n-glycan" + elif residue_name in o_glycans: + return "o-glycan" + elif residue_name in c_glycans: + return "c-glycan" + return "x-glycan" + + +def convert_glycosites_to_log( + glycosites: List[GlycoSite], structure: gemmi.Structure | Path | str +): + if isinstance(structure, str) or isinstance(structure, Path): + structure = gemmi.read_structure(str(structure)) + + keys = defaultdict(list) + for glycosite in glycosites: + model = structure[glycosite.model_idx] + chain = model[glycosite.chain_idx] + residue = chain[glycosite.residue_idx] + key = f"{chain.name}-{residue.name}-{residue.seqid.num}" + keys[convert_residue_name_to_type(residue.name)].append(key) + + return keys + + +def get_amplitude_phase(args): + if "," not in args.colin_fwt: + raise ArgumentError("FWT column should be comma separated") + return args.colin_fwt.split(",") + + +def xray(args): + sails_structure = get_sails_structure(args.modelin) + resource = importlib.resources.files("sails").joinpath("data") + + if args.preddirin: + predicted_map = read_prediction_dir(args.preddirin) + else: + amplitude, phase = get_amplitude_phase(args) + predicted_map = predict_map( + "binary", + args.mtzin, + "output", + nthreads=8, + amplitude=amplitude, + phase=phase, + save_map=True, + ) + + sails_grid = get_sails_map(predicted_map) + result = identify_predicted_sites(sails_structure, sails_grid, str(resource)) + log = convert_glycosites_to_log(result, args.modelin) + save_log(log, args) + + +def em(args): + sails_structure = get_sails_structure(args.modelin) + resource = importlib.resources.files("sails").joinpath("data") + + if args.preddirin: + predicted_map = read_prediction_dir(args.preddirin) + else: + predicted_map = predict_map( + "binary", args.mapin, "output", nthreads=8, save_map=True + ) + + sails_predicted_grid = get_sails_map(predicted_map) + result = identify_predicted_sites( + sails_structure, sails_predicted_grid, str(resource) + ) + log = convert_glycosites_to_log(result, args.modelin) + save_log(log, args) + + +def density_find(args: argparse.Namespace): + t0 = time.time() + + if args.source == "xray": + xray(args) + elif args.source == "em": + em(args) + else: + raise RuntimeError("Unknown mode") + + t1 = time.time() + print(f"Sails Density Identification - Time Taken = {(t1 - t0)} seconds") + + +def run(): + """ + Parse command-line arguments, read PDB model, find glycosylation sites, + and write the results to an output file in JSON format. + + :return: None + """ + + parser = argparse.ArgumentParser() + + subparsers = parser.add_subparsers(dest="mode", required=True) + + seq_parser = subparsers.add_parser("seq") + seq_parser.add_argument( + "--modelin", + required=True, + type=str, + help="Path to a model in PDB or CIF format", + ) + seq_parser.add_argument( + "--logout", + required=False, + default="sites.json", + type=str, + help="Path to output file", + ) + + density_parser = subparsers.add_parser("density") + density_subparser = density_parser.add_subparsers(dest="source", required=True) + xray_parser = density_subparser.add_parser("xray") + xray_parser.add_argument( + "--mtzin", required=True, type=str, help="Path to mtz file" + ) + xray_parser.add_argument( + "--modelin", + required=True, + type=str, + help="Path to a model in PDB or CIF format", + ) + xray_parser.add_argument( + "--preddirin", + required=False, + type=str, + help="Path to a model in PDB or CIF format", + ) + xray_parser.add_argument( + "--logout", + required=False, + default="sites.json", + type=str, + help="Path to output file", + ) + xray_parser.add_argument("--colin-fo", type=str, required=False, default="FP,SIGFP") + xray_parser.add_argument( + "--colin-fwt", type=str, required=False, default="FWT,PHWT" + ) + + em_parser = density_subparser.add_parser("em") + em_parser.add_argument("--mapin", type=str, required=True) + em_parser.add_argument( + "--modelin", + required=True, + type=str, + help="Path to a model in PDB or CIF format", + ) + em_parser.add_argument( + "--logout", + required=False, + default="sites.json", + type=str, + help="Path to output file", + ) + + args = parser.parse_args() + if args.mode == "seq": + sequence_find(args) + elif args.mode == "density": + density_find(args) diff --git a/package/src/sails/glycosylate.py b/package/src/sails/glycosylate.py index eadbc50..ee1a7ae 100644 --- a/package/src/sails/glycosylate.py +++ b/package/src/sails/glycosylate.py @@ -15,6 +15,7 @@ class Type(enum.IntEnum): n_glycosylate = 1 c_glycosylate = 2 o_mannosylate = 3 + auto = 4 def __str__(self): return self.name @@ -37,12 +38,26 @@ def map_type_to_function(type: Type): if type == Type.o_mannosylate: return o_mannosylate + # if type == Type.auto: + # return auto_glycosylate + raise TypeError("Type not found") +def read_prediction_dir(path: Path | str) -> gemmi.FloatGrid: + path = Path(path) + glycan_path = path / "sails-glycan.map" + if not glycan_path.exists(): + raise FileNotFoundError(glycan_path) + + map_ = gemmi.read_ccp4_map(str(glycan_path)) + return map_.grid + + def glycosylate_xtal( structure: gemmi.Structure | Path | str, mtz: gemmi.Mtz | Path | str, + preddirin: Path | str, cycles: int, f: str, sigf: str, @@ -68,6 +83,15 @@ def glycosylate_xtal( sails_mtz = interface.get_sails_mtz(mtz, f, sigf, fwt, phwt) resource = importlib.resources.files("sails").joinpath("data") + # if type == Type.auto: + # if preddirin: + # predicted_map = read_prediction_dir(preddirin) + # else: + # predicted_map = predict_map("binary", mtz, "output", nthreads=8, save_map=True) + # sails_grid = interface.get_sails_map(predicted_map) + # + # result = auto_glycosylate(sails_structure, sails_mtz, sails_grid, cycles, str(resource), verbose) + # else: func = map_type_to_function(type) result = func(sails_structure, sails_mtz, cycles, str(resource), verbose) @@ -162,7 +186,7 @@ def xray(args): cycles = args.cycles if args.type == Type.n_glycosylate else 1 structure, mtz, log, snfgs = glycosylate_xtal( - args.modelin, args.mtzin, cycles, *labels, args.type, args.v + args.modelin, args.mtzin, args.preddirin, cycles, *labels, args.type, args.v ) if args.snfgout: @@ -207,15 +231,16 @@ def parse_args(): parent = argparse.ArgumentParser(add_help=False) group = parent.add_argument_group("Required arguments for all modes") group.add_argument("-v", action=argparse.BooleanOptionalAction, default=False) - group.add_argument("-modelin", type=str, required=True) + group.add_argument("--modelin", type=str, required=True) + group.add_argument("--preddirin", type=str, required=False) group.add_argument( - "-modelout", type=str, required=False, default="sails-model-out.cif" + "--modelout", type=str, required=False, default="sails-model-out.cif" ) - group.add_argument("-logout", type=str, default="sails-log.json") - group.add_argument("-snfgout", type=str) - group.add_argument("-cycles", type=int, required=False, default=2) + group.add_argument("--logout", type=str, default="sails-log.json") + group.add_argument("--snfgout", type=str) + group.add_argument("--cycles", type=int, required=False, default=2) group.add_argument( - "-type", type=Type.from_string, choices=list(Type), default=Type.n_glycosylate + "--type", type=Type.from_string, choices=list(Type), default=Type.n_glycosylate ) formatter = argparse.ArgumentDefaultsHelpFormatter @@ -225,17 +250,17 @@ def parse_args(): xray_parser_group = xray_parser.add_argument_group( "Required arguments in X-ray mode" ) - xray_parser_group.add_argument("-mtzin", type=str, required=True) + xray_parser_group.add_argument("--mtzin", type=str, required=True) xray_parser_group.add_argument( - "-mtzout", type=str, required=False, default="sails-refln-out.mtz" + "--mtzout", type=str, required=False, default="sails-refln-out.mtz" ) xray_parser_group.add_argument( - "-colin-fo", type=str, required=False, default="FP,SIGFP" + "--colin-fo", type=str, required=False, default="FP,SIGFP" ) - xray_parser_group.add_argument("-colin-fwt", type=str, required=False, default="") + xray_parser_group.add_argument("--colin-fwt", type=str, required=False, default="") em_parser = subparsers.add_parser("em", parents=[parent], formatter_class=formatter) em_parser_group = em_parser.add_argument_group("Required arguments in EM mode") - em_parser_group.add_argument("-mapin", type=str, required=True) + em_parser_group.add_argument("--mapin", type=str, required=True) return parser.parse_args() diff --git a/package/src/sails/interface.py b/package/src/sails/interface.py index 3763fcb..b8caf4d 100644 --- a/package/src/sails/interface.py +++ b/package/src/sails/interface.py @@ -122,7 +122,7 @@ def extract_gemmi_structure(structure: gemmi.Structure) -> sails.Structure: ) ) om = sails.Model() - om.name = structure[0].name + om.num = structure[0].num for chain in structure[0]: oc = sails.Chain() oc.name = chain.name diff --git a/package/src/sails/prediction/predict.py b/package/src/sails/prediction/predict.py index 2838273..ccd8406 100644 --- a/package/src/sails/prediction/predict.py +++ b/package/src/sails/prediction/predict.py @@ -3,6 +3,8 @@ from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import List, Tuple + +import gemmi import numpy as np from tqdm import tqdm @@ -164,6 +166,9 @@ def save_grid(self, type: MapType, output_path: Path | str): output_path / f"sails-{type.name}{suffix}", ) + def get_grid(self, type: MapType): + return self.predicted_grids[type] + def run(): """Run prediction from command line arguments""" @@ -198,7 +203,8 @@ def predict_map( phase: str = "PHWT", overlap: int = None, nthreads: int = 1, -): + save_map: bool = False, +) -> gemmi.FloatGrid: """Run prediction from Python""" logging.info( f"Running prediction with model {model}, input {input}, output {output}, resolution {resolution}, amplitude {amplitude}, phase {phase}, overlap {overlap}" @@ -214,6 +220,6 @@ def predict_map( ) prediction = Sails(model_path, configuration=configuration) prediction.predict(input, [amplitude, phase], resolution_cutoff=resolution) - prediction.save_grid(MapType.phosphate, output) - prediction.save_grid(MapType.sugar, output) - prediction.save_grid(MapType.base, output) + if save_map: + prediction.save_grid(MapType.glycan, output) + return prediction.get_grid(MapType.glycan) From bcd67d8060f7649205eac6e5e10db5d2d2cb2d97 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Thu, 9 Oct 2025 10:40:40 +0100 Subject: [PATCH 04/56] Added sails-find multiclass capability --- package/src/bindings/python_sails.cpp | 4 +- package/src/cpp/sails-predictions.cpp | 70 +++++++++++++++++++------ package/src/cpp/sails.cpp | 16 +++++- package/src/include/sails-predictions.h | 19 +++++-- package/src/sails/find.py | 35 ++++++++++--- package/src/sails/glycosylate.py | 19 +++++-- package/src/sails/prediction/model.py | 3 +- package/src/sails/prediction/predict.py | 20 +++++-- 8 files changed, 150 insertions(+), 36 deletions(-) diff --git a/package/src/bindings/python_sails.cpp b/package/src/bindings/python_sails.cpp index 70e0826..b81b3ed 100644 --- a/package/src/bindings/python_sails.cpp +++ b/package/src/bindings/python_sails.cpp @@ -199,7 +199,9 @@ NB_MODULE(sails_module, m) { "structure"_a, "grid"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); m.def("identify_predicted_sites", nb::overload_cast&, std::string &>(&identify_predicted_sites), - "structure"_a, "grid"_a, "resource_dir"_a); + "structure"_a, "glycan_grid"_a, "resource_dir"_a); + m.def("identify_predicted_sites", nb::overload_cast&, gemmi::Grid<>&, std::string &>(&identify_predicted_sites), + "structure"_a, "glycan_grid"_a, "protein_grid"_a, "resource_dir"_a); m.def("find_all_wurcs", &find_all_wurcs, "structure"_a, "resource_dir"_a); diff --git a/package/src/cpp/sails-predictions.cpp b/package/src/cpp/sails-predictions.cpp index 097fda7..e172fad 100644 --- a/package/src/cpp/sails-predictions.cpp +++ b/package/src/cpp/sails-predictions.cpp @@ -5,21 +5,21 @@ #include "../include/sails-predictions.h" -gemmi::NeighborSearch Sails::Predictions::create_neighbour_search(float threshold, gemmi::UnitCell unit_cell) { +gemmi::NeighborSearch Sails::Predictions::create_neighbour_search(gemmi::Grid<> *grid, float threshold, const gemmi::UnitCell& unit_cell) { gemmi::Model model = gemmi::Model(0); gemmi::Chain chain = gemmi::Chain("A"); int seqid = 0; - for (int u = 0; u < m_glycan_map.nu; u++) { - for (int v = 0; v < m_glycan_map.nv; v++) { - for (int w = 0; w < m_glycan_map.nw; w++) { + for (int u = 0; u < grid->nu; u++) { + for (int v = 0; v < grid->nv; v++) { + for (int w = 0; w < grid->nw; w++) { - gemmi::Grid<>::Point point = m_glycan_map.get_point(u, v, w); + gemmi::Grid<>::Point point = grid->get_point(u, v, w); if (*point.value < threshold) { continue; } - gemmi::Position position = m_glycan_map.point_to_position(point); + gemmi::Position position = grid->point_to_position(point); gemmi::Atom atom; atom.name = "X"; atom.element = gemmi::Element("C"); @@ -35,24 +35,26 @@ gemmi::NeighborSearch Sails::Predictions::create_neighbour_search(float threshol model.chains = {chain}; - gemmi::Structure s; - s.cell = m_glycan_map.unit_cell; - s.spacegroup_hm = m_glycan_map.spacegroup->hm; - s.models = {model}; - Utils::save_structure_to_file(s, "points.cif"); - - std::cout << m_glycan_map.unit_cell.a << " " << m_glycan_map.unit_cell.b << " " << m_glycan_map.unit_cell.c << " " - << m_glycan_map.unit_cell.alpha << " " << m_glycan_map.unit_cell.beta << " " << m_glycan_map.unit_cell.gamma << std::endl; gemmi::NeighborSearch ns = {model, unit_cell, 2}; ns.populate(); return ns; } Sails::Glycosites Sails::Predictions::find_potential_sites(gemmi::Structure &structure) { + if (m_glycan_map == nullptr) { + throw std::invalid_argument("Glycan map is null"); + } + if (m_protein_map == nullptr) { + return find_potential_sites_using_glycan(structure); + } + return find_potential_sites_using_protein_glycan(structure); +} + +Sails::Glycosites Sails::Predictions::find_potential_sites_using_glycan(gemmi::Structure &structure) { Glycosites potential_sites = {}; - gemmi::NeighborSearch ns = create_neighbour_search(0.1, structure.cell); + gemmi::NeighborSearch ns = create_neighbour_search(m_glycan_map, 0.1, structure.cell); for (int m = 0; m < structure.models.size(); m++) { for (int c = 0; c < structure.models[m].chains.size(); c++) { @@ -86,3 +88,41 @@ Sails::Glycosites Sails::Predictions::find_potential_sites(gemmi::Structure &str return potential_sites; } + +Sails::Glycosites Sails::Predictions::find_potential_sites_using_protein_glycan(gemmi::Structure &structure) { + Glycosites potential_sites = {}; + + gemmi::NeighborSearch ns = create_neighbour_search(m_protein_map, 0.1, structure.cell); + + for (int m = 0; m < structure.models.size(); m++) { + for (int c = 0; c < structure.models[m].chains.size(); c++) { + for (int r = 0; r < structure.models[m].chains[c].residues.size(); r++) { + + Glycosite site = {m, c, r}; + gemmi::Residue residue = structure.models[m].chains[c].residues[r]; + std::string residue_name = residue.name; + + if (protein_donors.find(residue_name) == protein_donors.end()) { + continue; + } + + std::vector donor_sets = m_residue_database[residue_name].donors; + + for (const auto& donor_set : donor_sets) { + std::string last_donor_atom_name = donor_set.atom3; + gemmi::Atom* last_donor_atom = residue.find_atom(last_donor_atom_name, '*'); + auto nearby_points = ns.find_atoms(last_donor_atom->pos, '*', 0.1, 1); + + if (nearby_points.empty()) { + continue; + } + + potential_sites.emplace_back(site); + break; + } + } + } + } + + return potential_sites; +} diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 7228425..c1aaf8d 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -296,17 +296,29 @@ Sails::Output o_mannosylate(gemmi::Structure &structure, Sails::MTZ &sails_mtz, return run_cycle(glycosites, structure, sails_mtz, cycles, resource_dir, true, verbose); } -Sails::Glycosites identify_predicted_sites(gemmi::Structure &structure, gemmi::Grid<>& grid, std::string &resource_dir) { +Sails::Glycosites identify_predicted_sites(gemmi::Structure &structure, gemmi::Grid<>& glycan_grid, std::string &resource_dir) { std::string data_file = resource_dir + "/data.json"; Sails::JSONLoader loader = {data_file}; Sails::ResidueDatabase residue_database = loader.load_residue_database(); Sails::LinkageDatabase linkage_database = loader.load_linkage_database(); - auto predictions = Sails::Predictions(grid, linkage_database, residue_database); + auto predictions = Sails::Predictions(&glycan_grid, linkage_database, residue_database); Sails::Glycosites potential_sites = predictions.find_potential_sites(structure); return potential_sites; } +Sails::Glycosites identify_predicted_sites(gemmi::Structure &structure, gemmi::Grid<>& glycan_grid, gemmi::Grid<>& protein_grid, std::string &resource_dir) { + std::string data_file = resource_dir + "/data.json"; + Sails::JSONLoader loader = {data_file}; + Sails::ResidueDatabase residue_database = loader.load_residue_database(); + Sails::LinkageDatabase linkage_database = loader.load_linkage_database(); + auto predictions = Sails::Predictions(&glycan_grid, &protein_grid, linkage_database, residue_database); + + Sails::Glycosites potential_sites = predictions.find_potential_sites(structure); + return potential_sites; +} + + // EM FUNCTIONS Sails::Output n_glycosylate(gemmi::Structure &structure, gemmi::Grid<>& grid, int cycles, std::string &resource_dir, diff --git a/package/src/include/sails-predictions.h b/package/src/include/sails-predictions.h index e76d6de..073c329 100644 --- a/package/src/include/sails-predictions.h +++ b/package/src/include/sails-predictions.h @@ -14,16 +14,29 @@ namespace Sails { class Predictions { public: - explicit Predictions(gemmi::Grid<>& glycan_map, LinkageDatabase& linkage_database, ResidueDatabase& residue_database): m_glycan_map(glycan_map), m_residue_database(residue_database) { + explicit Predictions(gemmi::Grid<>* glycan_map, LinkageDatabase& linkage_database, ResidueDatabase& residue_database): m_residue_database(residue_database) { protein_donors = find_protein_donors(linkage_database); + m_glycan_map = glycan_map; }; - gemmi::NeighborSearch create_neighbour_search(float threshold, gemmi::UnitCell unit_cell); + explicit Predictions(gemmi::Grid<>* glycan_map, gemmi::Grid<>* protein_map, LinkageDatabase& linkage_database, ResidueDatabase& residue_database): m_residue_database(residue_database) { + protein_donors = find_protein_donors(linkage_database); + m_glycan_map = glycan_map; + m_protein_map = protein_map; + }; Glycosites find_potential_sites(gemmi::Structure &structure); private: - gemmi::Grid<>& m_glycan_map; + gemmi::NeighborSearch create_neighbour_search(gemmi::Grid<> *grid, float threshold, const gemmi::UnitCell& unit_cell); + + Glycosites find_potential_sites_using_glycan(gemmi::Structure &structure); + + Glycosites find_potential_sites_using_protein_glycan(gemmi::Structure &structure); + + + gemmi::Grid<>* m_glycan_map = nullptr; + gemmi::Grid<>* m_protein_map = nullptr; std::set protein_donors; Sails::ResidueDatabase& m_residue_database; }; diff --git a/package/src/sails/find.py b/package/src/sails/find.py index 4194ef4..9747fea 100644 --- a/package/src/sails/find.py +++ b/package/src/sails/find.py @@ -10,6 +10,7 @@ from sails import identify_predicted_sites, GlycoSite from .interface import get_sails_structure, get_sails_map from .glycosylate import read_prediction_dir, save_log +from .prediction.model import ModelType from .prediction.predict import predict_map @@ -161,13 +162,13 @@ def get_amplitude_phase(args): def xray(args): sails_structure = get_sails_structure(args.modelin) resource = importlib.resources.files("sails").joinpath("data") - + model = ModelType[args.modeltype] if args.preddirin: - predicted_map = read_prediction_dir(args.preddirin) + predictions = read_prediction_dir(args.preddirin, model) else: amplitude, phase = get_amplitude_phase(args) - predicted_map = predict_map( - "binary", + predictions = predict_map( + model.name, args.mtzin, "output", nthreads=8, @@ -176,8 +177,18 @@ def xray(args): save_map=True, ) - sails_grid = get_sails_map(predicted_map) - result = identify_predicted_sites(sails_structure, sails_grid, str(resource)) + if model == ModelType.binary: + glycan_predicted_map = predictions + sails_grid = get_sails_map(glycan_predicted_map) + result = identify_predicted_sites(sails_structure, sails_grid, str(resource)) + else: + glycan_predicted_map, protein_predicted_map = predictions + sails_glycan_grid = get_sails_map(glycan_predicted_map) + sails_protein_grid = get_sails_map(protein_predicted_map) + result = identify_predicted_sites( + sails_structure, sails_glycan_grid, sails_protein_grid, str(resource) + ) + log = convert_glycosites_to_log(result, args.modelin) save_log(log, args) @@ -267,6 +278,12 @@ def run(): type=str, help="Path to output file", ) + xray_parser.add_argument( + "--modeltype", + required=True, + choices=[type.name for type in ModelType], + help="Binary or Multiclass model", + ) xray_parser.add_argument("--colin-fo", type=str, required=False, default="FP,SIGFP") xray_parser.add_argument( "--colin-fwt", type=str, required=False, default="FWT,PHWT" @@ -287,6 +304,12 @@ def run(): type=str, help="Path to output file", ) + em_parser.add_argument( + "--modeltype", + required=True, + choices=[type.name for type in ModelType], + help="Binary or Multiclass model", + ) args = parser.parse_args() if args.mode == "seq": diff --git a/package/src/sails/glycosylate.py b/package/src/sails/glycosylate.py index ee1a7ae..fba5721 100644 --- a/package/src/sails/glycosylate.py +++ b/package/src/sails/glycosylate.py @@ -9,6 +9,7 @@ import gemmi from sails import interface, n_glycosylate, c_glycosylate, o_mannosylate, __version__ +from .prediction.model import ModelType class Type(enum.IntEnum): @@ -44,14 +45,26 @@ def map_type_to_function(type: Type): raise TypeError("Type not found") -def read_prediction_dir(path: Path | str) -> gemmi.FloatGrid: +def read_prediction_dir( + path: Path | str, model_type: ModelType +) -> gemmi.FloatGrid | Tuple[gemmi.FloatGrid, gemmi.FloatGrid]: path = Path(path) glycan_path = path / "sails-glycan.map" + protein_path = path / "sails-protein.map" + if not glycan_path.exists(): raise FileNotFoundError(glycan_path) - map_ = gemmi.read_ccp4_map(str(glycan_path)) - return map_.grid + if model_type == ModelType.multiclass: + if not protein_path.exists(): + raise FileNotFoundError(protein_path) + + glycan_map = gemmi.read_ccp4_map(str(glycan_path)) + + if model_type == ModelType.multiclass: + protein_map = gemmi.read_ccp4_map(str(protein_path)) + return glycan_map.grid, protein_map.grid + return glycan_map.grid def glycosylate_xtal( diff --git a/package/src/sails/prediction/model.py b/package/src/sails/prediction/model.py index 2da573f..788be8c 100644 --- a/package/src/sails/prediction/model.py +++ b/package/src/sails/prediction/model.py @@ -89,7 +89,6 @@ def get_latest_model(type: ModelType) -> str: file = filename.get("rfilename", "") if file.endswith(".onnx"): possible_models.append(file) - # Get latest model out of list based on date possible_models = sorted(possible_models, reverse=True) latest_model = possible_models[0] @@ -219,7 +218,7 @@ def get_model_config(model_path: Path, overlap: int | None) -> SimpleNamespace: ) case ModelType.multiclass: return SimpleNamespace( - box_size=128, overlap=64 if overlap is None else overlap + box_size=128, overlap=64 if overlap is None else overlap, channels=3 ) case _: raise RuntimeError(f"Invalid model type - {model_type}") diff --git a/package/src/sails/prediction/predict.py b/package/src/sails/prediction/predict.py index ccd8406..1724e2a 100644 --- a/package/src/sails/prediction/predict.py +++ b/package/src/sails/prediction/predict.py @@ -11,7 +11,7 @@ from .load import load_density, load_onnx_model from .grid_tools import interpolate_grid, reinterpolate_grid, precompute_slices from .arguments import parse_arguments -from .model import find_model, get_model_config +from .model import find_model, get_model_config, ModelType from .save import save_grid from ..logs import setup_logging from .config import Configuration, MapType @@ -50,7 +50,9 @@ def _run_prediction(self, work_grid: np.ndarray) -> np.ndarray: slices = precompute_slices(work_grid_shape, overlap=self.configuration.overlap) box_size = self.configuration.box_size - total_array = np.zeros((*work_grid_shape, 2), dtype=np.float32) + total_array = np.zeros( + (*work_grid_shape, self.configuration.channels), dtype=np.float32 + ) count_array = np.zeros_like(total_array, dtype=np.float32) # Variance arrays for Welch's one pass variance method @@ -174,7 +176,8 @@ def run(): """Run prediction from command line arguments""" setup_logging() args = parse_arguments() - model_path = find_model(args.model) + model = ModelType[args.model] + model_path = find_model(model) model_configuration = get_model_config(model_path, args.overlap) configuration = Configuration( use_gpu=args.gpu, @@ -192,6 +195,8 @@ def run(): ) output_dir = Path(args.output) sails.save_grid(MapType.glycan, output_dir) + if model == ModelType.multiclass: + sails.save_grid(MapType.protein, output_dir) def predict_map( @@ -204,11 +209,13 @@ def predict_map( overlap: int = None, nthreads: int = 1, save_map: bool = False, -) -> gemmi.FloatGrid: +) -> gemmi.FloatGrid | Tuple[gemmi.FloatGrid, gemmi.FloatGrid]: """Run prediction from Python""" logging.info( f"Running prediction with model {model}, input {input}, output {output}, resolution {resolution}, amplitude {amplitude}, phase {phase}, overlap {overlap}" ) + + model = ModelType[model] model_path = find_model(model) model_configuration = get_model_config(model_path, overlap) configuration = Configuration( @@ -222,4 +229,9 @@ def predict_map( prediction.predict(input, [amplitude, phase], resolution_cutoff=resolution) if save_map: prediction.save_grid(MapType.glycan, output) + if model == ModelType.multiclass: + prediction.save_grid(MapType.protein, output) + + if model == ModelType.multiclass: + return prediction.get_grid(MapType.glycan), prediction.get_grid(MapType.protein) return prediction.get_grid(MapType.glycan) From a77f694302bb342ffd1435296e909befdb63d85f Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Fri, 10 Oct 2025 10:07:19 +0100 Subject: [PATCH 05/56] Added requests dependency --- package/_pyproject.toml | 1 + package/pyproject.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/package/_pyproject.toml b/package/_pyproject.toml index 5083cd5..2184be0 100644 --- a/package/_pyproject.toml +++ b/package/_pyproject.toml @@ -21,6 +21,7 @@ dependencies=[ 'typing-extensions', 'onnxruntime-gpu; platform_system != "Darwin"', 'onnxruntime; platform_system == "Darwin"', + 'requests' ] [project.urls] diff --git a/package/pyproject.toml b/package/pyproject.toml index 06c89c6..d723e64 100644 --- a/package/pyproject.toml +++ b/package/pyproject.toml @@ -21,6 +21,7 @@ dependencies=[ 'typing-extensions', 'onnxruntime-gpu; platform_system != "Darwin"', 'onnxruntime; platform_system == "Darwin"', + 'requests' ] [tool.setuptools] From ada2a1d4ddd738ddb61592c2ae828a076177b96c Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Fri, 10 Oct 2025 17:11:43 +0100 Subject: [PATCH 06/56] Added early exit when no positive predictions found --- package/src/cpp/sails-predictions.cpp | 20 ++++++++++++++++---- package/src/include/sails-predictions.h | 3 ++- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/package/src/cpp/sails-predictions.cpp b/package/src/cpp/sails-predictions.cpp index e172fad..7a457a5 100644 --- a/package/src/cpp/sails-predictions.cpp +++ b/package/src/cpp/sails-predictions.cpp @@ -5,7 +5,8 @@ #include "../include/sails-predictions.h" -gemmi::NeighborSearch Sails::Predictions::create_neighbour_search(gemmi::Grid<> *grid, float threshold, const gemmi::UnitCell& unit_cell) { +std::optional Sails::Predictions::create_neighbour_search( + gemmi::Grid<> *grid, float threshold, const gemmi::UnitCell &unit_cell) { gemmi::Model model = gemmi::Model(0); gemmi::Chain chain = gemmi::Chain("A"); @@ -33,6 +34,10 @@ gemmi::NeighborSearch Sails::Predictions::create_neighbour_search(gemmi::Grid<> } } + if (seqid == 0) { + return std::nullopt; + } + model.chains = {chain}; gemmi::NeighborSearch ns = {model, unit_cell, 2}; @@ -54,7 +59,11 @@ Sails::Glycosites Sails::Predictions::find_potential_sites_using_glycan(gemmi::S Glycosites potential_sites = {}; - gemmi::NeighborSearch ns = create_neighbour_search(m_glycan_map, 0.1, structure.cell); + std::optional ns_optional = create_neighbour_search(m_glycan_map, 0.1, structure.cell); + if (!ns_optional.has_value()) { + return potential_sites; + } + gemmi::NeighborSearch ns = ns_optional.value(); for (int m = 0; m < structure.models.size(); m++) { for (int c = 0; c < structure.models[m].chains.size(); c++) { @@ -92,7 +101,11 @@ Sails::Glycosites Sails::Predictions::find_potential_sites_using_glycan(gemmi::S Sails::Glycosites Sails::Predictions::find_potential_sites_using_protein_glycan(gemmi::Structure &structure) { Glycosites potential_sites = {}; - gemmi::NeighborSearch ns = create_neighbour_search(m_protein_map, 0.1, structure.cell); + std::optional ns_optional = create_neighbour_search(m_protein_map, 0.1, structure.cell); + if (!ns_optional.has_value()) { + return potential_sites; + } + gemmi::NeighborSearch ns = ns_optional.value(); for (int m = 0; m < structure.models.size(); m++) { for (int c = 0; c < structure.models[m].chains.size(); c++) { @@ -123,6 +136,5 @@ Sails::Glycosites Sails::Predictions::find_potential_sites_using_protein_glycan( } } } - return potential_sites; } diff --git a/package/src/include/sails-predictions.h b/package/src/include/sails-predictions.h index 073c329..4a56063 100644 --- a/package/src/include/sails-predictions.h +++ b/package/src/include/sails-predictions.h @@ -28,7 +28,8 @@ namespace Sails { Glycosites find_potential_sites(gemmi::Structure &structure); private: - gemmi::NeighborSearch create_neighbour_search(gemmi::Grid<> *grid, float threshold, const gemmi::UnitCell& unit_cell); + std::optional create_neighbour_search(gemmi::Grid<> *grid, float threshold, + const gemmi::UnitCell &unit_cell); Glycosites find_potential_sites_using_glycan(gemmi::Structure &structure); From 340bf8e58eb311b88e025ab6c99788e64f62282a Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Fri, 10 Oct 2025 17:13:47 +0100 Subject: [PATCH 07/56] Added sails prediction submodule to pyproject.toml --- package/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/pyproject.toml b/package/pyproject.toml index d723e64..6578b4b 100644 --- a/package/pyproject.toml +++ b/package/pyproject.toml @@ -25,7 +25,7 @@ dependencies=[ ] [tool.setuptools] -packages = ["sails"] +packages = ["sails", "sails.prediction"] package-dir = {"" = "src"} include-package-data = true From 7ae1e2faf42644c75e54fff42a58fb1b8d511bae Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Sat, 11 Oct 2025 14:12:02 +0100 Subject: [PATCH 08/56] Added nullptr check and nanobind dependency --- package/pyproject.toml | 3 ++- package/src/cpp/sails-predictions.cpp | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/package/pyproject.toml b/package/pyproject.toml index 6578b4b..7c4fbd2 100644 --- a/package/pyproject.toml +++ b/package/pyproject.toml @@ -21,7 +21,8 @@ dependencies=[ 'typing-extensions', 'onnxruntime-gpu; platform_system != "Darwin"', 'onnxruntime; platform_system == "Darwin"', - 'requests' + 'requests', + 'nanobind==2.4.0' ] [tool.setuptools] diff --git a/package/src/cpp/sails-predictions.cpp b/package/src/cpp/sails-predictions.cpp index 7a457a5..90e2d79 100644 --- a/package/src/cpp/sails-predictions.cpp +++ b/package/src/cpp/sails-predictions.cpp @@ -124,6 +124,9 @@ Sails::Glycosites Sails::Predictions::find_potential_sites_using_protein_glycan( for (const auto& donor_set : donor_sets) { std::string last_donor_atom_name = donor_set.atom3; gemmi::Atom* last_donor_atom = residue.find_atom(last_donor_atom_name, '*'); + if (last_donor_atom == nullptr) { + continue; + } auto nearby_points = ns.find_atoms(last_donor_atom->pos, '*', 0.1, 1); if (nearby_points.empty()) { From 23c8a4112ac5e81af910b4fcce8a5e26cb720b3a Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Sun, 12 Oct 2025 16:22:26 +0100 Subject: [PATCH 09/56] Added nullptr check and nanobind dependency --- package/src/cpp/sails-predictions.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/package/src/cpp/sails-predictions.cpp b/package/src/cpp/sails-predictions.cpp index 90e2d79..ee7e84a 100644 --- a/package/src/cpp/sails-predictions.cpp +++ b/package/src/cpp/sails-predictions.cpp @@ -82,6 +82,9 @@ Sails::Glycosites Sails::Predictions::find_potential_sites_using_glycan(gemmi::S for (const auto& donor_set : donor_sets) { std::string last_donor_atom_name = donor_set.atom3; gemmi::Atom* last_donor_atom = residue.find_atom(last_donor_atom_name, '*'); + if (last_donor_atom == nullptr) { + continue; + } auto nearby_points = ns.find_atoms(last_donor_atom->pos, '*', 0.1, 2); if (nearby_points.empty()) { From 1c1731edf7a6016e318726757ca282f1a9fc5991 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Tue, 14 Oct 2025 12:07:02 +0100 Subject: [PATCH 10/56] Added ability to use glycan finding in protein prediction --- package/src/bindings/python_sails.cpp | 4 ++-- package/src/cpp/sails-predictions.cpp | 14 +++++++------- package/src/cpp/sails.cpp | 6 +++--- package/src/include/sails-predictions.h | 4 ++-- package/src/sails/find.py | 13 ++++++++++++- 5 files changed, 26 insertions(+), 15 deletions(-) diff --git a/package/src/bindings/python_sails.cpp b/package/src/bindings/python_sails.cpp index b81b3ed..3431860 100644 --- a/package/src/bindings/python_sails.cpp +++ b/package/src/bindings/python_sails.cpp @@ -200,8 +200,8 @@ NB_MODULE(sails_module, m) { m.def("identify_predicted_sites", nb::overload_cast&, std::string &>(&identify_predicted_sites), "structure"_a, "glycan_grid"_a, "resource_dir"_a); - m.def("identify_predicted_sites", nb::overload_cast&, gemmi::Grid<>&, std::string &>(&identify_predicted_sites), - "structure"_a, "glycan_grid"_a, "protein_grid"_a, "resource_dir"_a); + m.def("identify_predicted_sites", nb::overload_cast&, gemmi::Grid<>&, bool, std::string &>(&identify_predicted_sites), + "structure"_a, "glycan_grid"_a, "protein_grid"_a, "use_glycan"_a, "resource_dir"_a); m.def("find_all_wurcs", &find_all_wurcs, "structure"_a, "resource_dir"_a); diff --git a/package/src/cpp/sails-predictions.cpp b/package/src/cpp/sails-predictions.cpp index ee7e84a..78e0825 100644 --- a/package/src/cpp/sails-predictions.cpp +++ b/package/src/cpp/sails-predictions.cpp @@ -45,14 +45,14 @@ std::optional Sails::Predictions::create_neighbour_search return ns; } -Sails::Glycosites Sails::Predictions::find_potential_sites(gemmi::Structure &structure) { - if (m_glycan_map == nullptr) { - throw std::invalid_argument("Glycan map is null"); - } - if (m_protein_map == nullptr) { +Sails::Glycosites Sails::Predictions::find_potential_sites(gemmi::Structure &structure, bool use_glycan) { + if (use_glycan && m_glycan_map != nullptr) { return find_potential_sites_using_glycan(structure); } - return find_potential_sites_using_protein_glycan(structure); + if (!use_glycan && m_protein_map != nullptr) { + return find_potential_sites_using_protein(structure); + } + throw std::invalid_argument("Glycan map is null"); } Sails::Glycosites Sails::Predictions::find_potential_sites_using_glycan(gemmi::Structure &structure) { @@ -101,7 +101,7 @@ Sails::Glycosites Sails::Predictions::find_potential_sites_using_glycan(gemmi::S return potential_sites; } -Sails::Glycosites Sails::Predictions::find_potential_sites_using_protein_glycan(gemmi::Structure &structure) { +Sails::Glycosites Sails::Predictions::find_potential_sites_using_protein(gemmi::Structure &structure) { Glycosites potential_sites = {}; std::optional ns_optional = create_neighbour_search(m_protein_map, 0.1, structure.cell); diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index c1aaf8d..64b9ce3 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -303,18 +303,18 @@ Sails::Glycosites identify_predicted_sites(gemmi::Structure &structure, gemmi::G Sails::LinkageDatabase linkage_database = loader.load_linkage_database(); auto predictions = Sails::Predictions(&glycan_grid, linkage_database, residue_database); - Sails::Glycosites potential_sites = predictions.find_potential_sites(structure); + Sails::Glycosites potential_sites = predictions.find_potential_sites(structure, true); return potential_sites; } -Sails::Glycosites identify_predicted_sites(gemmi::Structure &structure, gemmi::Grid<>& glycan_grid, gemmi::Grid<>& protein_grid, std::string &resource_dir) { +Sails::Glycosites identify_predicted_sites(gemmi::Structure &structure, gemmi::Grid<>& glycan_grid, gemmi::Grid<>& protein_grid, bool use_glycan, std::string &resource_dir) { std::string data_file = resource_dir + "/data.json"; Sails::JSONLoader loader = {data_file}; Sails::ResidueDatabase residue_database = loader.load_residue_database(); Sails::LinkageDatabase linkage_database = loader.load_linkage_database(); auto predictions = Sails::Predictions(&glycan_grid, &protein_grid, linkage_database, residue_database); - Sails::Glycosites potential_sites = predictions.find_potential_sites(structure); + Sails::Glycosites potential_sites = predictions.find_potential_sites(structure, use_glycan); return potential_sites; } diff --git a/package/src/include/sails-predictions.h b/package/src/include/sails-predictions.h index 4a56063..28d2561 100644 --- a/package/src/include/sails-predictions.h +++ b/package/src/include/sails-predictions.h @@ -25,7 +25,7 @@ namespace Sails { m_protein_map = protein_map; }; - Glycosites find_potential_sites(gemmi::Structure &structure); + Glycosites find_potential_sites(gemmi::Structure &structure, bool use_glycan); private: std::optional create_neighbour_search(gemmi::Grid<> *grid, float threshold, @@ -33,7 +33,7 @@ namespace Sails { Glycosites find_potential_sites_using_glycan(gemmi::Structure &structure); - Glycosites find_potential_sites_using_protein_glycan(gemmi::Structure &structure); + Glycosites find_potential_sites_using_protein(gemmi::Structure &structure); gemmi::Grid<>* m_glycan_map = nullptr; diff --git a/package/src/sails/find.py b/package/src/sails/find.py index 9747fea..3b77303 100644 --- a/package/src/sails/find.py +++ b/package/src/sails/find.py @@ -185,8 +185,13 @@ def xray(args): glycan_predicted_map, protein_predicted_map = predictions sails_glycan_grid = get_sails_map(glycan_predicted_map) sails_protein_grid = get_sails_map(protein_predicted_map) + searchtype = args.searchtype result = identify_predicted_sites( - sails_structure, sails_glycan_grid, sails_protein_grid, str(resource) + sails_structure, + sails_glycan_grid, + sails_protein_grid, + searchtype == "glycan", + str(resource), ) log = convert_glycosites_to_log(result, args.modelin) @@ -284,6 +289,12 @@ def run(): choices=[type.name for type in ModelType], help="Binary or Multiclass model", ) + xray_parser.add_argument( + "--searchtype", + required=True, + choices=["protein", "glycan"], + help="Search for protein or glycan, only used if modeltype is multiclass", + ) xray_parser.add_argument("--colin-fo", type=str, required=False, default="FP,SIGFP") xray_parser.add_argument( "--colin-fwt", type=str, required=False, default="FWT,PHWT" From 7166e4b37169628281e7a490c6492c9e24425b95 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Sat, 18 Oct 2025 08:28:11 +0100 Subject: [PATCH 11/56] Added autoglycoslate function to API --- package/src/bindings/python_sails.cpp | 4 ++ package/src/sails/__init__.py | 2 + package/src/sails/glycosylate.py | 56 +++++++++++++++++++-------- 3 files changed, 46 insertions(+), 16 deletions(-) diff --git a/package/src/bindings/python_sails.cpp b/package/src/bindings/python_sails.cpp index 3431860..5ab1124 100644 --- a/package/src/bindings/python_sails.cpp +++ b/package/src/bindings/python_sails.cpp @@ -198,6 +198,10 @@ NB_MODULE(sails_module, m) { nb::overload_cast &, int, std::string &, bool>(&o_mannosylate), "structure"_a, "grid"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); + m.def("auto_glycosylate", + nb::overload_cast&, gemmi::Grid<>&, int, std::string &, bool>(&auto_glycosylate), "structure"_a, + "mtz"_a, "glycan_grid"_a, "protein_grid"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); + m.def("identify_predicted_sites", nb::overload_cast&, std::string &>(&identify_predicted_sites), "structure"_a, "glycan_grid"_a, "resource_dir"_a); m.def("identify_predicted_sites", nb::overload_cast&, gemmi::Grid<>&, bool, std::string &>(&identify_predicted_sites), diff --git a/package/src/sails/__init__.py b/package/src/sails/__init__.py index 78d8a12..a865030 100644 --- a/package/src/sails/__init__.py +++ b/package/src/sails/__init__.py @@ -27,6 +27,7 @@ model_wurcs, morph, identify_predicted_sites, + auto_glycosylate, ) from .__version__ import __version__ from .glycosylate import glycosylate_xtal, glycosylate_em, Type @@ -81,4 +82,5 @@ "model_wurcs", "morph", "identify_predicted_sites", + "auto_glycosylate", ] diff --git a/package/src/sails/glycosylate.py b/package/src/sails/glycosylate.py index fba5721..41e3396 100644 --- a/package/src/sails/glycosylate.py +++ b/package/src/sails/glycosylate.py @@ -8,8 +8,16 @@ from typing import Tuple, List import gemmi -from sails import interface, n_glycosylate, c_glycosylate, o_mannosylate, __version__ +from sails import ( + interface, + n_glycosylate, + c_glycosylate, + o_mannosylate, + __version__, + auto_glycosylate, +) from .prediction.model import ModelType +from .prediction.predict import predict_map class Type(enum.IntEnum): @@ -39,8 +47,8 @@ def map_type_to_function(type: Type): if type == Type.o_mannosylate: return o_mannosylate - # if type == Type.auto: - # return auto_glycosylate + if type == Type.auto: + return auto_glycosylate raise TypeError("Type not found") @@ -96,17 +104,31 @@ def glycosylate_xtal( sails_mtz = interface.get_sails_mtz(mtz, f, sigf, fwt, phwt) resource = importlib.resources.files("sails").joinpath("data") - # if type == Type.auto: - # if preddirin: - # predicted_map = read_prediction_dir(preddirin) - # else: - # predicted_map = predict_map("binary", mtz, "output", nthreads=8, save_map=True) - # sails_grid = interface.get_sails_map(predicted_map) - # - # result = auto_glycosylate(sails_structure, sails_mtz, sails_grid, cycles, str(resource), verbose) - # else: - func = map_type_to_function(type) - result = func(sails_structure, sails_mtz, cycles, str(resource), verbose) + if type == Type.auto: + if preddirin: + predictions = read_prediction_dir( + preddirin, model_type=ModelType.multiclass + ) + else: + predictions = predict_map( + "multiclass", mtz, "output", nthreads=8, save_map=True + ) + glycan, protein = predictions + sails_glycan = interface.get_sails_map(glycan) + sails_protein = interface.get_sails_map(protein) + + result = auto_glycosylate( + sails_structure, + sails_mtz, + sails_glycan, + sails_protein, + cycles, + str(resource), + verbose, + ) + else: + func = map_type_to_function(type) + result = func(sails_structure, sails_mtz, cycles, str(resource), verbose) return ( interface.extract_sails_structure(result.structure), @@ -197,7 +219,9 @@ def save_snfgs(snfgs: dict, snfg_path: Path): def xray(args): labels = get_column_labels(args.colin_fo, args.colin_fwt) - cycles = args.cycles if args.type == Type.n_glycosylate else 1 + cycles = ( + args.cycles if args.type == Type.n_glycosylate or args.type == Type.auto else 1 + ) structure, mtz, log, snfgs = glycosylate_xtal( args.modelin, args.mtzin, args.preddirin, cycles, *labels, args.type, args.v ) @@ -253,7 +277,7 @@ def parse_args(): group.add_argument("--snfgout", type=str) group.add_argument("--cycles", type=int, required=False, default=2) group.add_argument( - "--type", type=Type.from_string, choices=list(Type), default=Type.n_glycosylate + "--type", type=Type.from_string, choices=list(Type), default=Type.auto ) formatter = argparse.ArgumentDefaultsHelpFormatter From 73f1c558782c177d59b6a2598dd9aea915edd952 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Sat, 18 Oct 2025 08:28:39 +0100 Subject: [PATCH 12/56] Added renumber function to Glycan --- package/src/include/sails-glycan.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/package/src/include/sails-glycan.h b/package/src/include/sails-glycan.h index 76df3bd..e10369e 100644 --- a/package/src/include/sails-glycan.h +++ b/package/src/include/sails-glycan.h @@ -244,6 +244,26 @@ namespace Sails { return sites; } + /** + * @brief Returns the DFS order of the sugars sites. + * + * @return A vector of Glycosites in DFS order + */ + [[nodiscard]] std::vector get_sugar_site_dfs_order_without_root() { + std::vector sites; + dfs_sites(root_sugar, sites, 0); + sites.erase(sites.begin(), sites.begin() + 1); + return sites; + } + + void renumber() { + std::vector sites = get_sugar_site_dfs_order_without_root(); + for (int i = 0; i < sites.size(); i++) { + gemmi::Residue* residue_ptr = Utils::get_residue_ptr_from_glycosite(sites[i], m_structure); + residue_ptr->seqid.num.value = i; + } + } + /** * @brief Returns the order of the sugars. * From 18d5fed35e067dbec7ce5dd04862cd08e75656bf Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Sat, 18 Oct 2025 08:28:59 +0100 Subject: [PATCH 13/56] Added standardise residue name function --- package/src/cpp/sails-linkage.cpp | 22 +++++++++++++++++++++- package/src/include/sails-linkage.h | 5 +++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/package/src/cpp/sails-linkage.cpp b/package/src/cpp/sails-linkage.cpp index f28c97c..6dc3a54 100644 --- a/package/src/cpp/sails-linkage.cpp +++ b/package/src/cpp/sails-linkage.cpp @@ -29,6 +29,26 @@ void Sails::Model::print_successful_log(Sails::Density &density, std::optional names = { + { + "AMAN", "MAN" + } + }; + + for (int m = 0; m < structure->models.size(); m++) { + for (int c = 0; c < structure->models[m].chains.size(); c++) { + for (int r = 0; r < structure->models[m].chains[c].residues.size(); r++) { + gemmi::Residue* residue = &structure->models[m].chains[c].residues[r]; + if (names.count(residue->name) == 0) continue; + + std::string new_name = names.at(residue->name); + residue->name = new_name; + } + } + } +} + // UTILITY FUNCTIONS std::optional Sails::Model::get_monomer(const std::string &monomer, bool remove_h) { @@ -40,7 +60,6 @@ std::optional Sails::Model::get_monomer(const std::string &monom std::string path = monomer_library_path + "/" + char(std::tolower(monomer.front())) + "/" + monomer + ".cif"; if (!Utils::file_exists(path)) { - std::cerr << "File " << path << " does not exist" << std::endl; path = special_monomer_path + "/" + monomer + ".cif"; if (!Utils::file_exists(path)) { std::cout << path << " monomer does not exist" << std::endl; @@ -312,6 +331,7 @@ std::optional Sails::Model::add_residue( SuperpositionResult best_result; float best_rscc = INT_MIN; + int i = 0; for (auto &cluster: data.clusters) { std::vector torsions = cluster.torsions.get_means_in_order(); std::vector torsion_stddev = cluster.torsions.get_stddev_in_order(); diff --git a/package/src/include/sails-linkage.h b/package/src/include/sails-linkage.h index 91e16f5..9035f1a 100644 --- a/package/src/include/sails-linkage.h +++ b/package/src/include/sails-linkage.h @@ -142,6 +142,11 @@ namespace Sails { static gemmi::Residue replace_residue(gemmi::Residue *target_residue, const std::string &replacement_residue_name); + + + + void standardise_residue_names() const; + private: typedef std::map > PossibleAdditions; From 6d4be5e85595835688f1d0f6b8f8469b24f5fd87 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Sat, 18 Oct 2025 08:29:22 +0100 Subject: [PATCH 14/56] Fixed bugs in density calculation --- .../src/cpp/density/sails-xtal-density.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/package/src/cpp/density/sails-xtal-density.cpp b/package/src/cpp/density/sails-xtal-density.cpp index 4f6b360..2d53912 100644 --- a/package/src/cpp/density/sails-xtal-density.cpp +++ b/package/src/cpp/density/sails-xtal-density.cpp @@ -60,7 +60,7 @@ void Sails::XtalDensity::load_hkl(const std::string &f, const std::string &sig_f gemmi::Grid<> Sails::XtalDensity::load_grid(const gemmi::Mtz &mtz, const std::string &f_col, const std::string &phi_col, bool normalise) { constexpr std::array null_size = {0, 0, 0}; - constexpr double sample_rate = 0; + constexpr double sample_rate = 3; constexpr auto order = gemmi::AxisOrder::XYZ; const gemmi::Mtz::Column &f = mtz.get_column_with_label(f_col); @@ -136,12 +136,19 @@ void Sails::XtalDensity::recalculate_map(gemmi::Structure &structure) { recalculated_data.emplace_back(hkl.h()); recalculated_data.emplace_back(hkl.k()); recalculated_data.emplace_back(hkl.l()); - recalculated_data.emplace_back(clipper::Util::rad2d(fobs_reflection.f())); - recalculated_data.emplace_back(clipper::Util::rad2d(fobs_reflection.sigf())); - recalculated_data.emplace_back(clipper::Util::rad2d(fbest_reflection.f())); + recalculated_data.emplace_back(fobs_reflection.f()); + recalculated_data.emplace_back(fobs_reflection.sigf()); + recalculated_data.emplace_back(fbest_reflection.f()); recalculated_data.emplace_back(clipper::Util::rad2d(fbest_reflection.phi())); - recalculated_data.emplace_back(clipper::Util::rad2d(fdiff_reflection.f())); + recalculated_data.emplace_back(fdiff_reflection.f()); recalculated_data.emplace_back(clipper::Util::rad2d(fdiff_reflection.phi())); + + // recalculated_data.emplace_back(clipper::Util::rad2d(fobs_reflection.f())); + // recalculated_data.emplace_back(clipper::Util::rad2d(fobs_reflection.sigf())); + // recalculated_data.emplace_back(clipper::Util::rad2d(fbest_reflection.f())); + // recalculated_data.emplace_back(clipper::Util::rad2d(fbest_reflection.phi())); + // recalculated_data.emplace_back(clipper::Util::rad2d(fdiff_reflection.f())); + // recalculated_data.emplace_back(clipper::Util::rad2d(fdiff_reflection.phi())); } gemmi::Mtz new_mtz; @@ -160,7 +167,7 @@ void Sails::XtalDensity::recalculate_map(gemmi::Structure &structure) { m_mtz = std::move(new_mtz); m_grid = load_grid(m_mtz, "FWT", "PHWT", false); - m_difference_grid = load_grid(m_mtz, "DELFWT", "PHDELWT", true); + m_difference_grid = load_grid(m_mtz, "DELFWT", "PHDELWT", false); } void Sails::XtalDensity::calculate_po_pc_map(gemmi::Structure &structure) { From f6f46d625dee936680719b8649729a8e6832f7be Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Sun, 19 Oct 2025 11:05:57 +0100 Subject: [PATCH 15/56] Added validate based on RSCC Fixed bugs in building Changed DDS --- package/_pyproject.toml | 1 + package/pyproject.toml | 1 + package/src/bindings/python_sails.cpp | 1 + package/src/cpp/density/sails-density.cpp | 90 +++++---- .../src/cpp/density/sails-xtal-density.cpp | 4 + package/src/cpp/sails-glycan.cpp | 14 ++ package/src/cpp/sails-json.cpp | 5 +- package/src/cpp/sails-refine.cpp | 87 ++++++--- package/src/cpp/sails.cpp | 175 ++++++++++++++---- package/src/include/density/sails-density.h | 6 +- .../src/include/density/sails-xtal-density.h | 2 + package/src/include/sails-glycan.h | 12 +- package/src/include/sails-model.h | 7 +- package/src/sails/__init__.py | 2 + package/src/sails/data/data.json | 46 +++-- package/src/sails/validate.py | 55 ++++++ 16 files changed, 392 insertions(+), 116 deletions(-) create mode 100644 package/src/sails/validate.py diff --git a/package/_pyproject.toml b/package/_pyproject.toml index 2184be0..a509eb6 100644 --- a/package/_pyproject.toml +++ b/package/_pyproject.toml @@ -38,6 +38,7 @@ sails-morph = "sails.morph:run" sails-predict = "sails.prediction.predict:run" sails-install = "sails.install:run" sails-clean = "sails.clean:run" +sails-validate = "sails.validate:run" [tool.scikit-build] # Protect the configuration against future changes in scikit-build-core diff --git a/package/pyproject.toml b/package/pyproject.toml index 7c4fbd2..4e96959 100644 --- a/package/pyproject.toml +++ b/package/pyproject.toml @@ -47,6 +47,7 @@ sails-morph = "sails.morph:run" sails-predict = "sails.prediction.predict:run" sails-install = "sails.install:run" sails-clean = "sails.clean:run" +sails-validate = "sails.validate:run" [tool.scikit-build] # Protect the configuration against future changes in scikit-build-core diff --git a/package/src/bindings/python_sails.cpp b/package/src/bindings/python_sails.cpp index 5ab1124..9f05bab 100644 --- a/package/src/bindings/python_sails.cpp +++ b/package/src/bindings/python_sails.cpp @@ -213,6 +213,7 @@ NB_MODULE(sails_module, m) { m.def("model_wurcs", &model_wurcs, "structure"_a, "wurcs"_a, "chain"_a, "seqid"_a, "resource_dir"_a); m.def("morph", &morph, "structure"_a, "wurcs"_a, "chain"_a, "seqid"_a, "resource_dir"_a); + m.def("validate", &validate, "structure"_a, "mtz"_a, "remove"_a, "resource_dir"_a); m.def("test_snfg", &test); diff --git a/package/src/cpp/density/sails-density.cpp b/package/src/cpp/density/sails-density.cpp index ed5ee83..a6598bd 100644 --- a/package/src/cpp/density/sails-density.cpp +++ b/package/src/cpp/density/sails-density.cpp @@ -17,8 +17,8 @@ double Sails::Density::score_residue(gemmi::Residue &residue, const DensityScore return rscc_score(residue); case rsr: return rsr_score(residue); - case dds: - return difference_density_score(residue); + // case dds: + // return check_difference_density(residue, TODO); default: return -1; } @@ -32,8 +32,8 @@ double Sails::Density::score_result(SuperpositionResult& result) { return rscc_score(result); case rsr: return rsr_score(result); - case dds: - return difference_density_score(result.new_residue); + // case dds: + // return check_difference_density(result.new_residue, TODO); default: return -1; } @@ -72,10 +72,10 @@ gemmi::Grid<> Sails::Density::calculate_density_for_grid(gemmi::Residue &residue gemmi::DensityCalculator, float> density_calculator; - density_calculator.grid.copy_metadata_from(*get_work_grid()); - density_calculator.grid.spacing[0] = get_work_grid()->spacing[0]; - density_calculator.grid.spacing[1] = get_work_grid()->spacing[1]; - density_calculator.grid.spacing[2] = get_work_grid()->spacing[2]; + density_calculator.grid.copy_metadata_from(*get_best_grid()); + density_calculator.grid.spacing[0] = get_best_grid()->spacing[0]; + density_calculator.grid.spacing[1] = get_best_grid()->spacing[1]; + density_calculator.grid.spacing[2] = get_best_grid()->spacing[2]; density_calculator.d_min = get_resolution(); density_calculator.initialize_grid(); @@ -126,7 +126,7 @@ float Sails::Density::rscc_score(gemmi::Residue &residue) const { for (auto &atom: residue.atoms) { box.extend(atom.pos); } - box.add_margin(1); + // box.add_margin(); // gemmi::Grid<> calc = calculate_density_for_box(residue, box); gemmi::Grid<> calc = calculate_density_for_grid(residue); @@ -139,14 +139,13 @@ float Sails::Density::rscc_score(gemmi::Residue &residue) const { // std::vector rs = {residue}; // Utils::save_residues_to_file(rs, "res.pdb"); - const gemmi::Position max = box.maximum; const gemmi::Position min = box.minimum; std::vector obs_values = {}; std::vector calc_values = {}; - constexpr double step_size = 0.5; + const double step_size = get_best_grid()->spacing[0]; for (double x = min.x; x <= max.x; x += step_size) { for (double y = min.y; y <= max.y; y += step_size) { for (double z = min.z; z <= max.z; z += step_size) { @@ -270,30 +269,44 @@ float Sails::Density::rsr_score(SuperpositionResult &result) { return numerator / denominator; } -float Sails::Density::difference_density_score(gemmi::Residue &residue) const { - gemmi::Box box; - for (auto &atom: residue.atoms) { - box.extend(atom.pos); - } +int Sails::Density::check_difference_density(gemmi::Residue &residue, std::pair map_stats) const { - const gemmi::Position max = box.maximum; - const gemmi::Position min = box.minimum; + float threshold = map_stats.first - 2 * map_stats.second; - float sum = 0.0f; - int points = 0; - constexpr double step_size = 0.5; - for (double x = min.x; x <= max.x; x += step_size) { - for (double y = min.y; y <= max.y; y += step_size) { - for (double z = min.z; z <= max.z; z += step_size) { - gemmi::Position position = {x, y, z}; - float value = get_difference_grid()->interpolate_value(position); - sum += abs(value); - points++; - } + std::set ring_atoms = { + "C1", "C2", "C3", "C4", "C5", "O5" + }; + int i = 0; + for (auto & atom : residue.atoms) { + // if (ring_atoms.count(atom.name) == 0) continue; + if (get_difference_grid()->interpolate_value(atom.pos) < threshold) { + i++; } } - - return sum / points; + return i; + // gemmi::Box box; + // for (auto &atom: residue.atoms) { + // box.extend(atom.pos); + // } + // + // const gemmi::Position max = box.maximum; + // const gemmi::Position min = box.minimum; + // + // float sum = 0.0f; + // int points = 0; + // constexpr double step_size = 0.5; + // for (double x = min.x; x <= max.x; x += step_size) { + // for (double y = min.y; y <= max.y; y += step_size) { + // for (double z = min.z; z <= max.z; z += step_size) { + // gemmi::Position position = {x, y, z}; + // float value = get_difference_grid()->interpolate_value(position); + // sum += abs(value); + // points++; + // } + // } + // } + // + // return sum / points; } float Sails::Density::score_atomic_position(const gemmi::Atom &atom) const { @@ -304,3 +317,18 @@ float Sails::Density::score_atomic_position(const gemmi::Atom &atom) const { float Sails::Density::score_position(const gemmi::Position &pos) const { return get_work_grid()->interpolate_value(pos); } + +std::pair Sails::Density::calculate_map_statistics(const gemmi::Grid<> *grid) const { + const float sum = std::accumulate(grid->data.begin(), grid->data.end(), 0.0f); + float mean = sum / grid->data.size(); + + float sq_sum = std::accumulate(grid->data.begin(), grid->data.end(), 0.0, + [mean](const double acc, const double x) { + const double diff = x - mean; + return acc + diff * diff; + }); + + float stdev = std::sqrt(sq_sum / grid->data.size()); + + return std::make_pair(mean, stdev); +} diff --git a/package/src/cpp/density/sails-xtal-density.cpp b/package/src/cpp/density/sails-xtal-density.cpp index 2d53912..0dcafbd 100644 --- a/package/src/cpp/density/sails-xtal-density.cpp +++ b/package/src/cpp/density/sails-xtal-density.cpp @@ -17,6 +17,10 @@ Sails::XtalDensity::XtalDensity(gemmi::Mtz &mtz, const std::string& F, const std load_hkl(F, SIGF); } +void Sails::XtalDensity::load_map_coefficients(const std::string &fwt, const std::string &phwt) { + m_grid = load_grid(m_mtz, fwt, phwt, false); +} + void Sails::XtalDensity::initialise_hkl() { m_resolution = clipper::Resolution(m_mtz.resolution_high()); diff --git a/package/src/cpp/sails-glycan.cpp b/package/src/cpp/sails-glycan.cpp index f49f329..d96900b 100644 --- a/package/src/cpp/sails-glycan.cpp +++ b/package/src/cpp/sails-glycan.cpp @@ -134,5 +134,19 @@ std::vector Sails::Glycan::get_terminal_sugars(Glycosite &root_s } std::vector terminal_sugars; dfs(sugars[root_seq_id].get(), terminal_sugars); + + // FUC has no links, but would be the terminal sugar in order, so add the sugar before FUC in that case + std::vector additional_sugars; + for (auto& sugar: terminal_sugars) { + gemmi::Residue* residue_ptr = Utils::get_residue_ptr_from_glycosite(sugar->site, m_structure); + if (residue_ptr->name == "FUC") { + auto previous_sugar = find_previous_sugar(sugar); + if (previous_sugar.has_value()) { + additional_sugars.emplace_back(previous_sugar.value()); + } + } + } + + terminal_sugars.insert(terminal_sugars.end(), additional_sugars.begin(), additional_sugars.end()); return terminal_sugars; } diff --git a/package/src/cpp/sails-json.cpp b/package/src/cpp/sails-json.cpp index f557cc2..97eb19c 100644 --- a/package/src/cpp/sails-json.cpp +++ b/package/src/cpp/sails-json.cpp @@ -32,6 +32,7 @@ Sails::ResidueDatabase Sails::JSONLoader::load_residue_database() { const char *anomer_key = "anomer"; const char *wurcs_code_key = "wurcsCode"; const char *special_key = "special"; + const char *is_sugar_key = "isSugar"; ResidueDatabase database; @@ -54,7 +55,9 @@ Sails::ResidueDatabase Sails::JSONLoader::load_residue_database() { std::string wurcs_code = std::string(value[wurcs_code_key].get_string().value()); bool special = value[special_key].get_bool(); - ResidueData data = {acceptors_sets, donor_sets, snfg_shape, snfg_colour, preferred_depths, anomer, wurcs_code, special}; + bool is_sugar = value[is_sugar_key].get_bool(); + + ResidueData data = {acceptors_sets, donor_sets, snfg_shape, snfg_colour, preferred_depths, anomer, wurcs_code, special, is_sugar}; database.insert({name, data}); } diff --git a/package/src/cpp/sails-refine.cpp b/package/src/cpp/sails-refine.cpp index f680a4f..a247745 100644 --- a/package/src/cpp/sails-refine.cpp +++ b/package/src/cpp/sails-refine.cpp @@ -7,26 +7,34 @@ double Sails::TorsionAngleRefiner::calculate_penalty(double angle, double angle_mean, double angle_stddev, double penalty_factor) { - int std_deviations_allowed = 1; - double range = std_deviations_allowed * angle_stddev; - double lower_bound = angle_mean - range; - double upper_bound = angle_mean + range; - - double deviation = 0; - if (angle < lower_bound) { - deviation = lower_bound - angle; - } else { - deviation = angle - upper_bound; - } - - double penalty = penalty_factor * pow(deviation, 2); - return penalty; + // int std_deviations_allowed = 2; + // double range = std_deviations_allowed * angle_stddev; + // double lower_bound = angle_mean - range; + // double upper_bound = angle_mean + range; + // + // double deviation = 0; + // if (angle < lower_bound) { + // deviation = lower_bound - angle; + // } else { + // deviation = angle - upper_bound; + // } + // + // double penalty = penalty_factor * pow(deviation, 2); + // return penalty; + // + double angle_r = angle * M_PI / 180.0; + double angle_mean_r = angle_mean * M_PI / 180.0; + double angle_stddev_r = angle_stddev * M_PI / 180.0; + double diff = angle_r - angle_mean_r; + double delta = atan2(sin(diff), cos(diff)) ; + double penalty = pow(delta, 2) / pow(angle_stddev_r,2); + return penalty * penalty_factor; } double Sails::TorsionAngleRefiner::calculate_penalty_factor() const { switch (m_density->get_score_method()) { case atomwise: - return 1e-3; + return 1e-2; case rscc: return 1e-5; default: @@ -35,16 +43,16 @@ double Sails::TorsionAngleRefiner::calculate_penalty_factor() const { } double Sails::TorsionAngleRefiner::score_function(std::vector &all_angles) { - std::vector angles = {all_angles[0], all_angles[1], all_angles[2]}; - std::vector torsions = {all_angles[3], all_angles[4], all_angles[5]}; + std::vector angles = {all_angles[1], all_angles[2], all_angles[3]}; + std::vector torsions = {all_angles[4], all_angles[5], all_angles[6]}; gemmi::Residue residue = gemmi::Residue(m_reference_residue); - gemmi::Transform superpose_result = Model::superpose_atoms(m_all_atoms, m_reference_atoms, m_length, angles, + gemmi::Transform superpose_result = Model::superpose_atoms(m_all_atoms, m_reference_atoms, all_angles[0], angles, torsions); gemmi::transform_pos_and_adp(residue, superpose_result); SuperpositionResult result = {residue, superpose_result, m_reference_residue}; - const double score = m_density->score_result(result); + const double score = -m_density->score_result(result); double penalty = 0; double penalty_factor = calculate_penalty_factor(); @@ -53,15 +61,35 @@ double Sails::TorsionAngleRefiner::score_function(std::vector &all_angle penalty += calculate_penalty(torsions[i], m_torsion_mean[i], m_torsion_range[i], penalty_factor); } - return penalty-score; + double bond_length_delta = abs(all_angles[0] - m_length); + if (bond_length_delta > 0.3) { + penalty += 1e10; + } + // std::cout << penalty << " " << score << " " << penalty_factor << std::endl; + + return score + penalty; } Sails::SuperpositionResult Sails::TorsionAngleRefiner::refine() { std::vector initial_simplex = { + m_length, m_angle_mean[0], m_angle_mean[1], m_angle_mean[2], m_torsion_mean[0], m_torsion_mean[1], m_torsion_mean[2] }; + // gemmi::Residue reference_residue = gemmi::Residue(m_reference_residue); + // gemmi::Transform reference_superpose_result = Model::superpose_atoms(m_all_atoms, m_reference_atoms, m_length, m_angle_mean, + // m_torsion_mean); + // gemmi::transform_pos_and_adp(reference_residue, reference_superpose_result); + // SuperpositionResult reference_result = {reference_residue, reference_superpose_result, m_reference_residue}; + // + // const double initial_score = m_density->score_result(reference_result); + // double penalty = 0; + // for (int i = 0; i < 3; i++) { + // penalty += calculate_penalty(m_angle_mean[i], m_angle_mean[i], m_angle_range[i], calculate_penalty_factor()); + // penalty += calculate_penalty(m_torsion_mean[i], m_torsion_mean[i], m_torsion_range[i], calculate_penalty_factor()); + // } + auto lambda = [&](std::vector &x) -> double { return this->score_function(x); }; @@ -70,24 +98,35 @@ Sails::SuperpositionResult Sails::TorsionAngleRefiner::refine() { 100000); std::vector final_angles = { - final_simplex[0], final_simplex[1], final_simplex[2] + final_simplex[1], final_simplex[2], final_simplex[3] }; std::vector final_torsions = { - final_simplex[3], final_simplex[4], final_simplex[5] + final_simplex[4], final_simplex[5], final_simplex[6] }; gemmi::Residue residue = gemmi::Residue(m_reference_residue); gemmi::Transform final_result = - Model::superpose_atoms(m_all_atoms, m_reference_atoms, m_length, final_angles, final_torsions); + Model::superpose_atoms(m_all_atoms, m_reference_atoms, final_simplex[0], final_angles, final_torsions); gemmi::transform_pos_and_adp(residue, final_result); SuperpositionResult result = {residue, final_result, m_reference_residue}; - // std::vector labels = {"alpha", "beta", "gamma", "psi", "phi", "omega"}; + // const double final_score = m_density->score_result(result); + // double final_penalty = 0; + // for (int i = 0; i < 3; i++) { + // final_penalty += calculate_penalty(final_angles[i], m_angle_mean[i], m_angle_range[i], calculate_penalty_factor()); + // final_penalty += calculate_penalty(final_angles[i], m_torsion_mean[i], m_torsion_range[i], calculate_penalty_factor()); + // } + // + // + // std::cout << std::endl << "Initial score: " << initial_score << " - penalty: " << penalty << std::endl; + // std::vector labels = {"length", "alpha", "beta", "gamma", "psi", "phi", "omega"}; // std::cout << "\nLabel\tOriginal\tNew" << std::endl; // for (int i = 0; i < final_simplex.size(); i++) { // std::cout << labels[i] << "\t" << initial_simplex[i] << "\t" << final_simplex[i] << std::endl; // } + // std::cout << "Final score: " << final_score << " - penalty: " << final_penalty << std::endl; + return result; } diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 64b9ce3..d84b8f9 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -29,13 +29,21 @@ #include "src/include/sails-predictions.h" -void print_rejection_dds(const Sails::Glycosite& s1, const Sails::Glycosite& s2, gemmi::Structure* structure, float score) { +void print_rejection_dds(const Sails::Glycosite& s1, const Sails::Glycosite& s2, gemmi::Structure* structure) { std::cout << "Removing " << Sails::Utils::format_residue_from_site(s1, structure) << "--" - << Sails::Utils::format_residue_from_site(s2, structure) << " because of high DDS = " << score <rscc_score(residue); rscc < rscc_threshold) { + const float rscc = density->rscc_score(residue); + print_rscc(snd->site, rscc, structure); + if (rscc < rscc_threshold) { to_remove.emplace_back(snd.get()); // add pointer to - if (debug) print_removal_rscc(residue, rscc); + if (debug) print_removal_rscc(snd->site, rscc, structure); continue; } // remove cases with high difference density score - if (const float diff_score = density->difference_density_score(residue); diff_score > dds_threshold) { - if (debug) print_rejection_dds(sugar_result.value()->site, fst, structure, diff_score); - to_remove.emplace_back(snd.get()); - } + // const int no_atoms_in_negative_density = density->check_difference_density(residue, difference_density_stats); + // // std::cout << Sails::Utils::format_residue_from_site(fst, structure) << " " << no_atoms_in_negative_density << std::endl; + // if (no_atoms_in_negative_density > 4) { + // if (debug) print_rejection_dds(sugar_result.value()->site, fst, structure); + // to_remove.emplace_back(snd.get()); + // } + // print_dds(snd->site, diff_score, structure); + // if ( diff_score > dds_threshold) { + + // } } // add linked sugars to removal list + std::set additional_sugars; for (auto &sugar: to_remove) { - std::vector additional_sugars; - for (auto &linked_sugar: glycan->adjacency_list[sugar]) { - // check that the linked sugar is not already in the removal list - if (std::find(to_remove.begin(), to_remove.end(), linked_sugar) != to_remove.end()) continue; + std::vector downstream_sugars = glycan->get_downstream_sugars(sugar); - additional_sugars.emplace_back(linked_sugar); + for (auto& downstream_sugar: downstream_sugars) { + if (std::find(to_remove.begin(), to_remove.end(), downstream_sugar) != to_remove.end()) continue; + additional_sugars.insert(downstream_sugar); } - to_remove.insert(to_remove.end(), additional_sugars.begin(), additional_sugars.end()); } + to_remove.insert(to_remove.end(), additional_sugars.begin(), additional_sugars.end()); // sort removal in decsending order so removed indices don't cause later array overflow std::sort(to_remove.begin(), to_remove.end(), [](const Sails::Sugar *a, const Sails::Sugar *b) { @@ -88,6 +106,8 @@ void remove_erroneous_sugars(gemmi::Structure *structure, Sails::Density *densit }); for (auto &sugar: to_remove) { + std::cout << "REMOVING: " << Sails::Utils::format_residue_from_site(sugar->site, structure) << std::endl; + glycan->remove_sugar(sugar); } } @@ -124,6 +144,13 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu density.recalculate_map(structure); density.calculate_po_pc_map(original_structure); + // + // gemmi::Grid<> x = *density.get_work_grid(); + // gemmi::Ccp4<> m; + // m.grid = x ; + // m.update_ccp4_header(); + // m.write_ccp4_map("wrk.map"); + structure.cell = density.get_mtz()->cell; structure.spacegroup_hm = density.get_mtz()->spacegroup_name; @@ -138,6 +165,11 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu if (verbose) std::cout << "\rCycle #" << i << std::endl; for (auto &glycosite: glycosites) { + // auto c = Sails::Utils::get_chain_from_glycosite(glycosite, &structure); + // auto r = Sails::Utils::get_residue_from_glycosite(glycosite, &structure); + // if (c.name != "D" || r.seqid.num.value != 483) continue; + // + // std::cout << "Checking " << Sails::Utils::format_residue_from_site(glycosite, &structure) << std::endl; Sails::Glycan glycan = topology.find_glycan_topology(glycosite); // if (glycan.empty()) { continue; } @@ -154,6 +186,13 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu density.recalculate_map(structure); density.calculate_po_pc_map(original_structure); + // const auto x = density.get_mtz(); + // std::string y = "wrk" + std::to_string(i) + ".mtz"; + // x->write_to_file(y); + // std::string z = "wrk" + std::to_string(i) + ".cif"; + // + // Sails::Utils::save_structure_to_file(structure, z); + // remove erroneous sugars for (auto &glycosite: glycosites) { Sails::Glycan glycan = topology.find_glycan_topology(glycosite); @@ -165,6 +204,7 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu topology.set_structure(&structure); // need to update neighbor search after removing n residues Sails::Glycan new_glycan = topology.find_glycan_topology(glycosite); + new_glycan.renumber(); std::set differences = old_glycan - new_glycan; telemetry >> differences; @@ -179,9 +219,11 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu std::cout << std::endl; + model.standardise_residue_names(); + + // add links and write files std::vector links = generate_link_records(&structure, &glycosites, &topology); - Sails::MTZ output_mtz = Sails::form_sails_mtz(*density.get_mtz(), "FP", "SIGFP"); std::string log_string = telemetry.format_log(&structure, &density, false).value(); @@ -274,12 +316,35 @@ Sails::Output run_em_cycle(Sails::Glycosites &glycosites, gemmi::Structure &stru }; } +Sails::Glycosites identify_predicted_sites(gemmi::Structure &structure, gemmi::Grid<>& glycan_grid, std::string &resource_dir) { + std::string data_file = resource_dir + "/data.json"; + Sails::JSONLoader loader = {data_file}; + Sails::ResidueDatabase residue_database = loader.load_residue_database(); + Sails::LinkageDatabase linkage_database = loader.load_linkage_database(); + auto predictions = Sails::Predictions(&glycan_grid, linkage_database, residue_database); + + Sails::Glycosites potential_sites = predictions.find_potential_sites(structure, true); + return potential_sites; +} + +Sails::Glycosites identify_predicted_sites(gemmi::Structure &structure, gemmi::Grid<>& glycan_grid, gemmi::Grid<>& protein_grid, bool use_glycan, std::string &resource_dir) { + std::string data_file = resource_dir + "/data.json"; + Sails::JSONLoader loader = {data_file}; + Sails::ResidueDatabase residue_database = loader.load_residue_database(); + Sails::LinkageDatabase linkage_database = loader.load_linkage_database(); + auto predictions = Sails::Predictions(&glycan_grid, &protein_grid, linkage_database, residue_database); + + Sails::Glycosites potential_sites = predictions.find_potential_sites(structure, use_glycan); + return potential_sites; +} + + // XRAY FUNCTIONS Sails::Output n_glycosylate(gemmi::Structure &structure, Sails::MTZ &sails_mtz, int cycles, std::string &resource_dir, bool verbose) { auto glycosites = Sails::find_n_glycosylation_sites(structure); - return run_cycle(glycosites, structure, sails_mtz, cycles, resource_dir, false, verbose); + return run_cycle(glycosites, structure, sails_mtz, cycles, resource_dir, true, verbose); } Sails::Output c_glycosylate(gemmi::Structure &structure, Sails::MTZ &sails_mtz, int cycles, std::string &resource_dir, @@ -296,27 +361,12 @@ Sails::Output o_mannosylate(gemmi::Structure &structure, Sails::MTZ &sails_mtz, return run_cycle(glycosites, structure, sails_mtz, cycles, resource_dir, true, verbose); } -Sails::Glycosites identify_predicted_sites(gemmi::Structure &structure, gemmi::Grid<>& glycan_grid, std::string &resource_dir) { - std::string data_file = resource_dir + "/data.json"; - Sails::JSONLoader loader = {data_file}; - Sails::ResidueDatabase residue_database = loader.load_residue_database(); - Sails::LinkageDatabase linkage_database = loader.load_linkage_database(); - auto predictions = Sails::Predictions(&glycan_grid, linkage_database, residue_database); - - Sails::Glycosites potential_sites = predictions.find_potential_sites(structure, true); - return potential_sites; +Sails::Output auto_glycosylate(gemmi::Structure &structure, Sails::MTZ &sails_mtz, gemmi::Grid<>& glycan_grid, gemmi::Grid<>& protein_grid, int cycles, std::string &resource_dir, + bool verbose) { + Sails::Glycosites glycosites = identify_predicted_sites(structure, glycan_grid, protein_grid, true, resource_dir); + return run_cycle(glycosites, structure, sails_mtz, cycles, resource_dir, false, verbose); } -Sails::Glycosites identify_predicted_sites(gemmi::Structure &structure, gemmi::Grid<>& glycan_grid, gemmi::Grid<>& protein_grid, bool use_glycan, std::string &resource_dir) { - std::string data_file = resource_dir + "/data.json"; - Sails::JSONLoader loader = {data_file}; - Sails::ResidueDatabase residue_database = loader.load_residue_database(); - Sails::LinkageDatabase linkage_database = loader.load_linkage_database(); - auto predictions = Sails::Predictions(&glycan_grid, &protein_grid, linkage_database, residue_database); - - Sails::Glycosites potential_sites = predictions.find_potential_sites(structure, use_glycan); - return potential_sites; -} // EM FUNCTIONS @@ -487,6 +537,57 @@ gemmi::Structure morph(gemmi::Structure& structure, std::string& wurcs, std::str } +gemmi::Structure validate(gemmi::Structure& structure, Sails::MTZ &sails_mtz, bool remove, std::string& resource_dir) { + std::string data_file = resource_dir + "/data.json"; + Sails::JSONLoader loader = {data_file}; + Sails::ResidueDatabase residue_database = loader.load_residue_database(); + Sails::LinkageDatabase linkage_database = loader.load_linkage_database(); + + gemmi::Mtz mtz = form_gemmi_mtz(sails_mtz); + check_spacegroup(&mtz, &structure); // check to ensure the MTZ has a spacegroup + + auto density = Sails::XtalDensity(mtz); + density.load_map_coefficients(); + + float threshold = 0.75; + + std::vector to_remove = {}; + + for (int m = 0; m < structure.models.size(); m++) { + for (int c = 0; c < structure.models[m].chains.size(); c++) { + for (int r = 0; r < structure.models[m].chains[c].residues.size(); r++) { + + gemmi::Residue* residue_ptr = &structure.models[m].chains[c].residues[r]; + if (residue_database.count(residue_ptr->name) == 0) continue; + const Sails::ResidueData& residue_data = residue_database.at(residue_ptr->name); + if (!residue_data.is_sugar) continue; + + Sails::Glycosite site = {m, c, r}; + float rscc = density.rscc_score(*residue_ptr); + std::cout << Sails::Utils::format_residue_from_site(site, &structure) << " with RSCC = " << rscc; + if (rscc > threshold) { + std::cout << std::endl; + continue; + } + std::cout << " - removing" << std::endl; + to_remove.emplace_back(site); + } + } + } + + std::sort(to_remove.begin(), to_remove.end(), [](const Sails::Glycosite& a, const Sails::Glycosite& b) { + return !(a < b); + }); + + for (const auto& site: to_remove) { + const auto residue_ptr = &structure.models[site.model_idx].chains[site.chain_idx].residues; + residue_ptr->erase(residue_ptr->begin() + site.residue_idx); + } + + return structure; +} + + // gemmi::Structure wurcs(gemmi::Structure& structure, std::string chain, int seqid, std::string& resource_dir) { // std::string data_file = resource_dir + "/data.json"; // Sails::JSONLoader loader = {data_file}; diff --git a/package/src/include/density/sails-density.h b/package/src/include/density/sails-density.h index 3568fda..891ea06 100644 --- a/package/src/include/density/sails-density.h +++ b/package/src/include/density/sails-density.h @@ -197,10 +197,11 @@ namespace Sails { * This method calculates the difference density score for the given residue using the difference_grid. * * @param residue The gemmi::Residue object for which the difference density score is to be calculated. + * @param map_stats * * @return The difference density score for the residue. */ - float difference_density_score(gemmi::Residue &residue) const; + int check_difference_density(gemmi::Residue &residue, std::pair map_stats) const; /** * @brief Scores an atom @@ -224,6 +225,9 @@ namespace Sails { */ [[nodiscard]] float score_position(const gemmi::Position& pos) const; + + [[nodiscard]] std::pair calculate_map_statistics(const gemmi::Grid<> *grid) const; + }; } // namespace Sails diff --git a/package/src/include/density/sails-xtal-density.h b/package/src/include/density/sails-xtal-density.h index 2811299..38c5e61 100644 --- a/package/src/include/density/sails-xtal-density.h +++ b/package/src/include/density/sails-xtal-density.h @@ -14,6 +14,8 @@ namespace Sails { explicit XtalDensity(gemmi::Mtz &mtz, const std::string& F, const std::string& SIGF); + void load_map_coefficients(const std::string& fwt = "FWT", const std::string& phwt = "PHWT"); + [[nodiscard]] const gemmi::Mtz *get_mtz() const override { return &m_mtz; } [[nodiscard]] const gemmi::Grid<> *get_work_grid() const override { return &m_po_pc_grid; } diff --git a/package/src/include/sails-glycan.h b/package/src/include/sails-glycan.h index e10369e..726b288 100644 --- a/package/src/include/sails-glycan.h +++ b/package/src/include/sails-glycan.h @@ -252,7 +252,7 @@ namespace Sails { [[nodiscard]] std::vector get_sugar_site_dfs_order_without_root() { std::vector sites; dfs_sites(root_sugar, sites, 0); - sites.erase(sites.begin(), sites.begin() + 1); + sites.erase(sites.begin()); return sites; } @@ -260,7 +260,7 @@ namespace Sails { std::vector sites = get_sugar_site_dfs_order_without_root(); for (int i = 0; i < sites.size(); i++) { gemmi::Residue* residue_ptr = Utils::get_residue_ptr_from_glycosite(sites[i], m_structure); - residue_ptr->seqid.num.value = i; + residue_ptr->seqid.num.value = i+1; } } @@ -318,6 +318,14 @@ namespace Sails { return count-1; } + + [[nodiscard]] std::vector get_downstream_sugars(Sugar* sugar) { + std::vector downstream_sugars; + dfs(sugar, downstream_sugars, 0); + // downstream_sugars.erase(downstream_sugars.begin()); + return downstream_sugars; + } + /** * @brief Returns internal adjacency list. * diff --git a/package/src/include/sails-model.h b/package/src/include/sails-model.h index e13062d..ece9b50 100644 --- a/package/src/include/sails-model.h +++ b/package/src/include/sails-model.h @@ -61,13 +61,15 @@ namespace Sails { ResidueData(const std::vector &acceptors, const std::vector &donors, std::string &snfg_shape, std::string &snfg_colour, std::vector &preferred_depths, std::string &anomer, - std::string &wurcs, bool special + std::string &wurcs, bool special, bool is_sugar ) : acceptors(acceptors), donors(donors), snfg_shape(std::move(snfg_shape)), snfg_colour(std::move(snfg_colour)), preferred_depths(preferred_depths), anomer(anomer), - special(special) { + special(special), + is_sugar(is_sugar) + { if (!wurcs.empty()) {wurcs_code = wurcs;} @@ -90,6 +92,7 @@ namespace Sails { std::vector preferred_depths; std::string anomer; bool special; + bool is_sugar; std::optional wurcs_code = std::nullopt; }; diff --git a/package/src/sails/__init__.py b/package/src/sails/__init__.py index a865030..61c1463 100644 --- a/package/src/sails/__init__.py +++ b/package/src/sails/__init__.py @@ -28,6 +28,7 @@ morph, identify_predicted_sites, auto_glycosylate, + validate, ) from .__version__ import __version__ from .glycosylate import glycosylate_xtal, glycosylate_em, Type @@ -83,4 +84,5 @@ "morph", "identify_predicted_sites", "auto_glycosylate", + "validate", ] diff --git a/package/src/sails/data/data.json b/package/src/sails/data/data.json index e120170..a5f7ee1 100644 --- a/package/src/sails/data/data.json +++ b/package/src/sails/data/data.json @@ -10,6 +10,7 @@ "anomer": "α", "wurcsCode": "", "special": false, + "isSugar": true, "donorSets": [], "acceptorSets": [ { @@ -30,6 +31,7 @@ "anomer": "β", "wurcsCode": "", "special": false, + "isSugar": true, "donorSets": [ { "atom3": "O6", @@ -57,6 +59,7 @@ "anomer": "α", "wurcsCode": "a1221m-1a_1-5", "special": true, + "isSugar": true, "donorSets": [], "acceptorSets": [ { @@ -81,6 +84,7 @@ "anomer": "α", "wurcsCode": "a1122h-1a_1-5", "special": false, + "isSugar": true, "donorSets": [ { "atom3": "O2", @@ -126,6 +130,7 @@ "anomer": "α", "wurcsCode": "a1122h-1a_1-5*", "special": false, + "isSugar": true, "donorSets": [], "acceptorSets": [ { @@ -146,6 +151,7 @@ "anomer": "β", "wurcsCode": "a1122h-1b_1-5", "special": false, + "isSugar": true, "donorSets": [ { "atom3": "O6", @@ -180,6 +186,7 @@ "anomer": "β", "wurcsCode": "a2122h-1b_1-5_2*NCC/3=O", "special": false, + "isSugar": true, "donorSets": [ { "atom1": "C5", @@ -219,6 +226,7 @@ "anomer": "", "wurcsCode": "", "special": false, + "isSugar": false, "donorSets": [ { "atom1": "CB", @@ -239,6 +247,7 @@ "anomer": "", "wurcsCode": "", "special": false, + "isSugar": false, "donorSets": [ { "atom1": "CD2", @@ -259,6 +268,7 @@ "anomer": "", "wurcsCode": "", "special": false, + "isSugar": false, "donorSets": [ { "atom1": "CA", @@ -279,6 +289,7 @@ "anomer": "", "wurcsCode": "", "special": false, + "isSugar": false, "donorSets": [ { "atom1": "CA", @@ -298,24 +309,6 @@ "acceptorNumber": 1, "length": 1.4, "clusters": [ - { - "angles": { - "alphaMean": 127.765, - "alphaStdDev": 8.153, - "betaMean": 112.241, - "betaStdDev": 8.153, - "gammaMean": 113.442, - "gammaStdDev": 1.689 - }, - "torsions": { - "phiMean": 54.086, - "phiStdDev": 36.298, - "psiMean": -179.512, - "psiStdDev": 53.054, - "omegaMean": 167.981, - "omegaStdDev": 11.815 - } - }, { "angles": { "alphaMean": 122.28, @@ -333,6 +326,23 @@ "omegaMean": 176.266, "omegaStdDev": 7.108 } + },{ + "angles": { + "alphaMean": 127.765, + "alphaStdDev": 8.153, + "betaMean": 112.241, + "betaStdDev": 8.153, + "gammaMean": 113.442, + "gammaStdDev": 1.689 + }, + "torsions": { + "phiMean": 54.086, + "phiStdDev": 36.298, + "psiMean": -179.512, + "psiStdDev": 53.054, + "omegaMean": 167.981, + "omegaStdDev": 11.815 + } } ] }, diff --git a/package/src/sails/validate.py b/package/src/sails/validate.py new file mode 100644 index 0000000..4eb4f88 --- /dev/null +++ b/package/src/sails/validate.py @@ -0,0 +1,55 @@ +import argparse +from .__version__ import __version__ +import importlib +from sails import validate, interface + +from .glycosylate import get_column_labels + + +def parse_args(): + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(dest="mode", required=True) + + parser.add_argument("--version", action="version", version=__version__) + + parent = argparse.ArgumentParser(add_help=False) + group = parent.add_argument_group("Required arguments for all modes") + group.add_argument("-v", action=argparse.BooleanOptionalAction, default=False) + group.add_argument("--modelin", type=str, required=True) + group.add_argument("--modelout", type=str, default="sails-validated.cif") + group.add_argument("--remove", action=argparse.BooleanOptionalAction, default=False) + + formatter = argparse.ArgumentDefaultsHelpFormatter + xray_parser = subparsers.add_parser( + "xray", parents=[parent], formatter_class=formatter + ) + xray_parser_group = xray_parser.add_argument_group( + "Required arguments in X-ray mode" + ) + xray_parser_group.add_argument("--mtzin", type=str, required=True) + xray_parser_group.add_argument( + "--colin-fo", type=str, required=False, default="FP,SIGFP" + ) + xray_parser_group.add_argument( + "--colin-fwt", type=str, required=False, default="FWT,PHWT" + ) + + em_parser = subparsers.add_parser("em", parents=[parent], formatter_class=formatter) + em_parser_group = em_parser.add_argument_group("Required arguments in EM mode") + em_parser_group.add_argument("--mapin", type=str, required=True) + + return parser.parse_args() + + +def run(): + args = parse_args() + + sails_structure = interface.get_sails_structure(args.modelin) + resource = importlib.resources.files("sails").joinpath("data") + + labels = get_column_labels(args.colin_fo, args.colin_fwt) + sails_mtz = interface.get_sails_mtz(args.mtzin, *labels) + + morphed_structure = validate(sails_structure, sails_mtz, args.remove, str(resource)) + structure = interface.extract_sails_structure(morphed_structure) + structure.make_mmcif_block().write_file(args.modelout) From 910005c4996bbbd9bdc1f72f045c346494705564 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Sun, 19 Oct 2025 14:13:44 +0100 Subject: [PATCH 16/56] Added chain and seqid glycoyslation options --- package/src/bindings/python_sails.cpp | 4 ++++ package/src/cpp/sails.cpp | 9 ++++++++ package/src/sails/__init__.py | 2 ++ package/src/sails/glycosylate.py | 32 ++++++++++++++++++++++++++- 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/package/src/bindings/python_sails.cpp b/package/src/bindings/python_sails.cpp index 9f05bab..ace5001 100644 --- a/package/src/bindings/python_sails.cpp +++ b/package/src/bindings/python_sails.cpp @@ -202,6 +202,10 @@ NB_MODULE(sails_module, m) { nb::overload_cast&, gemmi::Grid<>&, int, std::string &, bool>(&auto_glycosylate), "structure"_a, "mtz"_a, "glycan_grid"_a, "protein_grid"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); + m.def("glycosylate_site", + nb::overload_cast(&glycosylate_site), "structure"_a, + "mtz"_a, "chain"_a, "seqid"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); + m.def("identify_predicted_sites", nb::overload_cast&, std::string &>(&identify_predicted_sites), "structure"_a, "glycan_grid"_a, "resource_dir"_a); m.def("identify_predicted_sites", nb::overload_cast&, gemmi::Grid<>&, bool, std::string &>(&identify_predicted_sites), diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index d84b8f9..af934bd 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -367,6 +367,15 @@ Sails::Output auto_glycosylate(gemmi::Structure &structure, Sails::MTZ &sails_mt return run_cycle(glycosites, structure, sails_mtz, cycles, resource_dir, false, verbose); } +Sails::Output glycosylate_site(gemmi::Structure &structure, Sails::MTZ &sails_mtz, std::string& chain, int seqid, int cycles, std::string &resource_dir, + bool verbose) { + std::optional potential_site = Sails::find_site(structure, chain, seqid); + if (!potential_site.has_value()) { + throw std::runtime_error("Site could not be found"); + } + Sails::Glycosites glycosites = {potential_site.value()}; + return run_cycle(glycosites, structure, sails_mtz, cycles, resource_dir, false, verbose); +} // EM FUNCTIONS diff --git a/package/src/sails/__init__.py b/package/src/sails/__init__.py index 61c1463..b7ea437 100644 --- a/package/src/sails/__init__.py +++ b/package/src/sails/__init__.py @@ -29,6 +29,7 @@ identify_predicted_sites, auto_glycosylate, validate, + glycosylate_site, ) from .__version__ import __version__ from .glycosylate import glycosylate_xtal, glycosylate_em, Type @@ -85,4 +86,5 @@ "identify_predicted_sites", "auto_glycosylate", "validate", + "glycosylate_site", ] diff --git a/package/src/sails/glycosylate.py b/package/src/sails/glycosylate.py index 41e3396..09a07aa 100644 --- a/package/src/sails/glycosylate.py +++ b/package/src/sails/glycosylate.py @@ -15,6 +15,7 @@ o_mannosylate, __version__, auto_glycosylate, + glycosylate_site, ) from .prediction.model import ModelType from .prediction.predict import predict_map @@ -79,6 +80,8 @@ def glycosylate_xtal( structure: gemmi.Structure | Path | str, mtz: gemmi.Mtz | Path | str, preddirin: Path | str, + chain: str, + seqid: int | str, cycles: int, f: str, sigf: str, @@ -104,6 +107,23 @@ def glycosylate_xtal( sails_mtz = interface.get_sails_mtz(mtz, f, sigf, fwt, phwt) resource = importlib.resources.files("sails").joinpath("data") + if chain and seqid: + result = glycosylate_site( + sails_structure, + sails_mtz, + chain, + int(seqid), + cycles, + str(resource), + verbose, + ) + return ( + interface.extract_sails_structure(result.structure), + interface.extract_sails_mtz(result.mtz), + json.loads(result.log), + result.snfgs, + ) + if type == Type.auto: if preddirin: predictions = read_prediction_dir( @@ -223,7 +243,15 @@ def xray(args): args.cycles if args.type == Type.n_glycosylate or args.type == Type.auto else 1 ) structure, mtz, log, snfgs = glycosylate_xtal( - args.modelin, args.mtzin, args.preddirin, cycles, *labels, args.type, args.v + args.modelin, + args.mtzin, + args.preddirin, + args.chain, + args.seqid, + cycles, + *labels, + args.type, + args.v, ) if args.snfgout: @@ -279,6 +307,8 @@ def parse_args(): group.add_argument( "--type", type=Type.from_string, choices=list(Type), default=Type.auto ) + group.add_argument("--chain", type=str, required=False) + group.add_argument("--seqid", type=str, required=False) formatter = argparse.ArgumentDefaultsHelpFormatter xray_parser = subparsers.add_parser( From 6663fed5133e6700b30fb3ee5d21b135a8edfd5a Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Sun, 19 Oct 2025 14:56:37 +0100 Subject: [PATCH 17/56] Added logging to sails-validate --- package/src/cpp/sails-json.cpp | 2 +- package/src/cpp/sails-telemetry.cpp | 18 ++++++++++++++++++ package/src/cpp/sails.cpp | 14 +++++++++----- package/src/include/sails-gemmi-bindings.h | 6 ++++-- package/src/include/sails-telemetry.h | 6 ++++++ package/src/sails/validate.py | 13 ++++++++++--- 6 files changed, 48 insertions(+), 11 deletions(-) diff --git a/package/src/cpp/sails-json.cpp b/package/src/cpp/sails-json.cpp index 97eb19c..4f0dd79 100644 --- a/package/src/cpp/sails-json.cpp +++ b/package/src/cpp/sails-json.cpp @@ -167,7 +167,7 @@ void Sails::JSONWriter::write_json_file(TelemetryLog &log, std::ostream &stream) stream << "\n"; } stream << "\t\t\t}\n\t\t}"; - if (cycle < log.size()) stream << ","; + if (cycle < log.size() - 1) stream << ","; } stream << "]\n}"; } diff --git a/package/src/cpp/sails-telemetry.cpp b/package/src/cpp/sails-telemetry.cpp index aac4a29..dece0b8 100644 --- a/package/src/cpp/sails-telemetry.cpp +++ b/package/src/cpp/sails-telemetry.cpp @@ -66,3 +66,21 @@ std::optional Sails::Telemetry::format_log(gemmi::Structure *struct } return std::nullopt; } + + +std::optional Sails::Telemetry::format_log(std::vector &log, bool write, const std::string& filepath) { + JSONWriter writer; + TelemetryLog telemetry_log; + telemetry_log[0] = log; + + if (write) { + std::ofstream stream(filepath); + writer.write_json_file(telemetry_log, stream); + stream.close(); + } else { + std::stringstream stream; + writer.write_json_file(telemetry_log, stream); + return stream.str(); + } + return std::nullopt; +} diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index af934bd..2ba04b4 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -546,7 +546,7 @@ gemmi::Structure morph(gemmi::Structure& structure, std::string& wurcs, std::str } -gemmi::Structure validate(gemmi::Structure& structure, Sails::MTZ &sails_mtz, bool remove, std::string& resource_dir) { +Sails::Output validate(gemmi::Structure& structure, Sails::MTZ &sails_mtz, bool remove, std::string& resource_dir) { std::string data_file = resource_dir + "/data.json"; Sails::JSONLoader loader = {data_file}; Sails::ResidueDatabase residue_database = loader.load_residue_database(); @@ -561,6 +561,7 @@ gemmi::Structure validate(gemmi::Structure& structure, Sails::MTZ &sails_mtz, bo float threshold = 0.75; std::vector to_remove = {}; + std::vector log = {}; for (int m = 0; m < structure.models.size(); m++) { for (int c = 0; c < structure.models[m].chains.size(); c++) { @@ -573,12 +574,11 @@ gemmi::Structure validate(gemmi::Structure& structure, Sails::MTZ &sails_mtz, bo Sails::Glycosite site = {m, c, r}; float rscc = density.rscc_score(*residue_ptr); - std::cout << Sails::Utils::format_residue_from_site(site, &structure) << " with RSCC = " << rscc; + std::string residue_key = Sails::Utils::format_residue_from_site(site, &structure); + log.emplace_back(residue_key, rscc); if (rscc > threshold) { - std::cout << std::endl; continue; } - std::cout << " - removing" << std::endl; to_remove.emplace_back(site); } } @@ -593,7 +593,11 @@ gemmi::Structure validate(gemmi::Structure& structure, Sails::MTZ &sails_mtz, bo residue_ptr->erase(residue_ptr->begin() + site.residue_idx); } - return structure; + std::string log_string = Sails::Telemetry::format_log(log, false, "").value(); + return { + structure, + log_string + }; } diff --git a/package/src/include/sails-gemmi-bindings.h b/package/src/include/sails-gemmi-bindings.h index e7c6c84..5263f2f 100644 --- a/package/src/include/sails-gemmi-bindings.h +++ b/package/src/include/sails-gemmi-bindings.h @@ -152,12 +152,14 @@ namespace Sails { * log string. */ struct Output { - Output(gemmi::Structure& structure, MTZ& mtz, std::string log, std::map>& snfgs): + Output(gemmi::Structure& structure, MTZ& mtz, std::string& log, std::map>& snfgs): structure(structure), mtz(mtz), log(std::move(log)), snfgs(snfgs){}; - Output(gemmi::Structure& structure, std::string log, std::map>& snfgs): + Output(gemmi::Structure& structure, std::string& log, std::map>& snfgs): structure(structure), log(std::move(log)), snfgs(snfgs){}; + Output(gemmi::Structure& structure, std::string& log): structure(structure), log(std::move(log)) {}; + gemmi::Structure structure ; MTZ mtz{}; std::string log; diff --git a/package/src/include/sails-telemetry.h b/package/src/include/sails-telemetry.h index 99956bc..a9f302f 100644 --- a/package/src/include/sails-telemetry.h +++ b/package/src/include/sails-telemetry.h @@ -25,6 +25,11 @@ namespace Sails { dds_score(dds_score) { } + TelemetryFormat(const std::string &residue_id, double rscc_score) + : residue_id(residue_id), + rscc_score(rscc_score), rsr_score(0), dds_score(0) { + } + std::string residue_id; double rscc_score; double rsr_score; @@ -153,6 +158,7 @@ namespace Sails { */ void format_log(gemmi::Structure* structure); + static std::optional format_log(std::vector& log, bool write, const std::string& filepath); /** * @brief Calculates the telemetry log for Sails. diff --git a/package/src/sails/validate.py b/package/src/sails/validate.py index 4eb4f88..252ce22 100644 --- a/package/src/sails/validate.py +++ b/package/src/sails/validate.py @@ -1,4 +1,6 @@ import argparse +import json + from .__version__ import __version__ import importlib from sails import validate, interface @@ -16,7 +18,8 @@ def parse_args(): group = parent.add_argument_group("Required arguments for all modes") group.add_argument("-v", action=argparse.BooleanOptionalAction, default=False) group.add_argument("--modelin", type=str, required=True) - group.add_argument("--modelout", type=str, default="sails-validated.cif") + group.add_argument("--modelout", type=str, default="sails-validate.cif") + group.add_argument("--logout", type=str, default="sails-validate.log") group.add_argument("--remove", action=argparse.BooleanOptionalAction, default=False) formatter = argparse.ArgumentDefaultsHelpFormatter @@ -50,6 +53,10 @@ def run(): labels = get_column_labels(args.colin_fo, args.colin_fwt) sails_mtz = interface.get_sails_mtz(args.mtzin, *labels) - morphed_structure = validate(sails_structure, sails_mtz, args.remove, str(resource)) - structure = interface.extract_sails_structure(morphed_structure) + result = validate(sails_structure, sails_mtz, args.remove, str(resource)) + + structure = interface.extract_sails_structure(result.structure) structure.make_mmcif_block().write_file(args.modelout) + log = json.loads(result.log) + with open(args.logout, "w") as f: + json.dump(log, f) From c77488248a7d7fcee63f770eb3f75667aff95003 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Mon, 20 Oct 2025 11:31:28 +0100 Subject: [PATCH 18/56] Added masked RSCC function --- package/src/cpp/density/sails-density.cpp | 12 +++++++++--- package/src/cpp/sails-utils.cpp | 8 ++++++++ package/src/include/sails-utils.h | 2 ++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/package/src/cpp/density/sails-density.cpp b/package/src/cpp/density/sails-density.cpp index a6598bd..3554ab5 100644 --- a/package/src/cpp/density/sails-density.cpp +++ b/package/src/cpp/density/sails-density.cpp @@ -126,11 +126,14 @@ float Sails::Density::rscc_score(gemmi::Residue &residue) const { for (auto &atom: residue.atoms) { box.extend(atom.pos); } - // box.add_margin(); + box.add_margin(1); // gemmi::Grid<> calc = calculate_density_for_box(residue, box); gemmi::Grid<> calc = calculate_density_for_grid(residue); + gemmi::Model model = Utils::create_model(residue); + gemmi::NeighborSearch ns = {model, get_best_grid()->unit_cell, 1.5}; + ns.populate(); // gemmi::Ccp4<> m; // m.grid = calc; // m.update_ccp4_header(); @@ -150,8 +153,11 @@ float Sails::Density::rscc_score(gemmi::Residue &residue) const { for (double y = min.y; y <= max.y; y += step_size) { for (double z = min.z; z <= max.z; z += step_size) { gemmi::Position position = {x, y, z}; - obs_values.emplace_back(get_best_grid()->interpolate_value(position)); - calc_values.emplace_back(calc.interpolate_value(position)); + auto nearest_atom = ns.find_atoms(position, '*', 0, 1.5); + if (!nearest_atom.empty()) { + obs_values.emplace_back(get_best_grid()->interpolate_value(position)); + calc_values.emplace_back(calc.interpolate_value(position)); + } } } } diff --git a/package/src/cpp/sails-utils.cpp b/package/src/cpp/sails-utils.cpp index d4f3673..f460694 100644 --- a/package/src/cpp/sails-utils.cpp +++ b/package/src/cpp/sails-utils.cpp @@ -142,3 +142,11 @@ std::vector Sails::Utils::split(const std::string &string, char del } return tokens; } + +gemmi::Model Sails::Utils::create_model(gemmi::Residue &residue) { + auto model = gemmi::Model(0); + auto chain = gemmi::Chain("A"); + chain.residues.push_back(residue); + model.chains.push_back(chain); + return model; +} diff --git a/package/src/include/sails-utils.h b/package/src/include/sails-utils.h index ef323fd..b52a2cc 100644 --- a/package/src/include/sails-utils.h +++ b/package/src/include/sails-utils.h @@ -257,6 +257,8 @@ namespace Sails::Utils { * @return a vector of strings split by the delimiter */ std::vector split(const std::string &string, char delimiter); + + gemmi::Model create_model(gemmi::Residue& residue); } // namespace Sails::Utils From f886814c1aa76747001d40bb52cb247bf81aadb1 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Mon, 20 Oct 2025 12:39:19 +0100 Subject: [PATCH 19/56] Added links to return structure --- package/src/bindings/python_sails.cpp | 53 +++++++++++++++++++++++++++ package/src/cpp/sails-cif.cpp | 35 ++++++++++++++++++ package/src/cpp/sails-linkage.cpp | 2 +- package/src/cpp/sails.cpp | 2 +- package/src/include/sails-cif.h | 8 ++-- package/src/sails/__init__.py | 6 +++ package/src/sails/interface.py | 31 ++++++++++++++++ 7 files changed, 132 insertions(+), 5 deletions(-) diff --git a/package/src/bindings/python_sails.cpp b/package/src/bindings/python_sails.cpp index ace5001..d5d318a 100644 --- a/package/src/bindings/python_sails.cpp +++ b/package/src/bindings/python_sails.cpp @@ -38,6 +38,58 @@ NB_MODULE(sails_module, m) { .def_rw("delfwt_phdelwt", &Sails::Reflection::delfwt_phdelwt); + nb::class_(m, "ResidueId") + .def_rw("seqid", &gemmi::ResidueId::seqid) + .def_rw("segment", &gemmi::ResidueId::segment) + .def_rw("name", &gemmi::ResidueId::name); + + nb::class_(m, "AtomAddress") + .def(nb::init<>()) + .def(nb::init(), + nb::arg("chain_name"), nb::arg("res_id"), nb::arg("atom_name"), nb::arg("altloc") = '\0') + .def(nb::init(), + nb::arg("chain_name"), nb::arg("seq_id"), nb::arg("res"), nb::arg("atom_name"), nb::arg("altloc") = '\0') + .def_rw("chain_name", &gemmi::AtomAddress::chain_name) + .def_rw("res_id", &gemmi::AtomAddress::res_id) + .def_rw("atom_name", &gemmi::AtomAddress::atom_name) + .def_rw("altloc", &gemmi::AtomAddress::altloc) + .def("__eq__", &gemmi::AtomAddress::operator==) + .def("__str__", &gemmi::AtomAddress::str); + + nb::enum_(m, "ConnectionType") + .value("Covale", gemmi::Connection::Type::Covale) + .value("Disulf", gemmi::Connection::Type::Disulf) + .value("Hydrog", gemmi::Connection::Type::Hydrog) + .value("MetalC", gemmi::Connection::Type::MetalC) + .value("Unknown", gemmi::Connection::Type::Unknown); + + nb::bind_vector >(m, "Connections"); + + nb::class_(m, "Connection") + .def(nb::init<>()) + .def_rw("name", &gemmi::Connection::name) + .def_rw("link_id", &gemmi::Connection::link_id) + .def_rw("type", &gemmi::Connection::type) + .def_rw("asu", &gemmi::Connection::asu) + .def_rw("partner1", &gemmi::Connection::partner1) + .def_rw("partner2", &gemmi::Connection::partner2) + .def_rw("reported_distance", &gemmi::Connection::reported_distance) + .def_prop_rw( + "reported_sym", + [](gemmi::Connection &self) { + return nb::cast(std::array{ + self.reported_sym[0], + self.reported_sym[1], + self.reported_sym[2], + self.reported_sym[3] + }); + }, + [](gemmi::Connection &self, const std::array &arr) { + for (size_t i = 0; i < 4; ++i) + self.reported_sym[i] = arr[i]; + }); + + // gemmi Structure nb::class_(m, "Structure") .def(nb::init<>()) @@ -45,6 +97,7 @@ NB_MODULE(sails_module, m) { .def("cell", [](const gemmi::Structure &structure) { return Sails::Cell(structure.cell); }) + .def_rw("connections", &gemmi::Structure::connections) .def("set_cell", [](gemmi::Structure &structure, const Sails::Cell &cell) { structure.cell = gemmi::UnitCell(cell.a, cell.b, cell.c, cell.alpha, cell.beta, cell.gamma); diff --git a/package/src/cpp/sails-cif.cpp b/package/src/cpp/sails-cif.cpp index abb4671..c3300b0 100644 --- a/package/src/cpp/sails-cif.cpp +++ b/package/src/cpp/sails-cif.cpp @@ -29,3 +29,38 @@ std::vector Sails::generate_link_records(gemmi::Structure *s } return links; } + +void Sails::add_links_to_structure(gemmi::Structure *structure, std::vector &link_records) { + + std::vector connections; + + for (auto& link_record : link_records) { + gemmi::Connection connection; + connection.type = gemmi::Connection::Covale; + gemmi::AtomAddress a1; + a1.chain_name = link_record.chain1.name; + gemmi::ResidueId resid1; + resid1.name = link_record.residue1.name; + resid1.seqid = link_record.residue1.seqid; + a1.res_id = resid1; + a1.atom_name = link_record.atom1.name; + + gemmi::AtomAddress a2; + a2.chain_name = link_record.chain2.name; + gemmi::ResidueId resid2; + resid2.name = link_record.residue2.name; + resid2.seqid = link_record.residue2.seqid; + a2.res_id = resid2; + a2.atom_name = link_record.atom2.name; + + double distance = (link_record.atom1.pos - link_record.atom2.pos).length(); + + connection.partner1 = a1; + connection.partner2 = a2; + connection.reported_distance = distance; + connections.push_back(connection); + + } + + structure->connections = connections; +} diff --git a/package/src/cpp/sails-linkage.cpp b/package/src/cpp/sails-linkage.cpp index 6dc3a54..75f1a68 100644 --- a/package/src/cpp/sails-linkage.cpp +++ b/package/src/cpp/sails-linkage.cpp @@ -103,7 +103,7 @@ void Sails::Model::save(const std::string &path, std::vector &links) std::ofstream os(path); gemmi::cif::Document document = make_mmcif_document(*structure); gemmi::cif::Block *block = &document.sole_block(); - auto struct_conn = block->find_or_add("_struct_conn", LinkRecord::tags()); + auto struct_conn = block->find_or_add("", LinkRecord::tags()); for (LinkRecord &link: links) { struct_conn.append_row(link.labels()); diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 2ba04b4..afcf3af 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -224,6 +224,7 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu // add links and write files std::vector links = generate_link_records(&structure, &glycosites, &topology); + Sails::add_links_to_structure(model.get_structure(), links); Sails::MTZ output_mtz = Sails::form_sails_mtz(*density.get_mtz(), "FP", "SIGFP"); std::string log_string = telemetry.format_log(&structure, &density, false).value(); @@ -305,7 +306,6 @@ Sails::Output run_em_cycle(Sails::Glycosites &glycosites, gemmi::Structure &stru // add links and write files std::vector links = generate_link_records(&structure, &glycosites, &topology); - std::string log_string = telemetry.format_log(&structure, &density, false).value(); Sails::Telemetry::SNFGCycleData snfgs = telemetry.get_snfgs(); diff --git a/package/src/include/sails-cif.h b/package/src/include/sails-cif.h index c849b15..366bc82 100644 --- a/package/src/include/sails-cif.h +++ b/package/src/include/sails-cif.h @@ -88,8 +88,8 @@ namespace Sails { */ std::vector labels() { double distance = (atom1.pos - atom2.pos).length(); - std::string res1_seqid = residue1.seqid.str(); - std::string res2_seqid = residue2.seqid.str(); + std::string res1_seqid = residue1.seqid.num.str(); + std::string res2_seqid = residue2.seqid.num.str(); return { id, @@ -130,7 +130,6 @@ namespace Sails { }; } - private: gemmi::Chain chain1; gemmi::Chain chain2; gemmi::Residue residue1; @@ -153,6 +152,9 @@ namespace Sails { std::vector generate_link_records(gemmi::Structure *structure, Sails::Glycosites *glycosites, Sails::Topology *topology); + + void add_links_to_structure(gemmi::Structure *structure, std::vector& link_records); + } #endif //SAILS_CIF_H diff --git a/package/src/sails/__init__.py b/package/src/sails/__init__.py index b7ea437..08abcb6 100644 --- a/package/src/sails/__init__.py +++ b/package/src/sails/__init__.py @@ -30,6 +30,9 @@ auto_glycosylate, validate, glycosylate_site, + Connections, + AtomAddress, + ResidueId, ) from .__version__ import __version__ from .glycosylate import glycosylate_xtal, glycosylate_em, Type @@ -87,4 +90,7 @@ "auto_glycosylate", "validate", "glycosylate_site", + "Connections", + "AtomAddress", + "ResidueId", ] diff --git a/package/src/sails/interface.py b/package/src/sails/interface.py index b8caf4d..8c3cb03 100644 --- a/package/src/sails/interface.py +++ b/package/src/sails/interface.py @@ -150,6 +150,36 @@ def extract_gemmi_structure(structure: gemmi.Structure) -> sails.Structure: return os +def extract_sails_atom_address(atom_address: sails.AtomAddress): + oa = gemmi.AtomAddress() + oa.chain_name = atom_address.chain_name + oa.atom_name = atom_address.atom_name + + oseqid = gemmi.SeqId( + atom_address.res_id.seqid.num(), atom_address.res_id.seqid.icode() + ) + + oresid = gemmi.ResidueId() + oresid.seqid = oseqid + oresid.name = atom_address.res_id.name + oa.res_id = oresid + + return oa + + +def extract_sails_connections(connections: sails.Connections): + connection_list = gemmi.ConnectionList() + + for connection in connections: + oconnection = gemmi.Connection() + oconnection.type = gemmi.ConnectionType[connection.type.__name__] + oconnection.partner1 = extract_sails_atom_address(connection.partner1) + oconnection.partner2 = extract_sails_atom_address(connection.partner2) + + connection_list.append(oconnection) + return connection_list + + def extract_sails_structure(structure: sails.Structure) -> gemmi.Structure: os = gemmi.Structure() om = gemmi.Model("1") @@ -178,6 +208,7 @@ def extract_sails_structure(structure: sails.Structure) -> gemmi.Structure: cell = structure.cell() os.cell = gemmi.UnitCell(cell.a, cell.b, cell.c, cell.alpha, cell.beta, cell.gamma) + os.connections = extract_sails_connections(structure.connections) return os From b327d5b95d41af34a3ca62d935ad851a1ea3d2a7 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Mon, 20 Oct 2025 13:53:53 +0100 Subject: [PATCH 20/56] Fixed bug with extra comma with 1 cycle jobs --- package/src/cpp/sails-json.cpp | 2 +- package/src/cpp/sails-telemetry.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/package/src/cpp/sails-json.cpp b/package/src/cpp/sails-json.cpp index 4f0dd79..97eb19c 100644 --- a/package/src/cpp/sails-json.cpp +++ b/package/src/cpp/sails-json.cpp @@ -167,7 +167,7 @@ void Sails::JSONWriter::write_json_file(TelemetryLog &log, std::ostream &stream) stream << "\n"; } stream << "\t\t\t}\n\t\t}"; - if (cycle < log.size() - 1) stream << ","; + if (cycle < log.size()) stream << ","; } stream << "]\n}"; } diff --git a/package/src/cpp/sails-telemetry.cpp b/package/src/cpp/sails-telemetry.cpp index dece0b8..963fb3d 100644 --- a/package/src/cpp/sails-telemetry.cpp +++ b/package/src/cpp/sails-telemetry.cpp @@ -71,7 +71,7 @@ std::optional Sails::Telemetry::format_log(gemmi::Structure *struct std::optional Sails::Telemetry::format_log(std::vector &log, bool write, const std::string& filepath) { JSONWriter writer; TelemetryLog telemetry_log; - telemetry_log[0] = log; + telemetry_log[1] = log; if (write) { std::ofstream stream(filepath); From 491dc920b26a3503a798ee763070f7b4e961d4ee Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Tue, 21 Oct 2025 14:52:06 +0100 Subject: [PATCH 21/56] Added sails-validate global RSCC calculation Fixed downstream sugars bug in removal --- package/src/cpp/density/sails-density.cpp | 40 ++++++--- package/src/cpp/sails-glycan.cpp | 17 +++- package/src/cpp/sails-refine.cpp | 4 +- package/src/cpp/sails.cpp | 90 +++++++++++++++------ package/src/include/density/sails-density.h | 16 +++- package/src/include/sails-glycan.h | 25 +++++- package/src/include/sails-refine.h | 2 +- package/src/include/sails-utils.h | 14 ++++ package/src/sails/interface.py | 1 + package/src/sails/validate.py | 14 +++- 10 files changed, 175 insertions(+), 48 deletions(-) diff --git a/package/src/cpp/density/sails-density.cpp b/package/src/cpp/density/sails-density.cpp index 3554ab5..3e48782 100644 --- a/package/src/cpp/density/sails-density.cpp +++ b/package/src/cpp/density/sails-density.cpp @@ -87,7 +87,24 @@ gemmi::Grid<> Sails::Density::calculate_density_for_grid(gemmi::Residue &residue return std::move(x); } -float Sails::Density::calculate_rscc(std::vector obs_values, std::vector calc_values) { +gemmi::Grid<> Sails::Density::calculate_density_for_structure(gemmi::Structure &structure) const { + gemmi::DensityCalculator, float> density_calculator; + + density_calculator.grid.copy_metadata_from(*get_best_grid()); + density_calculator.grid.spacing[0] = get_best_grid()->spacing[0]; + density_calculator.grid.spacing[1] = get_best_grid()->spacing[1]; + density_calculator.grid.spacing[2] = get_best_grid()->spacing[2]; + + density_calculator.d_min = get_resolution(); + density_calculator.initialize_grid(); + density_calculator.add_model_density_to_grid(structure.models[0]); + density_calculator.grid.symmetrize_sum(); + auto x = density_calculator.grid; + return std::move(x); +} + +template +T Sails::Density::calculate_rscc(std::vector obs_values, std::vector calc_values) { if (obs_values.size() != calc_values.size()) throw std::runtime_error("RSCC obs and calc lists are different sizes"); @@ -117,6 +134,8 @@ float Sails::Density::calculate_rscc(std::vector obs_values, std::vector< if (denominator == 0.0f) throw std::runtime_error("RSCC Denominator is 0"); return numerator / denominator; } +template float Sails::Density::calculate_rscc(std::vector obs_values, std::vector calc_values); +template double Sails::Density::calculate_rscc(std::vector obs_values, std::vector calc_values); float Sails::Density::rscc_score(gemmi::Residue &residue) const { @@ -126,13 +145,13 @@ float Sails::Density::rscc_score(gemmi::Residue &residue) const { for (auto &atom: residue.atoms) { box.extend(atom.pos); } - box.add_margin(1); + box.add_margin(2); // gemmi::Grid<> calc = calculate_density_for_box(residue, box); gemmi::Grid<> calc = calculate_density_for_grid(residue); gemmi::Model model = Utils::create_model(residue); - gemmi::NeighborSearch ns = {model, get_best_grid()->unit_cell, 1.5}; + gemmi::NeighborSearch ns = {model, get_best_grid()->unit_cell, 2}; ns.populate(); // gemmi::Ccp4<> m; // m.grid = calc; @@ -142,18 +161,17 @@ float Sails::Density::rscc_score(gemmi::Residue &residue) const { // std::vector rs = {residue}; // Utils::save_residues_to_file(rs, "res.pdb"); - const gemmi::Position max = box.maximum; const gemmi::Position min = box.minimum; + const gemmi::Position max = box.maximum; std::vector obs_values = {}; std::vector calc_values = {}; - const double step_size = get_best_grid()->spacing[0]; - for (double x = min.x; x <= max.x; x += step_size) { - for (double y = min.y; y <= max.y; y += step_size) { - for (double z = min.z; z <= max.z; z += step_size) { + for (double x = min.x; x <= max.x; x += get_best_grid()->spacing[0]) { + for (double y = min.y; y <= max.y; y += get_best_grid()->spacing[1]) { + for (double z = min.z; z <= max.z; z += get_best_grid()->spacing[2]) { gemmi::Position position = {x, y, z}; - auto nearest_atom = ns.find_atoms(position, '*', 0, 1.5); + auto nearest_atom = ns.find_atoms(position, '*', 0, 2); if (!nearest_atom.empty()) { obs_values.emplace_back(get_best_grid()->interpolate_value(position)); calc_values.emplace_back(calc.interpolate_value(position)); @@ -162,7 +180,7 @@ float Sails::Density::rscc_score(gemmi::Residue &residue) const { } } - return calculate_rscc(obs_values, calc_values); + return calculate_rscc(obs_values, calc_values); } float Sails::Density::rscc_score(SuperpositionResult &result) { @@ -201,7 +219,7 @@ float Sails::Density::rscc_score(SuperpositionResult &result) { } } - return calculate_rscc(obs_values, calc_values); + return calculate_rscc(obs_values, calc_values); } float Sails::Density::rsr_score(gemmi::Residue &residue) { diff --git a/package/src/cpp/sails-glycan.cpp b/package/src/cpp/sails-glycan.cpp index d96900b..79974eb 100644 --- a/package/src/cpp/sails-glycan.cpp +++ b/package/src/cpp/sails-glycan.cpp @@ -89,7 +89,7 @@ void Sails::Glycan::bfs(Sails::Sugar *root) { } } -void Sails::Glycan::dfs(Sugar *current_sugar, std::vector &terminal_sugars, int depth = 0) { +void Sails::Glycan::dfs_terminal(Sugar *current_sugar, std::vector &terminal_sugars, int depth = 0) { std::set &sugar_set = adjacency_list[current_sugar]; if (sugar_set.empty()) { current_sugar->depth = depth; @@ -98,7 +98,7 @@ void Sails::Glycan::dfs(Sugar *current_sugar, std::vector &terminal_sug for (Sugar *sugar: sugar_set) { sugar->depth = depth + 1; - dfs(sugar, terminal_sugars, depth + 1); + dfs_terminal(sugar, terminal_sugars, depth + 1); } } @@ -113,6 +113,17 @@ void Sails::Glycan::dfs_sites(Sugar *current_sugar, std::vector &site } } +void Sails::Glycan::dfs_sugars(Sugar *current_sugar, std::vector &sugars, int depth) { + const std::set &sugar_set = adjacency_list[current_sugar]; + current_sugar->depth = depth; + sugars.push_back(current_sugar); + + for (Sugar *sugar: sugar_set) { + sugar->depth = depth + 1; + dfs_sugars(sugar, sugars, depth + 1); + } +} + std::set Sails::Glycan::operator-(const Glycan& glycan) { std::set this_keys; std::transform(this->sugars.begin(), this->sugars.end(), std::inserter(this_keys, this_keys.end()), @@ -133,7 +144,7 @@ std::vector Sails::Glycan::get_terminal_sugars(Glycosite &root_s throw std::runtime_error("Root SeqId is not valid"); } std::vector terminal_sugars; - dfs(sugars[root_seq_id].get(), terminal_sugars); + dfs_terminal(sugars[root_seq_id].get(), terminal_sugars); // FUC has no links, but would be the terminal sugar in order, so add the sugar before FUC in that case std::vector additional_sugars; diff --git a/package/src/cpp/sails-refine.cpp b/package/src/cpp/sails-refine.cpp index a247745..59ac0fe 100644 --- a/package/src/cpp/sails-refine.cpp +++ b/package/src/cpp/sails-refine.cpp @@ -61,9 +61,9 @@ double Sails::TorsionAngleRefiner::score_function(std::vector &all_angle penalty += calculate_penalty(torsions[i], m_torsion_mean[i], m_torsion_range[i], penalty_factor); } - double bond_length_delta = abs(all_angles[0] - m_length); + double bond_length_delta = std::abs(all_angles[0] - m_length); if (bond_length_delta > 0.3) { - penalty += 1e10; + penalty += bond_length_delta * 1e5; } // std::cout << penalty << " " << score << " " << penalty_factor << std::endl; diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index afcf3af..08fff3c 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -546,7 +546,7 @@ gemmi::Structure morph(gemmi::Structure& structure, std::string& wurcs, std::str } -Sails::Output validate(gemmi::Structure& structure, Sails::MTZ &sails_mtz, bool remove, std::string& resource_dir) { +Sails::Output validate(gemmi::Structure& structure, Sails::MTZ &sails_mtz, bool remove, float threshold, std::string& resource_dir) { std::string data_file = resource_dir + "/data.json"; Sails::JSONLoader loader = {data_file}; Sails::ResidueDatabase residue_database = loader.load_residue_database(); @@ -558,41 +558,79 @@ Sails::Output validate(gemmi::Structure& structure, Sails::MTZ &sails_mtz, bool auto density = Sails::XtalDensity(mtz); density.load_map_coefficients(); - float threshold = 0.75; + gemmi::Grid<> calculated_density = density.calculate_density_for_structure(structure); + + gemmi::NeighborSearch ns = gemmi::NeighborSearch(structure.models[0], structure.cell, 2); + ns.populate(); + + gemmi::Grid<> best_grid = *density.get_best_grid(); + std::map>> residue_pairs; + + for (auto point: best_grid) { + gemmi::Position position = best_grid.point_to_position(point); + auto mark = ns.find_nearest_atom(position, 2); + if (mark == nullptr) continue; + + auto site = Sails::Glycosite(0, mark->chain_idx, mark->residue_idx, 0); + const gemmi::Residue* residue_ptr = &structure.models[site.model_idx].chains[site.chain_idx].residues[site.residue_idx]; + if (residue_database.count(residue_ptr->name) == 0) continue; + const Sails::ResidueData& residue = residue_database.at(residue_ptr->name); + if (!residue.is_sugar) continue; + + double obs = *point.value; + double calc = calculated_density.interpolate_value(position); + residue_pairs[site].emplace_back(obs, calc); + + } + std::map rsccs; + + for (const auto& [site, data]: residue_pairs) { + auto [obs_values, calc_values] = Sails::Utils::split_pairs(data); + if (obs_values.empty() || calc_values.empty()) continue; + + rsccs[site] = Sails::Density::calculate_rscc(obs_values, calc_values); + } std::vector to_remove = {}; std::vector log = {}; - for (int m = 0; m < structure.models.size(); m++) { - for (int c = 0; c < structure.models[m].chains.size(); c++) { - for (int r = 0; r < structure.models[m].chains[c].residues.size(); r++) { - - gemmi::Residue* residue_ptr = &structure.models[m].chains[c].residues[r]; - if (residue_database.count(residue_ptr->name) == 0) continue; - const Sails::ResidueData& residue_data = residue_database.at(residue_ptr->name); - if (!residue_data.is_sugar) continue; - - Sails::Glycosite site = {m, c, r}; - float rscc = density.rscc_score(*residue_ptr); - std::string residue_key = Sails::Utils::format_residue_from_site(site, &structure); - log.emplace_back(residue_key, rscc); - if (rscc > threshold) { - continue; - } - to_remove.emplace_back(site); - } + for (auto& [site, rscc]: rsccs) { + std::string residue_key = Sails::Utils::format_residue_from_site(site, &structure); + log.emplace_back(residue_key, rscc); + if (rscc > threshold) { + continue; } + to_remove.emplace_back(site); } - std::sort(to_remove.begin(), to_remove.end(), [](const Sails::Glycosite& a, const Sails::Glycosite& b) { - return !(a < b); - }); + if (remove) { + Sails::Topology topology = {&structure, residue_database}; + + std::set removal_set = {to_remove.begin(), to_remove.end()}; + + for (auto &site: to_remove) { + auto glycan = topology.find_glycan_topology(site); + std::vector downstream_sugars = glycan.get_downstream_sugars(site); + for (auto& downstream_sugar: downstream_sugars) { + if (std::find(removal_set.begin(), removal_set.end(), downstream_sugar->site) != removal_set.end()) continue; + downstream_sugar->site.atom_idx = 0; // remove atom site from site to allow sorting + removal_set.insert(downstream_sugar->site); + } + } - for (const auto& site: to_remove) { - const auto residue_ptr = &structure.models[site.model_idx].chains[site.chain_idx].residues; - residue_ptr->erase(residue_ptr->begin() + site.residue_idx); + std::vector removal_list = {removal_set.begin(), removal_set.end()}; + + std::sort(removal_list.begin(), removal_list.end(), [](const Sails::Glycosite& a, const Sails::Glycosite& b) { + return !(a < b); + }); + + for (auto &site: removal_list) { + const auto residue_ptr = &structure.models[site.model_idx].chains[site.chain_idx].residues; + residue_ptr->erase(residue_ptr->begin() + site.residue_idx); + } } + std::string log_string = Sails::Telemetry::format_log(log, false, "").value(); return { structure, diff --git a/package/src/include/density/sails-density.h b/package/src/include/density/sails-density.h index 891ea06..b93ecd3 100644 --- a/package/src/include/density/sails-density.h +++ b/package/src/include/density/sails-density.h @@ -110,6 +110,19 @@ namespace Sails { */ gemmi::Grid<> calculate_density_for_grid(gemmi::Residue &residue) const; + /** + * @brief Calculates the density for a gemmi::Residue object. + * + * This method takes a gemmi::Residue object and calculates the density + * using the gemmi::DensityCalculator class. The density calculation is performed using the + * density score method specified in the constructor of the gemmi::DensityCalculator. + * + * @param residue The gemmi::Residue object for which the density is calculated. + * + * @return The calculated density grid for the specified box. + */ + gemmi::Grid<> calculate_density_for_structure(gemmi::Structure &structure) const; + /** * @brief Calculates the RSCC (Real Space Correlation Coefficient) score for a given residue. * @@ -135,7 +148,8 @@ namespace Sails { * * @return The RSCC between the observed and calculated values. */ - static float calculate_rscc(std::vector obs_values, std::vector calc_values) ; + template + static T calculate_rscc(std::vector obs_values, std::vector calc_values) ; /** * @brief Calculates the RSCC score for a given superposition result. diff --git a/package/src/include/sails-glycan.h b/package/src/include/sails-glycan.h index 726b288..cbc5ee3 100644 --- a/package/src/include/sails-glycan.h +++ b/package/src/include/sails-glycan.h @@ -321,7 +321,18 @@ namespace Sails { [[nodiscard]] std::vector get_downstream_sugars(Sugar* sugar) { std::vector downstream_sugars; - dfs(sugar, downstream_sugars, 0); + dfs_sugars(sugar, downstream_sugars, 0); + // downstream_sugars.erase(downstream_sugars.begin()); + return downstream_sugars; + } + + [[nodiscard]] std::vector get_downstream_sugars(Glycosite& site) { + std::vector downstream_sugars; + if (sugars.count(site) == 0) { + return {}; + } + Sugar* sugar = sugars.at(site).get(); + dfs_sugars(sugar, downstream_sugars, 0); // downstream_sugars.erase(downstream_sugars.begin()); return downstream_sugars; } @@ -569,7 +580,7 @@ namespace Sails { * @param terminal_sugars - A vector to store the terminal sugar molecules found. * @param depth - The depth of the current search */ - [[maybe_unused]] void dfs(Sugar *current_sugar, std::vector &terminal_sugars, int depth); + [[maybe_unused]] void dfs_terminal(Sugar *current_sugar, std::vector &terminal_sugars, int depth); /** * Performs a depth-first search (DFS) on a graph of sugar molecules, starting from @@ -582,6 +593,16 @@ namespace Sails { [[maybe_unused]] void dfs_sites(Sugar *current_sugar, std::vector &sites, int depth); + /** + * Performs a depth-first search (DFS) on a graph of sugar molecules, starting from + * a given sugar and collecting terminal sugars. + * + * @param current_sugar - The current sugar molecule being visited. + * @param sites - A vector to store the sites + * @param depth - The depth of the current search + */ + [[maybe_unused]] void dfs_sugars(Sugar *current_sugar, std::vector &sugars, int depth); + /** * @brief Get the structure associated with the glycan. * diff --git a/package/src/include/sails-refine.h b/package/src/include/sails-refine.h index b4e7241..9ff86d7 100644 --- a/package/src/include/sails-refine.h +++ b/package/src/include/sails-refine.h @@ -12,7 +12,7 @@ #include #include -#include +#include namespace Sails { diff --git a/package/src/include/sails-utils.h b/package/src/include/sails-utils.h index b52a2cc..59a42cb 100644 --- a/package/src/include/sails-utils.h +++ b/package/src/include/sails-utils.h @@ -259,6 +259,20 @@ namespace Sails::Utils { std::vector split(const std::string &string, char delimiter); gemmi::Model create_model(gemmi::Residue& residue); + + + template + std::pair, std::vector> split_pairs(const std::vector> &pairs) { + std::vector firsts; + std::vector seconds; + firsts.reserve(pairs.size()); + seconds.reserve(pairs.size()); + for (const auto& p : pairs) { + firsts.push_back(p.first); + seconds.push_back(p.second); + } + return {std::move(firsts), std::move(seconds)}; + } } // namespace Sails::Utils diff --git a/package/src/sails/interface.py b/package/src/sails/interface.py index 8c3cb03..8c3000e 100644 --- a/package/src/sails/interface.py +++ b/package/src/sails/interface.py @@ -208,6 +208,7 @@ def extract_sails_structure(structure: sails.Structure) -> gemmi.Structure: cell = structure.cell() os.cell = gemmi.UnitCell(cell.a, cell.b, cell.c, cell.alpha, cell.beta, cell.gamma) + os.spacegroup_hm = structure.spacegroup_hm os.connections = extract_sails_connections(structure.connections) return os diff --git a/package/src/sails/validate.py b/package/src/sails/validate.py index 252ce22..469d39f 100644 --- a/package/src/sails/validate.py +++ b/package/src/sails/validate.py @@ -20,7 +20,11 @@ def parse_args(): group.add_argument("--modelin", type=str, required=True) group.add_argument("--modelout", type=str, default="sails-validate.cif") group.add_argument("--logout", type=str, default="sails-validate.log") + group.add_argument( + "--threshold", type=float, default=0.7, help="RSCC Threshold to use for removal" + ) group.add_argument("--remove", action=argparse.BooleanOptionalAction, default=False) + group.add_argument("--print", action=argparse.BooleanOptionalAction, default=False) formatter = argparse.ArgumentDefaultsHelpFormatter xray_parser = subparsers.add_parser( @@ -53,10 +57,16 @@ def run(): labels = get_column_labels(args.colin_fo, args.colin_fwt) sails_mtz = interface.get_sails_mtz(args.mtzin, *labels) - result = validate(sails_structure, sails_mtz, args.remove, str(resource)) + result = validate( + sails_structure, sails_mtz, args.remove, args.threshold, str(resource) + ) structure = interface.extract_sails_structure(result.structure) structure.make_mmcif_block().write_file(args.modelout) log = json.loads(result.log) + + if args.print: + print(json.dumps(log, indent=4)) + with open(args.logout, "w") as f: - json.dump(log, f) + json.dump(log, f, indent=4) From bcd56186238b0207efd997f19fc8a1396b70ca96 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Tue, 21 Oct 2025 14:52:29 +0100 Subject: [PATCH 22/56] Added space group to structure Changed snfg command line argument dashes --- package/src/bindings/python_sails.cpp | 3 ++- package/src/sails/snfg.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/package/src/bindings/python_sails.cpp b/package/src/bindings/python_sails.cpp index d5d318a..b313770 100644 --- a/package/src/bindings/python_sails.cpp +++ b/package/src/bindings/python_sails.cpp @@ -98,6 +98,7 @@ NB_MODULE(sails_module, m) { return Sails::Cell(structure.cell); }) .def_rw("connections", &gemmi::Structure::connections) + .def_rw("spacegroup_hm", &gemmi::Structure::spacegroup_hm) .def("set_cell", [](gemmi::Structure &structure, const Sails::Cell &cell) { structure.cell = gemmi::UnitCell(cell.a, cell.b, cell.c, cell.alpha, cell.beta, cell.gamma); @@ -270,7 +271,7 @@ NB_MODULE(sails_module, m) { m.def("model_wurcs", &model_wurcs, "structure"_a, "wurcs"_a, "chain"_a, "seqid"_a, "resource_dir"_a); m.def("morph", &morph, "structure"_a, "wurcs"_a, "chain"_a, "seqid"_a, "resource_dir"_a); - m.def("validate", &validate, "structure"_a, "mtz"_a, "remove"_a, "resource_dir"_a); + m.def("validate", &validate, "structure"_a, "mtz"_a, "remove"_a, "threshold"_a, "resource_dir"_a); m.def("test_snfg", &test); diff --git a/package/src/sails/snfg.py b/package/src/sails/snfg.py index d4b5ef0..bd35586 100644 --- a/package/src/sails/snfg.py +++ b/package/src/sails/snfg.py @@ -110,10 +110,10 @@ def create_single_snfg(args): def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("-model", type=str, required=True) - parser.add_argument("-snfgout", type=str, required=True) - parser.add_argument("-chain", type=str, required=False) - parser.add_argument("-seqid", type=int, required=False) + parser.add_argument("--model", type=str, required=True) + parser.add_argument("--snfgout", type=str, required=True) + parser.add_argument("--chain", type=str, required=False) + parser.add_argument("--seqid", type=int, required=False) parser.add_argument("--all", action=argparse.BooleanOptionalAction, required=False) parser.add_argument( "--overwrite", action=argparse.BooleanOptionalAction, required=False From 4d06dde2d4248f637c0edda104ab7a004b0066d1 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Tue, 21 Oct 2025 22:02:48 +0100 Subject: [PATCH 23/56] Refactored RSCC calculation to support EM formfactors Added resolution option to EM functions Added sails-validate em --- package/src/bindings/python_sails.cpp | 27 +++- package/src/cpp/density/sails-density.cpp | 112 ++++++++-------- package/src/cpp/density/sails-em-density.cpp | 59 ++++++++- .../src/cpp/density/sails-xtal-density.cpp | 56 ++++++++ package/src/cpp/sails.cpp | 123 ++++++++++++++++-- package/src/include/density/sails-density.h | 78 +++++------ .../src/include/density/sails-em-density.h | 15 ++- .../src/include/density/sails-xtal-density.h | 8 +- package/src/sails/glycosylate.py | 66 +++++++++- package/src/sails/validate.py | 40 +++++- 10 files changed, 462 insertions(+), 122 deletions(-) diff --git a/package/src/bindings/python_sails.cpp b/package/src/bindings/python_sails.cpp index b313770..a6f7150 100644 --- a/package/src/bindings/python_sails.cpp +++ b/package/src/bindings/python_sails.cpp @@ -243,23 +243,31 @@ NB_MODULE(sails_module, m) { "mtz"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); m.def("n_glycosylate", - nb::overload_cast &, int, std::string &, bool>(&n_glycosylate), - "structure"_a, "grid"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); + nb::overload_cast &, float, int, std::string &, bool>(&n_glycosylate), + "structure"_a, "grid"_a, "resolution"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); m.def("c_glycosylate", - nb::overload_cast &, int, std::string &, bool>(&c_glycosylate), - "structure"_a, "grid"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); + nb::overload_cast &, float, int, std::string &, bool>(&c_glycosylate), + "structure"_a, "grid"_a, "resolution"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); m.def("o_mannosylate", - nb::overload_cast &, int, std::string &, bool>(&o_mannosylate), - "structure"_a, "grid"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); + nb::overload_cast &, float, int, std::string &, bool>(&o_mannosylate), + "structure"_a, "grid"_a, "resolution"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); m.def("auto_glycosylate", nb::overload_cast&, gemmi::Grid<>&, int, std::string &, bool>(&auto_glycosylate), "structure"_a, "mtz"_a, "glycan_grid"_a, "protein_grid"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); + m.def("auto_glycosylate", + nb::overload_cast &, float, gemmi::Grid<>&, gemmi::Grid<>&, int, std::string &, bool>(&auto_glycosylate), "structure"_a, + "grid"_a, "resolution"_a, "glycan_grid"_a, "protein_grid"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); + m.def("glycosylate_site", nb::overload_cast(&glycosylate_site), "structure"_a, "mtz"_a, "chain"_a, "seqid"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); + m.def("glycosylate_site", + nb::overload_cast &, float, std::string&, int, int, std::string &, bool>(&glycosylate_site), "structure"_a, + "grid"_a, "resolution"_a, "chain"_a, "seqid"_a, "cycles"_a, "resource_dir"_a, "verbose"_a); + m.def("identify_predicted_sites", nb::overload_cast&, std::string &>(&identify_predicted_sites), "structure"_a, "glycan_grid"_a, "resource_dir"_a); m.def("identify_predicted_sites", nb::overload_cast&, gemmi::Grid<>&, bool, std::string &>(&identify_predicted_sites), @@ -271,7 +279,12 @@ NB_MODULE(sails_module, m) { m.def("model_wurcs", &model_wurcs, "structure"_a, "wurcs"_a, "chain"_a, "seqid"_a, "resource_dir"_a); m.def("morph", &morph, "structure"_a, "wurcs"_a, "chain"_a, "seqid"_a, "resource_dir"_a); - m.def("validate", &validate, "structure"_a, "mtz"_a, "remove"_a, "threshold"_a, "resource_dir"_a); + + // XRAY + m.def("validate", nb::overload_cast(&validate), "structure"_a, "mtz"_a, "remove"_a, "threshold"_a, "resource_dir"_a); + + // EM + m.def("validate", nb::overload_cast &, float, bool, float, std::string &>(&validate), "structure"_a, "grid"_a, "resolution"_a, "remove"_a, "threshold"_a, "resource_dir"_a); m.def("test_snfg", &test); diff --git a/package/src/cpp/density/sails-density.cpp b/package/src/cpp/density/sails-density.cpp index 3e48782..b6ba2cf 100644 --- a/package/src/cpp/density/sails-density.cpp +++ b/package/src/cpp/density/sails-density.cpp @@ -46,62 +46,62 @@ float Sails::Density::atomwise_score(const gemmi::Residue &residue) const { }) / (residue.atoms.size()); } -gemmi::Grid<> Sails::Density::calculate_density_for_box(gemmi::Residue &residue, gemmi::Box &box) const { - - gemmi::DensityCalculator, float> density_calculator; - - gemmi::Position size = box.get_size(); - gemmi::UnitCell dummy_cell = {size.x, size.y, size.z, 90, 90, 90}; - density_calculator.grid.unit_cell = dummy_cell; - density_calculator.grid.nu = size.x; - density_calculator.grid.nv = size.y; - density_calculator.grid.nw = size.z; - density_calculator.grid.spacegroup = get_work_grid()->spacegroup; - density_calculator.grid.axis_order = get_work_grid()->axis_order; - - density_calculator.d_min = 1; - density_calculator.initialize_grid(); - for (auto &atom: residue.atoms) { - density_calculator.add_atom_density_to_grid(atom); - } - density_calculator.grid.symmetrize_sum(); - return density_calculator.grid; -} - -gemmi::Grid<> Sails::Density::calculate_density_for_grid(gemmi::Residue &residue) const { - - gemmi::DensityCalculator, float> density_calculator; - - density_calculator.grid.copy_metadata_from(*get_best_grid()); - density_calculator.grid.spacing[0] = get_best_grid()->spacing[0]; - density_calculator.grid.spacing[1] = get_best_grid()->spacing[1]; - density_calculator.grid.spacing[2] = get_best_grid()->spacing[2]; - - density_calculator.d_min = get_resolution(); - density_calculator.initialize_grid(); - for (auto &atom: residue.atoms) { - density_calculator.add_atom_density_to_grid(atom); - } - density_calculator.grid.symmetrize_sum(); - auto x = density_calculator.grid; - return std::move(x); -} - -gemmi::Grid<> Sails::Density::calculate_density_for_structure(gemmi::Structure &structure) const { - gemmi::DensityCalculator, float> density_calculator; - - density_calculator.grid.copy_metadata_from(*get_best_grid()); - density_calculator.grid.spacing[0] = get_best_grid()->spacing[0]; - density_calculator.grid.spacing[1] = get_best_grid()->spacing[1]; - density_calculator.grid.spacing[2] = get_best_grid()->spacing[2]; - - density_calculator.d_min = get_resolution(); - density_calculator.initialize_grid(); - density_calculator.add_model_density_to_grid(structure.models[0]); - density_calculator.grid.symmetrize_sum(); - auto x = density_calculator.grid; - return std::move(x); -} +// gemmi::Grid<> Sails::Density::calculate_density_for_box(gemmi::Residue &residue, gemmi::Box &box) const { +// +// gemmi::DensityCalculator, float> density_calculator; +// +// gemmi::Position size = box.get_size(); +// gemmi::UnitCell dummy_cell = {size.x, size.y, size.z, 90, 90, 90}; +// density_calculator.grid.unit_cell = dummy_cell; +// density_calculator.grid.nu = size.x; +// density_calculator.grid.nv = size.y; +// density_calculator.grid.nw = size.z; +// density_calculator.grid.spacegroup = get_work_grid()->spacegroup; +// density_calculator.grid.axis_order = get_work_grid()->axis_order; +// +// density_calculator.d_min = 1; +// density_calculator.initialize_grid(); +// for (auto &atom: residue.atoms) { +// density_calculator.add_atom_density_to_grid(atom); +// } +// density_calculator.grid.symmetrize_sum(); +// return density_calculator.grid; +// } +// +// gemmi::Grid<> Sails::Density::calculate_density_for_grid(gemmi::Residue &residue) const { +// +// gemmi::DensityCalculator, float> density_calculator; +// +// density_calculator.grid.copy_metadata_from(*get_best_grid()); +// density_calculator.grid.spacing[0] = get_best_grid()->spacing[0]; +// density_calculator.grid.spacing[1] = get_best_grid()->spacing[1]; +// density_calculator.grid.spacing[2] = get_best_grid()->spacing[2]; +// +// density_calculator.d_min = get_resolution(); +// density_calculator.initialize_grid(); +// for (auto &atom: residue.atoms) { +// density_calculator.add_atom_density_to_grid(atom); +// } +// density_calculator.grid.symmetrize_sum(); +// auto x = density_calculator.grid; +// return std::move(x); +// } +// +// gemmi::Grid<> Sails::Density::calculate_density_for_structure(gemmi::Structure &structure) const { +// gemmi::DensityCalculator, float> density_calculator; +// +// density_calculator.grid.copy_metadata_from(*get_best_grid()); +// density_calculator.grid.spacing[0] = get_best_grid()->spacing[0]; +// density_calculator.grid.spacing[1] = get_best_grid()->spacing[1]; +// density_calculator.grid.spacing[2] = get_best_grid()->spacing[2]; +// +// density_calculator.d_min = get_resolution(); +// density_calculator.initialize_grid(); +// density_calculator.add_model_density_to_grid(structure.models[0]); +// density_calculator.grid.symmetrize_sum(); +// auto x = density_calculator.grid; +// return std::move(x); +// } template T Sails::Density::calculate_rscc(std::vector obs_values, std::vector calc_values) { diff --git a/package/src/cpp/density/sails-em-density.cpp b/package/src/cpp/density/sails-em-density.cpp index 3b57af9..b968858 100644 --- a/package/src/cpp/density/sails-em-density.cpp +++ b/package/src/cpp/density/sails-em-density.cpp @@ -5,6 +5,63 @@ #include "../../include/density/sails-density.h" #include "../../include/density/sails-em-density.h" -Sails::EMDensity::EMDensity(gemmi::Grid<> &grid) { +Sails::EMDensity::EMDensity(gemmi::Grid<> &grid, float resolution) { m_grid = grid; + m_resolution = resolution; +} + +gemmi::Grid<> Sails::EMDensity::calculate_density_for_box(gemmi::Residue &residue, + gemmi::Box &box) const { + gemmi::DensityCalculator, float> density_calculator; + + gemmi::Position size = box.get_size(); + gemmi::UnitCell dummy_cell = {size.x, size.y, size.z, 90, 90, 90}; + density_calculator.grid.unit_cell = dummy_cell; + density_calculator.grid.nu = size.x; + density_calculator.grid.nv = size.y; + density_calculator.grid.nw = size.z; + density_calculator.grid.spacegroup = get_work_grid()->spacegroup; + density_calculator.grid.axis_order = get_work_grid()->axis_order; + + density_calculator.d_min = 1; + density_calculator.initialize_grid(); + for (auto &atom: residue.atoms) { + density_calculator.add_atom_density_to_grid(atom); + } + density_calculator.grid.symmetrize_sum(); + return density_calculator.grid; +} + +gemmi::Grid<> Sails::EMDensity::calculate_density_for_grid(gemmi::Residue &residue) const { + gemmi::DensityCalculator, float> density_calculator; + + density_calculator.grid.copy_metadata_from(*get_best_grid()); + density_calculator.grid.spacing[0] = get_best_grid()->spacing[0]; + density_calculator.grid.spacing[1] = get_best_grid()->spacing[1]; + density_calculator.grid.spacing[2] = get_best_grid()->spacing[2]; + + density_calculator.d_min = get_resolution(); + density_calculator.initialize_grid(); + for (auto &atom: residue.atoms) { + density_calculator.add_atom_density_to_grid(atom); + } + density_calculator.grid.symmetrize_sum(); + auto x = density_calculator.grid; + return std::move(x); +} + +gemmi::Grid<> Sails::EMDensity::calculate_density_for_structure(gemmi::Structure &structure) const { + gemmi::DensityCalculator, float> density_calculator; + + density_calculator.grid.copy_metadata_from(*get_best_grid()); + density_calculator.grid.spacing[0] = get_best_grid()->spacing[0]; + density_calculator.grid.spacing[1] = get_best_grid()->spacing[1]; + density_calculator.grid.spacing[2] = get_best_grid()->spacing[2]; + + density_calculator.d_min = get_resolution(); + density_calculator.initialize_grid(); + density_calculator.add_model_density_to_grid(structure.models[0]); + density_calculator.grid.symmetrize_sum(); + auto x = density_calculator.grid; + return std::move(x); } diff --git a/package/src/cpp/density/sails-xtal-density.cpp b/package/src/cpp/density/sails-xtal-density.cpp index 0dcafbd..dc06b8b 100644 --- a/package/src/cpp/density/sails-xtal-density.cpp +++ b/package/src/cpp/density/sails-xtal-density.cpp @@ -97,6 +97,62 @@ void Sails::XtalDensity::form_atom_list(const gemmi::Structure &structure, std:: } +gemmi::Grid<> Sails::XtalDensity::calculate_density_for_box(gemmi::Residue &residue, + gemmi::Box &box) const { + gemmi::DensityCalculator, float> density_calculator; + + gemmi::Position size = box.get_size(); + gemmi::UnitCell dummy_cell = {size.x, size.y, size.z, 90, 90, 90}; + density_calculator.grid.unit_cell = dummy_cell; + density_calculator.grid.nu = size.x; + density_calculator.grid.nv = size.y; + density_calculator.grid.nw = size.z; + density_calculator.grid.spacegroup = get_work_grid()->spacegroup; + density_calculator.grid.axis_order = get_work_grid()->axis_order; + + density_calculator.d_min = 1; + density_calculator.initialize_grid(); + for (auto &atom: residue.atoms) { + density_calculator.add_atom_density_to_grid(atom); + } + density_calculator.grid.symmetrize_sum(); + return density_calculator.grid; +} + +gemmi::Grid<> Sails::XtalDensity::calculate_density_for_grid(gemmi::Residue &residue) const { + gemmi::DensityCalculator, float> density_calculator; + + density_calculator.grid.copy_metadata_from(*get_best_grid()); + density_calculator.grid.spacing[0] = get_best_grid()->spacing[0]; + density_calculator.grid.spacing[1] = get_best_grid()->spacing[1]; + density_calculator.grid.spacing[2] = get_best_grid()->spacing[2]; + + density_calculator.d_min = get_resolution(); + density_calculator.initialize_grid(); + for (auto &atom: residue.atoms) { + density_calculator.add_atom_density_to_grid(atom); + } + density_calculator.grid.symmetrize_sum(); + auto x = density_calculator.grid; + return std::move(x); +} + +gemmi::Grid<> Sails::XtalDensity::calculate_density_for_structure(gemmi::Structure &structure) const { + gemmi::DensityCalculator, float> density_calculator; + + density_calculator.grid.copy_metadata_from(*get_best_grid()); + density_calculator.grid.spacing[0] = get_best_grid()->spacing[0]; + density_calculator.grid.spacing[1] = get_best_grid()->spacing[1]; + density_calculator.grid.spacing[2] = get_best_grid()->spacing[2]; + + density_calculator.d_min = get_resolution(); + density_calculator.initialize_grid(); + density_calculator.add_model_density_to_grid(structure.models[0]); + density_calculator.grid.symmetrize_sum(); + auto x = density_calculator.grid; + return std::move(x); +} + void Sails::XtalDensity::recalculate_map(gemmi::Structure &structure) { std::vector atoms; form_atom_list(structure, atoms); diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 08fff3c..f0dff81 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -237,7 +237,7 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu }; } -Sails::Output run_em_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structure, gemmi::Grid<>& grid, int cycles, +Sails::Output run_em_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structure, gemmi::Grid<>& grid, float resolution, int cycles, std::string &resource_dir, bool strict, bool verbose) { @@ -251,7 +251,7 @@ Sails::Output run_em_cycle(Sails::Glycosites &glycosites, gemmi::Structure &stru Sails::Topology topology = {&structure, residue_database}; Sails::SNFG snfg = Sails::SNFG(&structure, &residue_database); - auto density = Sails::EMDensity(grid); + auto density = Sails::EMDensity(grid, resolution); structure.cell = density.get_mtz()->cell; structure.spacegroup_hm = density.get_mtz()->spacegroup_name; @@ -380,26 +380,43 @@ Sails::Output glycosylate_site(gemmi::Structure &structure, Sails::MTZ &sails_mt // EM FUNCTIONS -Sails::Output n_glycosylate(gemmi::Structure &structure, gemmi::Grid<>& grid, int cycles, std::string &resource_dir, +Sails::Output n_glycosylate(gemmi::Structure &structure, gemmi::Grid<>& grid, float resolution, int cycles, std::string &resource_dir, bool verbose) { auto glycosites = Sails::find_n_glycosylation_sites(structure); - return run_em_cycle(glycosites, structure, grid, cycles, resource_dir, false, verbose); + return run_em_cycle(glycosites, structure, grid, resolution, cycles, resource_dir, false, verbose); } -Sails::Output c_glycosylate(gemmi::Structure &structure, gemmi::Grid<>& grid, int cycles, std::string &resource_dir, +Sails::Output c_glycosylate(gemmi::Structure &structure, gemmi::Grid<>& grid, float resolution, int cycles, std::string &resource_dir, bool verbose) { auto glycosites = Sails::find_c_glycosylation_sites(structure); - return run_em_cycle(glycosites, structure, grid, cycles, resource_dir, false, verbose); + return run_em_cycle(glycosites, structure, grid, resolution, cycles, resource_dir, false, verbose); } -Sails::Output o_mannosylate(gemmi::Structure &structure, gemmi::Grid<>& grid, int cycles, std::string &resource_dir, +Sails::Output o_mannosylate(gemmi::Structure &structure, gemmi::Grid<>& grid, float resolution, int cycles, std::string &resource_dir, bool verbose) { Sails::SolventAccessibility sa = Sails::SolventAccessibility(&structure); Sails::SolventAccessibility::SolventAccessibilityMap sa_map = sa.calculate_solvent_accessibility(); auto glycosites = Sails::find_o_mannosylation_sites(structure, sa_map); - return run_em_cycle(glycosites, structure, grid, cycles, resource_dir, true, verbose); + return run_em_cycle(glycosites, structure, grid, resolution, cycles, resource_dir, true, verbose); } +Sails::Output auto_glycosylate(gemmi::Structure &structure, gemmi::Grid<>& grid, float resolution, gemmi::Grid<>& glycan_grid, gemmi::Grid<>& protein_grid, int cycles, std::string &resource_dir, + bool verbose) { + Sails::Glycosites glycosites = identify_predicted_sites(structure, glycan_grid, protein_grid, true, resource_dir); + return run_em_cycle(glycosites, structure, grid, resolution, cycles, resource_dir, false, verbose); +} + +Sails::Output glycosylate_site(gemmi::Structure &structure, gemmi::Grid<>& grid, float resolution, std::string& chain, int seqid, int cycles, std::string &resource_dir, + bool verbose) { + std::optional potential_site = Sails::find_site(structure, chain, seqid); + if (!potential_site.has_value()) { + throw std::runtime_error("Site could not be found"); + } + Sails::Glycosites glycosites = {potential_site.value()}; + return run_em_cycle(glycosites, structure, grid, resolution, cycles, resource_dir, false, verbose); +} + + //SNFG FUNCTIONS @@ -638,6 +655,94 @@ Sails::Output validate(gemmi::Structure& structure, Sails::MTZ &sails_mtz, bool }; } +Sails::Output validate(gemmi::Structure& structure, gemmi::Grid<>& grid, float resolution, bool remove, float threshold, std::string& resource_dir) { + std::string data_file = resource_dir + "/data.json"; + Sails::JSONLoader loader = {data_file}; + Sails::ResidueDatabase residue_database = loader.load_residue_database(); + Sails::LinkageDatabase linkage_database = loader.load_linkage_database(); + + auto density = Sails::EMDensity(grid, resolution); + + gemmi::Grid<> calculated_density = density.calculate_density_for_structure(structure); + + gemmi::NeighborSearch ns = gemmi::NeighborSearch(structure.models[0], structure.cell, 2); + ns.populate(); + + gemmi::Grid<> best_grid = *density.get_best_grid(); + std::map>> residue_pairs; + + for (auto point: best_grid) { + gemmi::Position position = best_grid.point_to_position(point); + auto mark = ns.find_nearest_atom(position, 2); + if (mark == nullptr) continue; + + auto site = Sails::Glycosite(0, mark->chain_idx, mark->residue_idx, 0); + const gemmi::Residue* residue_ptr = &structure.models[site.model_idx].chains[site.chain_idx].residues[site.residue_idx]; + if (residue_database.count(residue_ptr->name) == 0) continue; + const Sails::ResidueData& residue = residue_database.at(residue_ptr->name); + if (!residue.is_sugar) continue; + + double obs = *point.value; + double calc = calculated_density.interpolate_value(position); + residue_pairs[site].emplace_back(obs, calc); + + } + std::map rsccs; + + for (const auto& [site, data]: residue_pairs) { + auto [obs_values, calc_values] = Sails::Utils::split_pairs(data); + if (obs_values.empty() || calc_values.empty()) continue; + + rsccs[site] = Sails::Density::calculate_rscc(obs_values, calc_values); + } + + std::vector to_remove = {}; + std::vector log = {}; + + for (auto& [site, rscc]: rsccs) { + std::string residue_key = Sails::Utils::format_residue_from_site(site, &structure); + log.emplace_back(residue_key, rscc); + if (rscc > threshold) { + continue; + } + to_remove.emplace_back(site); + } + + if (remove) { + Sails::Topology topology = {&structure, residue_database}; + + std::set removal_set = {to_remove.begin(), to_remove.end()}; + + for (auto &site: to_remove) { + auto glycan = topology.find_glycan_topology(site); + std::vector downstream_sugars = glycan.get_downstream_sugars(site); + for (auto& downstream_sugar: downstream_sugars) { + if (std::find(removal_set.begin(), removal_set.end(), downstream_sugar->site) != removal_set.end()) continue; + downstream_sugar->site.atom_idx = 0; // remove atom site from site to allow sorting + removal_set.insert(downstream_sugar->site); + } + } + + std::vector removal_list = {removal_set.begin(), removal_set.end()}; + + std::sort(removal_list.begin(), removal_list.end(), [](const Sails::Glycosite& a, const Sails::Glycosite& b) { + return !(a < b); + }); + + for (auto &site: removal_list) { + const auto residue_ptr = &structure.models[site.model_idx].chains[site.chain_idx].residues; + residue_ptr->erase(residue_ptr->begin() + site.residue_idx); + } + } + + + std::string log_string = Sails::Telemetry::format_log(log, false, "").value(); + return { + structure, + log_string + }; +} + // gemmi::Structure wurcs(gemmi::Structure& structure, std::string chain, int seqid, std::string& resource_dir) { // std::string data_file = resource_dir + "/data.json"; @@ -699,5 +804,5 @@ int main() { std::string data_file = "package/src/sails/data/"; auto glycosites = Sails::find_n_glycosylation_sites(structure); - run_em_cycle(glycosites, structure, map.grid, 1, data_file, false, true); + // run_em_cycle(glycosites, structure, map.grid, 1, data_file, false, true); } diff --git a/package/src/include/density/sails-density.h b/package/src/include/density/sails-density.h index b93ecd3..3601a24 100644 --- a/package/src/include/density/sails-density.h +++ b/package/src/include/density/sails-density.h @@ -46,6 +46,46 @@ namespace Sails { [[nodiscard]] virtual const DensityScoreMethod get_score_method() const = 0; + /** + * @brief Calculates the density for a given box based on a gemmi::Residue object. + * + * This method takes a gemmi::Residue object and calculates the density for the specified box + * using the gemmi::DensityCalculator class. The density calculation is performed using the + * density score method specified in the constructor of the gemmi::DensityCalculator. + * + * @param residue The gemmi::Residue object for which the density is calculated. + * @param box + * + * @return The calculated density grid for the specified box. + */ + virtual gemmi::Grid<> calculate_density_for_box(gemmi::Residue &residue, gemmi::Box &box) const = 0; + + /** + * @brief Calculates the density for a gemmi::Residue object. + * + * This method takes a gemmi::Residue object and calculates the density + * using the gemmi::DensityCalculator class. The density calculation is performed using the + * density score method specified in the constructor of the gemmi::DensityCalculator. + * + * @param residue The gemmi::Residue object for which the density is calculated. + * + * @return The calculated density grid for the specified box. + */ + virtual gemmi::Grid<> calculate_density_for_grid(gemmi::Residue &residue) const = 0; + + /** + * @brief Calculates the density for a gemmi::Residue object. + * + * This method takes a gemmi::Residue object and calculates the density + * using the gemmi::DensityCalculator class. The density calculation is performed using the + * density score method specified in the constructor of the gemmi::DensityCalculator. + * + * @param residue The gemmi::Residue object for which the density is calculated. + * + * @return The calculated density grid for the specified box. + */ + virtual gemmi::Grid<> calculate_density_for_structure(gemmi::Structure &structure) const = 0; + /** * @brief Scores a residue based on the specified density score method. * @@ -83,45 +123,7 @@ namespace Sails { */ [[nodiscard]] float atomwise_score(const gemmi::Residue &residue) const; - /** - * @brief Calculates the density for a given box based on a gemmi::Residue object. - * - * This method takes a gemmi::Residue object and calculates the density for the specified box - * using the gemmi::DensityCalculator class. The density calculation is performed using the - * density score method specified in the constructor of the gemmi::DensityCalculator. - * - * @param residue The gemmi::Residue object for which the density is calculated. - * @param box - * - * @return The calculated density grid for the specified box. - */ - gemmi::Grid<> calculate_density_for_box(gemmi::Residue &residue, gemmi::Box &box) const; - - /** - * @brief Calculates the density for a gemmi::Residue object. - * - * This method takes a gemmi::Residue object and calculates the density - * using the gemmi::DensityCalculator class. The density calculation is performed using the - * density score method specified in the constructor of the gemmi::DensityCalculator. - * - * @param residue The gemmi::Residue object for which the density is calculated. - * - * @return The calculated density grid for the specified box. - */ - gemmi::Grid<> calculate_density_for_grid(gemmi::Residue &residue) const; - /** - * @brief Calculates the density for a gemmi::Residue object. - * - * This method takes a gemmi::Residue object and calculates the density - * using the gemmi::DensityCalculator class. The density calculation is performed using the - * density score method specified in the constructor of the gemmi::DensityCalculator. - * - * @param residue The gemmi::Residue object for which the density is calculated. - * - * @return The calculated density grid for the specified box. - */ - gemmi::Grid<> calculate_density_for_structure(gemmi::Structure &structure) const; /** * @brief Calculates the RSCC (Real Space Correlation Coefficient) score for a given residue. diff --git a/package/src/include/density/sails-em-density.h b/package/src/include/density/sails-em-density.h index d51e1ce..ea354ba 100644 --- a/package/src/include/density/sails-em-density.h +++ b/package/src/include/density/sails-em-density.h @@ -5,9 +5,9 @@ #include "sails-density.h" namespace Sails { - class EMDensity : public Density { + class EMDensity : public Density{ public: - explicit EMDensity(gemmi::Grid<> &grid); + explicit EMDensity(gemmi::Grid<> &grid, float resolution); [[nodiscard]] const gemmi::Mtz *get_mtz() const override { return &m_mtz; } @@ -17,7 +17,7 @@ namespace Sails { [[nodiscard]] const gemmi::Grid<> *get_difference_grid() const override { return &m_grid; } - [[nodiscard]] const double get_resolution() const override { return 2.0; } + [[nodiscard]] const double get_resolution() const override { return m_resolution; } [[nodiscard]] const DensityScoreMethod get_score_method() const override { return score_method; } @@ -25,6 +25,13 @@ namespace Sails { return &calculated_maps; } + gemmi::Grid<> calculate_density_for_box(gemmi::Residue &residue, gemmi::Box &box) const override; + + gemmi::Grid<> calculate_density_for_grid(gemmi::Residue &residue) const override; + + gemmi::Grid<> calculate_density_for_structure(gemmi::Structure &structure) const override; + + private: /** * Best map @@ -46,6 +53,8 @@ namespace Sails { */ gemmi::Mtz m_mtz; + float m_resolution; + /** * Fc maps for residues in standard positions - used for fast RSCC calculations */ diff --git a/package/src/include/density/sails-xtal-density.h b/package/src/include/density/sails-xtal-density.h index 38c5e61..9a7ccd0 100644 --- a/package/src/include/density/sails-xtal-density.h +++ b/package/src/include/density/sails-xtal-density.h @@ -24,7 +24,7 @@ namespace Sails { [[nodiscard]] const gemmi::Grid<> *get_difference_grid() const override { return &m_difference_grid; } - [[nodiscard]] const double get_resolution() const override { return 2.0; } + [[nodiscard]] const double get_resolution() const override { return m_mtz.resolution_high(); } [[nodiscard]] const DensityScoreMethod get_score_method() const override { return score_method;} @@ -32,6 +32,12 @@ namespace Sails { return &calculated_maps; } + gemmi::Grid<> calculate_density_for_box(gemmi::Residue &residue, gemmi::Box &box) const override; + + gemmi::Grid<> calculate_density_for_grid(gemmi::Residue &residue) const override; + + gemmi::Grid<> calculate_density_for_structure(gemmi::Structure &structure) const override; + /** * @brief Recalculates the map based on the given structure. * diff --git a/package/src/sails/glycosylate.py b/package/src/sails/glycosylate.py index 09a07aa..768b580 100644 --- a/package/src/sails/glycosylate.py +++ b/package/src/sails/glycosylate.py @@ -161,6 +161,10 @@ def glycosylate_xtal( def glycosylate_em( structure: gemmi.Structure | Path | str, map: gemmi.Ccp4Map | gemmi.FloatGrid | Path | str, + preddirin: Path | str, + resolution: float, + chain: str, + seqid: int | str, cycles: int, type: Type = Type.n_glycosylate, verbose: bool = False, @@ -169,8 +173,51 @@ def glycosylate_em( sails_grid = interface.get_sails_map(map) resource = importlib.resources.files("sails").joinpath("data") - func = map_type_to_function(type) - result = func(sails_structure, sails_grid, cycles, str(resource), verbose) + if chain and seqid: + result = glycosylate_site( + sails_structure, + sails_grid, + resolution, + chain, + int(seqid), + cycles, + str(resource), + verbose, + ) + return ( + interface.extract_sails_structure(result.structure), + json.loads(result.log), + result.snfgs, + ) + + if type == Type.auto: + if preddirin: + predictions = read_prediction_dir( + preddirin, model_type=ModelType.multiclass + ) + else: + predictions = predict_map( + "multiclass", map, "output", nthreads=8, save_map=True + ) + glycan, protein = predictions + sails_glycan = interface.get_sails_map(glycan) + sails_protein = interface.get_sails_map(protein) + + result = auto_glycosylate( + sails_structure, + sails_grid, + resolution, + sails_glycan, + sails_protein, + cycles, + str(resource), + verbose, + ) + else: + func = map_type_to_function(type) + result = func( + sails_structure, sails_grid, resolution, cycles, str(resource), verbose + ) return ( interface.extract_sails_structure(result.structure), @@ -264,9 +311,19 @@ def xray(args): def em(args): - cycles = args.cycles if args.type == Type.n_glycosylate else 1 + cycles = ( + args.cycles if args.type == Type.n_glycosylate or args.type == Type.auto else 1 + ) structure, log, snfgs = glycosylate_em( - args.modelin, args.mapin, cycles, args.type, args.v + args.modelin, + args.mapin, + args.preddirin, + args.resolution, + args.chain, + args.seqid, + cycles, + args.type, + args.v, ) structure.make_mmcif_block().write_file(args.modelout) save_log(log, args) @@ -329,5 +386,6 @@ def parse_args(): em_parser = subparsers.add_parser("em", parents=[parent], formatter_class=formatter) em_parser_group = em_parser.add_argument_group("Required arguments in EM mode") em_parser_group.add_argument("--mapin", type=str, required=True) + em_parser_group.add_argument("--resolution", type=float, required=True) return parser.parse_args() diff --git a/package/src/sails/validate.py b/package/src/sails/validate.py index 469d39f..db918ec 100644 --- a/package/src/sails/validate.py +++ b/package/src/sails/validate.py @@ -1,5 +1,6 @@ import argparse import json +import time from .__version__ import __version__ import importlib @@ -48,9 +49,7 @@ def parse_args(): return parser.parse_args() -def run(): - args = parse_args() - +def xray(args): sails_structure = interface.get_sails_structure(args.modelin) resource = importlib.resources.files("sails").joinpath("data") @@ -70,3 +69,38 @@ def run(): with open(args.logout, "w") as f: json.dump(log, f, indent=4) + + +def em(args): + sails_structure = interface.get_sails_structure(args.modelin) + sails_grid = interface.get_sails_map(args.mapin) + resource = importlib.resources.files("sails").joinpath("data") + + result = validate( + sails_structure, sails_grid, args.remove, args.threshold, str(resource) + ) + + structure = interface.extract_sails_structure(result.structure) + structure.make_mmcif_block().write_file(args.modelout) + log = json.loads(result.log) + + if args.print: + print(json.dumps(log, indent=4)) + + with open(args.logout, "w") as f: + json.dump(log, f, indent=4) + + +def run(): + t0 = time.time() + args = parse_args() + + if args.mode == "xray": + xray(args) + elif args.mode == "em": + em(args) + else: + raise RuntimeError("Unknown mode") + + t1 = time.time() + print(f"Sails Validate - Time Taken = {(t1 - t0)} seconds") From d308346fc73dda934ac93e3ca7445dfdec406dd3 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Wed, 22 Oct 2025 08:50:08 +0100 Subject: [PATCH 24/56] Added global RSCC score in per cycle removal --- package/CMakeLists.txt | 1 + package/src/cpp/sails-score.cpp | 45 +++++++++++++++ package/src/cpp/sails.cpp | 96 +++++++------------------------ package/src/include/sails-score.h | 16 ++++++ package/src/sails/validate.py | 8 ++- 5 files changed, 89 insertions(+), 77 deletions(-) create mode 100644 package/src/cpp/sails-score.cpp create mode 100644 package/src/include/sails-score.h diff --git a/package/CMakeLists.txt b/package/CMakeLists.txt index 59387da..4597f1d 100644 --- a/package/CMakeLists.txt +++ b/package/CMakeLists.txt @@ -103,6 +103,7 @@ add_library( ${WRK_DIR}/src/cpp/sails-wurcs.cpp ${WRK_DIR}/src/cpp/sails-predictions.cpp ${WRK_DIR}/src/cpp/sails-morph.cpp + ${WRK_DIR}/src/cpp/sails-score.cpp # Density ${WRK_DIR}/src/cpp/density/sails-density.cpp diff --git a/package/src/cpp/sails-score.cpp b/package/src/cpp/sails-score.cpp new file mode 100644 index 0000000..8ffe3e5 --- /dev/null +++ b/package/src/cpp/sails-score.cpp @@ -0,0 +1,45 @@ +// +// Created by Jordan Dialpuri on 22/10/2025. +// + + +#include "../include/sails-score.h" + +#include "src/include/sails-utils.h" + +std::map Sails::Score::calculate_rsccs(Density *density, gemmi::Structure *structure, ResidueDatabase &residue_database) { + gemmi::Grid<> calculated_density = density->calculate_density_for_structure(*structure); + + constexpr double radius = 2; + auto ns = gemmi::NeighborSearch(structure->models[0], structure->cell, radius); + ns.populate(); + + gemmi::Grid<> best_grid = *density->get_best_grid(); + std::map>> residue_pairs; + + for (auto point: best_grid) { + gemmi::Position position = best_grid.point_to_position(point); + auto mark = ns.find_nearest_atom(position, radius); + if (mark == nullptr) continue; + + auto site = Glycosite(0, mark->chain_idx, mark->residue_idx, 0); + const gemmi::Residue* residue_ptr = &structure->models[site.model_idx].chains[site.chain_idx].residues[site.residue_idx]; + if (residue_database.count(residue_ptr->name) == 0) continue; + const ResidueData& residue = residue_database.at(residue_ptr->name); + if (!residue.is_sugar) continue; + + double obs = *point.value; + double calc = calculated_density.interpolate_value(position); + residue_pairs[site].emplace_back(obs, calc); + + } + std::map rsccs; + + for (const auto& [site, data]: residue_pairs) { + auto [obs_values, calc_values] = Sails::Utils::split_pairs(data); + if (obs_values.empty() || calc_values.empty()) continue; + + rsccs[site] = Sails::Density::calculate_rscc(obs_values, calc_values); + } + return rsccs; +} diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index f0dff81..08aafe5 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -27,6 +27,7 @@ #include #include "src/include/sails-predictions.h" +#include "src/include/sails-score.h" void print_rejection_dds(const Sails::Glycosite& s1, const Sails::Glycosite& s2, gemmi::Structure* structure) { @@ -47,11 +48,12 @@ void print_dds(const Sails::Glycosite &site, float dds, gemmi::Structure *struct } void remove_erroneous_sugars(gemmi::Structure *structure, Sails::Density *density, Sails::Glycan *glycan, bool strict, - bool debug) { + bool debug, Sails::ResidueDatabase &residue_database) { const float rscc_threshold = strict ? 0.65: 0.5; const float dds_threshold = strict ? 1.0: 1.1; - const std::pair difference_density_stats = density->calculate_map_statistics(density->get_difference_grid()); + // const std::pair difference_density_stats = density->calculate_map_statistics(density->get_difference_grid()); + std::map rsccs = Sails::Score::calculate_rsccs(density, structure, residue_database); std::vector to_remove; for (const auto &[fst, snd]: *glycan) { @@ -63,16 +65,19 @@ void remove_erroneous_sugars(gemmi::Structure *structure, Sails::Density *densit gemmi::Residue previous_residue = Sails::Utils::get_residue_from_glycosite( sugar_result.value()->site, structure); - // if (residue.name == "ASN") { continue; } // don't remove ASN - // if (residue.name == "TRP") { continue; } // don't remove TRP + snd->site.atom_idx = 0; // set atom index to 0 so can be used in comparisons on the residue level // remove cases with low rscc - const float rscc = density->rscc_score(residue); - print_rscc(snd->site, rscc, structure); - if (rscc < rscc_threshold) { - to_remove.emplace_back(snd.get()); // add pointer to - if (debug) print_removal_rscc(snd->site, rscc, structure); - continue; + if (rsccs.count(snd->site) != 0) { + const double rscc = rsccs.at(snd->site); + print_rscc(snd->site, rscc, structure); + if (rscc < rscc_threshold) { + to_remove.emplace_back(snd.get()); // add pointer to remove + if (debug) print_removal_rscc(snd->site, rscc, structure); + } + } else { + std::cout << Sails::Utils::format_site_key(fst) << " | " << Sails::Utils::format_site_key(snd->site) << std::endl; + throw std::runtime_error("Glycosite was not found in the RSCC calculation" + Sails::Utils::format_residue_from_site(snd->site, structure)); } // remove cases with high difference density score @@ -200,7 +205,7 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu // std::cout << "Attempting removal at " << Sails::Utils::format_residue_from_site(glycosite, &structure) << std::endl; Sails::Glycan old_glycan = glycan; - remove_erroneous_sugars(&structure, &density, &glycan, strict, verbose); + remove_erroneous_sugars(&structure, &density, &glycan, strict, verbose, residue_database); topology.set_structure(&structure); // need to update neighbor search after removing n residues Sails::Glycan new_glycan = topology.find_glycan_topology(glycosite); @@ -286,7 +291,7 @@ Sails::Output run_em_cycle(Sails::Glycosites &glycosites, gemmi::Structure &stru // std::cout << "Attempting removal at " << Sails::Utils::format_residue_from_site(glycosite, &structure) << std::endl; Sails::Glycan old_glycan = glycan; - remove_erroneous_sugars(&structure, &density, &glycan, strict, verbose); + remove_erroneous_sugars(&structure, &density, &glycan, strict, verbose, residue_database); topology.set_structure(&structure); // need to update neighbor search after removing n residues Sails::Glycan new_glycan = topology.find_glycan_topology(glycosite); @@ -306,6 +311,7 @@ Sails::Output run_em_cycle(Sails::Glycosites &glycosites, gemmi::Structure &stru // add links and write files std::vector links = generate_link_records(&structure, &glycosites, &topology); + Sails::add_links_to_structure(model.get_structure(), links); std::string log_string = telemetry.format_log(&structure, &density, false).value(); Sails::Telemetry::SNFGCycleData snfgs = telemetry.get_snfgs(); @@ -575,38 +581,7 @@ Sails::Output validate(gemmi::Structure& structure, Sails::MTZ &sails_mtz, bool auto density = Sails::XtalDensity(mtz); density.load_map_coefficients(); - gemmi::Grid<> calculated_density = density.calculate_density_for_structure(structure); - - gemmi::NeighborSearch ns = gemmi::NeighborSearch(structure.models[0], structure.cell, 2); - ns.populate(); - - gemmi::Grid<> best_grid = *density.get_best_grid(); - std::map>> residue_pairs; - - for (auto point: best_grid) { - gemmi::Position position = best_grid.point_to_position(point); - auto mark = ns.find_nearest_atom(position, 2); - if (mark == nullptr) continue; - - auto site = Sails::Glycosite(0, mark->chain_idx, mark->residue_idx, 0); - const gemmi::Residue* residue_ptr = &structure.models[site.model_idx].chains[site.chain_idx].residues[site.residue_idx]; - if (residue_database.count(residue_ptr->name) == 0) continue; - const Sails::ResidueData& residue = residue_database.at(residue_ptr->name); - if (!residue.is_sugar) continue; - - double obs = *point.value; - double calc = calculated_density.interpolate_value(position); - residue_pairs[site].emplace_back(obs, calc); - - } - std::map rsccs; - - for (const auto& [site, data]: residue_pairs) { - auto [obs_values, calc_values] = Sails::Utils::split_pairs(data); - if (obs_values.empty() || calc_values.empty()) continue; - - rsccs[site] = Sails::Density::calculate_rscc(obs_values, calc_values); - } + std::map rsccs = Sails::Score::calculate_rsccs(&density, &structure, residue_database); std::vector to_remove = {}; std::vector log = {}; @@ -663,38 +638,7 @@ Sails::Output validate(gemmi::Structure& structure, gemmi::Grid<>& grid, float r auto density = Sails::EMDensity(grid, resolution); - gemmi::Grid<> calculated_density = density.calculate_density_for_structure(structure); - - gemmi::NeighborSearch ns = gemmi::NeighborSearch(structure.models[0], structure.cell, 2); - ns.populate(); - - gemmi::Grid<> best_grid = *density.get_best_grid(); - std::map>> residue_pairs; - - for (auto point: best_grid) { - gemmi::Position position = best_grid.point_to_position(point); - auto mark = ns.find_nearest_atom(position, 2); - if (mark == nullptr) continue; - - auto site = Sails::Glycosite(0, mark->chain_idx, mark->residue_idx, 0); - const gemmi::Residue* residue_ptr = &structure.models[site.model_idx].chains[site.chain_idx].residues[site.residue_idx]; - if (residue_database.count(residue_ptr->name) == 0) continue; - const Sails::ResidueData& residue = residue_database.at(residue_ptr->name); - if (!residue.is_sugar) continue; - - double obs = *point.value; - double calc = calculated_density.interpolate_value(position); - residue_pairs[site].emplace_back(obs, calc); - - } - std::map rsccs; - - for (const auto& [site, data]: residue_pairs) { - auto [obs_values, calc_values] = Sails::Utils::split_pairs(data); - if (obs_values.empty() || calc_values.empty()) continue; - - rsccs[site] = Sails::Density::calculate_rscc(obs_values, calc_values); - } + std::map rsccs = Sails::Score::calculate_rsccs(&density, &structure, residue_database); std::vector to_remove = {}; std::vector log = {}; diff --git a/package/src/include/sails-score.h b/package/src/include/sails-score.h new file mode 100644 index 0000000..39b8d42 --- /dev/null +++ b/package/src/include/sails-score.h @@ -0,0 +1,16 @@ +// +// Created by Jordan Dialpuri on 22/10/2025. +// + +#ifndef SAILS_SCORE_H +#define SAILS_SCORE_H +#include "sails-model.h" +#include "density/sails-density.h" + +namespace Sails::Score { + + std::map calculate_rsccs(Sails::Density* density, gemmi::Structure* structure, ResidueDatabase &residue_database); + +} + +#endif //SAILS_SCORE_H diff --git a/package/src/sails/validate.py b/package/src/sails/validate.py index db918ec..1b3eb95 100644 --- a/package/src/sails/validate.py +++ b/package/src/sails/validate.py @@ -45,6 +45,7 @@ def parse_args(): em_parser = subparsers.add_parser("em", parents=[parent], formatter_class=formatter) em_parser_group = em_parser.add_argument_group("Required arguments in EM mode") em_parser_group.add_argument("--mapin", type=str, required=True) + em_parser_group.add_argument("--resolution", type=float, required=True) return parser.parse_args() @@ -77,7 +78,12 @@ def em(args): resource = importlib.resources.files("sails").joinpath("data") result = validate( - sails_structure, sails_grid, args.remove, args.threshold, str(resource) + sails_structure, + sails_grid, + args.resolution, + args.remove, + args.threshold, + str(resource), ) structure = interface.extract_sails_structure(result.structure) From 72a7c95fa48a7abfc6cfbd8d801cadb5bc434604 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Wed, 22 Oct 2025 08:50:17 +0100 Subject: [PATCH 25/56] Updated compare script --- package/scripts/compare_structures.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/package/scripts/compare_structures.py b/package/scripts/compare_structures.py index 68e2382..8cc6cd6 100644 --- a/package/scripts/compare_structures.py +++ b/package/scripts/compare_structures.py @@ -2,6 +2,7 @@ import gemmi import json import numpy as np +from pprint import pprint def load_data_file(filename): @@ -20,11 +21,11 @@ def format_residue(chain: gemmi.Chain, residue: gemmi.Residue): def main(args): - data = load_data_file("package/data/data.json") + data = load_data_file("package/src/sails/data/data.json") structure = gemmi.read_structure(args.model) reference = gemmi.read_structure(args.reference) - ns = gemmi.NeighborSearch(structure, max_radius=1).populate() + ns = gemmi.NeighborSearch(structure, max_radius=1.5).populate() output = {} @@ -84,6 +85,8 @@ def main(args): percentage_modelled = 100 * modelled / total_sugars print(f"Percentage Modelled {percentage_modelled:.2f}") + pprint(output) + if __name__ == "__main__": parser = argparse.ArgumentParser() From a06f03f9edb4d1af2563698ef121d14e9f4a5601 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Thu, 23 Oct 2025 09:43:50 +0100 Subject: [PATCH 26/56] Updated protein finding to use glycan prediction too --- package/src/cpp/sails-predictions.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/package/src/cpp/sails-predictions.cpp b/package/src/cpp/sails-predictions.cpp index 78e0825..88f5cde 100644 --- a/package/src/cpp/sails-predictions.cpp +++ b/package/src/cpp/sails-predictions.cpp @@ -108,7 +108,14 @@ Sails::Glycosites Sails::Predictions::find_potential_sites_using_protein(gemmi:: if (!ns_optional.has_value()) { return potential_sites; } + + std::optional glycan_ns_optional = create_neighbour_search(m_glycan_map, 0.1, structure.cell); + if (!glycan_ns_optional.has_value()) { + return potential_sites; + } + gemmi::NeighborSearch ns = ns_optional.value(); + gemmi::NeighborSearch glycan_ns = glycan_ns_optional.value(); for (int m = 0; m < structure.models.size(); m++) { for (int c = 0; c < structure.models[m].chains.size(); c++) { @@ -131,8 +138,9 @@ Sails::Glycosites Sails::Predictions::find_potential_sites_using_protein(gemmi:: continue; } auto nearby_points = ns.find_atoms(last_donor_atom->pos, '*', 0.1, 1); + auto nearby_glycan_points = glycan_ns.find_atoms(last_donor_atom->pos, '*', 0, 2); - if (nearby_points.empty()) { + if (nearby_points.empty() || nearby_glycan_points.empty()) { continue; } From bb1846d6e7004fab8c936bb17e259f22dae27012 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Thu, 23 Oct 2025 12:12:16 +0100 Subject: [PATCH 27/56] Added better clash score which counts all but donor atom clash --- package/src/cpp/sails-linkage.cpp | 33 ++++++++++++++++++++++++----- package/src/include/sails-linkage.h | 3 ++- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/package/src/cpp/sails-linkage.cpp b/package/src/cpp/sails-linkage.cpp index 75f1a68..af2f327 100644 --- a/package/src/cpp/sails-linkage.cpp +++ b/package/src/cpp/sails-linkage.cpp @@ -271,12 +271,34 @@ Sails::Model::ChainType Sails::Model::find_chain_type(std::vector sugar return result ? non_protein : protein; } -double Sails::Model::calculate_clash_score(const SuperpositionResult &result) const { - constexpr double radius = 1; - gemmi::NeighborSearch ns = gemmi::NeighborSearch(structure->models[0], structure->cell, radius).populate(); +double Sails::Model::calculate_clash_score(const SuperpositionResult &result, gemmi::Atom *donor_atom) const { + constexpr double radius = 1.5; + gemmi::NeighborSearch ns = gemmi::NeighborSearch(structure->models[0], structure->cell, radius); + + for (auto & model : structure->models) { + for (int c = 0; c < model.chains.size(); c++) { + for (int r = 0; r < model.chains[c].residues.size(); r++) { + const gemmi::Residue* residue_ptr = &model.chains[c].residues[r]; + gemmi::ResidueInfo residue_info = gemmi::find_tabulated_residue(residue_ptr->name); + if (residue_info.is_amino_acid() || residue_database.count(residue_ptr->name) > 0 ) { + for (int a = 0; a < model.chains[c].residues[r].atoms.size(); a++) { + gemmi::Atom* current_atom_ptr = &model.chains[c].residues[r].atoms[a]; + if (donor_atom != current_atom_ptr) ns.add_atom(*current_atom_ptr, c, r, a); + } + } + } + } + } + + // ns.populate(); + + double clash_score = 0; for (auto &atom: result.new_residue.atoms) { auto nearest_atoms = ns.find_atoms(atom.pos, '\0', 0, radius); + // for (auto& x: nearest_atoms) { + // std::cout << "Clash between atom " << atom.name << " " << Utils::format_residue_from_site(Glycosite(*x), structure) << x->to_cra(structure->models[0]).atom->name << std::endl; + // } clash_score += static_cast(nearest_atoms.size()); } return clash_score; @@ -357,7 +379,8 @@ std::optional Sails::Model::add_residue( } // calculate clash score - double clash_score = calculate_clash_score(result); + double clash_score = calculate_clash_score(result, atoms[2]); + // std::cout << std::endl << clash_score << std::endl; if (clash_score > 1) { continue; } @@ -433,7 +456,7 @@ std::optional Sails::Model::add_residue(gemmi::Resid SuperpositionResult result = {new_monomer, superpose_result, reference_library_monomer}; // calculate clash score - double clash_score = calculate_clash_score(result); + double clash_score = calculate_clash_score(result, &atoms[2]); if (clash_score < best_clash) { best_clash = clash_score; best_result = std::move(result); diff --git a/package/src/include/sails-linkage.h b/package/src/include/sails-linkage.h index 9035f1a..84e6c50 100644 --- a/package/src/include/sails-linkage.h +++ b/package/src/include/sails-linkage.h @@ -222,9 +222,10 @@ namespace Sails { * SuperpositionResult. Nearby atoms are found using a NeighborSearch with a given radius. * * @param result The SuperpositionResult from which to calculate the clash score. + * @param donor_atom * @return The calculated clash score. */ - [[nodiscard]] double calculate_clash_score(const SuperpositionResult &result) const; + [[nodiscard]] double calculate_clash_score(const SuperpositionResult &result, gemmi::Atom *donor_atom) const; /** From 10ce8f07d9f511e737e68193fba9cb02cace02d0 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Thu, 23 Oct 2025 13:47:50 +0100 Subject: [PATCH 28/56] Added sequential sites to autoglycosylate Updated chain names selection function --- package/src/cpp/sails-linkage.cpp | 32 ++++++++++++++++++++++++------- package/src/cpp/sails.cpp | 13 ++++++++++++- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/package/src/cpp/sails-linkage.cpp b/package/src/cpp/sails-linkage.cpp index af2f327..4b2fd0b 100644 --- a/package/src/cpp/sails-linkage.cpp +++ b/package/src/cpp/sails-linkage.cpp @@ -207,15 +207,33 @@ void Sails::Model::add_sugar_to_structure(const Sugar *terminal_sugar, Superposi int chain_idx = terminal_sugar->site.chain_idx; if (chain_type == protein) { - const size_t last_chain_idx = structure->models[terminal_sugar->site.model_idx].chains.size(); - chain_idx = static_cast(last_chain_idx); - gemmi::Chain chain = gemmi::Chain(""); - chain.name = Utils::get_next_string( - structure->models[terminal_sugar->site.model_idx].chains[last_chain_idx - 1].name); - structure->models[terminal_sugar->site.model_idx].chains.emplace_back(chain); + gemmi::Model* model = &structure->models[terminal_sugar->site.model_idx]; + const std::vector* chains = &model->chains; + + if (chains->empty()) { + throw std::runtime_error("No existing chains found in the model. Is it empty?"); + } + + const auto max_it = std::max_element(chains->begin(), chains->end(), + [](const gemmi::Chain& a, const gemmi::Chain& b) { + return a.name < b.name; + }); + + auto new_chain = gemmi::Chain(""); + new_chain.name = Utils::get_next_string(max_it->name); + + model->chains.emplace_back(std::move(new_chain)); + chain_idx = static_cast(model->chains.size() - 1); + + // const size_t last_chain_idx = structure->models[terminal_sugar->site.model_idx].chains.size(); + // chain_idx = static_cast(last_chain_idx); + // gemmi::Chain chain = gemmi::Chain(""); + // chain.name = Utils::get_next_string( + // structure->models[terminal_sugar->site.model_idx].chains[last_chain_idx - 1].name); + // structure->models[terminal_sugar->site.model_idx].chains.emplace_back(chain); } - auto all_residues = &structure->models[terminal_sugar->site.model_idx].chains[chain_idx].residues; + const auto all_residues = &structure->models[terminal_sugar->site.model_idx].chains[chain_idx].residues; favoured_addition.new_residue.seqid = gemmi::SeqId(static_cast(all_residues->size()) + 1, '?'); all_residues->insert(all_residues->end(), std::move(favoured_addition.new_residue)); } diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 08aafe5..0d4305c 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -369,7 +369,18 @@ Sails::Output o_mannosylate(gemmi::Structure &structure, Sails::MTZ &sails_mtz, Sails::Output auto_glycosylate(gemmi::Structure &structure, Sails::MTZ &sails_mtz, gemmi::Grid<>& glycan_grid, gemmi::Grid<>& protein_grid, int cycles, std::string &resource_dir, bool verbose) { - Sails::Glycosites glycosites = identify_predicted_sites(structure, glycan_grid, protein_grid, true, resource_dir); + Sails::Glycosites predicted_glycosites = identify_predicted_sites(structure, glycan_grid, protein_grid, true, resource_dir); + std::cout << "Found " << predicted_glycosites.size() << " potential sites using deep learning models" << std::endl; + Sails::Glycosites n_glycosites = Sails::find_n_glycosylation_sites(structure); + Sails::Glycosites c_glycosites = Sails::find_c_glycosylation_sites(structure); + + std::set glycosites_set = {predicted_glycosites.begin(), predicted_glycosites.end()}; + glycosites_set.insert(n_glycosites.begin(), n_glycosites.end()); + glycosites_set.insert(c_glycosites.begin(), c_glycosites.end()); + Sails::Glycosites glycosites = {glycosites_set.begin(), glycosites_set.end()}; + int diff = static_cast(glycosites.size()) - static_cast(predicted_glycosites.size()); + std::cout << "Supplemented with " << diff << " sites from the sequence" << std::endl; + return run_cycle(glycosites, structure, sails_mtz, cycles, resource_dir, false, verbose); } From 051a5d743e799b56779ee83ef3d6d3401b3feee3 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Thu, 23 Oct 2025 16:32:49 +0100 Subject: [PATCH 29/56] Fixed bug with chain names not going past Z --- package/src/cpp/sails-linkage.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/package/src/cpp/sails-linkage.cpp b/package/src/cpp/sails-linkage.cpp index 4b2fd0b..7c33062 100644 --- a/package/src/cpp/sails-linkage.cpp +++ b/package/src/cpp/sails-linkage.cpp @@ -216,11 +216,15 @@ void Sails::Model::add_sugar_to_structure(const Sugar *terminal_sugar, Superposi const auto max_it = std::max_element(chains->begin(), chains->end(), [](const gemmi::Chain& a, const gemmi::Chain& b) { + if (a.name.length() != b.name.length()) { + return a.name.length() < b.name.length(); + } return a.name < b.name; }); auto new_chain = gemmi::Chain(""); new_chain.name = Utils::get_next_string(max_it->name); + std::cout << "Last chain name is " << max_it->name << " next is " << new_chain.name << std::endl; model->chains.emplace_back(std::move(new_chain)); chain_idx = static_cast(model->chains.size() - 1); From acb981bf0cefaea6fbd586b4086c0dd61213a35c Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Thu, 23 Oct 2025 16:33:01 +0100 Subject: [PATCH 30/56] Fixed bug with chain names not going past Z --- package/src/cpp/sails-linkage.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/package/src/cpp/sails-linkage.cpp b/package/src/cpp/sails-linkage.cpp index 7c33062..be14abd 100644 --- a/package/src/cpp/sails-linkage.cpp +++ b/package/src/cpp/sails-linkage.cpp @@ -224,7 +224,6 @@ void Sails::Model::add_sugar_to_structure(const Sugar *terminal_sugar, Superposi auto new_chain = gemmi::Chain(""); new_chain.name = Utils::get_next_string(max_it->name); - std::cout << "Last chain name is " << max_it->name << " next is " << new_chain.name << std::endl; model->chains.emplace_back(std::move(new_chain)); chain_idx = static_cast(model->chains.size() - 1); From 9ccb3ea963a683b4babfbab8f03719c59e6e0221 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Thu, 23 Oct 2025 19:08:17 +0100 Subject: [PATCH 31/56] Added functions to remove free sugars --- package/src/cpp/sails-linkage.cpp | 35 +++++++++++++++++++++++++++++ package/src/cpp/sails.cpp | 11 +++++++++ package/src/include/sails-glycan.h | 16 +++++++++++++ package/src/include/sails-linkage.h | 6 +++++ 4 files changed, 68 insertions(+) diff --git a/package/src/cpp/sails-linkage.cpp b/package/src/cpp/sails-linkage.cpp index be14abd..6539b3c 100644 --- a/package/src/cpp/sails-linkage.cpp +++ b/package/src/cpp/sails-linkage.cpp @@ -49,6 +49,41 @@ void Sails::Model::standardise_residue_names() const { } } +std::set Sails::Model::get_all_glycosites() const { + std::set sites = {}; + for (auto & model : structure->models) { + for (int c = 0; c < model.chains.size(); c++) { + for (int r = 0; r < model.chains[c].residues.size(); r++) { + const gemmi::Residue* residue_ptr = &model.chains[c].residues[r]; + if (residue_database.count(residue_ptr->name) > 0) { + ResidueData residue_data = residue_database.at(residue_ptr->name); + if (!residue_data.is_sugar) continue; + Glycosite site = {0, c, r, 0}; + sites.insert(site); + } + } + } + } + return sites; +} + +void Sails::Model::remove_free_sites(std::set &all_sites) const { + std::set all_sites_in_model = get_all_glycosites(); + std::vector free_sites; + std::set_difference(all_sites_in_model.begin(), all_sites_in_model.end(), + all_sites.begin(), all_sites.end(), + std::back_inserter(free_sites)); + + std::sort(free_sites.begin(), free_sites.end(), [](const Sails::Glycosite& a, const Sails::Glycosite& b) { + return !(a < b); + }); + + for (const auto& site: free_sites) { + const auto residues = &structure->models[site.model_idx].chains[site.chain_idx].residues; + residues->erase(residues->begin() + site.residue_idx); + } +} + // UTILITY FUNCTIONS std::optional Sails::Model::get_monomer(const std::string &monomer, bool remove_h) { diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 0d4305c..3abc484 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -226,6 +226,17 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu model.standardise_residue_names(); + // find and remove any free sugars (likely due to something going wrong) + std::set all_sites = {}; + for (auto &glycosite: glycosites) { + Sails::Glycan glycan = topology.find_glycan_topology(glycosite); + auto sites = glycan.get_sites(); + all_sites.insert(sites.begin(), sites.end()); + } + + model.remove_free_sites(all_sites); + topology.set_structure(model.get_structure()); + // add links and write files std::vector links = generate_link_records(&structure, &glycosites, &topology); diff --git a/package/src/include/sails-glycan.h b/package/src/include/sails-glycan.h index cbc5ee3..7d53e70 100644 --- a/package/src/include/sails-glycan.h +++ b/package/src/include/sails-glycan.h @@ -364,6 +364,22 @@ namespace Sails { return &sugars; } + + /** + * @brief Returns the sites in this glycan. + * + * @return A ptr to all sugars in this glycan. + */ + [[nodiscard]] std::vector get_sites() const { + std::vector sites; + sites.reserve(sugars.size()); + for(const auto&[fst, snd]: sugars) { + sites.emplace_back(fst); + } + return sites; + } + + /** * @brief Adds linkage between two sugars. * diff --git a/package/src/include/sails-linkage.h b/package/src/include/sails-linkage.h index 84e6c50..75d16cf 100644 --- a/package/src/include/sails-linkage.h +++ b/package/src/include/sails-linkage.h @@ -147,6 +147,8 @@ namespace Sails { void standardise_residue_names() const; + void remove_free_sites(std::set& all_sites) const; + private: typedef std::map > PossibleAdditions; @@ -258,6 +260,10 @@ namespace Sails { static void move_acceptor_atomic_positions(std::vector &atoms, double length, std::vector &angles, std::vector &torsions); + + [[nodiscard]] std::set get_all_glycosites() const; + + // /** // * @brief Move the positions of acceptor atoms based on given parameters. // * From 1439b86812842cae161b142d035f5a5440e49ad0 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Thu, 23 Oct 2025 19:08:49 +0100 Subject: [PATCH 32/56] Added functions to remove free sugars --- package/src/include/sails-linkage.h | 1 + 1 file changed, 1 insertion(+) diff --git a/package/src/include/sails-linkage.h b/package/src/include/sails-linkage.h index 75d16cf..83a71cf 100644 --- a/package/src/include/sails-linkage.h +++ b/package/src/include/sails-linkage.h @@ -14,6 +14,7 @@ #include #include +#include #include #include From 58e196a15dc3578130494bcbe573bc471a19bd95 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Fri, 24 Oct 2025 22:28:09 +0100 Subject: [PATCH 33/56] Added searchtype to sails-find em --- package/src/sails/find.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/package/src/sails/find.py b/package/src/sails/find.py index 3b77303..090fd57 100644 --- a/package/src/sails/find.py +++ b/package/src/sails/find.py @@ -321,6 +321,12 @@ def run(): choices=[type.name for type in ModelType], help="Binary or Multiclass model", ) + em_parser.add_argument( + "--searchtype", + required=True, + choices=["protein", "glycan"], + help="Search for protein or glycan, only used if modeltype is multiclass", + ) args = parser.parse_args() if args.mode == "seq": From 5f8bf31f186a93b687f3b733877e0dc366fafa3f Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Fri, 24 Oct 2025 22:30:20 +0100 Subject: [PATCH 34/56] Added preddirin to sails-find em --- package/src/sails/find.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/package/src/sails/find.py b/package/src/sails/find.py index 090fd57..1d39445 100644 --- a/package/src/sails/find.py +++ b/package/src/sails/find.py @@ -315,6 +315,12 @@ def run(): type=str, help="Path to output file", ) + em_parser.add_argument( + "--preddirin", + required=False, + type=str, + help="Path to a model in PDB or CIF format", + ) em_parser.add_argument( "--modeltype", required=True, From c8c4cdeac2aa9b6919619f7d9e64a00a4b9935c9 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Fri, 24 Oct 2025 22:31:13 +0100 Subject: [PATCH 35/56] Added preddirin to sails-find em --- package/src/sails/find.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/package/src/sails/find.py b/package/src/sails/find.py index 1d39445..2de56fd 100644 --- a/package/src/sails/find.py +++ b/package/src/sails/find.py @@ -201,9 +201,10 @@ def xray(args): def em(args): sails_structure = get_sails_structure(args.modelin) resource = importlib.resources.files("sails").joinpath("data") + model = ModelType[args.modeltype] if args.preddirin: - predicted_map = read_prediction_dir(args.preddirin) + predicted_map = read_prediction_dir(args.preddirin, model) else: predicted_map = predict_map( "binary", args.mapin, "output", nthreads=8, save_map=True From 04a7ec1577bd28e2d22e6c08699cbca5a2019e84 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Fri, 24 Oct 2025 22:33:03 +0100 Subject: [PATCH 36/56] Added preddirin to sails-find em --- package/src/sails/find.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/package/src/sails/find.py b/package/src/sails/find.py index 2de56fd..90f4220 100644 --- a/package/src/sails/find.py +++ b/package/src/sails/find.py @@ -204,16 +204,33 @@ def em(args): model = ModelType[args.modeltype] if args.preddirin: - predicted_map = read_prediction_dir(args.preddirin, model) + predictions = read_prediction_dir(args.preddirin, model) else: - predicted_map = predict_map( - "binary", args.mapin, "output", nthreads=8, save_map=True + predictions = predict_map( + model.name, + args.mapin, + "output", + nthreads=8, + save_map=True, + ) + + if model == ModelType.binary: + glycan_predicted_map = predictions + sails_grid = get_sails_map(glycan_predicted_map) + result = identify_predicted_sites(sails_structure, sails_grid, str(resource)) + else: + glycan_predicted_map, protein_predicted_map = predictions + sails_glycan_grid = get_sails_map(glycan_predicted_map) + sails_protein_grid = get_sails_map(protein_predicted_map) + searchtype = args.searchtype + result = identify_predicted_sites( + sails_structure, + sails_glycan_grid, + sails_protein_grid, + searchtype == "glycan", + str(resource), ) - sails_predicted_grid = get_sails_map(predicted_map) - result = identify_predicted_sites( - sails_structure, sails_predicted_grid, str(resource) - ) log = convert_glycosites_to_log(result, args.modelin) save_log(log, args) From 58725189ffa0b36f77c29eb3de558fc0fad473df Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Sat, 25 Oct 2025 13:52:24 +0100 Subject: [PATCH 37/56] Added skipping of glycosites in EM mode --- package/src/cpp/sails.cpp | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 3abc484..8ef6362 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -110,9 +110,7 @@ void remove_erroneous_sugars(gemmi::Structure *structure, Sails::Density *densit return !(a->site < b->site); }); - for (auto &sugar: to_remove) { - std::cout << "REMOVING: " << Sails::Utils::format_residue_from_site(sugar->site, structure) << std::endl; - + for (const auto &sugar: to_remove) { glycan->remove_sugar(sugar); } } @@ -282,9 +280,9 @@ Sails::Output run_em_cycle(Sails::Glycosites &glycosites, gemmi::Structure &stru std::cout << std::flush; if (verbose) std::cout << "\rCycle #" << i << std::endl; + std::cout << "Attempting to model at " << glycosites.size() << " sites." << std::endl; for (auto &glycosite: glycosites) { Sails::Glycan glycan = topology.find_glycan_topology(glycosite); - // if (glycan.empty()) { continue; } // find terminal sugars Sails::Glycan new_glycan = model.extend(glycan, glycosite, density, verbose); @@ -296,16 +294,21 @@ Sails::Output run_em_cycle(Sails::Glycosites &glycosites, gemmi::Structure &stru } // remove erroneous sugars + std::set unmodellable_sites = {}; for (auto &glycosite: glycosites) { Sails::Glycan glycan = topology.find_glycan_topology(glycosite); - if (glycan.empty()) { continue; } // std::cout << "Attempting removal at " << Sails::Utils::format_residue_from_site(glycosite, &structure) << std::endl; Sails::Glycan old_glycan = glycan; remove_erroneous_sugars(&structure, &density, &glycan, strict, verbose, residue_database); topology.set_structure(&structure); // need to update neighbor search after removing n residues + Sails::Glycan new_glycan = topology.find_glycan_topology(glycosite); + if (new_glycan.empty()) { + unmodellable_sites.insert(glycosite); + continue; + } std::set differences = old_glycan - new_glycan; telemetry >> differences; @@ -315,10 +318,29 @@ Sails::Output run_em_cycle(Sails::Glycosites &glycosites, gemmi::Structure &stru telemetry.save_snfg(i, glycosite_key, snfg_string); } + // sort removal in decsending order so removed indices don't cause later array overflow + glycosites.erase( + std::remove_if(glycosites.begin(), glycosites.end(),[&](const Sails::Glycosite &site) { + return unmodellable_sites.count(site) > 0; + }),glycosites.end() + ); + telemetry.save_state(i); } std::cout << std::endl; + model.standardise_residue_names(); + + // find and remove any free sugars (likely due to something going wrong) + std::set all_sites = {}; + for (auto &glycosite: glycosites) { + Sails::Glycan glycan = topology.find_glycan_topology(glycosite); + auto sites = glycan.get_sites(); + all_sites.insert(sites.begin(), sites.end()); + } + + model.remove_free_sites(all_sites); + topology.set_structure(model.get_structure()); // add links and write files std::vector links = generate_link_records(&structure, &glycosites, &topology); From 528f60c9656b5817f2351e21e957689019665f52 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Sat, 25 Oct 2025 14:38:58 +0100 Subject: [PATCH 38/56] Added Q Score to Sails --- package/src/cpp/sails-json.cpp | 2 +- package/src/cpp/sails-score.cpp | 151 ++++++++++++++++++++++++++ package/src/cpp/sails.cpp | 3 +- package/src/include/sails-score.h | 12 ++ package/src/include/sails-telemetry.h | 13 ++- 5 files changed, 175 insertions(+), 6 deletions(-) diff --git a/package/src/cpp/sails-json.cpp b/package/src/cpp/sails-json.cpp index 97eb19c..9272e92 100644 --- a/package/src/cpp/sails-json.cpp +++ b/package/src/cpp/sails-json.cpp @@ -160,7 +160,7 @@ void Sails::JSONWriter::write_json_file(TelemetryLog &log, std::ostream &stream) for (int i = 0; i < entries.size(); ++i) { stream << "\t\t\t\t\"" << entries[i].residue_id << "\": {\"rscc\": " << entries[i].rscc_score << ", \"rsr\": " << entries[i].rsr_score << - ", \"dds\": " << entries[i].dds_score << "}"; + ", \"qscore\": " << entries[i].q_score << "}"; if (i < entries.size() - 1) { stream << ","; } diff --git a/package/src/cpp/sails-score.cpp b/package/src/cpp/sails-score.cpp index 8ffe3e5..e88672c 100644 --- a/package/src/cpp/sails-score.cpp +++ b/package/src/cpp/sails-score.cpp @@ -43,3 +43,154 @@ std::map Sails::Score::calculate_rsccs(Density *densit } return rsccs; } + +std::map Sails::Score::calculate_qscores(Sails::Density *density, gemmi::Structure *structure, + ResidueDatabase &residue_database) { + + constexpr double radius = 3; + auto ns = gemmi::NeighborSearch(structure->models[0], structure->cell, radius); + ns.populate(); + + auto [mean, stddev] = density->calculate_map_statistics(density->get_best_grid()); + + const float A = mean + (10 * stddev); + const float B = mean - stddev; + const float sigma = 0.6; + constexpr int N = 8; + + std::map qscores; + for (auto & model : structure->models) { + for (int c = 0; c < model.chains.size(); c++) { + for (int r = 0; r < model.chains[c].residues.size(); r++) { + const gemmi::Residue* residue_ptr = &model.chains[c].residues[r]; + if (residue_database.count(residue_ptr->name) > 0) { + const ResidueData& residue_data = residue_database.at(residue_ptr->name); + if (!residue_data.is_sugar) continue; + Glycosite site = {0, c, r, 0}; + + std::vector residue_q_scores = {}; + for (int a = 0; a < residue_ptr->atoms.size(); a++) { + Glycosite atom_site = {0, c, r, a}; + + double atom_q = Sails::Score::QScore::calculate_q_score(residue_ptr->atoms[a].pos, atom_site, + density->get_best_grid(), ns, A, B, sigma, N); + residue_q_scores.emplace_back(atom_q); + } + double mean_residue_q_score = std::accumulate(residue_q_scores.begin(), residue_q_scores.end(), + 0.0) / static_cast(residue_ptr->atoms.size()); + qscores[site] = mean_residue_q_score; + + } + } + } + } + + return qscores; +} + +std::vector Sails::Score::QScore::fibonacci_sphere(int samples, float radius, const gemmi::Position ¢er) { + std::vector positions; + const double offset = 2.0 / samples; + const double increment = M_PI * (3.0 - sqrt(5.0)); + + for (int i = 0 ; i < samples; i++) { + const double y = ((i * offset) - 1) + (offset / 2); + const double r = sqrt(1 - pow(y,2)); + + const double phi = i * increment; + + const double x = cos(phi) * r; + const double z = sin(phi) * r; + + gemmi::Position position = {x, y, z}; + position *= radius; + position += center; + positions.emplace_back(position); + } + return positions; +} + +std::vector Sails::Score::QScore::get_radial_points(const gemmi::Position &position, float radius, int N, + Glycosite &site, gemmi::NeighborSearch &ns) { + + std::vector positions; + constexpr int max_iter = 200; + + for (int i = 0 ; i < max_iter ; i++) { + std::vector sampled_sphere = fibonacci_sphere(N+i, radius, position); + for (const auto& sampled_position: sampled_sphere) { + const gemmi::NeighborSearch::Mark* nearest_atom = ns.find_nearest_atom(sampled_position); + auto nearest_site = Glycosite(*nearest_atom); + if (nearest_site == site) { + positions.emplace_back(sampled_position); + } + + if (positions.size() >= N) { + break; + } + } + if (positions.size() >= N) { + break; + } + } + return positions; +} + +std::vector Sails::Score::QScore::sample_density(const gemmi::Grid<> *grid, std::vector &positions) { + std::vector values; + for (auto& position: positions) { + double value = grid->tricubic_interpolation(position); + values.emplace_back(value); + } + return values; +} + +double Sails::Score::QScore::calculate_q_score(const gemmi::Position & position, Glycosite &site, const gemmi::Grid<> *grid, + gemmi::NeighborSearch &ns, float A, float B, float sigma, int N) { + + const int M = 21; + std::vector sample_space(M); + for (int i = 0; i < M; i++) + sample_space[i] = (2.0f / (M - 1)) * i; + + std::vector u(N, std::vector(M, 0)); + std::vector v(N, std::vector(M, 0)); + + for (int i = 0; i < M; i++) { + const double radius = sample_space[i]; + const double gaussian_sample = A * exp(-0.5 * pow(radius / sigma, 2)) + B; + + auto radial_pts = get_radial_points(position, radius, N, site, ns); + if (radial_pts.size() != static_cast(N)) + continue; + + auto u_samples = sample_density(grid, radial_pts); + + for (int j = 0; j < N; j++) { + u[j][i] = u_samples[j]; + v[j][i] = gaussian_sample; + } + } + + std::vector u_flat, v_flat; + for (int j = 0; j < N; j++) { + const double mean_u = std::accumulate(u[j].begin(), u[j].end(), 0.0) / M; + const double mean_v = std::accumulate(v[j].begin(), v[j].end(), 0.0) / M; + for (int i = 0; i < M; i++) { + u_flat.push_back(u[j][i] - mean_u); + v_flat.push_back(v[j][i] - mean_v); + } + } + + double numerator = 0; + double sum_u2 = 0; + double sum_v2 = 0; + + for (size_t i = 0; i < u_flat.size(); i++) { + numerator += u_flat[i] * v_flat[i]; + sum_u2 += u_flat[i] * u_flat[i]; + sum_v2 += v_flat[i] * v_flat[i]; + } + + return numerator / (std::sqrt(sum_u2) * std::sqrt(sum_v2)); +} diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 8ef6362..f56d3e7 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -683,13 +683,14 @@ Sails::Output validate(gemmi::Structure& structure, gemmi::Grid<>& grid, float r auto density = Sails::EMDensity(grid, resolution); std::map rsccs = Sails::Score::calculate_rsccs(&density, &structure, residue_database); + std::map qscores = Sails::Score::calculate_qscores(&density, &structure, residue_database); std::vector to_remove = {}; std::vector log = {}; for (auto& [site, rscc]: rsccs) { std::string residue_key = Sails::Utils::format_residue_from_site(site, &structure); - log.emplace_back(residue_key, rscc); + log.emplace_back(residue_key, rscc, qscores.at(site)); if (rscc > threshold) { continue; } diff --git a/package/src/include/sails-score.h b/package/src/include/sails-score.h index 39b8d42..d5cba0f 100644 --- a/package/src/include/sails-score.h +++ b/package/src/include/sails-score.h @@ -11,6 +11,18 @@ namespace Sails::Score { std::map calculate_rsccs(Sails::Density* density, gemmi::Structure* structure, ResidueDatabase &residue_database); + std::map calculate_qscores(Sails::Density* density, gemmi::Structure* structure, ResidueDatabase &residue_database); + + namespace QScore { + std::vector fibonacci_sphere(int samples, float radius, const gemmi::Position ¢er); + + std::vector get_radial_points(const gemmi::Position &position, float radius, int N, Glycosite& site, gemmi::NeighborSearch& ns); + + std::vector sample_density(const gemmi::Grid<> *grid, std::vector& positions); + + double calculate_q_score(const gemmi::Position & position, Glycosite &site, const gemmi::Grid<> *grid, + gemmi::NeighborSearch &ns, float A, float B, float sigma, int N); + } } #endif //SAILS_SCORE_H diff --git a/package/src/include/sails-telemetry.h b/package/src/include/sails-telemetry.h index a9f302f..d828fbf 100644 --- a/package/src/include/sails-telemetry.h +++ b/package/src/include/sails-telemetry.h @@ -18,22 +18,27 @@ namespace Sails { struct TelemetryFormat { TelemetryFormat() = default; - TelemetryFormat(const std::string &residue_id, double rscc_score, double rsr_score, double dds_score) + TelemetryFormat(const std::string &residue_id, double rscc_score, double rsr_score, double q_score) : residue_id(residue_id), rscc_score(rscc_score), rsr_score(rsr_score), - dds_score(dds_score) { + q_score(q_score) { } TelemetryFormat(const std::string &residue_id, double rscc_score) : residue_id(residue_id), - rscc_score(rscc_score), rsr_score(0), dds_score(0) { + rscc_score(rscc_score), rsr_score(0), q_score(0) { + } + + TelemetryFormat(const std::string &residue_id, double rscc_score, double q_score) + : residue_id(residue_id), + rscc_score(rscc_score), rsr_score(0), q_score(q_score) { } std::string residue_id; double rscc_score; double rsr_score; - double dds_score; + double q_score; }; typedef std::map> TelemetryLog; From 79214f81dd3b5646e4c13e35b4976d8f8d7cbed0 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Sun, 26 Oct 2025 10:42:43 +0000 Subject: [PATCH 39/56] Added FUC clash removal Added q score to density --- package/src/cpp/density/sails-density.cpp | 33 +++++++++++++++++-- package/src/cpp/sails-linkage.cpp | 16 ++++----- package/src/cpp/sails-refine.cpp | 3 +- package/src/cpp/sails-score.cpp | 25 +++++++++++--- package/src/cpp/sails-telemetry.cpp | 4 +-- package/src/cpp/sails-topology.cpp | 5 ++- package/src/cpp/sails.cpp | 18 ++++++++++ package/src/include/density/sails-density.h | 6 +++- .../src/include/density/sails-em-density.h | 15 ++++++++- .../src/include/density/sails-xtal-density.h | 14 ++++++++ package/src/include/sails-linkage.h | 11 +++++++ package/src/include/sails-score.h | 2 ++ package/src/include/sails-utils.h | 14 ++++++++ 13 files changed, 145 insertions(+), 21 deletions(-) diff --git a/package/src/cpp/density/sails-density.cpp b/package/src/cpp/density/sails-density.cpp index b6ba2cf..12b04e3 100644 --- a/package/src/cpp/density/sails-density.cpp +++ b/package/src/cpp/density/sails-density.cpp @@ -8,6 +8,8 @@ #include #include +#include "src/include/sails-score.h" + double Sails::Density::score_residue(gemmi::Residue &residue, const DensityScoreMethod &method) { switch (method) { @@ -32,8 +34,8 @@ double Sails::Density::score_result(SuperpositionResult& result) { return rscc_score(result); case rsr: return rsr_score(result); - // case dds: - // return check_difference_density(result.new_residue, TODO); + case q: + return q_score(result.new_residue); default: return -1; } @@ -356,3 +358,30 @@ std::pair Sails::Density::calculate_map_statistics(const gemmi::Gr return std::make_pair(mean, stdev); } + +double Sails::Density::q_score(gemmi::Residue &residue) { + auto [mean, stddev] = get_map_stats(); + + const float A = mean + (10 * stddev); + const float B = mean - stddev; + constexpr float sigma = 0.6; + constexpr int N = 8; + + gemmi::Model model = Utils::create_model(residue); + gemmi::NeighborSearch ns = {model, get_best_grid()->unit_cell, 2}; + ns.populate(); + + std::vector residue_q_scores = {}; + + for (int a = 0; a < residue.atoms.size(); a++) { + Glycosite atom_site = {0, 0, 0, a}; + double atom_q = Score::QScore::calculate_q_score(residue.atoms[a].pos, atom_site, get_work_grid(), + ns, A, B, sigma, N); + residue_q_scores.emplace_back(atom_q); + } + + const double mean_residue_q_score = std::accumulate(residue_q_scores.begin(), residue_q_scores.end(), 0.0) + / static_cast(residue.atoms.size()); + + return mean_residue_q_score; +} diff --git a/package/src/cpp/sails-linkage.cpp b/package/src/cpp/sails-linkage.cpp index 6539b3c..1c23ecd 100644 --- a/package/src/cpp/sails-linkage.cpp +++ b/package/src/cpp/sails-linkage.cpp @@ -240,7 +240,6 @@ void Sails::Model::remove_leaving_atom(Sails::LinkageData &data, gemmi::Residue void Sails::Model::add_sugar_to_structure(const Sugar *terminal_sugar, SuperpositionResult &favoured_addition, ChainType &chain_type) { int chain_idx = terminal_sugar->site.chain_idx; - if (chain_type == protein) { gemmi::Model* model = &structure->models[terminal_sugar->site.model_idx]; const std::vector* chains = &model->chains; @@ -271,6 +270,9 @@ void Sails::Model::add_sugar_to_structure(const Sugar *terminal_sugar, Superposi // structure->models[terminal_sugar->site.model_idx].chains.emplace_back(chain); } + double average_donor_bfactor = Utils::calculate_average_bfactor(terminal_sugar->site, structure); + Utils::set_all_bfactors(&favoured_addition.new_residue, average_donor_bfactor); + const auto all_residues = &structure->models[terminal_sugar->site.model_idx].chains[chain_idx].residues; favoured_addition.new_residue.seqid = gemmi::SeqId(static_cast(all_residues->size()) + 1, '?'); all_residues->insert(all_residues->end(), std::move(favoured_addition.new_residue)); @@ -328,6 +330,10 @@ Sails::Model::ChainType Sails::Model::find_chain_type(std::vector sugar } double Sails::Model::calculate_clash_score(const SuperpositionResult &result, gemmi::Atom *donor_atom) const { + return calculate_clash_score(result.new_residue, donor_atom); +} + +double Sails::Model::calculate_clash_score(const gemmi::Residue &residue, gemmi::Atom *donor_atom) const { constexpr double radius = 1.5; gemmi::NeighborSearch ns = gemmi::NeighborSearch(structure->models[0], structure->cell, radius); @@ -346,15 +352,9 @@ double Sails::Model::calculate_clash_score(const SuperpositionResult &result, ge } } - // ns.populate(); - - double clash_score = 0; - for (auto &atom: result.new_residue.atoms) { + for (auto &atom: residue.atoms) { auto nearest_atoms = ns.find_atoms(atom.pos, '\0', 0, radius); - // for (auto& x: nearest_atoms) { - // std::cout << "Clash between atom " << atom.name << " " << Utils::format_residue_from_site(Glycosite(*x), structure) << x->to_cra(structure->models[0]).atom->name << std::endl; - // } clash_score += static_cast(nearest_atoms.size()); } return clash_score; diff --git a/package/src/cpp/sails-refine.cpp b/package/src/cpp/sails-refine.cpp index 59ac0fe..c7e69e6 100644 --- a/package/src/cpp/sails-refine.cpp +++ b/package/src/cpp/sails-refine.cpp @@ -37,6 +37,8 @@ double Sails::TorsionAngleRefiner::calculate_penalty_factor() const { return 1e-2; case rscc: return 1e-5; + case q: + return 1e-5; default: return 0; } @@ -65,7 +67,6 @@ double Sails::TorsionAngleRefiner::score_function(std::vector &all_angle if (bond_length_delta > 0.3) { penalty += bond_length_delta * 1e5; } - // std::cout << penalty << " " << score << " " << penalty_factor << std::endl; return score + penalty; } diff --git a/package/src/cpp/sails-score.cpp b/package/src/cpp/sails-score.cpp index e88672c..dfc66de 100644 --- a/package/src/cpp/sails-score.cpp +++ b/package/src/cpp/sails-score.cpp @@ -5,6 +5,8 @@ #include "../include/sails-score.h" +#include + #include "src/include/sails-utils.h" std::map Sails::Score::calculate_rsccs(Density *density, gemmi::Structure *structure, ResidueDatabase &residue_database) { @@ -88,6 +90,19 @@ std::map Sails::Score::calculate_qscores(Sails::Densit return qscores; } +double Sails::Score::calculate_clash_score(gemmi::Residue *residue, gemmi::Structure *structure) { + constexpr double radius = 1; + gemmi::NeighborSearch ns = gemmi::NeighborSearch(structure->models[0], structure->cell, radius); + ns.populate(); + + double clash_score = 0; + for (auto &atom: residue->atoms) { + auto nearest_atoms = ns.find_atoms(atom.pos, '\0', 0, radius); + clash_score += static_cast(nearest_atoms.size()); + } + return clash_score; +} + std::vector Sails::Score::QScore::fibonacci_sphere(int samples, float radius, const gemmi::Position ¢er) { std::vector positions; const double offset = 2.0 / samples; @@ -119,11 +134,11 @@ std::vector Sails::Score::QScore::get_radial_points(const gemmi for (int i = 0 ; i < max_iter ; i++) { std::vector sampled_sphere = fibonacci_sphere(N+i, radius, position); for (const auto& sampled_position: sampled_sphere) { - const gemmi::NeighborSearch::Mark* nearest_atom = ns.find_nearest_atom(sampled_position); - auto nearest_site = Glycosite(*nearest_atom); - if (nearest_site == site) { - positions.emplace_back(sampled_position); - } + // const gemmi::NeighborSearch::Mark* nearest_atom = ns.find_nearest_atom(sampled_position); + // auto nearest_site = Glycosite(*nearest_atom); + // if (nearest_site == site) { + positions.emplace_back(sampled_position); + // } if (positions.size() >= N) { break; diff --git a/package/src/cpp/sails-telemetry.cpp b/package/src/cpp/sails-telemetry.cpp index 963fb3d..b2de6b9 100644 --- a/package/src/cpp/sails-telemetry.cpp +++ b/package/src/cpp/sails-telemetry.cpp @@ -39,12 +39,12 @@ Sails::TelemetryLog Sails::Telemetry::calculate_log(gemmi::Structure *structure, if (residue.atoms.empty()) {continue;} const double rscc_score = density->score_residue(residue, rscc); const double rsr_score = density->score_residue(residue, rsr); - const double dds_score = density->score_residue(residue, dds); + const double q_score = density->score_residue(residue, q); log[cycle].emplace_back( Utils::format_residue_from_site(site, structure), rscc_score, rsr_score, - dds_score); + q_score); } } return log; diff --git a/package/src/cpp/sails-topology.cpp b/package/src/cpp/sails-topology.cpp index 07618d7..656e9ff 100644 --- a/package/src/cpp/sails-topology.cpp +++ b/package/src/cpp/sails-topology.cpp @@ -25,7 +25,10 @@ void Sails::Topology::find_residue_near_donor(Glycosite &glycosite, Glycan &glyc gemmi::Residue residue = Utils::get_residue_from_glycosite(glycosite, m_structure); // std::cout << "Searching near " << Utils::get_chain_from_glycosite(glycosite, m_structure).name << "-" << Utils::format_residue_key(&residue) << std::endl; - if (m_database.find(residue.name) == m_database.end()) { throw std::runtime_error("Glycosite is not in database"); } + if (m_database.find(residue.name) == m_database.end()) { + std::cout << residue.name << std::endl; + throw std::runtime_error("Glycosite is not in database"); + } auto database_entry = m_database[residue.name]; for (const auto &donor: database_entry.donors) { diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index f56d3e7..0781e48 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -43,6 +43,10 @@ void print_rscc(const Sails::Glycosite &site, float rscc, gemmi::Structure *stru std::cout << Sails::Utils::format_residue_from_site(site, structure) << " - RSCC = " << rscc << std::endl; } +void print_removal_clash(const Sails::Glycosite &site, float rscc, gemmi::Structure *structure) { + std::cout << "Removing " << Sails::Utils::format_residue_from_site(site, structure) << " because of clashes (Clash score = " << rscc << ")" << std::endl; +} + void print_dds(const Sails::Glycosite &site, float dds, gemmi::Structure *structure) { std::cout << Sails::Utils::format_residue_from_site(site, structure) << " - DDS = " << dds << std::endl; } @@ -54,6 +58,7 @@ void remove_erroneous_sugars(gemmi::Structure *structure, Sails::Density *densit // const std::pair difference_density_stats = density->calculate_map_statistics(density->get_difference_grid()); std::map rsccs = Sails::Score::calculate_rsccs(density, structure, residue_database); + std::map qscores = Sails::Score::calculate_qscores(density, structure, residue_database); std::vector to_remove; for (const auto &[fst, snd]: *glycan) { @@ -62,6 +67,15 @@ void remove_erroneous_sugars(gemmi::Structure *structure, Sails::Density *densit std::optional sugar_result = glycan->find_previous_sugar(snd.get()); if (!sugar_result.has_value()) continue; // if there is nothing previous, it must be a protein residue + if (residue.name == "FUC") { + double clash_score = Sails::Score::calculate_clash_score(&residue, structure); + if (clash_score > 2) { + print_removal_clash(snd->site, clash_score, structure) ; + to_remove.push_back(snd.get()); + continue; + } + } + gemmi::Residue previous_residue = Sails::Utils::get_residue_from_glycosite( sugar_result.value()->site, structure); @@ -319,6 +333,10 @@ Sails::Output run_em_cycle(Sails::Glycosites &glycosites, gemmi::Structure &stru } // sort removal in decsending order so removed indices don't cause later array overflow + if (verbose && !unmodellable_sites.empty()) { + std::cout << "Stopping trials at " << unmodellable_sites.size() << " sites." << std::endl; + } + glycosites.erase( std::remove_if(glycosites.begin(), glycosites.end(),[&](const Sails::Glycosite &site) { return unmodellable_sites.count(site) > 0; diff --git a/package/src/include/density/sails-density.h b/package/src/include/density/sails-density.h index 3601a24..6ad907e 100644 --- a/package/src/include/density/sails-density.h +++ b/package/src/include/density/sails-density.h @@ -25,7 +25,7 @@ namespace Sails { typedef clipper::HKL_info::HKL_reference_index HRI; enum DensityScoreMethod { - atomwise, rscc, rsr, dds + atomwise, rscc, rsr, q }; class Density { @@ -46,6 +46,8 @@ namespace Sails { [[nodiscard]] virtual const DensityScoreMethod get_score_method() const = 0; + [[nodiscard]] virtual std::pair get_map_stats() = 0; + /** * @brief Calculates the density for a given box based on a gemmi::Residue object. * @@ -244,6 +246,8 @@ namespace Sails { [[nodiscard]] std::pair calculate_map_statistics(const gemmi::Grid<> *grid) const; + [[nodiscard]] double q_score(gemmi::Residue &residue); + }; } // namespace Sails diff --git a/package/src/include/density/sails-em-density.h b/package/src/include/density/sails-em-density.h index ea354ba..998f26e 100644 --- a/package/src/include/density/sails-em-density.h +++ b/package/src/include/density/sails-em-density.h @@ -25,6 +25,16 @@ namespace Sails { return &calculated_maps; } + [[nodiscard]] std::pair get_map_stats() override { + if (map_mean == INT_MIN || map_stddev == INT_MIN ) { + auto [mean, stddev] = calculate_map_statistics(get_work_grid()); + map_mean = mean; + map_stddev = stddev; + return std::make_pair(map_mean, map_stddev); + } + return std::make_pair(map_mean, map_stddev); + } + gemmi::Grid<> calculate_density_for_box(gemmi::Residue &residue, gemmi::Box &box) const override; gemmi::Grid<> calculate_density_for_grid(gemmi::Residue &residue) const override; @@ -65,6 +75,9 @@ namespace Sails { * * The DensityScoreMethod class is used to represent the score method for scoring residues to density */ - DensityScoreMethod score_method = rscc; + DensityScoreMethod score_method = atomwise; + + float map_mean = INT_MIN; + float map_stddev = INT_MIN; }; } diff --git a/package/src/include/density/sails-xtal-density.h b/package/src/include/density/sails-xtal-density.h index 9a7ccd0..60d7d52 100644 --- a/package/src/include/density/sails-xtal-density.h +++ b/package/src/include/density/sails-xtal-density.h @@ -32,6 +32,16 @@ namespace Sails { return &calculated_maps; } + [[nodiscard]] std::pair get_map_stats() override { + if (map_mean == INT_MIN || map_stddev == INT_MIN ) { + auto [mean, stddev] = calculate_map_statistics(get_best_grid()); + map_mean = mean; + map_stddev = stddev; + return std::make_pair(map_mean, map_stddev); + } + return std::make_pair(map_mean, map_stddev); + } + gemmi::Grid<> calculate_density_for_box(gemmi::Residue &residue, gemmi::Box &box) const override; gemmi::Grid<> calculate_density_for_grid(gemmi::Residue &residue) const override; @@ -173,5 +183,9 @@ namespace Sails { * Clipper best map */ clipper::Xmap m_best_map; + + float map_mean = INT_MIN; + float map_stddev = INT_MIN; + }; } diff --git a/package/src/include/sails-linkage.h b/package/src/include/sails-linkage.h index 83a71cf..636371f 100644 --- a/package/src/include/sails-linkage.h +++ b/package/src/include/sails-linkage.h @@ -230,6 +230,17 @@ namespace Sails { */ [[nodiscard]] double calculate_clash_score(const SuperpositionResult &result, gemmi::Atom *donor_atom) const; + /** @brief Calculates the clash score for the given SuperpositionResult. + * + * The clash score is calculated by finding the number of nearby atoms for each atom in the + * SuperpositionResult. Nearby atoms are found using a NeighborSearch with a given radius. + * + * @param result The SuperpositionResult from which to calculate the clash score. + * @param donor_atom + * @return The calculated clash score. + */ + [[nodiscard]] double calculate_clash_score(const gemmi::Residue &residue, gemmi::Atom *donor_atom) const; + /** * @brief Removes the leaving atom from the given residue objects. diff --git a/package/src/include/sails-score.h b/package/src/include/sails-score.h index d5cba0f..99a95f9 100644 --- a/package/src/include/sails-score.h +++ b/package/src/include/sails-score.h @@ -13,6 +13,8 @@ namespace Sails::Score { std::map calculate_qscores(Sails::Density* density, gemmi::Structure* structure, ResidueDatabase &residue_database); + double calculate_clash_score(gemmi::Residue* residue, gemmi::Structure* structure); + namespace QScore { std::vector fibonacci_sphere(int samples, float radius, const gemmi::Position ¢er); diff --git a/package/src/include/sails-utils.h b/package/src/include/sails-utils.h index 59a42cb..39b620f 100644 --- a/package/src/include/sails-utils.h +++ b/package/src/include/sails-utils.h @@ -273,6 +273,20 @@ namespace Sails::Utils { } return {std::move(firsts), std::move(seconds)}; } + + double calculate_average_bfactor(const Glycosite &site, gemmi::Structure * structure) { + gemmi::Residue* residue_ptr = get_residue_ptr_from_glycosite(site, structure); + const double sum = std::accumulate(residue_ptr->atoms.begin(), residue_ptr->atoms.end(), 0.0, [](const double current, gemmi::Atom& atom) { + return current + atom.b_iso; + }); + return sum / residue_ptr->atoms.size(); + } + + void set_all_bfactors(gemmi::Residue * residue, double b_factor) { + for (auto & atom : residue->atoms) { + atom.b_iso = b_factor; + } + } } // namespace Sails::Utils From 58989124baf79c763cc145c6cf4597f2e746a2bf Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Sun, 26 Oct 2025 18:35:52 +0000 Subject: [PATCH 40/56] Added q score option to validate --- package/src/bindings/python_sails.cpp | 2 +- package/src/cpp/sails.cpp | 13 ++++++++----- package/src/sails/validate.py | 4 ++++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/package/src/bindings/python_sails.cpp b/package/src/bindings/python_sails.cpp index a6f7150..21afd66 100644 --- a/package/src/bindings/python_sails.cpp +++ b/package/src/bindings/python_sails.cpp @@ -284,7 +284,7 @@ NB_MODULE(sails_module, m) { m.def("validate", nb::overload_cast(&validate), "structure"_a, "mtz"_a, "remove"_a, "threshold"_a, "resource_dir"_a); // EM - m.def("validate", nb::overload_cast &, float, bool, float, std::string &>(&validate), "structure"_a, "grid"_a, "resolution"_a, "remove"_a, "threshold"_a, "resource_dir"_a); + m.def("validate", nb::overload_cast &, float, bool, float, bool, std::string &>(&validate), "structure"_a, "grid"_a, "resolution"_a, "remove"_a, "threshold"_a, "use_q"_a, "resource_dir"_a); m.def("test_snfg", &test); diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 0781e48..7d400fc 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -692,7 +692,7 @@ Sails::Output validate(gemmi::Structure& structure, Sails::MTZ &sails_mtz, bool }; } -Sails::Output validate(gemmi::Structure& structure, gemmi::Grid<>& grid, float resolution, bool remove, float threshold, std::string& resource_dir) { +Sails::Output validate(gemmi::Structure& structure, gemmi::Grid<>& grid, float resolution, bool remove, float threshold, bool use_q, std::string& resource_dir) { std::string data_file = resource_dir + "/data.json"; Sails::JSONLoader loader = {data_file}; Sails::ResidueDatabase residue_database = loader.load_residue_database(); @@ -700,16 +700,19 @@ Sails::Output validate(gemmi::Structure& structure, gemmi::Grid<>& grid, float r auto density = Sails::EMDensity(grid, resolution); + std::map rsccs = Sails::Score::calculate_rsccs(&density, &structure, residue_database); std::map qscores = Sails::Score::calculate_qscores(&density, &structure, residue_database); + std::map scores = use_q ? qscores : rsccs; + std::vector to_remove = {}; std::vector log = {}; - for (auto& [site, rscc]: rsccs) { + for (auto& [site, score]: scores) { std::string residue_key = Sails::Utils::format_residue_from_site(site, &structure); - log.emplace_back(residue_key, rscc, qscores.at(site)); - if (rscc > threshold) { + log.emplace_back(residue_key, rsccs.at(site), qscores.at(site)); + if (score > threshold) { continue; } to_remove.emplace_back(site); @@ -724,7 +727,7 @@ Sails::Output validate(gemmi::Structure& structure, gemmi::Grid<>& grid, float r auto glycan = topology.find_glycan_topology(site); std::vector downstream_sugars = glycan.get_downstream_sugars(site); for (auto& downstream_sugar: downstream_sugars) { - if (std::find(removal_set.begin(), removal_set.end(), downstream_sugar->site) != removal_set.end()) continue; + if (removal_set.count(downstream_sugar->site) > 0) continue; downstream_sugar->site.atom_idx = 0; // remove atom site from site to allow sorting removal_set.insert(downstream_sugar->site); } diff --git a/package/src/sails/validate.py b/package/src/sails/validate.py index 1b3eb95..083ebc5 100644 --- a/package/src/sails/validate.py +++ b/package/src/sails/validate.py @@ -46,6 +46,9 @@ def parse_args(): em_parser_group = em_parser.add_argument_group("Required arguments in EM mode") em_parser_group.add_argument("--mapin", type=str, required=True) em_parser_group.add_argument("--resolution", type=float, required=True) + em_parser_group.add_argument( + "--score", choices=["q", "rscc"], required=False, default="q" + ) return parser.parse_args() @@ -83,6 +86,7 @@ def em(args): args.resolution, args.remove, args.threshold, + args.score == "q", str(resource), ) From a5b9cf340a1758191e17c5b01becf3b07ffe2f2b Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Mon, 27 Oct 2025 09:33:10 +0000 Subject: [PATCH 41/56] Added Q Score threshold calculation --- package/src/cpp/sails.cpp | 9 ++++++++- package/src/sails/validate.py | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 7d400fc..e800d32 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -705,14 +705,21 @@ Sails::Output validate(gemmi::Structure& structure, gemmi::Grid<>& grid, float r std::map qscores = Sails::Score::calculate_qscores(&density, &structure, residue_database); std::map scores = use_q ? qscores : rsccs; + // equation from https://doi.org/10.1107/S2059798325005923 + double q_score_threshold = -0.0016*pow(resolution,2) + 0.0434*pow(resolution,2)-0.3956*resolution + 1.3366; + double applied_threshold = use_q ? q_score_threshold : threshold ; + + if (remove) { + std::cout << "Enforcing score limit of " << applied_threshold << std::endl; + } std::vector to_remove = {}; std::vector log = {}; for (auto& [site, score]: scores) { std::string residue_key = Sails::Utils::format_residue_from_site(site, &structure); log.emplace_back(residue_key, rsccs.at(site), qscores.at(site)); - if (score > threshold) { + if (score > applied_threshold) { continue; } to_remove.emplace_back(site); diff --git a/package/src/sails/validate.py b/package/src/sails/validate.py index 083ebc5..83d3b41 100644 --- a/package/src/sails/validate.py +++ b/package/src/sails/validate.py @@ -22,7 +22,7 @@ def parse_args(): group.add_argument("--modelout", type=str, default="sails-validate.cif") group.add_argument("--logout", type=str, default="sails-validate.log") group.add_argument( - "--threshold", type=float, default=0.7, help="RSCC Threshold to use for removal" + "--threshold", type=float, default=0.8, help="RSCC Threshold to use for removal" ) group.add_argument("--remove", action=argparse.BooleanOptionalAction, default=False) group.add_argument("--print", action=argparse.BooleanOptionalAction, default=False) From 606e34d3e9245f789802ff81b38b8143f8316a02 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Mon, 27 Oct 2025 12:03:20 +0000 Subject: [PATCH 42/56] Added early stopping to X-ray cycles --- package/src/cpp/sails.cpp | 40 ++++++++++++++++++++++++++++-- package/src/include/sails-glycan.h | 4 ++- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index e800d32..5512ca0 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -176,11 +176,16 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu Sails::Telemetry telemetry = Sails::Telemetry(""); + Sails::Glycosites original_glycosites = glycosites; + for (int i = 1; i <= cycles; i++) { if (!verbose) std::cout << "\rCycle #" << i; std::cout << std::flush; if (verbose) std::cout << "\rCycle #" << i << std::endl; + if (glycosites.empty()) break; + std::set unmodellable_sites = {}; + for (auto &glycosite: glycosites) { // auto c = Sails::Utils::get_chain_from_glycosite(glycosite, &structure); // auto r = Sails::Utils::get_residue_from_glycosite(glycosite, &structure); @@ -193,6 +198,12 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu // find terminal sugars Sails::Glycan new_glycan = model.extend(glycan, glycosite, density, verbose); + // if nothing was added, add site to unmodellable list + if (new_glycan.size() == glycan.size()) { + std::cout << "Nothing new modelled at site:" << Sails::Utils::format_residue_from_site(glycosite, &structure) << std::endl; + unmodellable_sites.insert(glycosite); + } + std::set differences = new_glycan - glycan; telemetry << differences; @@ -221,6 +232,12 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu topology.set_structure(&structure); // need to update neighbor search after removing n residues Sails::Glycan new_glycan = topology.find_glycan_topology(glycosite); + + if (new_glycan.empty()) { + unmodellable_sites.insert(glycosite); + continue; + } + new_glycan.renumber(); std::set differences = old_glycan - new_glycan; @@ -231,6 +248,19 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu telemetry.save_snfg(i, glycosite_key, snfg_string); } + if (verbose && !unmodellable_sites.empty()) { + std::cout << "Stopping trials at " << unmodellable_sites.size() << " sites." << std::endl; + for (const auto& site: unmodellable_sites) { + std::cout << "\tSite:" << Sails::Utils::format_residue_from_site(site, &structure) << std::endl; + } + } + + glycosites.erase( + std::remove_if(glycosites.begin(), glycosites.end(),[&](const Sails::Glycosite &site) { + return unmodellable_sites.count(site) > 0; + }),glycosites.end() + ); + telemetry.save_state(i); } @@ -240,16 +270,16 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu // find and remove any free sugars (likely due to something going wrong) std::set all_sites = {}; - for (auto &glycosite: glycosites) { + for (auto &glycosite: original_glycosites) { Sails::Glycan glycan = topology.find_glycan_topology(glycosite); auto sites = glycan.get_sites(); all_sites.insert(sites.begin(), sites.end()); } + model.remove_free_sites(all_sites); topology.set_structure(model.get_structure()); - // add links and write files std::vector links = generate_link_records(&structure, &glycosites, &topology); Sails::add_links_to_structure(model.get_structure(), links); @@ -319,11 +349,14 @@ Sails::Output run_em_cycle(Sails::Glycosites &glycosites, gemmi::Structure &stru topology.set_structure(&structure); // need to update neighbor search after removing n residues Sails::Glycan new_glycan = topology.find_glycan_topology(glycosite); + if (new_glycan.empty()) { unmodellable_sites.insert(glycosite); continue; } + new_glycan.renumber(); + std::set differences = old_glycan - new_glycan; telemetry >> differences; @@ -335,6 +368,9 @@ Sails::Output run_em_cycle(Sails::Glycosites &glycosites, gemmi::Structure &stru // sort removal in decsending order so removed indices don't cause later array overflow if (verbose && !unmodellable_sites.empty()) { std::cout << "Stopping trials at " << unmodellable_sites.size() << " sites." << std::endl; + for (const auto& site: unmodellable_sites) { + std::cout << "\tSITE:" << Sails::Utils::format_residue_from_site(site, &structure) << std::endl; + } } glycosites.erase( diff --git a/package/src/include/sails-glycan.h b/package/src/include/sails-glycan.h index 7d53e70..76c60e9 100644 --- a/package/src/include/sails-glycan.h +++ b/package/src/include/sails-glycan.h @@ -374,7 +374,9 @@ namespace Sails { std::vector sites; sites.reserve(sugars.size()); for(const auto&[fst, snd]: sugars) { - sites.emplace_back(fst); + Glycosite site = fst; + site.atom_idx = 0; // set to 0 for later comparisons + sites.emplace_back(site); } return sites; } From d74c2679fb27cfc2bd150f7011819308b4207f7d Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Mon, 27 Oct 2025 12:17:08 +0000 Subject: [PATCH 43/56] Fixed bug in FUC clash score --- package/src/cpp/sails-score.cpp | 17 +++++++++++++---- package/src/cpp/sails.cpp | 2 +- package/src/include/sails-score.h | 2 +- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/package/src/cpp/sails-score.cpp b/package/src/cpp/sails-score.cpp index dfc66de..bcacab0 100644 --- a/package/src/cpp/sails-score.cpp +++ b/package/src/cpp/sails-score.cpp @@ -90,15 +90,24 @@ std::map Sails::Score::calculate_qscores(Sails::Densit return qscores; } -double Sails::Score::calculate_clash_score(gemmi::Residue *residue, gemmi::Structure *structure) { +double Sails::Score::calculate_clash_score(Sails::Glycosite &site, gemmi::Structure *structure) { constexpr double radius = 1; - gemmi::NeighborSearch ns = gemmi::NeighborSearch(structure->models[0], structure->cell, radius); + auto ns = gemmi::NeighborSearch(structure->models[0], structure->cell, radius); ns.populate(); + gemmi::Residue residue = Sails::Utils::get_residue_from_glycosite(site, structure); + site.atom_idx = 0; + double clash_score = 0; - for (auto &atom: residue->atoms) { + for (auto &atom: residue.atoms) { auto nearest_atoms = ns.find_atoms(atom.pos, '\0', 0, radius); - clash_score += static_cast(nearest_atoms.size()); + for (const auto& nearest_atom: nearest_atoms) { + Glycosite atom_site = {0, nearest_atom->chain_idx, nearest_atom->residue_idx, 0}; + if (atom_site == site) continue; + clash_score += 1; + + } + // clash_score += static_cast(nearest_atoms.size()); } return clash_score; } diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 5512ca0..0e1ae95 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -68,7 +68,7 @@ void remove_erroneous_sugars(gemmi::Structure *structure, Sails::Density *densit if (!sugar_result.has_value()) continue; // if there is nothing previous, it must be a protein residue if (residue.name == "FUC") { - double clash_score = Sails::Score::calculate_clash_score(&residue, structure); + double clash_score = Sails::Score::calculate_clash_score(snd->site, structure); if (clash_score > 2) { print_removal_clash(snd->site, clash_score, structure) ; to_remove.push_back(snd.get()); diff --git a/package/src/include/sails-score.h b/package/src/include/sails-score.h index 99a95f9..94b3194 100644 --- a/package/src/include/sails-score.h +++ b/package/src/include/sails-score.h @@ -13,7 +13,7 @@ namespace Sails::Score { std::map calculate_qscores(Sails::Density* density, gemmi::Structure* structure, ResidueDatabase &residue_database); - double calculate_clash_score(gemmi::Residue* residue, gemmi::Structure* structure); + double calculate_clash_score(Sails::Glycosite &site, gemmi::Structure* structure); namespace QScore { std::vector fibonacci_sphere(int samples, float radius, const gemmi::Position ¢er); From 838f52dd71a7650c22c7ad218e04199b2864dcca Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Mon, 27 Oct 2025 12:53:13 +0000 Subject: [PATCH 44/56] Inlined util methods --- package/src/include/sails-utils.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/package/src/include/sails-utils.h b/package/src/include/sails-utils.h index 39b620f..4245b35 100644 --- a/package/src/include/sails-utils.h +++ b/package/src/include/sails-utils.h @@ -274,7 +274,7 @@ namespace Sails::Utils { return {std::move(firsts), std::move(seconds)}; } - double calculate_average_bfactor(const Glycosite &site, gemmi::Structure * structure) { + inline double calculate_average_bfactor(const Glycosite &site, gemmi::Structure * structure) { gemmi::Residue* residue_ptr = get_residue_ptr_from_glycosite(site, structure); const double sum = std::accumulate(residue_ptr->atoms.begin(), residue_ptr->atoms.end(), 0.0, [](const double current, gemmi::Atom& atom) { return current + atom.b_iso; @@ -282,7 +282,7 @@ namespace Sails::Utils { return sum / residue_ptr->atoms.size(); } - void set_all_bfactors(gemmi::Residue * residue, double b_factor) { + inline void set_all_bfactors(gemmi::Residue * residue, double b_factor) { for (auto & atom : residue->atoms) { atom.b_iso = b_factor; } From 4be2df5127004b229430cc00b05bb89c9b1c2d78 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Mon, 27 Oct 2025 16:25:15 +0000 Subject: [PATCH 45/56] Added back sites --- package/src/cpp/sails-score.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/package/src/cpp/sails-score.cpp b/package/src/cpp/sails-score.cpp index bcacab0..0e9509f 100644 --- a/package/src/cpp/sails-score.cpp +++ b/package/src/cpp/sails-score.cpp @@ -143,11 +143,11 @@ std::vector Sails::Score::QScore::get_radial_points(const gemmi for (int i = 0 ; i < max_iter ; i++) { std::vector sampled_sphere = fibonacci_sphere(N+i, radius, position); for (const auto& sampled_position: sampled_sphere) { - // const gemmi::NeighborSearch::Mark* nearest_atom = ns.find_nearest_atom(sampled_position); - // auto nearest_site = Glycosite(*nearest_atom); - // if (nearest_site == site) { - positions.emplace_back(sampled_position); - // } + const gemmi::NeighborSearch::Mark* nearest_atom = ns.find_nearest_atom(sampled_position); + auto nearest_site = Glycosite(*nearest_atom); + if (nearest_site == site) { + positions.emplace_back(sampled_position); + } if (positions.size() >= N) { break; From d884dd0f1b93a941283aac38687b3bc99013894a Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Tue, 28 Oct 2025 09:21:51 +0000 Subject: [PATCH 46/56] Added protein-glycan finding and ordered preference of N,C,O --- package/src/cpp/sails.cpp | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 0e1ae95..d2d6657 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -456,7 +456,7 @@ Sails::Output o_mannosylate(gemmi::Structure &structure, Sails::MTZ &sails_mtz, Sails::Output auto_glycosylate(gemmi::Structure &structure, Sails::MTZ &sails_mtz, gemmi::Grid<>& glycan_grid, gemmi::Grid<>& protein_grid, int cycles, std::string &resource_dir, bool verbose) { - Sails::Glycosites predicted_glycosites = identify_predicted_sites(structure, glycan_grid, protein_grid, true, resource_dir); + Sails::Glycosites predicted_glycosites = identify_predicted_sites(structure, glycan_grid, protein_grid, false, resource_dir); std::cout << "Found " << predicted_glycosites.size() << " potential sites using deep learning models" << std::endl; Sails::Glycosites n_glycosites = Sails::find_n_glycosylation_sites(structure); Sails::Glycosites c_glycosites = Sails::find_c_glycosylation_sites(structure); @@ -468,6 +468,20 @@ Sails::Output auto_glycosylate(gemmi::Structure &structure, Sails::MTZ &sails_mt int diff = static_cast(glycosites.size()) - static_cast(predicted_glycosites.size()); std::cout << "Supplemented with " << diff << " sites from the sequence" << std::endl; + // prefer to glycosylate N first, then C, then O. + std::sort(glycosites.begin(), glycosites.end(), + [&](const Sails::Glycosite& a, const Sails::Glycosite& b) { + auto rank = [&](const Sails::Glycosite& s) { + gemmi::Residue* residue = Sails::Utils::get_residue_ptr_from_glycosite(s, &structure); + if (residue->name == "ASN") return 0; + if (residue->name == "TRP") return 1; + if (residue->name == "SER" || residue->name == "THR") return 2; + return 3; + }; + return rank(a) < rank(b); + }); + + return run_cycle(glycosites, structure, sails_mtz, cycles, resource_dir, false, verbose); } From 7c9b6f0a0fc15fe3c2ada06489305891fef8ef8e Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Tue, 28 Oct 2025 09:33:36 +0000 Subject: [PATCH 47/56] Fixed bug in link creation --- package/src/cpp/sails.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index d2d6657..5fc84fa 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -281,7 +281,7 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu topology.set_structure(model.get_structure()); // add links and write files - std::vector links = generate_link_records(&structure, &glycosites, &topology); + std::vector links = generate_link_records(&structure, &original_glycosites, &topology); Sails::add_links_to_structure(model.get_structure(), links); Sails::MTZ output_mtz = Sails::form_sails_mtz(*density.get_mtz(), "FP", "SIGFP"); std::string log_string = telemetry.format_log(&structure, &density, false).value(); From 0019a593ea1b08a7180a52d72d83474355d852aa Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Tue, 28 Oct 2025 12:31:32 +0000 Subject: [PATCH 48/56] Added EM removal with Q score rather than RSCC --- package/src/cpp/density/sails-density.cpp | 4 +- package/src/cpp/sails-json.cpp | 3 +- package/src/cpp/sails.cpp | 75 ++++++++++++++++++++++- 3 files changed, 77 insertions(+), 5 deletions(-) diff --git a/package/src/cpp/density/sails-density.cpp b/package/src/cpp/density/sails-density.cpp index 12b04e3..8b4481c 100644 --- a/package/src/cpp/density/sails-density.cpp +++ b/package/src/cpp/density/sails-density.cpp @@ -19,8 +19,8 @@ double Sails::Density::score_residue(gemmi::Residue &residue, const DensityScore return rscc_score(residue); case rsr: return rsr_score(residue); - // case dds: - // return check_difference_density(residue, TODO); + case q: + return q_score(residue); default: return -1; } diff --git a/package/src/cpp/sails-json.cpp b/package/src/cpp/sails-json.cpp index 9272e92..bfd9a62 100644 --- a/package/src/cpp/sails-json.cpp +++ b/package/src/cpp/sails-json.cpp @@ -153,6 +153,7 @@ void Sails::JSONWriter::write_json_file(TelemetryLog &log, std::ostream &stream) stream << "{\n"; stream << "\t\"date\": \"" << strtok(ctime(&t_c), "\n") << "\",\n"; stream << "\t\"cycles\":[\n\t\t"; + int cycle_index = 0; for (const auto &[cycle, entries]: log) { stream << "{\n"; stream << "\t\t\t\"cycle\": " << cycle << ",\n"; @@ -167,7 +168,7 @@ void Sails::JSONWriter::write_json_file(TelemetryLog &log, std::ostream &stream) stream << "\n"; } stream << "\t\t\t}\n\t\t}"; - if (cycle < log.size()) stream << ","; + if (++cycle_index < log.size()) stream << ","; } stream << "]\n}"; } diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 5fc84fa..11e903a 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -39,10 +39,18 @@ void print_removal_rscc(const Sails::Glycosite &site, float rscc, gemmi::Structu std::cout << "Removing " << Sails::Utils::format_residue_from_site(site, structure) << " because of low RSCC =" << rscc << std::endl; } +void print_removal_qscore(const Sails::Glycosite &site, float qscore, gemmi::Structure *structure) { + std::cout << "Removing " << Sails::Utils::format_residue_from_site(site, structure) << " because of low Q score =" << qscore << std::endl; +} + void print_rscc(const Sails::Glycosite &site, float rscc, gemmi::Structure *structure) { std::cout << Sails::Utils::format_residue_from_site(site, structure) << " - RSCC = " << rscc << std::endl; } +void print_qscore(const Sails::Glycosite &site, float qscore, gemmi::Structure *structure) { + std::cout << Sails::Utils::format_residue_from_site(site, structure) << " - Q score = " << qscore << std::endl; +} + void print_removal_clash(const Sails::Glycosite &site, float rscc, gemmi::Structure *structure) { std::cout << "Removing " << Sails::Utils::format_residue_from_site(site, structure) << " because of clashes (Clash score = " << rscc << ")" << std::endl; } @@ -129,6 +137,70 @@ void remove_erroneous_sugars(gemmi::Structure *structure, Sails::Density *densit } } +void remove_erroneous_sugars_em(gemmi::Structure *structure, Sails::Density *density, Sails::Glycan *glycan, float resolution, + bool debug, Sails::ResidueDatabase &residue_database) { + + std::map qscores = Sails::Score::calculate_qscores(density, structure, residue_database); + double qscore_threshold = -0.0016*pow(resolution,2) + 0.0434*pow(resolution,2)-0.3956*resolution + 1.3366; + + std::vector to_remove; + for (const auto &[fst, snd]: *glycan) { + gemmi::Residue residue = Sails::Utils::get_residue_from_glycosite(snd->site, structure); + + std::optional sugar_result = glycan->find_previous_sugar(snd.get()); + if (!sugar_result.has_value()) continue; // if there is nothing previous, it must be a protein residue + + if (residue.name == "FUC") { + double clash_score = Sails::Score::calculate_clash_score(snd->site, structure); + if (clash_score > 2) { + print_removal_clash(snd->site, clash_score, structure) ; + to_remove.push_back(snd.get()); + continue; + } + } + + gemmi::Residue previous_residue = Sails::Utils::get_residue_from_glycosite( + sugar_result.value()->site, structure); + + snd->site.atom_idx = 0; // set atom index to 0 so can be used in comparisons on the residue level + + // remove cases with low rscc + if (qscores.count(snd->site) != 0) { + const double qscore = qscores.at(snd->site); + print_qscore(snd->site, qscore, structure); + if (qscore < qscore_threshold) { + to_remove.emplace_back(snd.get()); // add pointer to remove + if (debug) print_removal_rscc(snd->site, qscore, structure); + } + } else { + std::cout << Sails::Utils::format_site_key(fst) << " | " << Sails::Utils::format_site_key(snd->site) << std::endl; + throw std::runtime_error("Glycosite was not found in the RSCC calculation" + Sails::Utils::format_residue_from_site(snd->site, structure)); + } + } + + // add linked sugars to removal list + std::set additional_sugars; + for (auto &sugar: to_remove) { + std::vector downstream_sugars = glycan->get_downstream_sugars(sugar); + + for (auto& downstream_sugar: downstream_sugars) { + if (std::find(to_remove.begin(), to_remove.end(), downstream_sugar) != to_remove.end()) continue; + additional_sugars.insert(downstream_sugar); + } + } + to_remove.insert(to_remove.end(), additional_sugars.begin(), additional_sugars.end()); + + // sort removal in decsending order so removed indices don't cause later array overflow + std::sort(to_remove.begin(), to_remove.end(), [](const Sails::Sugar *a, const Sails::Sugar *b) { + return !(a->site < b->site); + }); + + for (const auto &sugar: to_remove) { + glycan->remove_sugar(sugar); + } +} + + Sails::Glycan get_glycan_topology(gemmi::Structure &structure, Sails::Glycosite &glycosite) { Sails::JSONLoader loader = {"package/data/data.json"}; Sails::ResidueDatabase residue_database = loader.load_residue_database(); @@ -285,7 +357,6 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu Sails::add_links_to_structure(model.get_structure(), links); Sails::MTZ output_mtz = Sails::form_sails_mtz(*density.get_mtz(), "FP", "SIGFP"); std::string log_string = telemetry.format_log(&structure, &density, false).value(); - Sails::Telemetry::SNFGCycleData snfgs = telemetry.get_snfgs(); return { *model.get_structure(), @@ -344,7 +415,7 @@ Sails::Output run_em_cycle(Sails::Glycosites &glycosites, gemmi::Structure &stru // std::cout << "Attempting removal at " << Sails::Utils::format_residue_from_site(glycosite, &structure) << std::endl; Sails::Glycan old_glycan = glycan; - remove_erroneous_sugars(&structure, &density, &glycan, strict, verbose, residue_database); + remove_erroneous_sugars_em(&structure, &density, &glycan, resolution, verbose, residue_database); topology.set_structure(&structure); // need to update neighbor search after removing n residues From 3c09843c9e0caabe2346f9a854ec0c2f0a4db083 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Tue, 28 Oct 2025 13:42:37 +0000 Subject: [PATCH 49/56] Added supplementation to em mode --- package/src/cpp/sails.cpp | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 11e903a..39d1462 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -591,7 +591,31 @@ Sails::Output o_mannosylate(gemmi::Structure &structure, gemmi::Grid<>& grid, fl Sails::Output auto_glycosylate(gemmi::Structure &structure, gemmi::Grid<>& grid, float resolution, gemmi::Grid<>& glycan_grid, gemmi::Grid<>& protein_grid, int cycles, std::string &resource_dir, bool verbose) { - Sails::Glycosites glycosites = identify_predicted_sites(structure, glycan_grid, protein_grid, true, resource_dir); + Sails::Glycosites predicted_glycosites = identify_predicted_sites(structure, glycan_grid, protein_grid, false, resource_dir); + std::cout << "Found " << predicted_glycosites.size() << " potential sites using deep learning models" << std::endl; + Sails::Glycosites n_glycosites = Sails::find_n_glycosylation_sites(structure); + Sails::Glycosites c_glycosites = Sails::find_c_glycosylation_sites(structure); + + std::set glycosites_set = {predicted_glycosites.begin(), predicted_glycosites.end()}; + glycosites_set.insert(n_glycosites.begin(), n_glycosites.end()); + glycosites_set.insert(c_glycosites.begin(), c_glycosites.end()); + Sails::Glycosites glycosites = {glycosites_set.begin(), glycosites_set.end()}; + int diff = static_cast(glycosites.size()) - static_cast(predicted_glycosites.size()); + std::cout << "Supplemented with " << diff << " sites from the sequence" << std::endl; + + // prefer to glycosylate N first, then C, then O. + std::sort(glycosites.begin(), glycosites.end(), + [&](const Sails::Glycosite& a, const Sails::Glycosite& b) { + auto rank = [&](const Sails::Glycosite& s) { + gemmi::Residue* residue = Sails::Utils::get_residue_ptr_from_glycosite(s, &structure); + if (residue->name == "ASN") return 0; + if (residue->name == "TRP") return 1; + if (residue->name == "SER" || residue->name == "THR") return 2; + return 3; + }; + return rank(a) < rank(b); + }); + return run_em_cycle(glycosites, structure, grid, resolution, cycles, resource_dir, false, verbose); } From 18217ddab28d20eb0e5e3cf0be2dd0436bd8b6e1 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Tue, 28 Oct 2025 16:11:38 +0000 Subject: [PATCH 50/56] Added chain and seqid options to validate --- package/src/bindings/python_sails.cpp | 1 + package/src/cpp/sails.cpp | 75 +++++++++++++++++++++++++++ package/src/sails/__init__.py | 2 + package/src/sails/validate.py | 21 ++++++-- 4 files changed, 95 insertions(+), 4 deletions(-) diff --git a/package/src/bindings/python_sails.cpp b/package/src/bindings/python_sails.cpp index 21afd66..7b14782 100644 --- a/package/src/bindings/python_sails.cpp +++ b/package/src/bindings/python_sails.cpp @@ -282,6 +282,7 @@ NB_MODULE(sails_module, m) { // XRAY m.def("validate", nb::overload_cast(&validate), "structure"_a, "mtz"_a, "remove"_a, "threshold"_a, "resource_dir"_a); + m.def("validate_site", nb::overload_cast(&validate_site), "structure"_a, "mtz"_a, "chain"_a, "seqid"_a, "remove"_a, "threshold"_a, "resource_dir"_a); // EM m.def("validate", nb::overload_cast &, float, bool, float, bool, std::string &>(&validate), "structure"_a, "grid"_a, "resolution"_a, "remove"_a, "threshold"_a, "use_q"_a, "resource_dir"_a); diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 39d1462..7f5ae45 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -837,6 +837,81 @@ Sails::Output validate(gemmi::Structure& structure, Sails::MTZ &sails_mtz, bool }; } +Sails::Output validate_site(gemmi::Structure& structure, Sails::MTZ &sails_mtz, std::string& chain, int seqid, bool remove, float threshold, std::string& resource_dir) { + std::string data_file = resource_dir + "/data.json"; + Sails::JSONLoader loader = {data_file}; + Sails::ResidueDatabase residue_database = loader.load_residue_database(); + Sails::LinkageDatabase linkage_database = loader.load_linkage_database(); + + gemmi::Mtz mtz = form_gemmi_mtz(sails_mtz); + check_spacegroup(&mtz, &structure); // check to ensure the MTZ has a spacegroup + + auto density = Sails::XtalDensity(mtz); + density.load_map_coefficients(); + + std::vector to_remove = {}; + std::vector log = {}; + + std::optional potential_site = Sails::find_site(structure, chain, seqid); + if (!potential_site.has_value()) { + throw std::runtime_error("Could not find potential site"); + } + std::cout << "Validating glycans at " << Sails::Utils::format_residue_from_site(potential_site.value(), &structure) << std::endl; + + Sails::Topology topology = {&structure, residue_database}; + auto glycan = topology.find_glycan_topology(potential_site.value()); + auto glycan_sites = glycan.get_sites(); + + std::cout << "Found " << glycan_sites.size() << " sites" << std::endl; + + std::map rsccs = Sails::Score::calculate_rsccs(&density, &structure, residue_database); + + for (auto& [site, rscc]: rsccs) { + if (std::find(glycan_sites.begin(), glycan_sites.end(), site) == glycan_sites.end()) continue; + + std::string residue_key = Sails::Utils::format_residue_from_site(site, &structure); + log.emplace_back(residue_key, rscc); + if (rscc > threshold) { + continue; + } + to_remove.emplace_back(site); + } + + if (remove) { + Sails::Topology topology = {&structure, residue_database}; + + std::set removal_set = {to_remove.begin(), to_remove.end()}; + + for (auto &site: to_remove) { + auto glycan = topology.find_glycan_topology(site); + std::vector downstream_sugars = glycan.get_downstream_sugars(site); + for (auto& downstream_sugar: downstream_sugars) { + if (std::find(removal_set.begin(), removal_set.end(), downstream_sugar->site) != removal_set.end()) continue; + downstream_sugar->site.atom_idx = 0; // remove atom site from site to allow sorting + removal_set.insert(downstream_sugar->site); + } + } + + std::vector removal_list = {removal_set.begin(), removal_set.end()}; + + std::sort(removal_list.begin(), removal_list.end(), [](const Sails::Glycosite& a, const Sails::Glycosite& b) { + return !(a < b); + }); + + for (auto &site: removal_list) { + const auto residue_ptr = &structure.models[site.model_idx].chains[site.chain_idx].residues; + residue_ptr->erase(residue_ptr->begin() + site.residue_idx); + } + } + + + std::string log_string = Sails::Telemetry::format_log(log, false, "").value(); + return { + structure, + log_string + }; +} + Sails::Output validate(gemmi::Structure& structure, gemmi::Grid<>& grid, float resolution, bool remove, float threshold, bool use_q, std::string& resource_dir) { std::string data_file = resource_dir + "/data.json"; Sails::JSONLoader loader = {data_file}; diff --git a/package/src/sails/__init__.py b/package/src/sails/__init__.py index 08abcb6..2b5b677 100644 --- a/package/src/sails/__init__.py +++ b/package/src/sails/__init__.py @@ -33,6 +33,7 @@ Connections, AtomAddress, ResidueId, + validate_site, ) from .__version__ import __version__ from .glycosylate import glycosylate_xtal, glycosylate_em, Type @@ -93,4 +94,5 @@ "Connections", "AtomAddress", "ResidueId", + "validate_site", ] diff --git a/package/src/sails/validate.py b/package/src/sails/validate.py index 83d3b41..15e90c2 100644 --- a/package/src/sails/validate.py +++ b/package/src/sails/validate.py @@ -4,7 +4,7 @@ from .__version__ import __version__ import importlib -from sails import validate, interface +from sails import validate, validate_site, interface from .glycosylate import get_column_labels @@ -41,6 +41,8 @@ def parse_args(): xray_parser_group.add_argument( "--colin-fwt", type=str, required=False, default="FWT,PHWT" ) + xray_parser_group.add_argument("--chain", type=str, required=False) + xray_parser_group.add_argument("--seqid", type=str, required=False) em_parser = subparsers.add_parser("em", parents=[parent], formatter_class=formatter) em_parser_group = em_parser.add_argument_group("Required arguments in EM mode") @@ -60,9 +62,20 @@ def xray(args): labels = get_column_labels(args.colin_fo, args.colin_fwt) sails_mtz = interface.get_sails_mtz(args.mtzin, *labels) - result = validate( - sails_structure, sails_mtz, args.remove, args.threshold, str(resource) - ) + if args.chain and args.seqid: + result = validate_site( + sails_structure, + sails_mtz, + args.chain, + args.seqid, + args.remove, + args.threshold, + str(resource), + ) + else: + result = validate( + sails_structure, sails_mtz, args.remove, args.threshold, str(resource) + ) structure = interface.extract_sails_structure(result.structure) structure.make_mmcif_block().write_file(args.modelout) From dcac4ba95f4b8b2c833e290bb329de0752805f47 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:41:41 +0000 Subject: [PATCH 51/56] Fixed call to logging in EM mode --- package/src/cpp/sails.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 7f5ae45..dc982f7 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -170,7 +170,7 @@ void remove_erroneous_sugars_em(gemmi::Structure *structure, Sails::Density *den print_qscore(snd->site, qscore, structure); if (qscore < qscore_threshold) { to_remove.emplace_back(snd.get()); // add pointer to remove - if (debug) print_removal_rscc(snd->site, qscore, structure); + if (debug) print_removal_qscore(snd->site, qscore, structure); } } else { std::cout << Sails::Utils::format_site_key(fst) << " | " << Sails::Utils::format_site_key(snd->site) << std::endl; @@ -872,8 +872,9 @@ Sails::Output validate_site(gemmi::Structure& structure, Sails::MTZ &sails_mtz, std::string residue_key = Sails::Utils::format_residue_from_site(site, &structure); log.emplace_back(residue_key, rscc); if (rscc > threshold) { - continue; + continue; } + std::cout << "Scheduling " << Sails::Utils::format_residue_from_site(site, &structure) << " for removal because RSCC " << rscc << "<" << threshold << std::endl; to_remove.emplace_back(site); } From 23c552318e15e9e005be50938a94e9545d31d461 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Tue, 4 Nov 2025 14:29:56 +0000 Subject: [PATCH 52/56] Fixed issue with missing donor atoms --- package/src/cpp/sails-glycan.cpp | 3 ++- package/src/cpp/sails-topology.cpp | 6 ++++++ package/src/cpp/sails.cpp | 4 ++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/package/src/cpp/sails-glycan.cpp b/package/src/cpp/sails-glycan.cpp index 79974eb..d246de8 100644 --- a/package/src/cpp/sails-glycan.cpp +++ b/package/src/cpp/sails-glycan.cpp @@ -141,7 +141,8 @@ std::set Sails::Glycan::operator-(const Glycan& glycan) { std::vector Sails::Glycan::get_terminal_sugars(Glycosite &root_seq_id) { if (sugars.find(root_seq_id) == sugars.end()) { - throw std::runtime_error("Root SeqId is not valid"); + // throw std::runtime_error("Root SeqId is not valid"); + return {}; } std::vector terminal_sugars; dfs_terminal(sugars[root_seq_id].get(), terminal_sugars); diff --git a/package/src/cpp/sails-topology.cpp b/package/src/cpp/sails-topology.cpp index 656e9ff..bbdc8a3 100644 --- a/package/src/cpp/sails-topology.cpp +++ b/package/src/cpp/sails-topology.cpp @@ -32,6 +32,12 @@ void Sails::Topology::find_residue_near_donor(Glycosite &glycosite, Glycan &glyc auto database_entry = m_database[residue.name]; for (const auto &donor: database_entry.donors) { + // check if at least one atom, if not add the root but no further sugars + gemmi::Atom* atom = residue.find_atom(donor.atom3, '*'); + if (atom == nullptr) { + continue; + } + // get donor atoms with that name, could return > 1 with altconfs gemmi::AtomGroup donor_atoms = residue.get(donor.atom3); for (const auto &donor_atom: donor_atoms) { diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index dc982f7..3e0fcab 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -315,9 +315,9 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu std::set differences = old_glycan - new_glycan; telemetry >> differences; - std::string snfg_string = snfg.create_snfg(new_glycan, glycosite); + // std::string snfg_string = snfg.create_snfg(new_glycan, glycosite); std::string glycosite_key = Sails::Utils::format_residue_from_site(glycosite, &structure); - telemetry.save_snfg(i, glycosite_key, snfg_string); + // telemetry.save_snfg(i, glycosite_key, snfg_string); } if (verbose && !unmodellable_sites.empty()) { From 8aa0aee4e6a4aa3f4bed391768945091d0c0b9fb Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Tue, 4 Nov 2025 14:30:23 +0000 Subject: [PATCH 53/56] Updated data.json --- package/src/sails/data/data.json | 875 ++++++++++++++++++++----------- 1 file changed, 568 insertions(+), 307 deletions(-) diff --git a/package/src/sails/data/data.json b/package/src/sails/data/data.json index a5f7ee1..dba1b60 100644 --- a/package/src/sails/data/data.json +++ b/package/src/sails/data/data.json @@ -181,7 +181,8 @@ "snfgColour": "#0090bc", "preferredDepths": [ 1, - 2 + 2, + 7 ], "anomer": "β", "wurcsCode": "a2122h-1b_1-5_2*NCC/3=O", @@ -311,38 +312,79 @@ "clusters": [ { "angles": { - "alphaMean": 122.28, - "alphaStdDev": 5.253, - "betaMean": 109.788, - "betaStdDev": 5.253, - "gammaMean": 113.963, - "gammaStdDev": 1.54 + "alphaMean": 129.837, + "alphaStdDev": 13.616, + "betaMean": 110.011, + "betaStdDev": 13.616, + "gammaMean": 113.524, + "gammaStdDev": 2.266 }, "torsions": { - "phiMean": -106.706, - "phiStdDev": 30.219, - "psiMean": 179.446, - "psiStdDev": 28.245, - "omegaMean": 176.266, - "omegaStdDev": 7.108 - } - },{ - "angles": { - "alphaMean": 127.765, - "alphaStdDev": 8.153, - "betaMean": 112.241, - "betaStdDev": 8.153, - "gammaMean": 113.442, - "gammaStdDev": 1.689 - }, - "torsions": { - "phiMean": 54.086, - "phiStdDev": 36.298, - "psiMean": -179.512, - "psiStdDev": 53.054, - "omegaMean": 167.981, - "omegaStdDev": 11.815 - } + "phiMean": 66.501, + "phiStdDev": 101.828, + "psiMean": 180.67, + "psiStdDev": 73.142, + "omegaMean": 161.983, + "omegaStdDev": 18.148 + }, + "priority": false + }, + { + "angles": { + "alphaMean": 127.742, + "alphaStdDev": 5.725, + "betaMean": 109.106, + "betaStdDev": 5.725, + "gammaMean": 114.671, + "gammaStdDev": 1.942 + }, + "torsions": { + "phiMean": -97.7, + "phiStdDev": 27.286, + "psiMean": 359.451, + "psiStdDev": 6.833, + "omegaMean": -179.573, + "omegaStdDev": 8.716 + }, + "priority": false + }, + { + "angles": { + "alphaMean": 121.376, + "alphaStdDev": 4.178, + "betaMean": 109.598, + "betaStdDev": 4.178, + "gammaMean": 113.848, + "gammaStdDev": 1.686 + }, + "torsions": { + "phiMean": -102.636, + "phiStdDev": 25.777, + "psiMean": 179.124, + "psiStdDev": 6.129, + "omegaMean": 176.271, + "omegaStdDev": 6.38 + }, + "priority": true + }, + { + "angles": { + "alphaMean": 125.118, + "alphaStdDev": 4.63, + "betaMean": 112.176, + "betaStdDev": 4.63, + "gammaMean": 113.306, + "gammaStdDev": 2.018 + }, + "torsions": { + "phiMean": 57.633, + "phiStdDev": 16.731, + "psiMean": 180.055, + "psiStdDev": 4.686, + "omegaMean": 168.839, + "omegaStdDev": 8.195 + }, + "priority": false } ] }, @@ -355,21 +397,22 @@ "clusters": [ { "angles": { - "alphaMean": 127.838, - "alphaStdDev": 5.582, - "betaMean": 109.84, - "betaStdDev": 5.582, - "gammaMean": 116.699, - "gammaStdDev": 2.541 + "alphaMean": 125.332, + "alphaStdDev": 3.684, + "betaMean": 109.285, + "betaStdDev": 3.684, + "gammaMean": 117.773, + "gammaStdDev": 1.929 }, "torsions": { - "phiMean": 116.884, - "phiStdDev": 25.889, - "psiMean": -176.651, - "psiStdDev": 11.025, - "omegaMean": 162.145, - "omegaStdDev": 35.401 - } + "phiMean": 120.063, + "phiStdDev": 13.997, + "psiMean": 183.105, + "psiStdDev": 5.923, + "omegaMean": 166.487, + "omegaStdDev": 27.968 + }, + "priority": true } ] }, @@ -382,21 +425,22 @@ "clusters": [ { "angles": { - "alphaMean": 113.548, - "alphaStdDev": 5.876, - "betaMean": 111.95, - "betaStdDev": 5.876, - "gammaMean": 114.725, - "gammaStdDev": 1.774 + "alphaMean": 113.152, + "alphaStdDev": 4.541, + "betaMean": 112.019, + "betaStdDev": 4.541, + "gammaMean": 114.806, + "gammaStdDev": 1.708 }, "torsions": { - "phiMean": 72.866, - "phiStdDev": 28.094, - "psiMean": 175.3, - "psiStdDev": 33.92, - "omegaMean": 60.137, - "omegaStdDev": 10.249 - } + "phiMean": 69.173, + "phiStdDev": 21.872, + "psiMean": 172.894, + "psiStdDev": 30.103, + "omegaMean": 60.268, + "omegaStdDev": 9.139 + }, + "priority": true } ] }, @@ -409,21 +453,22 @@ "clusters": [ { "angles": { - "alphaMean": 113.367, - "alphaStdDev": 6.634, - "betaMean": 111.89, - "betaStdDev": 6.634, - "gammaMean": 114.592, - "gammaStdDev": 2.038 + "alphaMean": 112.707, + "alphaStdDev": 5.556, + "betaMean": 111.711, + "betaStdDev": 5.556, + "gammaMean": 114.497, + "gammaStdDev": 1.981 }, "torsions": { - "phiMean": 86.251, - "phiStdDev": 27.802, - "psiMean": 130.215, - "psiStdDev": 25.722, - "omegaMean": 61.975, - "omegaStdDev": 11.282 - } + "phiMean": 88.006, + "phiStdDev": 28.579, + "psiMean": 129.928, + "psiStdDev": 25.827, + "omegaMean": 62.706, + "omegaStdDev": 10.134 + }, + "priority": true } ] }, @@ -436,39 +481,79 @@ "clusters": [ { "angles": { - "alphaMean": 117.289, - "alphaStdDev": 7.737, - "betaMean": 110.997, - "betaStdDev": 7.737, - "gammaMean": 113.811, - "gammaStdDev": 1.418 + "alphaMean": 124.537, + "alphaStdDev": 9.917, + "betaMean": 109.82, + "betaStdDev": 9.917, + "gammaMean": 113.801, + "gammaStdDev": 2.256 }, "torsions": { - "phiMean": 30.089, - "phiStdDev": 51.985, - "psiMean": -124.86, - "psiStdDev": 27.34, - "omegaMean": 175.963, - "omegaStdDev": 7.299 - } + "phiMean": 66.225, + "phiStdDev": 110.625, + "psiMean": -138.585, + "psiStdDev": 60.914, + "omegaMean": 172.277, + "omegaStdDev": 11.969 + }, + "priority": false }, { "angles": { - "alphaMean": 112.665, - "alphaStdDev": 4.676, - "betaMean": 111.191, - "betaStdDev": 4.676, - "gammaMean": 113.81, - "gammaStdDev": 1.316 + "alphaMean": 118.5, + "alphaStdDev": 4.218, + "betaMean": 110.849, + "betaStdDev": 4.218, + "gammaMean": 114.828, + "gammaStdDev": 1.339 }, "torsions": { - "phiMean": -80.049, - "phiStdDev": 17.934, - "psiMean": -127.513, - "psiStdDev": 24.467, - "omegaMean": 179.044, - "omegaStdDev": 4.33 - } + "phiMean": -92.052, + "phiStdDev": 13.796, + "psiMean": 61.167, + "psiStdDev": 15.518, + "omegaMean": -178.618, + "omegaStdDev": 6.176 + }, + "priority": false + }, + { + "angles": { + "alphaMean": 113.04, + "alphaStdDev": 3.917, + "betaMean": 111.017, + "betaStdDev": 3.917, + "gammaMean": 113.571, + "gammaStdDev": 1.718 + }, + "torsions": { + "phiMean": -79.724, + "phiStdDev": 16.764, + "psiMean": -127.925, + "psiStdDev": 15.296, + "omegaMean": 178.681, + "omegaStdDev": 4.643 + }, + "priority": true + }, + { + "angles": { + "alphaMean": 120.232, + "alphaStdDev": 3.25, + "betaMean": 109.47, + "betaStdDev": 3.25, + "gammaMean": 114.125, + "gammaStdDev": 2.327 + }, + "torsions": { + "phiMean": 85.744, + "phiStdDev": 12.098, + "psiMean": -109.446, + "psiStdDev": 7.651, + "omegaMean": 172.163, + "omegaStdDev": 6.743 + }, + "priority": false } ] }, @@ -481,39 +566,60 @@ "clusters": [ { "angles": { - "alphaMean": 112.045, - "alphaStdDev": 5.979, - "betaMean": 111.561, - "betaStdDev": 5.979, - "gammaMean": 114.921, - "gammaStdDev": 1.979 + "alphaMean": 112.17, + "alphaStdDev": 6.577, + "betaMean": 110.888, + "betaStdDev": 6.577, + "gammaMean": 115.115, + "gammaStdDev": 2.084 }, "torsions": { - "phiMean": -83.971, - "phiStdDev": 30.056, - "psiMean": -159.668, - "psiStdDev": 27.959, - "omegaMean": -63.8, - "omegaStdDev": 8.514 - } + "phiMean": -84.596, + "phiStdDev": 30.324, + "psiMean": 169.536, + "psiStdDev": 49.974, + "omegaMean": -64.177, + "omegaStdDev": 10.422 + }, + "priority": true }, { "angles": { - "alphaMean": 111.376, - "alphaStdDev": 3.955, - "betaMean": 112.227, - "betaStdDev": 3.955, - "gammaMean": 115.159, - "gammaStdDev": 2.054 + "alphaMean": 111.943, + "alphaStdDev": 3.334, + "betaMean": 111.523, + "betaStdDev": 3.334, + "gammaMean": 115.16, + "gammaStdDev": 1.963 }, "torsions": { - "phiMean": -81.232, - "phiStdDev": 30.292, - "psiMean": 125.964, - "psiStdDev": 23.346, - "omegaMean": -65.062, - "omegaStdDev": 8.861 - } + "phiMean": -73.943, + "phiStdDev": 8.372, + "psiMean": -174.486, + "psiStdDev": 11.471, + "omegaMean": -62.223, + "omegaStdDev": 4.214 + }, + "priority": false + }, + { + "angles": { + "alphaMean": 111.501, + "alphaStdDev": 2.805, + "betaMean": 112.175, + "betaStdDev": 2.805, + "gammaMean": 115.277, + "gammaStdDev": 1.588 + }, + "torsions": { + "phiMean": -72.707, + "phiStdDev": 5.713, + "psiMean": 125.347, + "psiStdDev": 17.233, + "omegaMean": -62.404, + "omegaStdDev": 3.752 + }, + "priority": false } ] }, @@ -526,39 +632,22 @@ "clusters": [ { "angles": { - "alphaMean": 114.468, - "alphaStdDev": 3.421, - "betaMean": 113.049, - "betaStdDev": 3.421, - "gammaMean": 115.227, - "gammaStdDev": 2.164 + "alphaMean": 114.315, + "alphaStdDev": 4.321, + "betaMean": 113.357, + "betaStdDev": 4.321, + "gammaMean": 114.783, + "gammaStdDev": 2.161 }, "torsions": { - "phiMean": -67.902, - "phiStdDev": 13.039, - "psiMean": 139.754, - "psiStdDev": 8.787, - "omegaMean": -65.706, - "omegaStdDev": 7.385 - } - }, - { - "angles": { - "alphaMean": 120.567, - "alphaStdDev": 5.481, - "betaMean": 110.511, - "betaStdDev": 5.481, - "gammaMean": 115.25, - "gammaStdDev": 2.348 + "phiMean": -70.877, + "phiStdDev": 12.529, + "psiMean": 137.662, + "psiStdDev": 14.607, + "omegaMean": -63.846, + "omegaStdDev": 6.328 }, - "torsions": { - "phiMean": -108.059, - "phiStdDev": 44.452, - "psiMean": 10.613, - "psiStdDev": 44.212, - "omegaMean": -75.286, - "omegaStdDev": 13.871 - } + "priority": true } ] }, @@ -571,39 +660,79 @@ "clusters": [ { "angles": { - "alphaMean": 112.963, - "alphaStdDev": 4.496, - "betaMean": 111.259, - "betaStdDev": 4.496, - "gammaMean": 113.771, - "gammaStdDev": 1.62 + "alphaMean": 120.117, + "alphaStdDev": 6.809, + "betaMean": 114.207, + "betaStdDev": 6.809, + "gammaMean": 113.329, + "gammaStdDev": 2.424 }, "torsions": { - "phiMean": -83.669, - "phiStdDev": 38.257, - "psiMean": -131.478, - "psiStdDev": 14.602, - "omegaMean": 178.472, - "omegaStdDev": 6.132 - } + "phiMean": 117.475, + "phiStdDev": 112.234, + "psiMean": -152.707, + "psiStdDev": 45.381, + "omegaMean": 169.41, + "omegaStdDev": 19.467 + }, + "priority": false }, { "angles": { - "alphaMean": 118.402, - "alphaStdDev": 5.23, - "betaMean": 111.058, - "betaStdDev": 5.23, - "gammaMean": 114.405, - "gammaStdDev": 1.96 + "alphaMean": 118.887, + "alphaStdDev": 3.828, + "betaMean": 110.238, + "betaStdDev": 3.828, + "gammaMean": 115.762, + "gammaStdDev": 2.336 }, "torsions": { - "phiMean": -110.737, - "phiStdDev": 41.555, - "psiMean": 78.36, - "psiStdDev": 25.132, - "omegaMean": 179.908, - "omegaStdDev": 8.293 - } + "phiMean": -117.12, + "phiStdDev": 17.269, + "psiMean": 69.913, + "psiStdDev": 15.995, + "omegaMean": -177.385, + "omegaStdDev": 10.382 + }, + "priority": false + }, + { + "angles": { + "alphaMean": 112.92, + "alphaStdDev": 3.717, + "betaMean": 110.727, + "betaStdDev": 3.717, + "gammaMean": 113.734, + "gammaStdDev": 1.855 + }, + "torsions": { + "phiMean": -84.638, + "phiStdDev": 16.181, + "psiMean": -131.408, + "psiStdDev": 14.169, + "omegaMean": 179.148, + "omegaStdDev": 5.325 + }, + "priority": true + }, + { + "angles": { + "alphaMean": 117.773, + "alphaStdDev": 3.189, + "betaMean": 113.615, + "betaStdDev": 3.189, + "gammaMean": 111.687, + "gammaStdDev": 1.326 + }, + "torsions": { + "phiMean": 51.356, + "phiStdDev": 9.507, + "psiMean": -121.891, + "psiStdDev": 7.708, + "omegaMean": 171.466, + "omegaStdDev": 3.746 + }, + "priority": false } ] }, @@ -616,39 +745,60 @@ "clusters": [ { "angles": { - "alphaMean": 112.696, - "alphaStdDev": 4.802, - "betaMean": 111.185, - "betaStdDev": 4.802, - "gammaMean": 114.511, - "gammaStdDev": 1.534 + "alphaMean": 122.303, + "alphaStdDev": 8.282, + "betaMean": 112.715, + "betaStdDev": 8.282, + "gammaMean": 114.869, + "gammaStdDev": 3.104 }, "torsions": { - "phiMean": 124.141, - "phiStdDev": 27.455, - "psiMean": 167.284, - "psiStdDev": 52.376, - "omegaMean": 62.53, - "omegaStdDev": 6.109 - } + "phiMean": -103.923, + "phiStdDev": 133.36, + "psiMean": -178.019, + "psiStdDev": 111.488, + "omegaMean": 72.846, + "omegaStdDev": 16.927 + }, + "priority": false }, { "angles": { - "alphaMean": 111.777, - "alphaStdDev": 4.176, - "betaMean": 112.14, - "betaStdDev": 4.176, - "gammaMean": 114.921, - "gammaStdDev": 1.929 + "alphaMean": 111.884, + "alphaStdDev": 3.674, + "betaMean": 112.374, + "betaStdDev": 3.674, + "gammaMean": 115.079, + "gammaStdDev": 2.103 }, "torsions": { - "phiMean": 76.691, - "phiStdDev": 17.032, - "psiMean": 120.879, - "psiStdDev": 18.587, - "omegaMean": 63.32, - "omegaStdDev": 6.55 - } + "phiMean": 79.668, + "phiStdDev": 22.254, + "psiMean": 122.875, + "psiStdDev": 21.525, + "omegaMean": 63.173, + "omegaStdDev": 6.833 + }, + "priority": true + }, + { + "angles": { + "alphaMean": 116.704, + "alphaStdDev": 2.744, + "betaMean": 110.516, + "betaStdDev": 2.744, + "gammaMean": 114.326, + "gammaStdDev": 1.492 + }, + "torsions": { + "phiMean": 115.031, + "phiStdDev": 15.957, + "psiMean": -45.776, + "psiStdDev": 18.655, + "omegaMean": 60.43, + "omegaStdDev": 5.04 + }, + "priority": false } ] }, @@ -661,39 +811,60 @@ "clusters": [ { "angles": { - "alphaMean": 111.245, - "alphaStdDev": 5.137, - "betaMean": 111.618, - "betaStdDev": 5.137, - "gammaMean": 114.578, - "gammaStdDev": 2.196 + "alphaMean": 111.97, + "alphaStdDev": 6.422, + "betaMean": 112.206, + "betaStdDev": 6.422, + "gammaMean": 114.618, + "gammaStdDev": 2.526 }, "torsions": { - "phiMean": 91.041, - "phiStdDev": 36.251, - "psiMean": -165.287, - "psiStdDev": 29.928, - "omegaMean": 62.662, - "omegaStdDev": 6.365 - } + "phiMean": 121.737, + "phiStdDev": 59.772, + "psiMean": -161.178, + "psiStdDev": 59.274, + "omegaMean": 67.007, + "omegaStdDev": 11.904 + }, + "priority": false }, { "angles": { - "alphaMean": 111.215, - "alphaStdDev": 5.15, - "betaMean": 111.891, - "betaStdDev": 5.15, - "gammaMean": 114.514, - "gammaStdDev": 1.803 + "alphaMean": 111.083, + "alphaStdDev": 3.186, + "betaMean": 111.653, + "betaStdDev": 3.186, + "gammaMean": 114.791, + "gammaStdDev": 1.861 }, "torsions": { - "phiMean": 99.177, - "phiStdDev": 38.324, - "psiMean": 122.388, - "psiStdDev": 28.196, - "omegaMean": 62.79, - "omegaStdDev": 6.586 - } + "phiMean": 75.669, + "phiStdDev": 23.342, + "psiMean": 178.455, + "psiStdDev": 13.232, + "omegaMean": 61.6, + "omegaStdDev": 5.504 + }, + "priority": true + }, + { + "angles": { + "alphaMean": 112.288, + "alphaStdDev": 3.591, + "betaMean": 113.29, + "betaStdDev": 3.591, + "gammaMean": 114.578, + "gammaStdDev": 1.955 + }, + "torsions": { + "phiMean": 84.192, + "phiStdDev": 19.648, + "psiMean": 87.973, + "psiStdDev": 8.041, + "omegaMean": 59.971, + "omegaStdDev": 5.414 + }, + "priority": false } ] }, @@ -706,39 +877,60 @@ "clusters": [ { "angles": { - "alphaMean": 113.427, - "alphaStdDev": 3.654, - "betaMean": 110.13, - "betaStdDev": 3.654, - "gammaMean": 114.421, - "gammaStdDev": 1.585 + "alphaMean": 115.429, + "alphaStdDev": 6.779, + "betaMean": 112.096, + "betaStdDev": 6.779, + "gammaMean": 114.219, + "gammaStdDev": 2.552 }, "torsions": { - "phiMean": 149.427, - "phiStdDev": 29.129, - "psiMean": 166.702, - "psiStdDev": 28.999, - "omegaMean": 62.546, - "omegaStdDev": 7.578 - } + "phiMean": 106.363, + "phiStdDev": 45.966, + "psiMean": 139.652, + "psiStdDev": 51.135, + "omegaMean": 65.193, + "omegaStdDev": 9.841 + }, + "priority": false }, { "angles": { - "alphaMean": 111.736, - "alphaStdDev": 3.807, - "betaMean": 111.726, - "betaStdDev": 3.807, - "gammaMean": 114.934, - "gammaStdDev": 1.849 + "alphaMean": 111.999, + "alphaStdDev": 3.669, + "betaMean": 111.888, + "betaStdDev": 3.669, + "gammaMean": 114.916, + "gammaStdDev": 1.883 }, "torsions": { - "phiMean": 81.341, - "phiStdDev": 15.082, - "psiMean": 127.531, - "psiStdDev": 21.67, - "omegaMean": 62.113, - "omegaStdDev": 5.308 - } + "phiMean": 81.869, + "phiStdDev": 11.331, + "psiMean": 135.629, + "psiStdDev": 18.959, + "omegaMean": 61.031, + "omegaStdDev": 4.842 + }, + "priority": true + }, + { + "angles": { + "alphaMean": 116.877, + "alphaStdDev": 2.86, + "betaMean": 114.58, + "betaStdDev": 2.86, + "gammaMean": 114.957, + "gammaStdDev": 2.177 + }, + "torsions": { + "phiMean": 61.352, + "phiStdDev": 3.213, + "psiMean": 68.128, + "psiStdDev": 4.275, + "omegaMean": 61.197, + "omegaStdDev": 3.494 + }, + "priority": false } ] }, @@ -751,39 +943,60 @@ "clusters": [ { "angles": { - "alphaMean": 111.267, - "alphaStdDev": 4.451, - "betaMean": 112.338, - "betaStdDev": 4.451, - "gammaMean": 115.005, - "gammaStdDev": 1.642 + "alphaMean": 157.741, + "alphaStdDev": -0.0, + "betaMean": 84.781, + "betaStdDev": -0.0, + "gammaMean": 115.292, + "gammaStdDev": -0.0 }, "torsions": { - "phiMean": 87.585, - "phiStdDev": 26.284, - "psiMean": 127.542, - "psiStdDev": 21.99, - "omegaMean": 63.792, - "omegaStdDev": 6.819 - } + "phiMean": -162.431, + "phiStdDev": -0.0, + "psiMean": -45.792, + "psiStdDev": -0.0, + "omegaMean": 85.172, + "omegaStdDev": -0.0 + }, + "priority": false }, { "angles": { - "alphaMean": 117.959, - "alphaStdDev": 8.512, - "betaMean": 110.22, - "betaStdDev": 8.512, - "gammaMean": 114.023, - "gammaStdDev": 1.649 + "alphaMean": 112.28, + "alphaStdDev": 4.978, + "betaMean": 112.174, + "betaStdDev": 4.978, + "gammaMean": 114.722, + "gammaStdDev": 1.941 }, "torsions": { - "phiMean": 109.582, - "phiStdDev": 28.224, - "psiMean": -37.08, - "psiStdDev": 30.971, - "omegaMean": 60.351, - "omegaStdDev": 8.854 - } + "phiMean": 81.551, + "phiStdDev": 19.6, + "psiMean": 130.234, + "psiStdDev": 22.768, + "omegaMean": 61.843, + "omegaStdDev": 6.198 + }, + "priority": true + }, + { + "angles": { + "alphaMean": 116.969, + "alphaStdDev": 2.361, + "betaMean": 110.661, + "betaStdDev": 2.361, + "gammaMean": 114.473, + "gammaStdDev": 1.862 + }, + "torsions": { + "phiMean": 108.755, + "phiStdDev": 22.812, + "psiMean": -41.195, + "psiStdDev": 22.631, + "omegaMean": 59.388, + "omegaStdDev": 7.295 + }, + "priority": false } ] }, @@ -796,38 +1009,86 @@ "clusters": [ { "angles": { - "alphaMean": 112.164, - "alphaStdDev": 5.285, - "betaMean": 111.948, - "betaStdDev": 5.285, - "gammaMean": 114.89, - "gammaStdDev": 1.67 + "alphaMean": 112.833, + "alphaStdDev": 8.327, + "betaMean": 112.546, + "betaStdDev": 8.327, + "gammaMean": 114.25, + "gammaStdDev": 2.213 }, "torsions": { - "phiMean": 69.439, - "phiStdDev": 16.826, - "psiMean": 178.823, - "psiStdDev": 36.578, - "omegaMean": 63.355, - "omegaStdDev": 7.64 - } + "phiMean": 99.941, + "phiStdDev": 58.35, + "psiMean": -179.936, + "psiStdDev": 55.528, + "omegaMean": 67.324, + "omegaStdDev": 12.981 + }, + "priority": false + }, + { + "angles": { + "alphaMean": 111.914, + "alphaStdDev": 3.471, + "betaMean": 111.639, + "betaStdDev": 3.471, + "gammaMean": 114.69, + "gammaStdDev": 2.042 + }, + "torsions": { + "phiMean": 69.712, + "phiStdDev": 11.052, + "psiMean": -174.851, + "psiStdDev": 12.285, + "omegaMean": 60.498, + "omegaStdDev": 5.127 + }, + "priority": true }, { "angles": { - "alphaMean": 110.925, - "alphaStdDev": 3.37, - "betaMean": 110.781, - "betaStdDev": 3.37, - "gammaMean": 114.455, - "gammaStdDev": 1.614 + "alphaMean": 107.91, + "alphaStdDev": 1.048, + "betaMean": 110.222, + "betaStdDev": 1.048, + "gammaMean": 115.455, + "gammaStdDev": 0.482 + }, + "torsions": { + "phiMean": 120.209, + "phiStdDev": 10.104, + "psiMean": -143.923, + "psiStdDev": 5.034, + "omegaMean": 67.549, + "omegaStdDev": 2.056 + }, + "priority": false + } + ] + }, + { + "donorResidue": "MAN", + "acceptorResidue": "NAG", + "donorNumber": 2, + "acceptorNumber": 1, + "length": 1.4, + "clusters": [ + { + "angles": { + "alphaMean": 112.374, + "alphaStdDev": 4.559, + "betaMean": 110.391, + "betaStdDev": 4.559, + "gammaMean": 113.604, + "gammaStdDev": 1.857 }, "torsions": { - "phiMean": 145.024, - "phiStdDev": 29.365, - "psiMean": -169.354, - "psiStdDev": 54.477, - "omegaMean": 63.742, - "omegaStdDev": 7.224 + "phiMean": -85.243, + "phiStdDev": 20.485, + "psiMean": 148.418, + "psiStdDev": 20.213, + "omegaMean": 178.521, + "omegaStdDev": 5.397 } } ] From 4a258a16333f3f66bd539c45ae0e12dc087b3d61 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Wed, 5 Nov 2025 10:21:16 +0000 Subject: [PATCH 54/56] Removed bad clusters in data.json --- package/src/sails/data/data.json | 173 +------------------------------ 1 file changed, 1 insertion(+), 172 deletions(-) diff --git a/package/src/sails/data/data.json b/package/src/sails/data/data.json index dba1b60..f124461 100644 --- a/package/src/sails/data/data.json +++ b/package/src/sails/data/data.json @@ -310,25 +310,6 @@ "acceptorNumber": 1, "length": 1.4, "clusters": [ - { - "angles": { - "alphaMean": 129.837, - "alphaStdDev": 13.616, - "betaMean": 110.011, - "betaStdDev": 13.616, - "gammaMean": 113.524, - "gammaStdDev": 2.266 - }, - "torsions": { - "phiMean": 66.501, - "phiStdDev": 101.828, - "psiMean": 180.67, - "psiStdDev": 73.142, - "omegaMean": 161.983, - "omegaStdDev": 18.148 - }, - "priority": false - }, { "angles": { "alphaMean": 127.742, @@ -479,25 +460,6 @@ "acceptorNumber": 1, "length": 1.4, "clusters": [ - { - "angles": { - "alphaMean": 124.537, - "alphaStdDev": 9.917, - "betaMean": 109.82, - "betaStdDev": 9.917, - "gammaMean": 113.801, - "gammaStdDev": 2.256 - }, - "torsions": { - "phiMean": 66.225, - "phiStdDev": 110.625, - "psiMean": -138.585, - "psiStdDev": 60.914, - "omegaMean": 172.277, - "omegaStdDev": 11.969 - }, - "priority": false - }, { "angles": { "alphaMean": 118.5, @@ -564,25 +526,6 @@ "acceptorNumber": 1, "length": 1.4, "clusters": [ - { - "angles": { - "alphaMean": 112.17, - "alphaStdDev": 6.577, - "betaMean": 110.888, - "betaStdDev": 6.577, - "gammaMean": 115.115, - "gammaStdDev": 2.084 - }, - "torsions": { - "phiMean": -84.596, - "phiStdDev": 30.324, - "psiMean": 169.536, - "psiStdDev": 49.974, - "omegaMean": -64.177, - "omegaStdDev": 10.422 - }, - "priority": true - }, { "angles": { "alphaMean": 111.943, @@ -600,7 +543,7 @@ "omegaMean": -62.223, "omegaStdDev": 4.214 }, - "priority": false + "priority": true }, { "angles": { @@ -658,25 +601,6 @@ "acceptorNumber": 1, "length": 1.4, "clusters": [ - { - "angles": { - "alphaMean": 120.117, - "alphaStdDev": 6.809, - "betaMean": 114.207, - "betaStdDev": 6.809, - "gammaMean": 113.329, - "gammaStdDev": 2.424 - }, - "torsions": { - "phiMean": 117.475, - "phiStdDev": 112.234, - "psiMean": -152.707, - "psiStdDev": 45.381, - "omegaMean": 169.41, - "omegaStdDev": 19.467 - }, - "priority": false - }, { "angles": { "alphaMean": 118.887, @@ -743,25 +667,6 @@ "acceptorNumber": 1, "length": 1.4, "clusters": [ - { - "angles": { - "alphaMean": 122.303, - "alphaStdDev": 8.282, - "betaMean": 112.715, - "betaStdDev": 8.282, - "gammaMean": 114.869, - "gammaStdDev": 3.104 - }, - "torsions": { - "phiMean": -103.923, - "phiStdDev": 133.36, - "psiMean": -178.019, - "psiStdDev": 111.488, - "omegaMean": 72.846, - "omegaStdDev": 16.927 - }, - "priority": false - }, { "angles": { "alphaMean": 111.884, @@ -809,25 +714,6 @@ "acceptorNumber": 1, "length": 1.4, "clusters": [ - { - "angles": { - "alphaMean": 111.97, - "alphaStdDev": 6.422, - "betaMean": 112.206, - "betaStdDev": 6.422, - "gammaMean": 114.618, - "gammaStdDev": 2.526 - }, - "torsions": { - "phiMean": 121.737, - "phiStdDev": 59.772, - "psiMean": -161.178, - "psiStdDev": 59.274, - "omegaMean": 67.007, - "omegaStdDev": 11.904 - }, - "priority": false - }, { "angles": { "alphaMean": 111.083, @@ -875,25 +761,6 @@ "acceptorNumber": 1, "length": 1.4, "clusters": [ - { - "angles": { - "alphaMean": 115.429, - "alphaStdDev": 6.779, - "betaMean": 112.096, - "betaStdDev": 6.779, - "gammaMean": 114.219, - "gammaStdDev": 2.552 - }, - "torsions": { - "phiMean": 106.363, - "phiStdDev": 45.966, - "psiMean": 139.652, - "psiStdDev": 51.135, - "omegaMean": 65.193, - "omegaStdDev": 9.841 - }, - "priority": false - }, { "angles": { "alphaMean": 111.999, @@ -941,25 +808,6 @@ "acceptorNumber": 1, "length": 1.4, "clusters": [ - { - "angles": { - "alphaMean": 157.741, - "alphaStdDev": -0.0, - "betaMean": 84.781, - "betaStdDev": -0.0, - "gammaMean": 115.292, - "gammaStdDev": -0.0 - }, - "torsions": { - "phiMean": -162.431, - "phiStdDev": -0.0, - "psiMean": -45.792, - "psiStdDev": -0.0, - "omegaMean": 85.172, - "omegaStdDev": -0.0 - }, - "priority": false - }, { "angles": { "alphaMean": 112.28, @@ -1007,25 +855,6 @@ "acceptorNumber": 1, "length": 1.4, "clusters": [ - { - "angles": { - "alphaMean": 112.833, - "alphaStdDev": 8.327, - "betaMean": 112.546, - "betaStdDev": 8.327, - "gammaMean": 114.25, - "gammaStdDev": 2.213 - }, - "torsions": { - "phiMean": 99.941, - "phiStdDev": 58.35, - "psiMean": -179.936, - "psiStdDev": 55.528, - "omegaMean": 67.324, - "omegaStdDev": 12.981 - }, - "priority": false - }, { "angles": { "alphaMean": 111.914, From 5b974b6b7361af972dc4816462ea392f11a32dde Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Wed, 10 Dec 2025 14:20:18 +0000 Subject: [PATCH 55/56] Updated prediction commandline to remove -m requirement --- package/src/sails/prediction/errors.py | 2 +- package/src/sails/prediction/model.py | 8 ++++---- package/src/sails/prediction/predict.py | 5 ++--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/package/src/sails/prediction/errors.py b/package/src/sails/prediction/errors.py index d6691f7..e5b849f 100644 --- a/package/src/sails/prediction/errors.py +++ b/package/src/sails/prediction/errors.py @@ -21,7 +21,7 @@ def show_multiple_model_error(model_names: List[str]): """Show warning when multiple models are found""" multiple_model_names = "" for model_name in model_names: - multiple_model_names += f"\t-model {model_name}\n" + multiple_model_names += f"\t-m {model_name}\n" logging.warning(f""" Multiple models have been found in either site_packages or CCP4/lib/data. diff --git a/package/src/sails/prediction/model.py b/package/src/sails/prediction/model.py index 788be8c..76fe6e8 100644 --- a/package/src/sails/prediction/model.py +++ b/package/src/sails/prediction/model.py @@ -178,16 +178,16 @@ def extract_model_names(models: List[Path]) -> List[str]: return model_names -def find_model(model: ModelType | str | None) -> Path | None: +def find_model(model: ModelType | str | None) -> Tuple[Path, ModelType] | None: """Search through site-packages and CCP4/lib/data for a potential model""" potential_models = find_all_potential_models() if not potential_models: sys.exit(1) + model_names = extract_model_names(potential_models) if not model and len(potential_models) == 1: - return Path(potential_models[0]) + return Path(potential_models[0]), ModelType[model_names[0]] - model_names = extract_model_names(potential_models) if not model: show_multiple_model_error(model_names) sys.exit(1) @@ -199,7 +199,7 @@ def find_model(model: ModelType | str | None) -> Path | None: for name in model_names: if name == specified_model_name: - return Path(potential_models[model_names.index(name)]) + return Path(potential_models[model_names.index(name)]), ModelType[name] show_missing_specified_model_error(specified_model_name) sys.exit(1) diff --git a/package/src/sails/prediction/predict.py b/package/src/sails/prediction/predict.py index 1724e2a..60c2900 100644 --- a/package/src/sails/prediction/predict.py +++ b/package/src/sails/prediction/predict.py @@ -176,8 +176,7 @@ def run(): """Run prediction from command line arguments""" setup_logging() args = parse_arguments() - model = ModelType[args.model] - model_path = find_model(model) + model_path, model = find_model(args.model) model_configuration = get_model_config(model_path, args.overlap) configuration = Configuration( use_gpu=args.gpu, @@ -216,7 +215,7 @@ def predict_map( ) model = ModelType[model] - model_path = find_model(model) + model_path, _ = find_model(model) model_configuration = get_model_config(model_path, overlap) configuration = Configuration( use_gpu=False, From a477b9473a0a2722128e7cd0428e2ec2f4da15c9 Mon Sep 17 00:00:00 2001 From: Jordan Dialpuri <44945647+Dialpuri@users.noreply.github.com> Date: Wed, 10 Dec 2025 16:16:28 +0000 Subject: [PATCH 56/56] Updated basic testing --- package/src/cpp/sails.cpp | 4 ++-- package/tests/test_glycosylation.py | 8 ++------ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/package/src/cpp/sails.cpp b/package/src/cpp/sails.cpp index 3e0fcab..dc982f7 100644 --- a/package/src/cpp/sails.cpp +++ b/package/src/cpp/sails.cpp @@ -315,9 +315,9 @@ Sails::Output run_cycle(Sails::Glycosites &glycosites, gemmi::Structure &structu std::set differences = old_glycan - new_glycan; telemetry >> differences; - // std::string snfg_string = snfg.create_snfg(new_glycan, glycosite); + std::string snfg_string = snfg.create_snfg(new_glycan, glycosite); std::string glycosite_key = Sails::Utils::format_residue_from_site(glycosite, &structure); - // telemetry.save_snfg(i, glycosite_key, snfg_string); + telemetry.save_snfg(i, glycosite_key, snfg_string); } if (verbose && !unmodellable_sites.empty()) { diff --git a/package/tests/test_glycosylation.py b/package/tests/test_glycosylation.py index e139782..13000a0 100644 --- a/package/tests/test_glycosylation.py +++ b/package/tests/test_glycosylation.py @@ -18,7 +18,7 @@ def cglycan(data_base_path): s = gemmi.read_structure(str(s_path)) m = gemmi.read_mtz_file(str(m_path)) - return s, m, 1, "FP", "SIGFP", "", "", sails.Type.c_glycosylate + return s, m, "", "", "", 1, "FP", "SIGFP", "", "", sails.Type.c_glycosylate def test_xtal_cglycosylation(cglycan): @@ -43,26 +43,22 @@ def test_xtal_cglycosylation(cglycan): assert "entries" in cycle entries = cycle["entries"] - expected_key = "D-AMAN-1" + expected_key = "D-MAN-1" assert expected_key in entries assert len(entries.keys()) == 1 sugar = entries[expected_key] rscc_key = "rscc" rsr_key = "rsr" - dds_key = "dds" assert rscc_key in sugar assert rsr_key in sugar - assert dds_key in sugar rscc_score = sugar[rscc_key] rsr_score = sugar[rsr_key] - dds_score = sugar[dds_key] assert rscc_score > 0.7 assert rsr_score > 0.9 - assert dds_score < 0.75 # test snfg output assert 1 in snfgs