diff --git a/README.md b/README.md index 392372a..d731444 100644 --- a/README.md +++ b/README.md @@ -425,6 +425,7 @@ files_mix: # PDB files for each crosslink type (required if mix_bool=true) replace_bool: false # Enable crosslink replacement ratio_replace: 30 # Percentage of crosslinks to replace (0-100) +ratio_replace_scope: "enzymatic" # Crosslinks to replace: "enzymatic" (default), "non_enzymatic" (AGEs), or "all" replace_file: null # Input file, or null to use geometry output # ================================================================================ @@ -620,10 +621,13 @@ n_term_combination: "9.C - 947.A" c_term_combination: "1047.C - 104.C" replace_bool: true ratio_replace: 30 # Replace 30% of crosslinks +ratio_replace_scope: "enzymatic" # "enzymatic" (default), "non_enzymatic" (AGEs), or "all" fibril_length: 40.0 contact_distance: 20 ``` +> **Note:** When you supply an input `pdb_file` together with `n_term_type`/`c_term_type`, ColBuilder verifies that the crosslinks in the PDB match the specified types. Mismatches (e.g. a trivalent PDB with `n_term_type: "HLKNL"`) stop generation with error `GEO_ERR_008`. + ```bash colbuilder --config_file config_replace_crosslinks.yaml ``` diff --git a/config.yaml b/config.yaml index 867ed3f..fc1dcd2 100644 --- a/config.yaml +++ b/config.yaml @@ -1,141 +1,204 @@ # ================================================================================ -# ColBuilder Configuration File +# ColBuilder Configuration File — FULL REFERENCE # ================================================================================ -# This configuration file controls the ColBuilder pipeline for generating collagen -# structures. The pipeline consists of three main modules that can be run -# independently or in combination: +# This file documents EVERY user-facing option in the ColBuilder pipeline, with +# defaults, valid values, and notes. Copy it and delete what you don't need. # -# 1. SEQUENCE GENERATOR: Creates collagen triple helix structures from sequences -# 2. GEOMETRY GENERATOR: Builds collagen microfibrils from triple helices -# 3. TOPOLOGY GENERATOR: Generates force field topologies for simulations +# The pipeline has three modules that run independently or chained together: +# 1. SEQUENCE GENERATOR : collagen triple helix from a sequence (homology model) +# 2. GEOMETRY GENERATOR : collagen microfibril from triple helices +# 3. TOPOLOGY GENERATOR : MD force-field topologies (Amber99 or Martini3) # -# IMPORTANT: When using additional crosslinks with a pre-mutated PDB, -# e.g. to add AGE crosslinks: -# - Run sequence generation first to apply and optimize additional crosslinks -# - Then run geometry generation in a separate execution using the output PDB +# Each module is toggled by its *_generator flag below. Any combination is valid: +# - sequence only, geometry only, topology only, or all three in one run. +# - When chained, the output of one stage feeds the next automatically. +# +# IMPORTANT (mutated-PDB / additional-crosslink workflow, e.g. adding AGEs): +# - Run sequence generation FIRST to apply and optimize additional crosslinks. +# - Then run geometry generation in a SEPARATE execution using the output PDB. +# ================================================================================ + + +# ================================================================================ +# OPERATION MODE # ================================================================================ +mode: null # Leave null — set automatically from the flags below. +config_file: null # Path to another YAML config to load (nested configs). +sequence_generator: true # Generate triple helix structure from sequence. +geometry_generator: true # Build microfibril from triple helix. +topology_generator: true # Generate simulation topology files. +debug: false # Keep intermediate files + verbose logs for debugging. +working_directory: "./" # Directory for output files. -# Operation Mode -mode: null -config_file: null # Path to another config file (for nested configs) -sequence_generator: true # Generate triple helix structure from sequence -geometry_generator: true # Build microfibril from triple helix -topology_generator: true # Generate simulation topology files -debug: false # Keep intermediate files for debugging -working_directory: "./" # Directory for output files # ================================================================================ # SEQUENCE GENERATION SETTINGS # ================================================================================ -# These settings control the generation of collagen triple helix structures. -# Two main workflows are supported: -# 1. Standard: Generate from FASTA sequence (leave mutated_pdb as null) -# 2. Mutated PDB: Add crosslinks to existing structure (must provide mutated_pdb) +# Two workflows: +# 1. Standard : generate from a FASTA sequence (leave mutated_pdb null). +# 2. Mutated PDB : add crosslinks to an existing structure (set mutated_pdb). -# Species selection - determines the collagen sequence if no custom FASTA provided +# Species selection — determines the collagen sequence if no custom FASTA is given. species: "rattus_norvegicus" # Available options: # Mammals (Primates): homo_sapiens, pan_troglodytes, pongo_abelii, callithrix_jacchus, otolemur_garnettii # Mammals (Rodents): mus_musculus, rattus_norvegicus # Mammals (Other): bos_taurus, canis_lupus, ailuropoda_melanoleuca, mustela_putorius, myotis_lucifugus, loxodonta_africana # Fish: danio_rerio, oreochromis_niloticus, oryzias_latipes, tetraodon_nigroviridis, xiphophorus_maculatus # Reptiles: pelodiscus_sinensis + # NOTE: for a species NOT in this list you MUST provide a fasta_file + # (and, if sequence_generator is false, a pdb_file). -# Mutated PDB workflow - provide pre-optimized structure to add additional crosslinks -mutated_pdb: null # Path to pre-mutated PDB, or null for standard workflow +# Mutated-PDB workflow — provide a pre-optimized structure to add crosslinks onto. +mutated_pdb: null # Path to pre-mutated PDB, or null for the standard workflow. # Example: "rattusnorvegicus_N_PYD_C_PYD+ADD1_Glucosepane.pdb" -# Custom sequence input - overrides species selection if provided -fasta_file: null # Path to custom FASTA file, or null to use available species sequence +# Custom sequence input — overrides species selection if provided. +fasta_file: null # Path to a custom FASTA file, or null to use the species sequence. # Example: "path/to/custom_sequence.fasta" + # ================================================================================ # CROSSLINK CONFIGURATION # ================================================================================ -# Terminal crosslinks are applied at N and C termini during standard generation. -# Additional crosslinks can be added anywhere (e.g., AGE products). +# Terminal crosslinks are applied at the N and C termini during standard +# generation. Additional crosslinks (e.g. AGE products) can be added on top. -crosslink: true # Enable crosslink application and optimization +crosslink: true # Enable crosslink application and optimization. -# Terminal crosslinks (applied during standard sequence generation) -n_term_type: "HLKNL" # N-terminal crosslink type -c_term_type: "HLKNL" # C-terminal crosslink type -n_term_combination: "9.C - 946.A" # Format: "residue.chain - residue.chain" +# Terminal crosslinks (applied during standard sequence generation). +n_term_type: "HLKNL" # N-terminal crosslink type (or "NONE"). +c_term_type: "HLKNL" # C-terminal crosslink type (or "NONE"). +n_term_combination: "9.C - 946.A" # Format: "residue.chain - residue.chain". c_term_combination: "1046.C - 103.C" -# Available crosslink types: DPD, DPL, HLKNL, LKNL, PYD, PYL, deHHLNL, deHLNL -# Check available crosslinks and respective combinations at [src/colbuilder/data/sequence/crosslinks.csv](https://github.com/graeter-group/colbuilder/blob/main/src/colbuilder/data/sequence/crosslinks.csv) - -# Additional crosslinks (uncomment for mutated PDB workflow) -# These are applied on top of existing crosslinks in the structure -# additional_1_type: "Pentosidine" # First additional crosslink type -# additional_2_type: null # Second additional crosslink type (optional) -# additional_1_combination: "1008.A - 767.B" # Position for first crosslink -# additional_2_combination: null # Position for second crosslink +# +# Available crosslink types: +# Enzymatic DIVALENT (2 residues): HLKNL, LKNL, deHLNL, deHHLNL +# Enzymatic TRIVALENT (3 residues): PYD, DPD, PYL, DPL +# Non-enzymatic / AGE (2 residues): Pentosidine, Glucosepane, MOLD +# None: NONE +# Combination format: +# Divalent -> "resid.chain - resid.chain" (e.g. "9.C - 947.A") +# Trivalent -> "resid.chain - resid.chain - resid.chain" (e.g. "6.B - 9.C - 946.A") +# Check valid types + combinations per species in: +# https://github.com/graeter-group/colbuilder/blob/main/src/colbuilder/data/sequence/crosslinks.csv +# +# CROSSLINK TYPE VALIDATION (input PDBs): +# If you also supply an input pdb_file together with n_term_type/c_term_type, +# ColBuilder checks that the crosslinks present in the PDB match the specified +# types (divalent vs trivalent). A mismatch — e.g. a trivalent PYD structure +# while n_term_type: "HLKNL" — stops generation with error GEO_ERR_008 and +# lists the detected crosslink residues. Fix the types to match the PDB, or +# supply a PDB whose crosslinks match the types. + +# Additional crosslinks (uncomment for the mutated-PDB workflow). +# Applied on top of existing crosslinks in the structure. +# additional_1_type: "Pentosidine" # First additional crosslink type. +# additional_2_type: null # Second additional crosslink type (optional). +# additional_1_combination: "1008.A - 767.B" # Position for the first crosslink. +# additional_2_combination: null # Position for the second crosslink. + +# Crosslink optimization settings. +# crosslink_copies: ["D0", "D5"] # Periodic copies used for distance optimization. +# Must be exactly 2 elements from D0..D5 (e.g. ["D0","D5"], ["D2","D3"]). +# Default is ["D0", "D5"] if not specified. -# Crosslink optimization settings -# crosslink_copies: ["D0", "D5"] # Periodic copies for optimization, must be exactly 2 elements -# Options: D0-D5, D0-D1, D1-D2, D2-D3, D3-D4. If not specified, default is ["D0", "D5"] # ================================================================================ # GEOMETRY GENERATION SETTINGS # ================================================================================ -# These settings control microfibril assembly from triple helices. -# The geometry generator arranges multiple copies of the input structure -# into a collagen microfibril based on crystal contacts. +# Arranges copies of the input triple helix into a microfibril via crystal contacts. -# Input structure for geometry generation -pdb_file: null # Path to input PDB, or null if sequence_generator is true +# Input structure for geometry generation. +pdb_file: null # Path to input PDB, or null when sequence_generator is true. # Example: "rattusnorvegicus_N_PYD_C_PYD.pdb" -# IMPORTANT: When using additional crosslinks, run sequence generation -# separately first, then use its output here +# IMPORTANT: when using additional crosslinks, run sequence generation separately +# first, then use its output here. + +# Microfibril parameters. +contact_distance: 20 # Maximum distance (Å) for crystal contacts. Larger = thicker fibril. +fibril_length: 70.0 # Target length of the microfibril (nm). -# Microfibril parameters -contact_distance: 20 # Maximum distance (Å) for crystal contacts -fibril_length: 70.0 # Target length of microfibril (nm) +# Advanced geometry options (usually left as null/false). +crystalcontacts_file: null # Custom crystal-contacts file (skip contact_distance). +connect_file: null # Custom connectivity information. +crystalcontacts_optimize: false # Optimize crystal-contact packing (slower, better packed). +# Note: optimization is performed automatically when generating from contact_distance; +# set this true to force it when using a crystalcontacts_file. + +# Lattice search space — three positive multipliers [dx, dy, dz] for the +# solution space explored during contact generation. Default (1, 1, 1). +# solution_space: [1, 1, 1] + +# CRYST1 record used to seed crystal contacts when none is present in the input. +# Override only if you know your unit cell differs from the collagen default. +# pdb_first_line: "CRYST1 39.970 26.950 677.900 89.24 94.59 105.58 P 1 2" -# Advanced geometry options (usually left as null/false) -crystalcontacts_file: null # Custom crystal contacts file -connect_file: null # Custom connectivity information -crystalcontacts_optimize: false # Optimize crystal contacts -# Note: crystalcontacts_optimize is automatically true when using contact_distance # ================================================================================ # MIXING OPTIONS # ================================================================================ -# Create heterogeneous microfibrils with different crosslink types. +# Create heterogeneous microfibrils that mix different crosslink types. -mix_bool: false # Enable mixing of different crosslink types -ratio_mix: "D:80 T:20" # Ratio of each type (must sum to 100) +mix_bool: false # Enable mixing of different crosslink types. +ratio_mix: "D:80 T:20" # Ratio of each type; percentages MUST sum to 100. # Format: "TypeLabel:percentage TypeLabel:percentage" -# Example: "D:80 T:20" = 80% divalent, 20% trivalent -files_mix: # PDB files for each crosslink type (required if mix_bool=true) - - "rattusnorvegicus_N_HLKNL_C_HLKNL.pdb" # File for type "D" (divalent) - - "rattusnorvegicus_N_PYD_C_PYD.pdb" # File for type "T" (trivalent) -# Note: Labels (D, T) are arbitrary but must match ratio_mix +# Example: "D:80 T:20" = 80% divalent, 20% trivalent. +files_mix: # One PDB per type label (required when mix_bool=true). + - "rattusnorvegicus_N_HLKNL_C_HLKNL.pdb" # File for type "D" (divalent). + - "rattusnorvegicus_N_PYD_C_PYD.pdb" # File for type "T" (trivalent). +# Note: labels (D, T) are arbitrary but must match those used in ratio_mix, +# and the number of files must equal the number of types in ratio_mix. + # ================================================================================ # REPLACEMENT OPTIONS # ================================================================================ -# Reduce crosslink density by replacing some with standard amino acids. -# Useful for modeling aged or diseased tissue with fewer crosslinks. +# Reduce crosslink density by replacing some crosslinks with standard lysines. +# Useful for modeling aged/diseased tissue or partial crosslinking. + +replace_bool: false # Enable crosslink replacement. +ratio_replace: 30 # Percentage of crosslinks to remove (0-100). + +# Which crosslinks are eligible for ratio-based replacement. +# "enzymatic" (DEFAULT) : enzymatic crosslinks (HLKNL/PYD-derived residues). +# "non_enzymatic" : AGE crosslinks only (Glucosepane, Pentosidine, MOLD). +# "all" : both enzymatic and non-enzymatic. +ratio_replace_scope: "enzymatic" + +replace_file: "rattus_norvegicus.pdb" # PDB containing the crosslinks to replace. +# If geometry_generator is true, this can be omitted (set to null) and the geometry +# output is used in its place. + +# Auto-fix unpaired enzymatic crosslinks: detect crosslink residues left without a +# partner after assembly and replace them, instead of erroring. Default false. +# auto_fix_unpaired: false + +# Manual, explicit replacement directives (advanced). Each entry targets one residue: +# " " +# When set, these take precedence over ratio-based replacement. +# manual_replacements: +# - "collagen_fibril.pdb LYS 947 A" -replace_bool: false # Enable crosslink replacement -ratio_replace: 30 # Percentage of crosslinks to remove (0-100) -replace_file: "rattus_norvegicus.pdb" # Path to a file containing specific crosslinks to replace -# If running with geometry step set to true, this file can be ommited (set to null) -# and the output of geometry will be used in its place. # ================================================================================ # TOPOLOGY GENERATION SETTINGS # ================================================================================ -# Generate molecular dynamics topology files for the final structure. +# Generate molecular-dynamics topology files for the final structure. -force_field: "amber99" # Force field to use -# Options: "amber99" (all-atom), "martini3" (coarse-grained) +force_field: "amber99" # "amber99" (all-atom) or "martini3" (coarse-grained). +topology_debug: false # Save all intermediate topology files. -topology_debug: false # Save intermediate topology files +# --- Martini3-specific (ignored for amber99) --- +# go_epsilon: 9.414 # Go-model epsilon for Martini3 CG parametrization (default 9.414). +# use_conda_run: false # Invoke Martinize2 via `conda run` (set true if it lives in another env). +# martinize2_command: null # Usually auto-detected from PATH; override only if needed. +# martinize2_env: null # Conda env name for Martinize2; usually auto-detected. -# Topology-Only Mode -# When topology_generator: true AND both sequence_generator: false and -# geometry_generator: false, ColBuilder operates in topology-only mode, -# reading an existing fibril PDB and generating topology files without -# running geometry generation. Crosslink parameters must match the input PDB. \ No newline at end of file +# ================================================================================ +# TOPOLOGY-ONLY MODE +# ================================================================================ +# When topology_generator: true AND both sequence_generator: false and +# geometry_generator: false, ColBuilder runs in topology-only mode: it reads an +# existing fibril PDB (pdb_file) and generates topology without geometry generation. +# The crosslink type detected in the PDB must match any crosslink types you specify +# (see the GEO_ERR_008 validation note above). diff --git a/docs/configuration.md b/docs/configuration.md index 337a906..cf9f884 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -69,6 +69,7 @@ files_mix: # Replacement Options replace_bool: false ratio_replace: 30 +ratio_replace_scope: "enzymatic" # enzymatic | non_enzymatic | all replace_file: null # Topology Options @@ -182,6 +183,10 @@ These parameters control the sequence generation stage (homology modeling). - A complete list of available species and combinations can be found at [src/colbuilder/data/sequence/crosslinks.csv](https://github.com/graeter-group/colbuilder/blob/main/src/colbuilder/data/sequence/crosslinks.csv). - When using the mutated PDB workflow, run sequence generation separately first, then use the output in geometry generation. +**Crosslink type validation**: +- When you provide an input `pdb_file` together with `n_term_type`/`c_term_type`, ColBuilder checks that the crosslink residues present in the PDB are consistent with the specified types. Divalent types (HLKNL, LKNL, deHLNL, deHHLNL) expect a divalent structure, and trivalent types (PYD, DPD, PYL, DPL) expect a trivalent structure. +- If the PDB contains crosslinks of a category that was not requested (for example, a trivalent PDB while `n_term_type: "HLKNL"` is set), generation stops with error `GEO_ERR_008` and a message listing the detected crosslink residues. Update `n_term_type`/`c_term_type` to match the structure, or supply a PDB whose crosslinks match the specified types. + ## Geometry Generation Parameters These parameters control the generation of the microfibril structure. @@ -213,6 +218,7 @@ These parameters control advanced features for creating mixed crosslinked microf | `files_mix` | list of strings | | Required if mix_bool is true, paths to PDB files with different crosslink types | | `replace_bool` | boolean | false | Enable crosslink replacement (with lysines) | | `ratio_replace` | integer | 30 | Percentage of crosslinks to replace (0-100) | +| `ratio_replace_scope` | string | "enzymatic" | Which crosslinks are eligible for ratio-based replacement: `enzymatic`, `non_enzymatic`, or `all` | | `replace_file` | string, null | null | File with crosslinks to be replaced | **Notes**: @@ -221,6 +227,7 @@ These parameters control advanced features for creating mixed crosslinked microf - The `files_mix` parameter specifies paths to PDB files of collagen molecules, each with a different crosslink type. - The `replace_bool` feature simulates partial crosslinking or aged collagen by replacing some crosslinks with unmodified lysine residues. - The `ratio_replace` parameter controls what percentage of crosslinks should be replaced. +- The `ratio_replace_scope` parameter selects which crosslinks may be replaced. **The default is `enzymatic`**, so ratio-based replacement targets enzymatic crosslinks (e.g. HLKNL/PYD-derived residues) unless you choose otherwise. Use `non_enzymatic` to target advanced glycation end-product (AGE) crosslinks (Glucosepane, Pentosidine, MOLD), or `all` to consider both. - The `replace_file` parameter specifies the path to a PDB file of a previously generated collagen microfibril. Set to null to use the geometry generation output. ## Topology Generation Parameters @@ -399,6 +406,7 @@ Understanding how parameters interact is important for successful use of ColBuil - Ratios in `ratio_mix` must sum to 100. - If `replace_bool` is true, you must specify a valid percentage in `ratio_replace` (0-100). - If `replace_bool` is true and `geometry_generator` is false, you must provide a `replace_file`. + - `ratio_replace_scope` must be one of `enzymatic` (default), `non_enzymatic`, or `all`. 5. **Geometry Generation Dependencies**: - If `crystalcontacts_optimize` is true, the geometry generation will take longer but may produce better-packed microfibrils. diff --git a/docs/data_dictionary.md b/docs/data_dictionary.md index 2e133d2..ccba8de 100644 --- a/docs/data_dictionary.md +++ b/docs/data_dictionary.md @@ -174,6 +174,7 @@ The most commonly used parameters for ColBuilder configuration: | files_mix | List of Paths | PDB files with different crosslink types | Valid PDB file paths (≥2 files) | [] | | replace_bool | boolean | Replace crosslinks with lysines | true/false | false | | ratio_replace | float | Percentage of crosslinks to replace | 0-100 | None | +| ratio_replace_scope | string | Which crosslinks are eligible for ratio-based replacement | "enzymatic", "non_enzymatic", "all" | "enzymatic" | | replace_file | Path/null | Input PDB file of fibril with crosslinks | Valid file path or null | null | **Validation Rules**: @@ -186,10 +187,12 @@ The most commonly used parameters for ColBuilder configuration: - `ratio_replace` must be between 0 and 100 - Either `geometry_generator=true` OR `replace_file` must be provided - If `geometry_generator=false`, must provide `replace_file` + - `ratio_replace_scope` must be one of `enzymatic`, `non_enzymatic`, or `all` **Notes**: - **Mixing** creates heterogeneous microfibrils with different crosslink types (e.g., 80% divalent + 20% trivalent) - **Replacement** simulates partial crosslinking or aged collagen by replacing some crosslinks with unmodified lysine residues +- `ratio_replace_scope` selects which crosslinks may be replaced. The default `enzymatic` targets enzymatic crosslinks (HLKNL/PYD-derived residues); `non_enzymatic` targets AGE crosslinks (Glucosepane, Pentosidine, MOLD); `all` considers both - Set `replace_file: null` to use geometry generation output for replacement ### Topology Generation Parameters diff --git a/docs/user_guide.md b/docs/user_guide.md index 6ec83d4..58c4e5d 100644 --- a/docs/user_guide.md +++ b/docs/user_guide.md @@ -272,9 +272,14 @@ files_mix: # Required if mix_bool is true ```yaml replace_bool: false # Enable replacement of crosslinks with lysines ratio_replace: 30 # Percentage of crosslinks to replace +ratio_replace_scope: "enzymatic" # Crosslinks to replace: "enzymatic" (default), "non_enzymatic" (AGEs), or "all" replace_file: null # File with crosslinks to be replaced ``` +The `ratio_replace_scope` parameter controls which crosslinks are eligible for ratio-based +replacement. The default `enzymatic` targets enzymatic crosslinks (HLKNL/PYD-derived residues); +use `non_enzymatic` to target AGE crosslinks (Glucosepane, Pentosidine, MOLD), or `all` for both. + ### Topology Options ```yaml diff --git a/pyproject.toml b/pyproject.toml index 2cff7ca..812ae9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,21 +20,19 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ] dependencies = [ - "numpy==1.25", - "vermouth==0.9.1", + # <2 keeps the numpy 1.x ABI (vermouth 0.9.1); >=1.25 lets pip pick a wheel + # that exists for the running Python (e.g. 1.26 on 3.12, where 1.25 has none). + "numpy>=1.25,<2", + "vermouth==0.9.1", "pandas==2.2.2", "biopython==1.84", "scikit-learn", - "h5py", - "libnetcdf", - "threadpoolctl", "PyYAML", - "click", "pydantic>=2.0", "tqdm", - "asyncio", "colorama>=0.4.4", "click>=8.0.0", ] diff --git a/src/colbuilder/colbuilder.py b/src/colbuilder/colbuilder.py index 21d03ff..ae5ad8b 100644 --- a/src/colbuilder/colbuilder.py +++ b/src/colbuilder/colbuilder.py @@ -511,6 +511,21 @@ async def run_pipeline(config: ColbuilderConfig) -> Dict[str, Path]: f"Using generated sequence PDB for further processing: {sequence_pdb}" ) + # Validate that the crosslinks present in an input PDB are consistent + # with the crosslink types requested in the configuration. This catches + # the common mistake of, e.g., specifying a divalent type (HLKNL) for a + # trivalent (PYD) structure. Generated PDBs match by construction, so + # this only meaningfully guards user-provided structures. + if config.pdb_file and (config.n_term_type or config.c_term_type): + from colbuilder.core.utils.crosslink_detector import CrosslinkDetector + + pdb_to_check = Path(config.pdb_file).resolve() + if pdb_to_check.exists(): + CrosslinkDetector.validate_against_specified_types( + pdb_to_check, + [config.n_term_type, config.c_term_type], + ) + # Topology-only mode if (config.topology_generator and not config.geometry_generator and diff --git a/src/colbuilder/core/geometry/chimera.py b/src/colbuilder/core/geometry/chimera.py index 31e1e87..04c4487 100644 --- a/src/colbuilder/core/geometry/chimera.py +++ b/src/colbuilder/core/geometry/chimera.py @@ -165,12 +165,15 @@ def swapaa(self, replace: str, system_type: str) -> subprocess.CompletedProcess: LOG.debug( f"Chimera command completed with return code: {result.returncode}" ) - if result.stdout: - # LOG.debug(f"Chimera stdout: {result.stdout}") - pass - if result.stderr: - # LOG.debug(f"Chimera stderr: {result.stderr}") - pass + if result.returncode != 0: + # Surface failures: a silent swapaa failure leaves unpaired crosslink + # markers (non-standard residues) in the structure, which breaks + # downstream topology generation / simulation. + LOG.error( + f"Chimera swapaa failed (return code {result.returncode}); " + f"unpaired crosslink markers may remain unmutated. " + f"stderr: {result.stderr}" + ) return result except Exception as e: LOG.error(f"Error executing Chimera command: {str(e)}") diff --git a/src/colbuilder/core/geometry/connect.py b/src/colbuilder/core/geometry/connect.py index 9b07ed6..bea8037 100644 --- a/src/colbuilder/core/geometry/connect.py +++ b/src/colbuilder/core/geometry/connect.py @@ -142,12 +142,16 @@ def get_external_connect_file( Any: Updated system object. """ if connect_file: - self.external_connect = [ - float(l.split(" ")[0].replace(".caps.pdb", "")) - for l in open(connect_file.with_suffix(".txt"), "r") - ] - if np.min(self.external_connect) > 0: - self.external_connect = [i - 1 for i in self.external_connect] + with open(connect_file.with_suffix(".txt"), "r") as fh: + self.external_connect = [ + float(l.split(" ")[0].replace(".caps.pdb", "")) + for l in fh + ] + # Connect files are written by write_connect() using the system's own + # (0-based) model ids, so they are already aligned with system.get_connect(). + # The previous "if min > 0: subtract 1" heuristic wrongly shifted every id + # whenever model 0 happened to be absent from the file, mis-mapping + # connectivity. No shift is applied. for model_id in system.get_connect().keys(): if model_id not in self.external_connect: diff --git a/src/colbuilder/core/geometry/crosslink.py b/src/colbuilder/core/geometry/crosslink.py index afb7c5d..31e1ff1 100644 --- a/src/colbuilder/core/geometry/crosslink.py +++ b/src/colbuilder/core/geometry/crosslink.py @@ -97,9 +97,9 @@ def read_crosslink(pdb_file: Union[str, Path]) -> List[Crosslink]: resname=line[17:20], chain=line[21], position=[ - float(line[29:38]), + float(line[30:38]), float(line[38:46]), - float(line[46:56]), + float(line[46:54]), ], type="T", ) @@ -113,9 +113,9 @@ def read_crosslink(pdb_file: Union[str, Path]) -> List[Crosslink]: resname=line[17:20], chain=line[21], position=[ - float(line[29:38]), + float(line[30:38]), float(line[38:46]), - float(line[46:56]), + float(line[46:54]), ], type="D", ) @@ -129,9 +129,9 @@ def read_crosslink(pdb_file: Union[str, Path]) -> List[Crosslink]: resname=line[17:20], chain=line[21], position=[ - float(line[29:38]), + float(line[30:38]), float(line[38:46]), - float(line[46:56]), + float(line[46:54]), ], type="D", ) @@ -145,9 +145,9 @@ def read_crosslink(pdb_file: Union[str, Path]) -> List[Crosslink]: resname=line[17:20], chain=line[21], position=[ - float(line[29:38]), + float(line[30:38]), float(line[38:46]), - float(line[46:56]), + float(line[46:54]), ], type="D", ) diff --git a/src/colbuilder/core/geometry/geometry_replacer.py b/src/colbuilder/core/geometry/geometry_replacer.py index 314549a..101195c 100644 --- a/src/colbuilder/core/geometry/geometry_replacer.py +++ b/src/colbuilder/core/geometry/geometry_replacer.py @@ -41,7 +41,7 @@ (["LZS"], ["LZD"]), # MOLD # Enzymatic divalent crosslinks (["L5Y"], ["L4Y"]), # HLKNL - (["L5X"], ["L4Y"]), # LKNL + (["L5X"], ["L4X"]), # LKNL (["LY5"], ["LY4"]), # deH-HLNL (["LX5"], ["LX4"]), # deH-LNL ] @@ -65,6 +65,7 @@ "L5Y": "LYS", "L4Y": "LYS", "L5X": "LYS", + "L4X": "LYS", "LY5": "LYS", "LX5": "LYS", "LY4": "LYS", @@ -77,7 +78,7 @@ DEFAULT_REPLACEMENT = "LYS" PAIR_DISTANCE_CUTOFF = 5.0 -PAIRED_RESIDUES: Set[str] = {"AGS", "APD", "LGX", "LPS", "LZS", "LZD", "L5Y", "L4Y", "L5X", "LY5", "LX5", "LY4", "LX4"} +PAIRED_RESIDUES: Set[str] = {"AGS", "APD", "LGX", "LPS", "LZS", "LZD", "L5Y", "L4Y", "L5X", "L4X", "LY5", "LX5", "LY4", "LX4"} NON_ENZYMATIC_PAIRED: Set[str] = {"AGS", "APD", "LGX", "LPS", "LZS", "LZD"} ENZYMATIC_SINGLETONS: Set[str] = { "LYX", @@ -95,6 +96,7 @@ "L5Y", "L4Y", "L5X", + "L4X", "LY5", "LX5", "LY4", @@ -269,11 +271,14 @@ async def replace_in_system( source_dir = Path.cwd() # Determine system type and setup type directory - type_dir_candidates = [source_dir] + # Prefer canonical typed caps in subdirs; check the source root LAST so a + # stale root copy cannot shadow the correct typed file (first-match-wins). + type_dir_candidates = [] for sub in ["DT", "TD", "D", "T", "M", "NC", "DY"]: cand = source_dir / sub if cand.exists(): type_dir_candidates.append(cand) + type_dir_candidates.append(source_dir) model_zero = system.get_model(model_id=0.0) system_type = model_zero.type if hasattr(model_zero, "type") else "D" @@ -356,7 +361,7 @@ async def replace_in_system( records=records, connect_groups=connect_groups, ratio_replace=float(config.ratio_replace), - scope=getattr(config, "ratio_replace_scope", "non_enzymatic"), + scope=getattr(config, "ratio_replace_scope", "enzymatic"), config=config, ) generated_from_ratio = bool(manual_list) @@ -367,7 +372,7 @@ async def replace_in_system( system=system, ratio_replace=float(config.ratio_replace), fibril_length=config.fibril_length, - scope=getattr(config, "ratio_replace_scope", "non_enzymatic"), + scope=getattr(config, "ratio_replace_scope", "enzymatic"), ) generated_from_ratio = bool(manual_list) LOG.debug(f"Generated {len(manual_list)} ratio-based instructions from system") @@ -379,7 +384,7 @@ async def replace_in_system( "No replacement instructions generated (ratio=%s%%, scope=%s). " "No matching markers found.", config.ratio_replace, - getattr(config, "ratio_replace_scope", "non_enzymatic"), + getattr(config, "ratio_replace_scope", "enzymatic"), ) else: LOG.debug("No replacement instructions provided") @@ -605,7 +610,7 @@ async def replace_direct( records=records, ratio_replace=float(config.ratio_replace), fibril_length=getattr(config, "fibril_length", 0.0), - scope=getattr(config, "ratio_replace_scope", "non_enzymatic"), + scope=getattr(config, "ratio_replace_scope", "enzymatic"), ) generated_from_ratio = bool(manual_list) @@ -735,8 +740,8 @@ def _calculate_fibril_bounds( fibril_length_angstroms = fibril_length * 10.0 - z_min = z_center - fibril_length_angstroms * 5 - z_max = z_center + fibril_length_angstroms * 5 + z_min = z_center - fibril_length_angstroms / 2 + z_max = z_center + fibril_length_angstroms / 2 else: z_min = min_z z_max = max_z @@ -879,7 +884,7 @@ async def _run_chimera_command( # ================================================================================== def _build_ratio_replacements( - self, system: Any, ratio_replace: float, fibril_length: float, scope: str = "non_enzymatic" + self, system: Any, ratio_replace: float, fibril_length: float, scope: str = "enzymatic" ) -> List[str]: """ Select crosslinks to mutate based on a density (ratio_replace) value. @@ -894,11 +899,11 @@ def _build_ratio_replacements( return [] try: - scope = scope or "non_enzymatic" + scope = scope or "enzymatic" scope = str(scope).strip().lower() if scope not in {"enzymatic", "non_enzymatic", "all"}: - LOG.warning("Unknown ratio_replace scope '%s'; defaulting to 'non_enzymatic'", scope) - scope = "non_enzymatic" + LOG.warning("Unknown ratio_replace scope '%s'; defaulting to 'enzymatic'", scope) + scope = "enzymatic" z_bounds = self._calculate_fibril_bounds(system, fibril_length) except Exception as e: @@ -1254,11 +1259,14 @@ async def _apply_manual_replacements_to_dir( except Exception: pass - type_dir_candidates = [source_dir] + # Prefer canonical typed caps in subdirs; check the source root LAST so a + # stale root copy cannot shadow the correct typed file (first-match-wins). + type_dir_candidates = [] for sub in ["DT", "TD", "D", "T", "M", "NC", "DY"]: cand = source_dir / sub if cand.exists(): type_dir_candidates.append(cand) + type_dir_candidates.append(source_dir) try: model_zero = system.get_model(model_id=0.0) @@ -1331,19 +1339,19 @@ def _build_ratio_replacements_from_connect( records: List[Dict[str, Any]], connect_groups: List[List[int]], ratio_replace: float, - scope: str = "non_enzymatic", + scope: str = "enzymatic", config: Optional[ColbuilderConfig] = None, ) -> List[str]: """Generate replacement instructions using connect groups and parsed caps records.""" if not records or not connect_groups or ratio_replace <= 0: return [] - scope = (scope or "non_enzymatic").strip().lower() + scope = (scope or "enzymatic").strip().lower() if scope not in {"enzymatic", "non_enzymatic", "all"}: LOG.warning( - "Unknown ratio_replace scope '%s'; defaulting to 'non_enzymatic'", scope + "Unknown ratio_replace scope '%s'; defaulting to 'enzymatic'", scope ) - scope = "non_enzymatic" + scope = "enzymatic" records_by_model: Dict[int, List[Dict[str, Any]]] = {} for rec in records: @@ -1534,18 +1542,18 @@ def _build_ratio_replacements_from_records( records: List[Dict[str, Any]], ratio_replace: float, fibril_length: float, - scope: str = "non_enzymatic", + scope: str = "enzymatic", ) -> List[str]: """Generate replacement instructions using parsed crosslink records.""" if not records or ratio_replace <= 0: return [] - scope = (scope or "non_enzymatic").strip().lower() + scope = (scope or "enzymatic").strip().lower() if scope not in {"enzymatic", "non_enzymatic", "all"}: LOG.warning( - "Unknown ratio_replace scope '%s'; defaulting to 'non_enzymatic'", scope + "Unknown ratio_replace scope '%s'; defaulting to 'enzymatic'", scope ) - scope = "non_enzymatic" + scope = "enzymatic" if scope == "enzymatic": target_resnames = ENZYMATIC_SINGLETONS.copy() @@ -1734,6 +1742,25 @@ def _calculate_bounds_from_records( return (z_min, z_max) + @staticmethod + def _extract_position(item: Any) -> Any: + """ + Return the 'position' of a crosslink entry regardless of how it is stored. + + Entries may be a plain dict ({'position': ...}), a dict whose 'crosslink' + value is itself a dict, or a dict whose 'crosslink' value is a Crosslink + object (the system-based path). Using getattr for objects avoids the + AttributeError that previously caused PYD trios to be silently dropped. + """ + if item is None: + return None + if isinstance(item, dict): + if "crosslink" in item: + cl = item["crosslink"] + return cl.get("position") if isinstance(cl, dict) else getattr(cl, "position", None) + return item.get("position") + return getattr(item, "position", None) + def _build_pyd_trios( self, lyx_list: List[Dict[str, Any]], @@ -1754,8 +1781,8 @@ def _build_pyd_trios( if idx in used_ly2: continue try: - lyx_pos = lyx.get("crosslink", {}).get("position") if "crosslink" in lyx else lyx.get("position") - ly2_pos = ly2.get("crosslink", {}).get("position") if "crosslink" in ly2 else ly2.get("position") + lyx_pos = self._extract_position(lyx) + ly2_pos = self._extract_position(ly2) if not lyx_pos or not ly2_pos: continue dist = self._calculate_distance(lyx_pos, ly2_pos) @@ -1771,8 +1798,8 @@ def _build_pyd_trios( if jdx in used_ly3: continue try: - lyx_pos = lyx.get("crosslink", {}).get("position") if "crosslink" in lyx else lyx.get("position") - ly3_pos = ly3.get("crosslink", {}).get("position") if "crosslink" in ly3 else ly3.get("position") + lyx_pos = self._extract_position(lyx) + ly3_pos = self._extract_position(ly3) if not lyx_pos or not ly3_pos: continue dist = self._calculate_distance(lyx_pos, ly3_pos) @@ -1951,17 +1978,29 @@ def _split_pdb_into_models(self, input_pdb: Path, type_dir: Path) -> int: atom_lines = [ln for ln in all_lines if ln.startswith(("ATOM", "HETATM", "TER"))] models: List[List[str]] = [] current: List[str] = [] - chain_sequence: List[str] = [] + current_chains: List[str] = [] # ordered chain ids seen in the current model + first_chain: Optional[str] = None for line in atom_lines: if line.startswith(("ATOM", "HETATM")): chain = line[21] - if not chain_sequence or chain != chain_sequence[-1]: - chain_sequence.append(chain) - if len(chain_sequence) > 3 and chain_sequence[-4:] == ["A", "B", "C", "A"]: + # Chains are written contiguously; act only on a chain transition. + if not current_chains or chain != current_chains[-1]: + # A new triple helix begins when we return to the model's first + # chain after a full set of >=3 distinct chains. This does not + # hardcode A/B/C ordering, so non-standard chain labels still split. + if ( + first_chain is not None + and chain == first_chain + and len(set(current_chains)) >= 3 + ): models.append(current) current = [] - chain_sequence = ["A"] + current_chains = [] + first_chain = None + if first_chain is None: + first_chain = chain + current_chains.append(chain) current.append(line) if current: diff --git a/src/colbuilder/core/geometry/main_geometry.py b/src/colbuilder/core/geometry/main_geometry.py index 954e5d3..539590a 100644 --- a/src/colbuilder/core/geometry/main_geometry.py +++ b/src/colbuilder/core/geometry/main_geometry.py @@ -419,6 +419,11 @@ async def _handle_full_generation(self) -> Tuple[Optional[System], Optional[Path LOG.section("Mixing geometry...") mixing_dir = self.file_manager.ensure_mixing_dir() system, _ = await self.mixer_service.mix(system, temp_config, mixing_dir) + # Finalize from where the mixed, per-type caps actually live. The + # mixer writes caps into mixing_dir/; geometry_dir only holds + # the original single build type, so writing from it would miss the + # newly assigned mix types (FileNotFoundError or stale caps). + final_temp_dir = mixing_dir LOG.info("Mixing completed.") # Determine if replacement is requested @@ -601,7 +606,7 @@ async def _handle_full_generation(self) -> Tuple[Optional[System], Optional[Path LOG.info(f"Writing final mixed system PDB to {output_pdb_path}") system.write_pdb( - pdb_out=output_prefix, + pdb_out=output_pdb_path, fibril_length=temp_config.fibril_length, cleanup=False, temp_dir=final_temp_dir, @@ -696,6 +701,14 @@ async def build_geometry(self) -> Tuple[Optional[System], Optional[Path]]: GeometryGenerationError: If any step in the process fails """ try: + # Clear stale caps/type-subdirs from previous runs so a re-run cannot + # silently reuse old geometry (different fibril length, contact distance, etc.). + try: + if getattr(self, "file_manager", None) is not None: + self.file_manager.clear_generation_dirs() + except Exception as e: + LOG.debug(f"Could not clear generation dirs at run start: {e}") + # Allow direct replacement when a PDB is provided, even if replace_bool was not set explicitly if self.config.replace_file and not self.config.geometry_generator: self.config.replace_bool = True diff --git a/src/colbuilder/core/geometry/optimize.py b/src/colbuilder/core/geometry/optimize.py index bfce6cf..dff00d0 100644 --- a/src/colbuilder/core/geometry/optimize.py +++ b/src/colbuilder/core/geometry/optimize.py @@ -445,7 +445,7 @@ def _create_complementary_model(self, crosslink: Any, source_model: Any, system: ) system.add_model(model=new_model) - + if not self._node_exists_in_system(pr_candidate, system): pr_model = model.Model( id=new_id + 1, @@ -453,8 +453,24 @@ def _create_complementary_model(self, crosslink: Any, source_model: Any, system: transformation=pr_transformation, pdb_file=system.crystal.pdb_file ) - system.add_model(model=pr_model) - + # Only keep the point-reflection copy if it is actually + # crosslink-connected to the existing system. Adding it + # unconditionally can introduce a floating, unconnected + # triple helix (orphan molecule) into the fibril. + pr_connected = any( + connect.get_connect( + ref_model=system.get_model(model_id=mid), + model=pr_model, + ) + for mid in system.get_models() + ) + if pr_connected: + system.add_model(model=pr_model) + else: + LOG.debug( + f"Skipping unconnected point-reflection copy at {pr_candidate}" + ) + self._mark_model_crosslinks_paired(new_model) crosslink_id = f"{source_model.id}_{crosslink.resid}_{crosslink.type}" diff --git a/src/colbuilder/core/geometry/system.py b/src/colbuilder/core/geometry/system.py index 4a78db6..72c3fac 100644 --- a/src/colbuilder/core/geometry/system.py +++ b/src/colbuilder/core/geometry/system.py @@ -352,13 +352,22 @@ def write_pdb(self, pdb_out: Union[str, Path], fibril_length: float, cleanup: bo if crystal_header: f.write(crystal_header) - # Write model files if they exist (typically for initial crystal structures) + # Write model files if they exist (typically for initial crystal structures). + # Skip any model that will also be written from its caps file below, otherwise + # the same atoms are emitted twice (uncapped body + caps) -> 0 A overlaps. for model_id, model in self.system.items(): if hasattr(model, 'pdb_file') and Path(model.pdb_file).exists(): + model_type = getattr(model, 'type', None) + if model_type is not None and (model_type, float(model_id)) in caps_by_model: + LOG.debug( + f"Skipping pdb_file body for model {model_id}; " + f"caps file will be written instead (avoid duplicate atoms)" + ) + continue LOG.debug(f"Writing model {model_id} from pdb_file") self._write_model_file_content(f, model.pdb_file) written_models += 1 - model_type = getattr(model, 'type', 'unknown') + model_type = model_type if model_type is not None else 'unknown' written_by_type[model_type] = written_by_type.get(model_type, 0) + 1 # Write caps files: one per model ID, respecting assigned type diff --git a/src/colbuilder/core/geometry/unpaired_crosslinks.py b/src/colbuilder/core/geometry/unpaired_crosslinks.py index efb2d36..c5ccd2b 100644 --- a/src/colbuilder/core/geometry/unpaired_crosslinks.py +++ b/src/colbuilder/core/geometry/unpaired_crosslinks.py @@ -234,9 +234,15 @@ def _build_crosslink_data(self) -> Tuple[Dict[str, Dict[str, str]], Dict[str, Se continue if res_name not in crosslinks_by_type: continue - chain_id, resid = residues[-1] - line = f"{fname} {res_name} {resid} {chain_id}" - crosslinks_by_type[res_name][fname] = line + # Emit EVERY unpaired marker of this type in the model, not just the + # last one. A fragment can legitimately carry multiple same-type + # markers; dropping all but one leaves non-standard residues behind + # (which then break topology generation / simulation). + lines = [ + f"{fname} {res_name} {resid} {chain_id}" + for (chain_id, resid) in residues + ] + crosslinks_by_type[res_name][fname] = lines types_by_model.setdefault(fname, set()).add(res_name) crosslinks_by_type = {r: m for r, m in crosslinks_by_type.items() if m} @@ -310,13 +316,16 @@ def _build_unpaired_entries( all_models = set(mapping.keys()) unpaired = all_models - paired_models.get(res_type, set()) for model in unpaired: - parts = mapping[model].split() - if len(parts) < 4: - LOG.warning("Malformed mapping for %s: %s", model, mapping[model]) - continue - pdb_file, orig_res, resid, chain = parts[:4] - new_res = RESIDUE_TO_MUTATION.get(orig_res, DEFAULT_MUTATION) - entries.append((pdb_file, new_res, resid, chain, orig_res)) + raw = mapping[model] + model_lines = raw if isinstance(raw, list) else [raw] + for entry_line in model_lines: + parts = entry_line.split() + if len(parts) < 4: + LOG.warning("Malformed mapping for %s: %s", model, entry_line) + continue + pdb_file, orig_res, resid, chain = parts[:4] + new_res = RESIDUE_TO_MUTATION.get(orig_res, DEFAULT_MUTATION) + entries.append((pdb_file, new_res, resid, chain, orig_res)) def sort_key(entry: Tuple[str, str, str, str, str]): pdb_file, _, resid, chain, orig_res = entry diff --git a/src/colbuilder/core/sequence/sequence_generator.py b/src/colbuilder/core/sequence/sequence_generator.py index d737a38..e779fde 100644 --- a/src/colbuilder/core/sequence/sequence_generator.py +++ b/src/colbuilder/core/sequence/sequence_generator.py @@ -460,6 +460,13 @@ async def _load_crosslinks(self) -> None: ) self._crosslinks = [] + missing: List[str] = [] + + def _is_real_type(t: Optional[str]) -> bool: + return bool(t) and str(t).strip().upper() not in {"NONE", "NOCROSS"} + + def _is_explicit_none(t: Optional[str]) -> bool: + return bool(t) and str(t).strip().upper() in {"NONE", "NOCROSS"} if self.config.n_term_type: LOG.debug(f"Loading N-terminal crosslinks of type: {self.config.n_term_type}") @@ -472,6 +479,11 @@ async def _load_crosslinks(self) -> None: if n_crosslinks: self._crosslinks.extend(n_crosslinks) LOG.debug(f"Loaded {len(n_crosslinks)} N-terminal crosslinks") + elif _is_real_type(self.config.n_term_type): + missing.append( + f"N-terminal type '{self.config.n_term_type}' " + f"(combination '{self.config.n_term_combination}')" + ) if self.config.c_term_type: LOG.debug(f"Loading C-terminal crosslinks of type: {self.config.c_term_type}") @@ -484,6 +496,54 @@ async def _load_crosslinks(self) -> None: if c_crosslinks: self._crosslinks.extend(c_crosslinks) LOG.debug(f"Loaded {len(c_crosslinks)} C-terminal crosslinks") + elif _is_real_type(self.config.c_term_type): + missing.append( + f"C-terminal type '{self.config.c_term_type}' " + f"(combination '{self.config.c_term_combination}')" + ) + + # Crosslinking is enabled (we returned early otherwise). If a real + # crosslink type was requested but nothing matched, fail loudly instead + # of silently building an uncrosslinked structure. Setting a terminal + # type to "NONE" is the explicit, accepted way to request no crosslink. + if missing: + available = sorted(species_crosslinks["type"].unique().tolist()) + raise SequenceGenerationError( + "Crosslinking is enabled but no matching crosslinks were found for: " + + "; ".join(missing) + + f". Available types for '{self.config.species}': {available}. " + "Check the type name (case-sensitive), the combination, and the " + "species — or set the terminal type to 'NONE' for no crosslink.", + error_code="SEQ_ERR_002", + context={ + "species": self.config.species, + "n_term_type": self.config.n_term_type, + "c_term_type": self.config.c_term_type, + "missing": missing, + }, + ) + + # Enabled, but neither terminus requested a real type or an explicit + # 'NONE' — i.e. crosslinking was intended but never specified. + if ( + not self._crosslinks + and not _is_real_type(self.config.n_term_type) + and not _is_real_type(self.config.c_term_type) + and not _is_explicit_none(self.config.n_term_type) + and not _is_explicit_none(self.config.c_term_type) + ): + raise SequenceGenerationError( + "Crosslinking is enabled (crosslink: true) but no crosslink type was " + "specified. Set n_term_type/c_term_type to a valid type, or set them " + "to 'NONE' (or crosslink: false) to build without crosslinks.", + error_code="SEQ_ERR_002", + context={ + "species": self.config.species, + "n_term_type": self.config.n_term_type, + "c_term_type": self.config.c_term_type, + }, + ) + if not self._crosslinks: LOG.warning("No crosslinks were loaded despite crosslinks being enabled") diff --git a/src/colbuilder/core/topology/amber.py b/src/colbuilder/core/topology/amber.py index 7abd107..3f90f06 100644 --- a/src/colbuilder/core/topology/amber.py +++ b/src/colbuilder/core/topology/amber.py @@ -13,6 +13,7 @@ import numpy as np from colorama import Fore, Style from typing import List, Any, Optional, Dict, Union, Tuple, Set +import re from colbuilder.core.geometry.system import System from colbuilder.core.geometry.crosslink import read_crosslink, Crosslink @@ -171,8 +172,8 @@ def _are_compatible_crosslinks(self, cl1: Crosslink, cl2: Crosslink) -> bool: True if crosslinks can form a bond """ # Divalent crosslinks (D-type) - divalent_aldehyde = {'L4Y', 'L4X', 'LY4', 'LX4', 'LGX', 'LPS'} - divalent_amine = {'L5Y', 'L5X', 'LY5', 'LX5', 'AGS', 'APD'} + divalent_aldehyde = {'L4Y', 'L4X', 'LY4', 'LX4', 'LGX', 'LPS', 'LZD'} + divalent_amine = {'L5Y', 'L5X', 'LY5', 'LX5', 'AGS', 'APD', 'LZS'} # Trivalent crosslinks (T-type) trivalent_aldehyde = {'LYX', 'LXY', 'LYY', 'LXX'} @@ -225,7 +226,7 @@ def find_atom_indices_for_crosslinks( atom_by_index = {a['index']: a for a in atom_data} # Read coordinates from GRO file - gro_file = itp_file.replace('.itp', '.gro') + gro_file = str(Path(itp_file).with_suffix('.gro')) gro_coords: Dict[int, np.ndarray] = {} if os.path.exists(gro_file): with open(gro_file, 'r') as f: @@ -467,6 +468,20 @@ def add_crosslink_topology_to_itp( try: mapped_pairs = self.find_atom_indices_for_crosslinks(itp_file, crosslink_pairs, merged_pdb_file) + + # Surface mapping failures: a dropped pair means NO bond and NO + # exclusions for that crosslink -> close contacts that crash MD. + total = len(crosslink_pairs) + n_mapped = len(mapped_pairs) + if n_mapped < total: + LOG.error( + f"Crosslink mapping incomplete in {os.path.basename(itp_file)}: " + f"{n_mapped}/{total} pair(s) mapped, {total - n_mapped} dropped. " + f"Dropped crosslinks get no bond and no exclusions (likely clashes " + f"during equilibration). Check residue/atom-name coverage in " + f"_is_crosslink_atom and the 5 A marker cutoff." + ) + if not mapped_pairs: LOG.warning(f"No atom indices found for crosslink topology in {itp_file}") return @@ -498,6 +513,11 @@ def add_crosslink_topology_to_itp( except Exception as e: LOG.warning(f"Failed to add angles/dihedrals, but bonds were added successfully: {str(e)}") + try: + self._add_crosslink_exclusions(itp_file, valid_bond_data) + except Exception as e: + LOG.warning(f"Failed to add crosslink exclusions, bonds/angles still added: {str(e)}") + LOG.debug(f" Successfully added crosslink topology to {itp_file}") except Exception as e: @@ -567,6 +587,118 @@ def _add_crosslink_bonds(self, itp_file: str, valid_bond_data: List[Dict]) -> No with open(itp_file, 'w') as f: f.writelines(lines) + def _add_crosslink_exclusions( + self, + itp_file: str, + valid_bond_data: List[Dict], + cutoff: float = 4.5, + ) -> None: + """Write explicit [ exclusions ] between each pair of crosslinked residues. + + nrexcl=3 only excludes atoms within 3 bonds along the bond graph. Covalently + joining two residues (e.g. HLKNL: L4Y CE - L5Y NZ) brings several heavy atoms + into close contact that are more than 3 bonds apart through the new bond, so + they are NOT auto-excluded and clash during equilibration. Here we exclude all + atom pairs between the two joined residues that lie within `cutoff` Angstrom + (falling back to excluding every inter-residue pair if no coordinates exist). + """ + if not valid_bond_data: + return + + with open(itp_file, "r") as f: + lines = f.readlines() + + # Map atom index -> (residue_nr, residue_name) and residue -> [indices] + atom_res: Dict[int, Tuple[int, str]] = {} + res_atoms: Dict[Tuple[int, str], List[int]] = {} + atoms_section = False + for line in lines: + s = line.strip() + if s.startswith("[ atoms ]"): + atoms_section = True + continue + if atoms_section and s.startswith("["): + break + if atoms_section and s and not s.startswith(";"): + parts = s.split() + if len(parts) >= 8: + idx = int(parts[0]) + key = (int(parts[2]), parts[3]) + atom_res[idx] = key + res_atoms.setdefault(key, []).append(idx) + + # Coordinates (Angstrom) from the matching GRO, if present + gro_file = str(Path(itp_file).with_suffix(".gro")) + coords: Dict[int, np.ndarray] = {} + if os.path.exists(gro_file): + with open(gro_file, "r") as f: + glines = f.readlines() + for line in glines[2:-1]: + if len(line) >= 44: + try: + aidx = int(line[15:20].strip()) + coords[aidx] = np.array([ + float(line[20:28]) * 10, + float(line[28:36]) * 10, + float(line[36:44]) * 10, + ]) + except (ValueError, IndexError): + continue + + # Build exclusion sets between each crosslinked residue pair + excl: Dict[int, Set[int]] = {} + pairs_added = 0 + for bond_data in valid_bond_data: + a1, a2 = bond_data["atoms"] + r1 = atom_res.get(a1) + r2 = atom_res.get(a2) + if not r1 or not r2 or r1 == r2: + continue + for i in res_atoms.get(r1, []): + for j in res_atoms.get(r2, []): + if i == j: + continue + if coords: + pi, pj = coords.get(i), coords.get(j) + if pi is not None and pj is not None and np.linalg.norm(pi - pj) > cutoff: + continue + lo, hi = (i, j) if i < j else (j, i) + if hi not in excl.setdefault(lo, set()): + excl[lo].add(hi) + pairs_added += 1 + + if not pairs_added: + return + + # Locate an existing [ exclusions ] section (amber pdb2gmx output has none) + start = end = -1 + for i, line in enumerate(lines): + if line.strip().startswith("[ exclusions ]"): + start = i + elif start >= 0 and line.strip().startswith("[") and not line.strip().startswith("[ exclusions ]"): + end = i + break + + body = [f"{ai} " + " ".join(str(x) for x in sorted(excl[ai])) + "\n" for ai in sorted(excl)] + + if start >= 0: + insert_at = end if end >= 0 else len(lines) + for entry in reversed(body): + lines.insert(insert_at, entry) + else: + lines.append("\n[ exclusions ]\n") + lines.append("; crosslink through-space exclusions (beyond nrexcl)\n") + lines.extend(body) + + with open(itp_file, "w") as f: + f.writelines(lines) + + LOG.debug( + f"Added {pairs_added} crosslink exclusion pair(s) " + f"({'distance-filtered' if coords else 'all inter-residue'}) " + f"to {os.path.basename(itp_file)}" + ) + def _filter_angles_by_bonds( self, angles: List[Tuple[int, int, int]], @@ -900,8 +1032,13 @@ def write_itp( except Exception: pass - output_file = itp_file.parent / str(itp_file.name).replace("top", "itp") - + output_file = itp_file.with_suffix(".itp") + + # pdb2gmx writes a single moleculetype block. GROMACS <2023 names it + # "Protein_chain_A"; GROMACS >=2023 names it just "Protein". Match either, + # anchored so comments/other lines don't trigger. + molname_re = re.compile(r'^\s*Protein(?:_chain_\w+)?\s+\d+\s*$') + with open(output_file, 'w') as f: write = False for line in itp_model: @@ -909,7 +1046,7 @@ def write_itp( break if write: f.write(line) - elif 'Protein_chain_A' in line: + elif molname_re.match(line): f.write('[ moleculetype ]\n') f.write(f'{molecule_name} 3\n') write = True diff --git a/src/colbuilder/core/topology/crosslink.py b/src/colbuilder/core/topology/crosslink.py index 4a8267d..73eef6f 100644 --- a/src/colbuilder/core/topology/crosslink.py +++ b/src/colbuilder/core/topology/crosslink.py @@ -47,12 +47,16 @@ def __init__(self, cnt_model: Optional[int] = None) -> None: 'dihedrals': [] } - # Distance thresholds for different crosslink types (in Å) - # Adjust these values based on your coarse-grained model + # Distance thresholds for different crosslink types (in Å). + # Equilibrium bead-bead distances are ~2.3-4.2 A (see dlyxly3/dlyxly2/dly45 + # below). A loose 15 A cutoff lets one bead pair with several partners in + # crosslink-dense regions, creating spurious bonds. Keep the cutoff modest + # so only genuine partners (close in the assembled fibril) are bonded. self.crosslink_thresholds = { - 'LYX_LY2': 15.0, # Maximum distance for LYX SC4 - LY2 SC1 crosslinks - 'LYX_LY3': 15.0, # Maximum distance for LYX SC5 - LY3 SC1 crosslinks - 'L4Y_L5Y': 15.0 # Maximum distance for L4Y SC1 - L5Y SC2 crosslinks + 'LYX_LY2': 6.0, # Maximum distance for LYX SC4 - LY2 SC1 crosslinks + 'LYX_LY3': 6.0, # Maximum distance for LYX SC5 - LY3 SC1 crosslinks + 'LY2_LY3': 6.0, # Maximum distance for LYX SC5 - LY3 SC1 crosslinks + 'L4Y_L5Y': 6.0 # Maximum distance for L4Y SC1 - L5Y SC2 crosslinks } # HLKNL-crosslink parameters @@ -65,16 +69,18 @@ def __init__(self, cnt_model: Optional[int] = None) -> None: # PYD-crosslink parameters self.klyxly2: str = '9000' # LYX-LY2 bond force constant (kJ/mol/nm^2) self.klyxly3: str = '12000' # LYX-LY3 bond force constant (kJ/mol/nm^2) + self.klyx5ly2: str = '12000' # LYX-LY2 bond force constant (kJ/mol/nm^2) (conserve ring) self.dlyxly2: str = '0.290' # LYX-LY2 bond equilibrium distance (nm) self.dlyxly3: str = '0.230' # LYX-LY3 bond equilibrium distance (nm) + self.dlyx5ly2: str = '0.370' # LYX-LY2 bond equilibrium distance (nm) (added) # PYD-crosslink angle parameters (degrees) - self.al2yx_1: str = '100' # LY2-LYX TP1q-TC6q-TC4 angle - self.al2yx_2: str = '60' # LY2-LYX TQ2p-TP1q-TC4 angle - self.al2yx_3: str = '130' # LY2-LYX TP1q-TC4-SP2 angle - self.al3yx_1: str = '100' # LY3-LYX TP1q-TC6q-TC4 angle - self.al3yx_2: str = '110' # LY3-LYX TQ2p-TC6q-TC4 angle - self.al3yx_3: str = '130' # LY3-LYX TC6q-TC4-SP2 angle + self.al2yx_1: str = '100' # LY2-LYX TP1q-TC6q-TC4 angle R2 R4 R3/S3 + self.al2yx_2: str = '60' # LY2-LYX TQ2p-TP1q-TC4 angle R1 R2 R3 + self.al2yx_3: str = '130' # LY2-LYX TP1q-TC4-SP2 angle R2 B2 R3 + self.al3yx_1: str = '100' # LY3-LYX TP1q-TC6q-TC4 angle R2 R4 R3/S3 + self.al3yx_2: str = '110' # LY3-LYX TQ2p-TC6q-TC4 angle R1 R4 R3/S3 + self.al3yx_3: str = '130' # LY3-LYX TC6q-TC4-SP2 angle R4 S3 B3 LOG.debug(f"Initialized Crosslink for model counter {cnt_model}") @@ -110,7 +116,7 @@ def get_crosslink_coords(self, cnt_model: Optional[int] = None) -> List[List[flo (line[17:20] == 'L4Y' and line[12:15] == 'SC1') or (line[17:20] == 'L5Y' and line[12:15] == 'SC2')): - coords = [float(line[29:38]), float(line[38:46]), float(line[46:56])] + coords = [float(line[30:38]), float(line[38:46]), float(line[46:54])] self.crosslink_pdb.append([ str(it_pdb), # atom index as string @@ -214,6 +220,15 @@ def find_crosslink_pairs(self, cnt_model: Optional[int] = None) -> List[Tuple[Li if dist <= self.crosslink_thresholds['LYX_LY2']: self.crosslink_pairs.append((lyx_atom, ly2_atom)) LOG.info(f" Added LYX-LY2 pair: atoms {lyx_atom[0]} - {ly2_atom[0]} (distance: {dist:.3f} Å)") + + # Find LYX SC5 - LY2 SC1 pairs + for lyx_atom in lyx_sc5_atoms: + for ly2_atom in ly2_sc1_atoms: + dist = np.linalg.norm(np.array(lyx_atom[-3:]) - np.array(ly2_atom[-3:])) + LOG.debug(f" Distance LYX SC5 (atom {lyx_atom[0]}) - LY2 SC1 (atom {ly2_atom[0]}): {dist:.3f} Å") + if dist <= self.crosslink_thresholds['LYX_LY2']: + self.crosslink_pairs.append((lyx_atom, ly2_atom)) + LOG.info(f" Added LYX-LY2 pair: atoms {lyx_atom[0]} - {ly2_atom[0]} (distance: {dist:.3f} Å)") # Find LYX SC5 - LY3 SC1 pairs for lyx_atom in lyx_sc5_atoms: @@ -280,10 +295,21 @@ def set_crosslink_bonded(self, cnt_model: Optional[int] = None, ]) connections_found += 1 LOG.info(f"Added LYX-LY2 crosslink between {clx[0]} and {cly[0]} (distance: {dist:.3f} Å)") + + # LYX SC5 - LY2 SC1 crosslinks + elif (clx[1] == 'LYX' and clx[2] == 'SC5' and + cly[1] == 'LY2' and cly[2] == 'SC1'): + self.crosslink_bonded['bonds'].append([ + clx[0], cly[0], '1', self.dlyx5ly2, f"{self.klyx5ly2}\n" + ]) + #angles check again + connections_found += 1 + LOG.info(f"Added LYX-LY2 (SC5) crosslink between {clx[0]} and {cly[0]} (distance: {dist:.3f} Å)") + # LYX SC5 - LY3 SC1 crosslinks elif (clx[1] == 'LYX' and clx[2] == 'SC5' and - cly[1] == 'LY3' and cly[2] == 'SC1'): + cly[1] == 'LY3' and cly[2] == 'SC1') self.crosslink_bonded['bonds'].append([ clx[0], cly[0], '1', self.dlyxly3, f"{self.klyxly3}\n" @@ -335,6 +361,15 @@ def set_crosslink_bonded(self, cnt_model: Optional[int] = None, connections_found += 1 LOG.info(f" Added LYX-LY2 crosslink between {cly[0]} and {clx[0]} (distance: {dist:.3f} Å)") + elif (cly[1] == 'LYX' and cly[2] == 'SC5' and + clx[1] == 'LY2' and clx[2] == 'SC1'): + + self.crosslink_bonded['bonds'].append([ + cly[0], clx[0], '1', self.dlyx5ly2, f"{self.klyx5ly2}\n" + ]) + connections_found += 1 + LOG.info(f" Added LYX-LY2 (SC5) crosslink between {cly[0]} and {clx[0]} (distance: {dist:.3f} Å)") + elif (cly[1] == 'LYX' and cly[2] == 'SC5' and clx[1] == 'LY3' and clx[2] == 'SC1'): diff --git a/src/colbuilder/core/topology/itp.py b/src/colbuilder/core/topology/itp.py index 774e3e8..f00b438 100644 --- a/src/colbuilder/core/topology/itp.py +++ b/src/colbuilder/core/topology/itp.py @@ -257,6 +257,13 @@ def read_itp( elif bonded_type == "dihedrals": self.dihedrals[cnt_con].append(tokens) + # Atom-count offset: derive the connection's last atom index + # from the parsed atoms section, so merge offsets stay correct even if + # martinize2 emitted no [ position_restraints ] block. Equivalent to the + # posres-derived value when posres is present. + if cnt_con is not None and self.atoms[cnt_con]: + self.mol_ends[cnt_con] = [int(self.atoms[cnt_con][-1][0])] + except FileNotFoundError: LOG.error(f"ITP file not found: {itp_path}") raise diff --git a/src/colbuilder/core/topology/main_topology.py b/src/colbuilder/core/topology/main_topology.py index d35e444..20a3317 100644 --- a/src/colbuilder/core/topology/main_topology.py +++ b/src/colbuilder/core/topology/main_topology.py @@ -35,7 +35,8 @@ 'go-table.itp', 'col_go-sites.itp', '*.CG.pdb', - 'D' + # Intermediate per-type caps subdirs copied into the topology working dir + 'D', 'T', 'NC', 'DT', 'TD', 'M', } def cleanup_temporary_files(ff_name: str, temp_patterns: Set[str], search_dirs: Optional[List[Path]] = None) -> None: @@ -329,11 +330,12 @@ async def build_topology(system: System, config: ColbuilderConfig, file_manager: # Always return to original directory os.chdir(original_dir) - # Clean up temporary files unless in debug mode + # Clean up temporary files unless in debug mode. + # Only clean the working topology_dir, NOT output_topology_dir: the + # latter holds the deliverables and some temp-file globs could match + # and delete them. search_dirs = [topology_dir] - if output_topology_dir is not None: - search_dirs.append(output_topology_dir) - + if not config.topology_debug: cleanup_temporary_files( config.force_field or "unknown_force_field", diff --git a/src/colbuilder/core/topology/martini.py b/src/colbuilder/core/topology/martini.py index d119e50..e3d7a96 100644 --- a/src/colbuilder/core/topology/martini.py +++ b/src/colbuilder/core/topology/martini.py @@ -15,6 +15,7 @@ import cmd import os +import sys import subprocess import shutil from pathlib import Path @@ -33,6 +34,7 @@ get_active_conda_env, find_and_install_custom_force_field, get_conda_command_with_path, + find_martinize2_executable, ) from colbuilder.core.utils.logger import setup_logger from colbuilder.core.utils.files import FileManager @@ -324,17 +326,26 @@ def cap_pdb(self, pdb: Optional[List[str]] = None) -> Tuple[List[str], str, str] firsts = set(first_res.values()) if first_res else set() lasts = {name for _, name in last_res.values()} if last_res else set() - # Anything here means "don't ask martinize2 to apply a terminal mod": + # Residues meaning "this terminus is already capped / is a crosslink + # block", so martinize2 must NOT add another terminal modification. special_first = {"ACE", "CLA", "LY2", "LY3", "L4Y", "L5Y", "LYX"} special_last = {"ACE", "CLA", "LY2", "LY3", "L4Y", "L5Y", "LYX", "NME"} - # N-terminus: conservative (ACE often causes issues on GLN etc. in some FF) - nter_flag = "none" if (not firsts or (firsts & special_first)) else "none" + # N-terminus: the geometry stage (caps.py) pre-caps every chain with an + # ACE residue, so the first residue is normally ACE and needs no -nter + # modification. We always use 'none' (martinize2's ACE modification is + # also known to fail on some first residues, e.g. GLN). Warn if a chain + # is unexpectedly NOT pre-capped so it isn't silently left uncapped. + if firsts and not (firsts & special_first): + LOG.warning( + f"N-terminus not pre-capped (first residues={sorted(firsts)}); " + f"using -nter none. Check the geometry capping stage if unexpected." + ) + nter_flag = "none" - # C-terminus: allow NME only if all chains end on standard residues (not special, not NME) - cter_flag = "NME" - if not lasts or (lasts & special_last): - cter_flag = "none" + # C-terminus: apply the NME cap unless the chain already ends in a + # cap/crosslink block, in which case use 'none'. + cter_flag = "none" if (not lasts or (lasts & special_last)) else "NME" # LOG.debug(f"cap_pdb decided: -nter {nter_flag}, -cter {cter_flag} (first={firsts}, last={lasts})") return pdb, cter_flag, nter_flag @@ -447,7 +458,10 @@ def get_system_pdb(self, size: Optional[int] = None) -> List[str]: return [] def write_system_topology( - self, topology_file: str = "system.top", size: Optional[int] = None + self, + topology_file: str = "system.top", + size: Optional[int] = None, + processed_ids: Optional[List[int]] = None, ) -> None: """ Write final topology file for the system. @@ -457,10 +471,18 @@ def write_system_topology( topology_file : str Path to the output topology file size : Optional[int] - Number of models to include + Number of models considered (fallback range when processed_ids is None). + processed_ids : Optional[List[int]] + col_{id} indices that actually produced an .itp. When given, only these + are #included, preventing GROMACS errors from missing files when some + models failed or were skipped. """ - if not size: - LOG.warning("No size provided to write_system_topology") + if processed_ids is not None: + ids_for_includes = sorted({int(m) for m in processed_ids}) + elif size: + ids_for_includes = list(range(size)) + else: + LOG.warning("No size or processed_ids provided to write_system_topology") return try: @@ -470,9 +492,15 @@ def write_system_topology( f.write('#include "martini_v3.0.0.itp"\n') f.write('#include "go-sites.itp"\n\n') - for m in range(size): - f.write(f'#include "col_{m}.itp"\n') - f.write(f'#include "col_{m}_go-excl.itp"\n') + included = [] + for m in ids_for_includes: + # Belt-and-suspenders: never include a file that isn't on disk. + if not Path(f"col_{int(m)}.itp").exists(): + LOG.warning(f"Skipping missing topology include: col_{int(m)}.itp") + continue + f.write(f'#include "col_{int(m)}.itp"\n') + f.write(f'#include "col_{int(m)}_go-excl.itp"\n') + included.append(int(m)) f.write('\n#include "martini_v3.0.0_solvents_v1.itp"\n') f.write('#include "martini_v3.0.0_ions_v1.itp"\n') @@ -481,8 +509,8 @@ def write_system_topology( f.write("Collagen, Martini 3 and Go-Potentials \n") f.write("\n[ molecules ]\n") - for t in range(size): - f.write(f"col_{t} 1\n") + for t in included: + f.write(f"col_{int(t)} 1\n") self.write_go_topology(name_type="sites.itp") @@ -527,7 +555,10 @@ def translate_pdb(self, pdb: Optional[List[str]] = None) -> List[str]: translated = [] for line in pdb: - if line[0:6] in self.is_line: + # Only ATOM/HETATM carry coordinates. TER/ANISOU lines have no + # parseable x/y/z, so don't try (and don't emit a spurious + # "Bad coord formatting" warning for every TER record). + if line[0:6] in ("ATOM ", "HETATM"): try: # Columns: x[30:38], y[38:46], z[46:54] x = float(line[30:38]) @@ -561,6 +592,47 @@ def write_gro( return None +def _build_connected_groups(system: System) -> List[List[Any]]: + """ + Collapse crosslink-connected models into transitive groups. + + Mirrors Amber.get_connected_groups so that each physically connected set + of triple helices is emitted exactly once. model.connect is symmetric and + self-inclusive (e.g. {self} + direct neighbors); a BFS gives the full group. + """ + all_models = list(system.get_models()) + processed: set = set() + groups: List[List[Any]] = [] + + for model_id in all_models: + if model_id in processed: + continue + + model = system.get_model(model_id=model_id) + if not model or not getattr(model, "connect", None): + groups.append([model_id]) + processed.add(model_id) + continue + + group: set = set() + stack = [model_id] + while stack: + cur = stack.pop() + if cur in group: + continue + group.add(cur) + cur_model = system.get_model(model_id=cur) + if cur_model and getattr(cur_model, "connect", None): + for nb in cur_model.connect: + if nb not in group: + stack.append(nb) + + groups.append(sorted(group)) + processed.update(group) + + return groups + + def check_output_files(topology_dir: Path, expected_models: int): """Diagnostic function to check which files were created""" cg_files = list(topology_dir.glob("*.CG.pdb")) @@ -689,7 +761,9 @@ async def build_martini3( ) LOG.info(f"Step 3/{steps} Processing models with Martinize2") - processed_models = [] + processed_models = [] # fibril model_ids that succeeded — for counts/logging ONLY + # (col_N.itp file indices live in written_itp_ids) + written_itp_ids: List[int] = [] # col_{cnt_model} indices actually written try: martinize2_command = getattr(config, "martinize2_command", None) @@ -715,27 +789,38 @@ async def build_martini3( type_dir.mkdir(exist_ok=True, parents=True) models_list = [model_id for model_id in system.get_models()] + + # Group crosslink-connected models so each physical triple helix is + # written exactly once. Without this, every model re-merges its whole + # neighborhood and connected helices are emitted multiple times, + # producing duplicate beads (0 A overlaps) and inflated particle counts. + connected_groups = _build_connected_groups(system) + LOG.debug( + f"Martini: collapsed {len(models_list)} models into " + f"{len(connected_groups)} connected group(s)" + ) + model_status = {} failed_martinize, failed_contact, failed_go, failed_itp = [], [], [], [] processed_in_topology: set = set() - for model_id in tqdm(models_list, desc="Building topology", unit="%"): + for group in tqdm(connected_groups, desc="Building topology", unit="grp"): + # Representative model id drives all file naming for this group. + model_id = group[0] model = system.get_model(model_id=model_id) if model is None: - LOG.warning(f"Skipping model {model_id}: model not found in System") + LOG.warning(f"Skipping group {group}: representative model not found in System") model_status[model_id] = "no_model" continue - # build connect_ids fallback - sys_connect = getattr(model, "connect", None) - if sys_connect: - connect_ids = list(sys_connect) - else: - connect_ids = [model_id] # self-connection - LOG.debug(f"Model {model_id}: no connections in System; processing as self-connection") + # Process every member of the group exactly once, and make sure the + # downstream helpers that read connectivity from the System + # (merge_pdbs, Itp.allocate/read_model) see the full group. + connect_ids = list(group) + model.connect = list(group) if not connect_ids: - LOG.warning(f"Skipping model {model_id}: no connections and no fallback") + LOG.warning(f"Skipping group {group}: no members") model_status[model_id] = "no_connections" continue @@ -762,7 +847,30 @@ def _martinize_cmd(nter_flag: str, cter_flag: str) -> str: f"-nter {nter_flag} -cter {cter_flag} " f"-govs-include -govs-moltype col_{int(model_id)}.{int(connect_id)} -maxwarn 4" ) - return get_conda_command_with_path("martinize2", args) + # An explicit user override wins. + configured = getattr(config, "martinize2_command", None) + if configured and configured not in ("martinize2", "auto"): + return f"{configured} {args}" + # Prefer the martinize2 that lives in the SAME environment + # as the running interpreter: that is the vermouth into + # which the custom martini300C force field was just + # installed. This avoids picking a martinize2 from another + # Python (e.g. a pyenv shim ahead on PATH) whose vermouth + # does not know the custom force field ("Unknown force + # field martini300C"). + local_exe = Path(sys.executable).parent / "martinize2" + if local_exe.exists(): + return f"{local_exe} {args}" + # Otherwise fall back to a direct martinize2 on PATH, and + # only wrap in `conda run` when it is reachable solely via + # a conda env. + try: + exe, use_conda, _env = find_martinize2_executable() + except Exception: + exe, use_conda = "martinize2", True + if use_conda: + return get_conda_command_with_path("martinize2", args) + return f"{exe} {args}" cmd = _martinize_cmd(nter, cter) process = await asyncio.create_subprocess_shell( @@ -799,7 +907,7 @@ def _martinize_cmd(nter_flag: str, cter_flag: str) -> str: continue go_cmd = ( - f"python create_goVirt.py -s {int(model_id)}.{int(connect_id)}.CG.pdb " + f"{sys.executable} create_goVirt.py -s {int(model_id)}.{int(connect_id)}.CG.pdb " f"-f map.out --moltype col_{int(model_id)}.{int(connect_id)} --go_eps {go_epsilon}" ) go_process = await asyncio.create_subprocess_shell( @@ -826,6 +934,7 @@ def _martinize_cmd(nter_flag: str, cter_flag: str) -> str: itp_.go_to_pairs(model_id=int(model_id)) itp_.make_topology(model_id=int(model_id), cnt_model=cnt_model) processed_models.append(model_id) + written_itp_ids.append(cnt_model) # this col_{cnt_model}.itp now exists # Track what we processed for connect_id in connect_ids: @@ -883,7 +992,11 @@ def _martinize_cmd(nter_flag: str, cter_flag: str) -> str: try: final_topology_file = f"collagen_fibril_{config.species}.top" - martini.write_system_topology(topology_file=final_topology_file, size=cnt_model) + martini.write_system_topology( + topology_file=final_topology_file, + size=cnt_model, + processed_ids=written_itp_ids, + ) topology_file_path = Path(final_topology_file) if topology_file_path.exists(): @@ -914,7 +1027,9 @@ def _martinize_cmd(nter_flag: str, cter_flag: str) -> str: ) try: - if not config.debug: + # Gate intermediate-file cleanup on topology_debug (the flag meant for + # keeping topology intermediates), not the general logging `debug` flag. + if not getattr(config, "topology_debug", False): subprocess.run(r"rm \#*", shell=True, check=False) except Exception as e: LOG.warning(f"Error cleaning up temporary files: {str(e)}") diff --git a/src/colbuilder/core/utils/config.py b/src/colbuilder/core/utils/config.py index ec432d1..75eb897 100644 --- a/src/colbuilder/core/utils/config.py +++ b/src/colbuilder/core/utils/config.py @@ -259,7 +259,7 @@ class ColbuilderConfig(BaseModel): description="Automatically detect unpaired enzymatic crosslinks and replace them", ) ratio_replace_scope: Literal["enzymatic", "non_enzymatic", "all"] = Field( - default="non_enzymatic", + default="enzymatic", description="Scope of residues considered for ratio-based replacement", ) ratio_replace: Optional[float] = Field( @@ -387,8 +387,8 @@ def __init__(self, **data): data["working_directory"] = Path.cwd().resolve() super().__init__(**data) - if self.mix_bool and self.ratio_mix is not None: - self.ratio_mix = self._convert_ratio_mix(self.ratio_mix) + # ratio_mix is converted/validated (incl. sum-to-100) by the + # validate_ratio_mix field validator; no extra conversion needed here. self.solution_space = self._convert_to_tuple(self.solution_space) self.files_mix = tuple(self.files_mix) if self.files_mix else None self.set_mode() @@ -483,8 +483,12 @@ def validate_contact_distance(cls, value: Optional[float]) -> Optional[float]: return value @field_validator("fibril_length") - def validate_fibril_length(cls, v: float) -> float: - """Validate fibril length.""" + def validate_fibril_length(cls, v: Optional[float]) -> Optional[float]: + """Validate fibril length. None is allowed here; required-ness is enforced + per-mode by validate_fibril_length_required (avoids a confusing TypeError + when the value is simply omitted).""" + if v is None: + return None if v <= 0: raise ValueError("fibril_length must be greater than 0") LOG.debug(f"Validating fibril_length: {v}") @@ -631,16 +635,6 @@ def validate_files_mix(cls, value): """Validate files mix format.""" return tuple(value) if value else None - @property - def ratio_mix(self) -> Dict[str, int]: - """Get ratio mix property.""" - return self._ratio_mix - - @ratio_mix.setter - def ratio_mix(self, value: Union[str, Dict[str, int]]): - """Set ratio mix property.""" - self._ratio_mix = self._convert_ratio_mix(value) - @field_validator("manual_replacements", mode="before") def validate_manual_replacements( cls, value: Optional[Union[str, List[str], Tuple[str, ...]]] @@ -793,12 +787,31 @@ def validate_mix_config(cls, values): error_code="CFG_ERR_004", ) if values.fibril_length is None: - values.fibril_length = values.get( - "fibril_length", 40.0 - ) # Default to 40 for mixing + values.fibril_length = 40.0 # Default to 40 nm for mixing LOG.debug(f"Validated fibril_length for mixing: {values.fibril_length}") return values + @model_validator(mode="after") + def validate_fibril_length_required(self) -> "ColbuilderConfig": + """fibril_length is required for geometry generation (no sensible default).""" + if self.geometry_generator and self.fibril_length is None: + raise ConfigurationError( + "fibril_length (nm) is required for geometry generation", + error_code="CFG_ERR_006", + ) + return self + + @model_validator(mode="after") + def validate_topology_requirements(self) -> "ColbuilderConfig": + """force_field is required when generating topology.""" + if self.topology_generator and not self.force_field: + raise ConfigurationError( + "force_field is required for topology generation " + "(supported: 'amber99' or 'martini3')", + error_code="CFG_ERR_006", + ) + return self + @model_validator(mode="after") def validate_replace_config(self) -> "ColbuilderConfig": """Validate replacement configuration.""" diff --git a/src/colbuilder/core/utils/crosslink_detector.py b/src/colbuilder/core/utils/crosslink_detector.py index 001d2e5..e417eea 100644 --- a/src/colbuilder/core/utils/crosslink_detector.py +++ b/src/colbuilder/core/utils/crosslink_detector.py @@ -41,7 +41,22 @@ class CrosslinkDetector: } ALL_CROSSLINK_RESIDUES = DIVALENT_RESIDUES | TRIVALENT_RESIDUES - + + # Map user-facing crosslink type names (as specified in the config via + # n_term_type / c_term_type) to the expected structure category. + # 'D' = divalent (two participating residues), 'T' = trivalent (three). + DIVALENT_TYPES = { + 'HLKNL', 'LKNL', 'deHLNL', 'deHHLNL', + 'Pentosidine', 'Glucosepane', 'MOLD', + } + TRIVALENT_TYPES = { + 'PYD', 'DPD', 'PYL', 'DPL', + } + + # Type names that explicitly mean "no crosslink" and should be skipped + # when validating against the structure. + NO_CROSSLINK_TYPES = {'NONE', 'NOCROSS'} + @classmethod def detect_structure_type(cls, pdb_path: Path) -> str: """ @@ -78,7 +93,136 @@ def detect_structure_type(cls, pdb_path: Path) -> str: # No crosslinks found - could be a non-crosslinked structure LOG.info("No crosslinks detected, using default type D") return 'D' # Default to divalent - + + @classmethod + def categorize_type(cls, type_name: Optional[str]) -> Optional[str]: + """ + Map a user-facing crosslink type name to a structure category. + + Args: + type_name: Crosslink type as specified in the config (e.g. "HLKNL", + "PYD"). May be None or a "no crosslink" placeholder. + + Returns: + 'D' for divalent types, 'T' for trivalent types, or None when the + type is empty, a "no crosslink" placeholder, or unrecognized. + """ + if not type_name: + return None + name = str(type_name).strip() + if name.upper() in cls.NO_CROSSLINK_TYPES: + return None + if name in cls.DIVALENT_TYPES: + return 'D' + if name in cls.TRIVALENT_TYPES: + return 'T' + return None + + @classmethod + def validate_against_specified_types( + cls, + pdb_path: Path, + specified_types: List[Optional[str]], + ) -> None: + """ + Check that the crosslinks present in a PDB match the specified types. + + This guards against a common mistake where a user supplies a PDB that + contains, for example, trivalent (PYD) crosslinks while specifying a + divalent crosslink type (e.g. HLKNL) in the configuration, or vice + versa. + + Validation is skipped (no error raised) when: + - no crosslink types are specified (or all are "NONE"/"NOCROSS"), + - none of the specified types are recognized, + - the PDB contains no crosslink residues at all. + + Args: + pdb_path: Path to the input PDB file. + specified_types: Crosslink type names from the config, e.g. + ``[config.n_term_type, config.c_term_type]``. + + Raises: + GeometryGenerationError: If the structure contains crosslink + residues whose category was not requested by the specified + types (error code ``GEO_ERR_008``). + """ + # Import here to avoid a circular import at module load time. + from colbuilder.core.utils.exceptions import GeometryGenerationError + + expected_categories: Set[str] = set() + recognized_types: List[str] = [] + for type_name in specified_types: + category = cls.categorize_type(type_name) + if category is not None: + expected_categories.add(category) + recognized_types.append(str(type_name).strip()) + + # Nothing meaningful to validate against. + if not expected_categories: + LOG.debug( + "Skipping crosslink type validation: no recognized crosslink " + "types were specified" + ) + return + + residues = cls.get_all_residues(pdb_path) + divalent_found = sorted(residues & cls.DIVALENT_RESIDUES) + trivalent_found = sorted(residues & cls.TRIVALENT_RESIDUES) + + detected_categories: Set[str] = set() + if divalent_found: + detected_categories.add('D') + if trivalent_found: + detected_categories.add('T') + + # No crosslinks in the structure: cannot (and need not) validate types. + if not detected_categories: + LOG.debug( + "Skipping crosslink type validation: no crosslink residues " + "found in %s", pdb_path + ) + return + + # The PDB contains a category of crosslink that was not requested. + if not detected_categories.issubset(expected_categories): + category_labels = {'D': 'divalent', 'T': 'trivalent'} + detected_label = ", ".join( + category_labels[c] for c in sorted(detected_categories) + ) + expected_label = ", ".join( + category_labels[c] for c in sorted(expected_categories) + ) + detected_residues = sorted(set(divalent_found) | set(trivalent_found)) + + message = ( + f"The crosslinks found in '{pdb_path}' do not match the " + f"specified crosslink types.\n" + f" Specified types: {', '.join(recognized_types)} " + f"(expected {expected_label} crosslinks)\n" + f" Detected in PDB: {detected_label} crosslink residues " + f"{detected_residues}\n" + f"Update n_term_type/c_term_type to match the structure, or " + f"provide a PDB whose crosslinks match the specified types." + ) + LOG.error(message) + raise GeometryGenerationError( + message=message, + error_code="GEO_ERR_008", + context={ + "pdb_file": str(pdb_path), + "specified_types": recognized_types, + "expected_categories": sorted(expected_categories), + "detected_categories": sorted(detected_categories), + "detected_residues": detected_residues, + }, + ) + + LOG.debug( + "Crosslink type validation passed: specified %s, detected %s", + sorted(expected_categories), sorted(detected_categories), + ) + @classmethod def get_all_residues(cls, pdb_path: Path) -> Set[str]: """ diff --git a/src/colbuilder/core/utils/crosslinks.py b/src/colbuilder/core/utils/crosslinks.py index 04931c7..54eaebb 100644 --- a/src/colbuilder/core/utils/crosslinks.py +++ b/src/colbuilder/core/utils/crosslinks.py @@ -87,7 +87,7 @@ def parse_crosslink_position( - position_str: str, residue_type: str, atom_name: str + position_str: str, residue_type: str, atom_name: str, atom_name2: Optional[str] = None ) -> Optional[CrosslinkPosition]: """ Parse a position string into a CrosslinkPosition object. @@ -116,6 +116,7 @@ def parse_crosslink_position( chain_id=parts[1], residue_type=residue_type, atom_name=atom_name, + atom_name2=(atom_name2 if atom_name2 and atom_name2 != "NONE" else None), ) except (ValueError, IndexError) as e: raise SequenceGenerationError( @@ -174,7 +175,9 @@ def extract_crosslinks_from_dataframe( pos3 = None if "P3" in row and row["P3"] != "NONE": - pos3 = parse_crosslink_position(row["P3"], row["R3"], row["A31"]) + pos3 = parse_crosslink_position( + row["P3"], row["R3"], row["A31"], row.get("A32") + ) crosslinks.append( CrosslinkPair( @@ -429,7 +432,13 @@ def _prepare_crosslink_info(self) -> List[Dict[str, str]]: pair.position3.residue_type if pair.position3 else "NONE" ), "atom31": pair.position3.atom_name if pair.position3 else "NONE", - "atom32": pair.position3.atom_name if pair.position3 else "NONE", + # Second bond of the trivalent third residue uses A32 (atom_name2); + # fall back to A31 only if A32 is absent. + "atom32": ( + (pair.position3.atom_name2 or pair.position3.atom_name) + if pair.position3 + else "NONE" + ), } for pair in self.crosslink_pairs ] diff --git a/src/colbuilder/core/utils/data_structures.py b/src/colbuilder/core/utils/data_structures.py index fe256c7..59286f9 100644 --- a/src/colbuilder/core/utils/data_structures.py +++ b/src/colbuilder/core/utils/data_structures.py @@ -102,6 +102,7 @@ class CrosslinkPosition: chain_id: str residue_type: str atom_name: str + atom_name2: Optional[str] = None # second atom (CSV A32) for the third residue of a trivalent crosslink def __post_init__(self): if self.chain_id not in {"A", "B", "C"}: diff --git a/src/colbuilder/core/utils/error_codes.py b/src/colbuilder/core/utils/error_codes.py index 312ce9d..0e910b1 100644 --- a/src/colbuilder/core/utils/error_codes.py +++ b/src/colbuilder/core/utils/error_codes.py @@ -152,7 +152,7 @@ class ErrorInfo(NamedTuple): "Contact distance must be a positive number", "Fibril length must be a positive number less than 334", "Terminal combinations must follow format 'ResidueNumber.Chain - ResidueNumber.Chain'", - "Force field must be either 'amber99' or 'martini'", + "Force field must be either 'amber99' or 'martini3'", ], ), } @@ -276,6 +276,17 @@ class ErrorInfo(NamedTuple): "Verify chain terminations match chain count", ], ), + "GEO_ERR_008": ErrorInfo( + code="GEO_ERR_008", + message="Crosslinks in PDB do not match the specified crosslink types", + suggestions=[ + "Ensure n_term_type/c_term_type match the crosslinks present in the PDB", + "Divalent types (HLKNL, LKNL, deHLNL, deHHLNL) require a divalent PDB", + "Trivalent types (PYD, DPD, PYL, DPL) require a trivalent PDB", + "Set the crosslink types to match the input structure, or provide a matching PDB", + ], + docs_url="https://colbuilder.readthedocs.io/en/latest/geometry.html#input-requirements", + ), } # Topology-related errors @@ -284,7 +295,7 @@ class ErrorInfo(NamedTuple): code="TOP_ERR_001", message="Invalid force field specification", suggestions=[ - "Use a supported force field (currently only amber99)", + "Use a supported force field ('amber99' or 'martini3')", "Check force field spelling", "Verify force field is properly configured", "Ensure force field files are available in the expected location", diff --git a/src/colbuilder/core/utils/files.py b/src/colbuilder/core/utils/files.py index ec709e9..f7f568c 100644 --- a/src/colbuilder/core/utils/files.py +++ b/src/colbuilder/core/utils/files.py @@ -531,6 +531,38 @@ def ensure_mixing_dir(self) -> Path: return self.mixing_dir + def clear_generation_dirs(self) -> None: + """ + Remove stale per-model PDBs, caps files and type subdirectories from the + geometry, mixing and replacement temp directories at the start of a run. + + Without this, re-running with a different fibril length / contact distance + leaves old ``{id}.caps.pdb`` files behind; since the final PDB is assembled + by selecting caps per model id, a model whose caps were not regenerated this + run would silently be written from the previous run's geometry. + + Deliberately does NOT touch topology_gen or sequence_gen (those are consumed + in later phases and may legitimately persist within a run). + """ + type_subdirs = {"NC", "D", "T", "TD", "DT", "M", "DY"} + for base in (self.geometry_dir, self.mixing_dir, self.replacement_dir): + if not base.exists(): + continue + try: + for entry in base.iterdir(): + if entry.is_dir() and entry.name in type_subdirs: + shutil.rmtree(entry, ignore_errors=True) + elif entry.is_file() and ( + entry.name.endswith(".caps.pdb") or entry.suffix == ".pdb" + ): + try: + entry.unlink() + except Exception: + pass + except Exception as e: + LOG.debug(f"Could not clear generation dir {base}: {e}") + LOG.debug("Cleared stale caps/type subdirs from geometry/mixing/replacement temp dirs") + def cleanup(self, category: Optional[str] = None, force: bool = False) -> None: """ Clean up temporary files and directories.