Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions pyiohat/parsers/ident/glyco_decipher_1_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,88 @@ def transform_mod_entry(self, entry):

return ";".join(transformed_mods)

def add_glycans_to_modifications(self):
"""
Extends the 'modifications' column by adding glycans from
glycan_composition (mapped to canonical sugar names) at the GlySite position.
Example: Oxidation:12;HexNAc(2)Hex(3)dHex(1)NeuAc(1):57
"""
return self.df.apply(self._transform_glycan_entry, axis=1)

def _transform_glycan_entry(self, row):
# change glycan_composition names to one used in pyiohat glycan name_to_mass dict for use in ucalc_mass ect.
glyco_name_lookup = {
"NulNAcA": "NeuAc",
"NulNGcA": "NeuGc",
}
canonical_order = ["HexNAc", "Hex", "dHex", "NeuAc", "NeuGc"]

base_mods = row.get("modifications", "")
gly_comp = row.get("glycan_composition", "")

gly_site = self._calculate_glycan_position(row)

if pd.isna(gly_site) or pd.isna(gly_comp):
return base_mods

# --- Parse glycan composition ---
gly_dict = {}
for match in re.finditer(r"(\w+)\((\d+)\)", gly_comp):
full_name, count = match.groups()
count = int(count)

# Apply specific lookups if needed
if full_name in glyco_name_lookup:
full_name = glyco_name_lookup[full_name]

if count > 0:
gly_dict[full_name] = gly_dict.get(full_name, 0) + count

# --- Build glycan string in canonical order ---
gly_str_parts = []
for sugar in canonical_order:
if sugar in gly_dict:
gly_str_parts.append(f"{sugar}({gly_dict[sugar]})")

if not gly_str_parts:
return base_mods

gly_str = "".join(gly_str_parts) + f":{gly_site}"

# --- Combine with existing modifications ---
print(f"Added Glycan to Modifications: {gly_str}")
if base_mods:
return ";".join([base_mods, gly_str])
else:
return gly_str

def _calculate_glycan_position(self, row):
"""
Calculates the glycan position based on glyco_decipher:GlycoSite and sequence_start.
"""
glyco_sites_str = row.get("glyco_decipher:GlycoSite", "")
sequence_starts_str = row.get("sequence_start", "")

if pd.isna(glyco_sites_str) or pd.isna(sequence_starts_str):
return pd.NA

glyco_sites = glyco_sites_str.split(";")
sequence_starts = sequence_starts_str.split("<|>")

for i, site in enumerate(glyco_sites):
if "/" not in site:
# Found a single number site
try:
site_val = int(site)
start_val = int(sequence_starts[i])
return site_val - start_val + 1
except (ValueError, IndexError):
# Handle cases where conversion fails or index is out of bounds
continue

# If no single-number site found, return 'n'
return "n"

def unify(self):
"""
Primary method to read and unify engine output.
Expand All @@ -190,5 +272,7 @@ def unify(self):
self.df["glycan_composition"] = self.convert_glycan_composition()
self.df["modifications"] = self.adjust_modifications()
self.process_unify_style()
# adding glycans to modifications must happen after process_unify_stlye since the sequence_start columns it required for glyco_site mapping
self.df["modifications"] = self.add_glycans_to_modifications()

return self.df
78 changes: 71 additions & 7 deletions pyiohat/parsers/ident/pglyco_3_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def __init__(self, *args, **kwargs):

self.df = pd.read_csv(self.input_file, delimiter="\t")
self.df.dropna(axis=1, how="all", inplace=True)
original_columns = set(self.df.columns)
self.mapping_dict = {
"GlySpec": "pglyco:GlySpec",
"PepSpec": "pglyco:PepSpec",
Expand All @@ -34,7 +35,7 @@ def __init__(self, *args, **kwargs):
"Peptide": "sequence",
"Mod": "modifications",
"PeptideMH": "pglyco:PeptideMH",
"Glycan(H,N,A,F)": "pglyco:Glycan(H,N,A,F)",
"Glycan(A,F,G,H,N)": "pglyco:Glycan(A,F,G,H,N)",
"GlycanComposition": "glycan_composition",
"PlausibleStruct": "pglyco:PlausibleStruct",
"GlyID": "pglyco:GlyID",
Expand All @@ -50,8 +51,8 @@ def __init__(self, *args, **kwargs):
"GlyIonRatio": "pglyco:GlyIonRatio",
"byIonRatio": "pglyco:byIonRatio",
"czIonRatio": "pglyco:czIonRatio",
"GlyDecoy": "glycan_is_decoy",
"PepDecoy": "peptide_is_decoy",
"GlyDecoy": "pglyco:glycan_is_decoy",
"PepDecoy": "pglyco:peptide_is_decoy",
"Ion_163.06": "pglyco:Ion_163.06",
"Ion_366.14": "pglyco:Ion_366.14",
"Ion_204.09": "pglyco:Ion_204.09",
Expand All @@ -68,6 +69,9 @@ def __init__(self, *args, **kwargs):
# pprint(f"mapping dict")
# pprint(self.mapping_dict)
self.df.rename(columns=self.mapping_dict, inplace=True)
unmapped_columns = original_columns.difference(set(self.mapping_dict.keys()))
prefix_mapping_dict = {col: f"pglyco:{col}" for col in unmapped_columns}
self.df.rename(columns=prefix_mapping_dict, inplace=True)
# pprint(f"renamed df")
# pprint(self.df)
self.df.columns = self.df.columns.str.lstrip(" ")
Expand Down Expand Up @@ -113,7 +117,6 @@ def check_parser_compatibility(cls, file):
"Peptide",
"Mod",
"PeptideMH",
"Glycan(H,N,A,F)",
"GlycanComposition",
"PlausibleStruct",
"GlyID",
Expand Down Expand Up @@ -202,15 +205,75 @@ def transform_mod_entry(self, entry):

return ";".join(transformed_mods)

def add_glycans_to_modifications(self):
"""
Extends the 'modifications' column by adding glycans from
GlycanComposition (mapped to canonical sugar names) at the GlySite position.
Example: Oxidation:12;HexNAc(2)Hex(3)dHex(1)NeuAc(1):57
"""
return self.df.apply(self._transform_glycan_entry, axis=1)

def _transform_glycan_entry(self, row):
# Mapping sinagle letters to sugar names as they are in pyiohat name_to_compostion dict so they can contribute to ucalc_mass ect.
pglyco_glyco_lookup = {
"H": "Hex",
"N": "HexNAc",
"F": "dHex",
"A": "NeuAc",
"G": "NeuGc",
}

canonical_order = ["HexNAc", "Hex", "dHex", "NeuAc", "NeuGc"]

base_mods = row.get("modifications", "")
gly_site = row.get("pglyco:GlySite")
gly_comp = row.get("glycan_composition", "")

if pd.isna(gly_site) or pd.isna(gly_comp):
return base_mods

import re

# --- Parse glycan composition ---
gly_dict = {}
for match in re.finditer(r"([A-Z])\((\d+)\)", gly_comp):
letter, count = match.groups()
count = int(count)
if count > 0 and letter in pglyco_glyco_lookup:
full_name = pglyco_glyco_lookup[letter]
gly_dict[full_name] = gly_dict.get(full_name, 0) + count

# --- Build glycan string in canonical order ---
gly_str_parts = []
for sugar in canonical_order:
if sugar in gly_dict:
gly_str_parts.append(f"{sugar}({gly_dict[sugar]})")

if not gly_str_parts:
return base_mods

gly_str = "".join(gly_str_parts) + f":{gly_site}"

# --- Combine with existing modifications ---
print(f"Added Glycan to Modifications: {gly_str}")
if base_mods:
return ";".join([base_mods, gly_str])
else:
return gly_str

def convert_is_decoy_columns(self):
"""
Converts 1/0 integer values in 'glycan_is_decoy' and 'peptide_is_decoy'
columns to True/False boolean values.
"""
conversion_map = {1: True, 0: False}

self.df["glycan_is_decoy"] = self.df["glycan_is_decoy"].map(conversion_map)
self.df["peptide_is_decoy"] = self.df["peptide_is_decoy"].map(conversion_map)
self.df["glycan_is_decoy"] = self.df["pglyco:glycan_is_decoy"].map(
conversion_map
)
self.df["peptide_is_decoy"] = self.df["pglyco:peptide_is_decoy"].map(
conversion_map
)

def unify(self):
"""
Expand All @@ -225,9 +288,10 @@ def unify(self):
expand=True,
)[1]
self.df["sequence"] = self.df["sequence"].astype(str).str.replace("J", "N")
self.df["modifications"] = self.adjust_modifications()
self.df["modifications"] = self.add_glycans_to_modifications()
self.df["glycan_composition"] = self.convert_glycan_composition()
self.convert_is_decoy_columns()
self.df["modifications"] = self.adjust_modifications()
self.process_unify_style()

return self.df
Loading