computational-ms · mukul-kandwal · Sep 14, 2025
diff --git a/pyiohat/parsers/ident/glyco_decipher_1_parser.py b/pyiohat/parsers/ident/glyco_decipher_1_parser.py
@@ -175,6 +175,88 @@ def transform_mod_entry(self, entry):
 
         return ";".join(transformed_mods)
 
+    def add_glycans_to_modifications(self):
+        """
+        Extends the 'modifications' column by adding glycans from
+        glycan_composition (mapped to canonical sugar names) at the GlySite position.
+        Example: Oxidation:12;HexNAc(2)Hex(3)dHex(1)NeuAc(1):57
+        """
+        return self.df.apply(self._transform_glycan_entry, axis=1)
+
+    def _transform_glycan_entry(self, row):
+        # change glycan_composition names to one used in pyiohat glycan name_to_mass dict for use in ucalc_mass ect.
+        glyco_name_lookup = {
+            "NulNAcA": "NeuAc",
+            "NulNGcA": "NeuGc",
+        }
+        canonical_order = ["HexNAc", "Hex", "dHex", "NeuAc", "NeuGc"]
+
+        base_mods = row.get("modifications", "")
+        gly_comp = row.get("glycan_composition", "")
+
+        gly_site = self._calculate_glycan_position(row)
+
+        if pd.isna(gly_site) or pd.isna(gly_comp):
+            return base_mods
+
+        # --- Parse glycan composition ---
+        gly_dict = {}
+        for match in re.finditer(r"(\w+)\((\d+)\)", gly_comp):
+            full_name, count = match.groups()
+            count = int(count)
+
+            # Apply specific lookups if needed
+            if full_name in glyco_name_lookup:
+                full_name = glyco_name_lookup[full_name]
+
+            if count > 0:
+                gly_dict[full_name] = gly_dict.get(full_name, 0) + count
+
+        # --- Build glycan string in canonical order ---
+        gly_str_parts = []
+        for sugar in canonical_order:
+            if sugar in gly_dict:
+                gly_str_parts.append(f"{sugar}({gly_dict[sugar]})")
+
+        if not gly_str_parts:
+            return base_mods
+
+        gly_str = "".join(gly_str_parts) + f":{gly_site}"
+
+        # --- Combine with existing modifications ---
+        print(f"Added Glycan to Modifications: {gly_str}")
+        if base_mods:
+            return ";".join([base_mods, gly_str])
+        else:
+            return gly_str
+
+    def _calculate_glycan_position(self, row):
+        """
+        Calculates the glycan position based on glyco_decipher:GlycoSite and sequence_start.
+        """
+        glyco_sites_str = row.get("glyco_decipher:GlycoSite", "")
+        sequence_starts_str = row.get("sequence_start", "")
+
+        if pd.isna(glyco_sites_str) or pd.isna(sequence_starts_str):
+            return pd.NA
+
+        glyco_sites = glyco_sites_str.split(";")
+        sequence_starts = sequence_starts_str.split("<|>")
+
+        for i, site in enumerate(glyco_sites):
+            if "/" not in site:
+                # Found a single number site
+                try:
+                    site_val = int(site)
+                    start_val = int(sequence_starts[i])
+                    return site_val - start_val + 1
+                except (ValueError, IndexError):
+                    # Handle cases where conversion fails or index is out of bounds
+                    continue
+
+        # If no single-number site found, return 'n'
+        return "n"
+
     def unify(self):
         """
         Primary method to read and unify engine output.
@@ -190,5 +272,7 @@ def unify(self):
         self.df["glycan_composition"] = self.convert_glycan_composition()
         self.df["modifications"] = self.adjust_modifications()
         self.process_unify_style()
+        # adding glycans to modifications must happen after process_unify_stlye since the sequence_start columns it required for glyco_site mapping
+        self.df["modifications"] = self.add_glycans_to_modifications()
 
         return self.df
diff --git a/pyiohat/parsers/ident/pglyco_3_parser.py b/pyiohat/parsers/ident/pglyco_3_parser.py
@@ -21,6 +21,7 @@ def __init__(self, *args, **kwargs):
 
         self.df = pd.read_csv(self.input_file, delimiter="\t")
         self.df.dropna(axis=1, how="all", inplace=True)
+        original_columns = set(self.df.columns)
         self.mapping_dict = {
             "GlySpec": "pglyco:GlySpec",
             "PepSpec": "pglyco:PepSpec",
@@ -34,7 +35,7 @@ def __init__(self, *args, **kwargs):
             "Peptide": "sequence",
             "Mod": "modifications",
             "PeptideMH": "pglyco:PeptideMH",
-            "Glycan(H,N,A,F)": "pglyco:Glycan(H,N,A,F)",
+            "Glycan(A,F,G,H,N)": "pglyco:Glycan(A,F,G,H,N)",
             "GlycanComposition": "glycan_composition",
             "PlausibleStruct": "pglyco:PlausibleStruct",
             "GlyID": "pglyco:GlyID",
@@ -50,8 +51,8 @@ def __init__(self, *args, **kwargs):
             "GlyIonRatio": "pglyco:GlyIonRatio",
             "byIonRatio": "pglyco:byIonRatio",
             "czIonRatio": "pglyco:czIonRatio",
-            "GlyDecoy": "glycan_is_decoy",
-            "PepDecoy": "peptide_is_decoy",
+            "GlyDecoy": "pglyco:glycan_is_decoy",
+            "PepDecoy": "pglyco:peptide_is_decoy",
             "Ion_163.06": "pglyco:Ion_163.06",
             "Ion_366.14": "pglyco:Ion_366.14",
             "Ion_204.09": "pglyco:Ion_204.09",
@@ -68,6 +69,9 @@ def __init__(self, *args, **kwargs):
         # pprint(f"mapping dict")
         # pprint(self.mapping_dict)
         self.df.rename(columns=self.mapping_dict, inplace=True)
+        unmapped_columns = original_columns.difference(set(self.mapping_dict.keys()))
+        prefix_mapping_dict = {col: f"pglyco:{col}" for col in unmapped_columns}
+        self.df.rename(columns=prefix_mapping_dict, inplace=True)
         # pprint(f"renamed df")
         # pprint(self.df)
         self.df.columns = self.df.columns.str.lstrip(" ")
@@ -113,7 +117,6 @@ def check_parser_compatibility(cls, file):
             "Peptide",
             "Mod",
             "PeptideMH",
-            "Glycan(H,N,A,F)",
             "GlycanComposition",
             "PlausibleStruct",
             "GlyID",
@@ -202,15 +205,75 @@ def transform_mod_entry(self, entry):
 
         return ";".join(transformed_mods)
 
+    def add_glycans_to_modifications(self):
+        """
+        Extends the 'modifications' column by adding glycans from
+        GlycanComposition (mapped to canonical sugar names) at the GlySite position.
+        Example: Oxidation:12;HexNAc(2)Hex(3)dHex(1)NeuAc(1):57
+        """
+        return self.df.apply(self._transform_glycan_entry, axis=1)
+
+    def _transform_glycan_entry(self, row):
+        # Mapping sinagle letters to sugar names as they are in pyiohat name_to_compostion dict so they can contribute to ucalc_mass ect.
+        pglyco_glyco_lookup = {
+            "H": "Hex",
+            "N": "HexNAc",
+            "F": "dHex",
+            "A": "NeuAc",
+            "G": "NeuGc",
+        }
+
+        canonical_order = ["HexNAc", "Hex", "dHex", "NeuAc", "NeuGc"]
+
+        base_mods = row.get("modifications", "")
+        gly_site = row.get("pglyco:GlySite")
+        gly_comp = row.get("glycan_composition", "")
+
+        if pd.isna(gly_site) or pd.isna(gly_comp):
+            return base_mods
+
+        import re
+
+        # --- Parse glycan composition ---
+        gly_dict = {}
+        for match in re.finditer(r"([A-Z])\((\d+)\)", gly_comp):
+            letter, count = match.groups()
+            count = int(count)
+            if count > 0 and letter in pglyco_glyco_lookup:
+                full_name = pglyco_glyco_lookup[letter]
+                gly_dict[full_name] = gly_dict.get(full_name, 0) + count
+
+        # --- Build glycan string in canonical order ---
+        gly_str_parts = []
+        for sugar in canonical_order:
+            if sugar in gly_dict:
+                gly_str_parts.append(f"{sugar}({gly_dict[sugar]})")
+
+        if not gly_str_parts:
+            return base_mods
+
+        gly_str = "".join(gly_str_parts) + f":{gly_site}"
+
+        # --- Combine with existing modifications ---
+        print(f"Added Glycan to Modifications: {gly_str}")
+        if base_mods:
+            return ";".join([base_mods, gly_str])
+        else:
+            return gly_str
+
     def convert_is_decoy_columns(self):
         """
         Converts 1/0 integer values in 'glycan_is_decoy' and 'peptide_is_decoy'
         columns to True/False boolean values.
         """
         conversion_map = {1: True, 0: False}
 
-        self.df["glycan_is_decoy"] = self.df["glycan_is_decoy"].map(conversion_map)
-        self.df["peptide_is_decoy"] = self.df["peptide_is_decoy"].map(conversion_map)
+        self.df["glycan_is_decoy"] = self.df["pglyco:glycan_is_decoy"].map(
+            conversion_map
+        )
+        self.df["peptide_is_decoy"] = self.df["pglyco:peptide_is_decoy"].map(
+            conversion_map
+        )
 
     def unify(self):
         """
@@ -225,9 +288,10 @@ def unify(self):
             expand=True,
         )[1]
         self.df["sequence"] = self.df["sequence"].astype(str).str.replace("J", "N")
+        self.df["modifications"] = self.adjust_modifications()
+        self.df["modifications"] = self.add_glycans_to_modifications()
         self.df["glycan_composition"] = self.convert_glycan_composition()
         self.convert_is_decoy_columns()
-        self.df["modifications"] = self.adjust_modifications()
         self.process_unify_style()
 
         return self.df