From eec57f0bd8c9692de429d0840305e55dd2e26d85 Mon Sep 17 00:00:00 2001 From: Jeremy Stone <74574922+jstone-uw@users.noreply.github.com> Date: Tue, 14 May 2024 14:33:28 -0700 Subject: [PATCH 1/5] Calculate score set statistics and expose in API. (#35) --- .../b823e14d6a00_score_set_statistics.py | 24 ++ src/mavedb/lib/score_sets.py | 233 +++++++++++++++++- src/mavedb/models/score_set.py | 1 + .../recalculate_score_set_statistics.py | 41 +++ src/mavedb/view_models/score_set.py | 39 +++ 5 files changed, 337 insertions(+), 1 deletion(-) create mode 100644 alembic/versions/b823e14d6a00_score_set_statistics.py create mode 100644 src/mavedb/scripts/recalculate_score_set_statistics.py diff --git a/alembic/versions/b823e14d6a00_score_set_statistics.py b/alembic/versions/b823e14d6a00_score_set_statistics.py new file mode 100644 index 00000000..4af63178 --- /dev/null +++ b/alembic/versions/b823e14d6a00_score_set_statistics.py @@ -0,0 +1,24 @@ +"""Score set statistics + +Revision ID: b823e14d6a00 +Revises: 9702d32bacb3 +Create Date: 2024-05-12 18:37:23.203099 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = 'b823e14d6a00' +down_revision = '9702d32bacb3' +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column('scoresets', sa.Column('statistics', postgresql.JSONB(astext_type=sa.Text()), nullable=True)) + + +def downgrade(): + op.drop_column('scoresets', 'statistics') diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index ff9e55a7..ff7a0249 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -1,8 +1,10 @@ import csv import io +import logging import re -from typing import Any, BinaryIO, Iterable, Optional, Sequence +from typing import Any, BinaryIO, Iterable, List, Optional, Sequence, Tuple, Union +import mavehgvs import numpy as np import pandas as pd from pandas.testing import assert_index_equal @@ -42,6 +44,8 @@ from mavedb.models.variant import Variant from mavedb.view_models.search import ScoreSetsSearch +logger = logging.getLogger(__name__) + VariantData = dict[str, Optional[dict[str, dict]]] @@ -635,3 +639,230 @@ def columns_for_dataset(dataset: Optional[pd.DataFrame]) -> list[str]: return [] return [col for col in dataset.columns if col not in HGVSColumns.options()] + + +def get_score_set_target_lengths(score_set: ScoreSet): + dna_lengths: list[int] = [] + protein_lengths: list[int] = [] + for target_gene in score_set.target_genes: + if target_gene.target_sequence is not None: + if target_gene.target_sequence.sequence_type == "protein": # or "dna" + protein_length = len(target_gene.target_sequence.sequence) + protein_lengths.append(protein_length) + dna_lengths.append(protein_length * 3) # Infer DNA target length from protein. + elif target_gene.target_sequence.sequence_type == "dna": + dna_length = len(target_gene.target_sequence.sequence) + protein_lengths.append(0) # Do not infer a protein target length from DNA. + dna_lengths.append(dna_length) + else: + # Invalid sequence type + raise ValidationError("Invalid sequence type") + return { + "dna": dna_lengths, + "protein": protein_lengths, + } + + +def summarize_nt_mutations_in_variant(variant: Variant): + if variant.hgvs_nt is None: + return {"num_mutations": 0} + nt_variants: Tuple[List[Optional[mavehgvs.Variant]], List[Optional[str]]] = mavehgvs.util.parse_variant_strings( + [variant.hgvs_nt] + ) + nt_variant: Optional[mavehgvs.Variant] = nt_variants[0][0] + if nt_variant is None: + return {"num_mutations": 0} + return { + "num_mutations": len(nt_variant.positions) if type(nt_variant.positions) is list else 1, + } + + +def summarize_pro_mutations_in_variant(variant: Variant): + if variant.hgvs_pro is None: + return {"num_mutations": 0, "most_severe_mutation_type": None} + pro_variants: Tuple[List[Optional[mavehgvs.Variant]], List[Optional[str]]] = mavehgvs.util.parse_variant_strings( + [variant.hgvs_pro] + ) + pro_variant: Optional[mavehgvs.Variant] = pro_variants[0][0] + if pro_variant is None: + return {"num_mutations": 0, "most_severe_mutation_type": None} + # print(f"{pro_variant.sequence}, VTYPE: {pro_variant.variant_type}, {pro_variant.target_id}") + + has_synonmyous_mutations = False + has_missense_mutations = False + has_nonsense_mutations = False + has_other_mutations = False + + # The pro_variant object contains either one sequence or a list of them, and similarly one variation type or a list. + # We normalize both here, turning single elements into lists of length 1. + sequences: list[Union[str, tuple[str, str], None]] = ( + pro_variant.sequence if type(pro_variant.sequence) is list else [pro_variant.sequence] + ) + variation_types: list[str] = ( + pro_variant.variant_type if type(pro_variant.variant_type) is list else [pro_variant.variant_type] + ) + + # Look at each variation (mutation) and count synonymous, nonsense, and missense mutations. + for i, variation_type in enumerate(variation_types): + if variation_type == "equal": + has_synonmyous_mutations = True + elif variation_type == "sub": + sequence = sequences[i] + # For a substitution, there should be two sequence elements (WT and mutated). The second sequence element + # may in general be one of the following: + # - An amino acid + # - Ter (stop codon). MaveHGVS doesn't support the short notation *. + # - - or del. This should not occur when the variant type is "sub." + # - =. This should not occur when the variant type is "sub," but we allow it. + if type(sequence) is not tuple: + logger.warn(f"Variant {variant.urn} has a sequence inconsistent with its variant type.") + has_other_mutations = True + elif sequence[1] in ["*", "Ter"]: + has_nonsense_mutations = True + elif sequence[1] in ["-", "del"]: + logger.warn(f"Variant {variant.urn} has a sequence inconsistent with its variant type.") + has_other_mutations = True + elif sequence[1] == "=" or sequence[0] == sequence[1]: + has_synonmyous_mutations = True + else: + has_missense_mutations = True + else: + print(variation_type) + has_other_mutations = True + + # Set the variant type only if all variations are of the same type. + if has_other_mutations: + mutation_type = "other" + elif has_nonsense_mutations: + mutation_type = "nonsense" + elif has_missense_mutations: + mutation_type = "missense" + elif has_synonmyous_mutations: + mutation_type = "synonymous" + else: + mutation_type = "other" + + return { + "num_mutations": len(pro_variant.positions) if type(pro_variant.positions) is list else 1, + "most_severe_mutation_type": mutation_type, + } + + +def calculate_score_set_statistics(score_set: ScoreSet): + score_set.target_genes + + lengths = get_score_set_target_lengths(score_set) + dna_target_length = sum(lengths["dna"]) + pro_target_length = sum(lengths["protein"]) + + num_single_mutant_nt_variants = 0 + num_double_mutant_nt_variants = 0 + num_triple_plus_mutant_nt_variants = 0 + + num_single_mutant_pro_variants = 0 + num_double_mutant_pro_variants = 0 + num_triple_plus_mutant_pro_variants = 0 + + num_splice_variants = 0 + + num_missense_variants = 0 + num_nonsense_variants = 0 + num_synonymous_variants = 0 + + # Count of all AA- and NT-sequence mutations, to be used in determining the average number of mutations per position + num_nt_mutations = 0 + num_pro_mutations = 0 + + all_variants_have_hgvs_nt = True + all_variants_have_hgvs_pro = True + some_variants_have_hgvs_nt = False + some_variants_have_hgvs_pro = False + + for v in score_set.variants: + # if not re.search(r"\*$", v.hgvs_pro): + # continue + + variant_has_hgvs_nt = v.hgvs_nt is not None + variant_has_hgvs_pro = v.hgvs_pro is not None + all_variants_have_hgvs_nt = all_variants_have_hgvs_nt and variant_has_hgvs_nt + all_variants_have_hgvs_pro = all_variants_have_hgvs_pro and variant_has_hgvs_pro + some_variants_have_hgvs_nt = some_variants_have_hgvs_nt or variant_has_hgvs_nt + some_variants_have_hgvs_pro = some_variants_have_hgvs_pro or variant_has_hgvs_pro + + nt_mutation_summary = summarize_nt_mutations_in_variant(v) + pro_mutation_summary = summarize_pro_mutations_in_variant(v) + + num_nt_mutations = nt_mutation_summary["num_mutations"] + num_pro_mutations = pro_mutation_summary["num_mutations"] + + # Count variants by number of AA-sequence mutations. + if num_pro_mutations == 1: + num_single_mutant_pro_variants += 1 + elif num_pro_mutations == 2: + num_double_mutant_pro_variants += 1 + elif num_pro_mutations > 2: + num_triple_plus_mutant_pro_variants += 1 + + # Count variants by number of NT-sequence mutations. + if num_nt_mutations == 1: + num_single_mutant_nt_variants += 1 + elif num_nt_mutations == 2: + num_double_mutant_nt_variants += 1 + elif num_nt_mutations > 2: + num_triple_plus_mutant_nt_variants += 1 + + # Count variants with hgvs_splice set. + if v.hgvs_splice is not None: + num_splice_variants += 1 + + # Count protein sequence variants by most severe mutation type. Those with mutations other than missense + most_severe_pro_mutation_type: int = pro_mutation_summary["most_severe_mutation_type"] + if most_severe_pro_mutation_type == "nonsense": + num_nonsense_variants += 1 + elif most_severe_pro_mutation_type == "missense": + num_missense_variants += 1 + elif most_severe_pro_mutation_type == "synonymous": + num_synonymous_variants += 1 + + print(f"{v.hgvs_pro}: {num_nt_mutations}/{num_pro_mutations} ({most_severe_pro_mutation_type})") + + statistics = { + "num_splice_variants": num_splice_variants, + "target_length": { + "dna": dna_target_length, + "protein": pro_target_length, + }, + } + + if some_variants_have_hgvs_nt or some_variants_have_hgvs_pro: + statistics["num_variants_by_mutation_count"] = {} + + if some_variants_have_hgvs_nt: + statistics["num_variants_by_mutation_count"]["nt"] = { # type: ignore + "single": num_single_mutant_nt_variants, + "double": num_double_mutant_nt_variants, + "triple_or_more": num_triple_plus_mutant_nt_variants, + } + + if some_variants_have_hgvs_pro: + statistics["num_variants_by_mutation_count"]["pro"] = { # type: ignore + "single": num_single_mutant_pro_variants, + "double": num_double_mutant_pro_variants, + "triple_or_more": num_triple_plus_mutant_pro_variants, + } + + if all_variants_have_hgvs_nt or all_variants_have_hgvs_pro: + statistics["mean_num_mutations_per_position"] = {} + + if all_variants_have_hgvs_nt: + statistics["mean_num_mutations_per_position"]["dna"] = num_nt_mutations / dna_target_length if dna_target_length > 0 else 0 # type: ignore + + if all_variants_have_hgvs_pro: + statistics["num_variants_by_mutation_type"] = { + "missense": num_missense_variants, + "nonsense": num_nonsense_variants, + "synonymous": num_synonymous_variants, + } + statistics["mean_num_mutations_per_position"]["protein"] = num_pro_mutations / pro_target_length if pro_target_length > 0 else 0 # type: ignore + + return statistics diff --git a/src/mavedb/models/score_set.py b/src/mavedb/models/score_set.py index 7b3be75f..1deb23fa 100644 --- a/src/mavedb/models/score_set.py +++ b/src/mavedb/models/score_set.py @@ -73,6 +73,7 @@ class ScoreSet(Base): extra_metadata = Column(JSONB, nullable=False) dataset_columns = Column(JSONB, nullable=False, default={}) external_links = Column(JSONB, nullable=False, default={}) + statistics = Column(JSONB, nullable=True) normalised = Column(Boolean, nullable=False, default=False) private = Column(Boolean, nullable=False, default=True) diff --git a/src/mavedb/scripts/recalculate_score_set_statistics.py b/src/mavedb/scripts/recalculate_score_set_statistics.py new file mode 100644 index 00000000..a6b3aa0d --- /dev/null +++ b/src/mavedb/scripts/recalculate_score_set_statistics.py @@ -0,0 +1,41 @@ +""" +Recalculate and update statistics for all score sets. + +Usage: +``` +python3 -m mavedb.scripts.recalculate_score_set_statistics +``` +""" + +import argparse +import logging + + +from sqlalchemy import select + +from mavedb.lib.score_sets import calculate_score_set_statistics +from mavedb.lib.script_environment import init_script_environment +from mavedb.models.score_set import ScoreSet + +logger = logging.getLogger(__name__) + +parser = argparse.ArgumentParser( + prog='recalculate_score_set_statistics', + description='Calculate and store statistics for one score set or all score sets.' +) +parser.add_argument("-urn", help="URN of the score set to update") +args = parser.parse_args() + +db = init_script_environment() + +if args.urn is not None: + score_sets = db.scalars(select(ScoreSet).where(ScoreSet.urn == args.urn)).all() +else: + score_sets = db.scalars(select(ScoreSet)).all() + +for score_set in score_sets: + statistics = calculate_score_set_statistics(score_set) + print(statistics) + score_set.statistics = statistics + db.add(score_set) + db.commit() diff --git a/src/mavedb/view_models/score_set.py b/src/mavedb/view_models/score_set.py index 93e95f20..699e8450 100644 --- a/src/mavedb/view_models/score_set.py +++ b/src/mavedb/view_models/score_set.py @@ -51,6 +51,43 @@ def get(self, key: Any, default: Any = ...) -> Any: return super().get(key, default) +class ScoreSetStatisticsMutationCountDistribution(BaseModel): + single: int + double: int + triple_or_more: int + + +class ScoreSetStatisticsMutationCountDistributions(BaseModel): + nt: Optional[ScoreSetStatisticsMutationCountDistribution] + pro: Optional[ScoreSetStatisticsMutationCountDistribution] + + +class ScoreSetStatisticsMutationTypeDistribution(BaseModel): + missense: int + nonsense: int + synonymous: int + + +class ScoreSetStatisticsTargetLengths(BaseModel): + dna: Optional[int] + protein: Optional[int] + + +class ScoreSetStatisticsMutationsPerPosition(BaseModel): + dna: Optional[int] + protein: Optional[int] + + +class ScoreSetStatistics(BaseModel): + """Score set statistics generated as a dict by calculate_score_set_statistics.""" + + num_splice_variants: Optional[int] + num_variants_by_mutation_count: Optional[ScoreSetStatisticsMutationCountDistributions] + num_variants_by_mutation_type: Optional[ScoreSetStatisticsMutationTypeDistribution] + target_length: Optional[ScoreSetStatisticsTargetLengths] + mean_num_mutations_per_position: Optional[ScoreSetStatisticsMutationsPerPosition] + + class ScoreSetBase(BaseModel): """Base class for score set view models.""" @@ -266,6 +303,7 @@ class ScoreSet(SavedScoreSet): private: bool processing_state: Optional[ProcessingState] processing_errors: Optional[dict] + statistics: Optional[ScoreSetStatistics] class ScoreSetWithVariants(ScoreSet): @@ -296,3 +334,4 @@ class ScoreSetPublicDump(SavedScoreSet): private: bool processing_state: Optional[ProcessingState] processing_errors: Optional[Dict] + statistics: Optional[ScoreSetStatistics] From 7f99125f118d2d0462fda8b71cc66e9b83e96540 Mon Sep 17 00:00:00 2001 From: Jeremy Stone <74574922+jstone-uw@users.noreply.github.com> Date: Mon, 26 Aug 2024 16:57:33 -0700 Subject: [PATCH 2/5] Score set statistics script usage instructions --- README.md | 40 +++++++++++++++++++ .../recalculate_score_set_statistics.py | 3 +- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index aad3e735..6f1c873e 100644 --- a/README.md +++ b/README.md @@ -135,3 +135,43 @@ We maintain deployment configuration options and steps within a [private reposit the production MaveDB environment. The main difference between the production setup and these local setups is that the worker and api services are split into distinct environments, allowing them to scale up or down individually dependent on need. + +## Scripts + +### export_public_data.py + +Usage: + +``` +python3 -m mavedb.scripts.export_public_data +``` + +This generates a ZIP archive named `mavedb-dump.zip` in the working directory. the ZIP file has the following contents: +- main.json: A JSON file providing metadata for all of the published experiment sets, experiments, and score sets +- variants/ + - [URN].counts.csv (for each variant URN): The score set's variant count columns, + sorted by variant number + - [URN].scores.csv (for each variant URN): The score set's variant count columns, + sorted by variant number + +In the exported JSON metadata, the root object's `experimentSets` property gives an array of experiment sets. +Experiments are nested in their parent experiment sets, and score sets in their parent experiments. + +The variant URNs used in filenames do not include the `urn:mavedb:` scheme identifier, so they look like +`00000001-a-1.counts.csv` and `00000001-a-1.scores.csv`, for instance. + +Unpublished data and data sets licensed other than under the Create Commmons Zero license are not included in the dump, +and user details are limited to ORCID IDs and names of contributors to published data sets. + + +### recalculate_score_set_statistics.py + +Usage: + +``` +python3 -m mavedb.scripts.recalculate_score_set_statistics + +python3 -m mavedb.scripts.recalculate_score_set_statistics -urn +``` + +This script recalculates statistics for all score sets or, if a URN is specified, for one score set. Statistics are stored as a JSON object in each score set's `statistics` property. diff --git a/src/mavedb/scripts/recalculate_score_set_statistics.py b/src/mavedb/scripts/recalculate_score_set_statistics.py index a6b3aa0d..aa8cc91f 100644 --- a/src/mavedb/scripts/recalculate_score_set_statistics.py +++ b/src/mavedb/scripts/recalculate_score_set_statistics.py @@ -4,6 +4,8 @@ Usage: ``` python3 -m mavedb.scripts.recalculate_score_set_statistics + +python3 -m mavedb.scripts.recalculate_score_set_statistics -urn ``` """ @@ -35,7 +37,6 @@ for score_set in score_sets: statistics = calculate_score_set_statistics(score_set) - print(statistics) score_set.statistics = statistics db.add(score_set) db.commit() From 551067c841dbfc8c03bb7d23c434dd65766d29d0 Mon Sep 17 00:00:00 2001 From: Jeremy Stone <74574922+jstone-uw@users.noreply.github.com> Date: Mon, 26 Aug 2024 16:58:09 -0700 Subject: [PATCH 3/5] Added score set statistics calculation to the intake worker job. --- src/mavedb/worker/jobs.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index d41baf81..f4b204e5 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -6,6 +6,7 @@ from sqlalchemy.orm import Session from mavedb.lib.score_sets import ( + calculate_score_set_statistics, columns_for_dataset, create_variants, create_variants_data, @@ -67,7 +68,8 @@ async def create_variants_for_score_set( variants_data = create_variants_data(validated_scores, validated_counts, None) create_variants(db, score_set, variants_data) - + score_set.statistics = calculate_score_set_statistics(score_set) + # Validation errors arise from problematic user data. These should be inserted into the database so failures can # be persisted to them. except ValidationError as e: From bdcc673623d7fb781d30f36226d5202ffada4488 Mon Sep 17 00:00:00 2001 From: Jeremy Stone <74574922+jstone-uw@users.noreply.github.com> Date: Mon, 26 Aug 2024 16:58:50 -0700 Subject: [PATCH 4/5] Code cleanup --- src/mavedb/lib/score_sets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index ff7a0249..872beca5 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -727,7 +727,7 @@ def summarize_pro_mutations_in_variant(variant: Variant): else: has_missense_mutations = True else: - print(variation_type) + # print(variation_type) has_other_mutations = True # Set the variant type only if all variations are of the same type. @@ -824,7 +824,7 @@ def calculate_score_set_statistics(score_set: ScoreSet): elif most_severe_pro_mutation_type == "synonymous": num_synonymous_variants += 1 - print(f"{v.hgvs_pro}: {num_nt_mutations}/{num_pro_mutations} ({most_severe_pro_mutation_type})") + # print(f"{v.hgvs_pro}: {num_nt_mutations}/{num_pro_mutations} ({most_severe_pro_mutation_type})") statistics = { "num_splice_variants": num_splice_variants, From b94378bbefa3fc103b623eab75ed17940226f08d Mon Sep 17 00:00:00 2001 From: Jeremy Stone <74574922+jstone-uw@users.noreply.github.com> Date: Mon, 26 Aug 2024 16:58:59 -0700 Subject: [PATCH 5/5] MyPy fixes --- src/mavedb/lib/score_sets.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index 872beca5..52888571 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -5,6 +5,7 @@ from typing import Any, BinaryIO, Iterable, List, Optional, Sequence, Tuple, Union import mavehgvs +from mavehgvs import util as mavehgvs_util import numpy as np import pandas as pd from pandas.testing import assert_index_equal @@ -647,11 +648,11 @@ def get_score_set_target_lengths(score_set: ScoreSet): for target_gene in score_set.target_genes: if target_gene.target_sequence is not None: if target_gene.target_sequence.sequence_type == "protein": # or "dna" - protein_length = len(target_gene.target_sequence.sequence) + protein_length = len(target_gene.target_sequence.sequence) if target_gene.target_sequence.sequence else 0 protein_lengths.append(protein_length) dna_lengths.append(protein_length * 3) # Infer DNA target length from protein. elif target_gene.target_sequence.sequence_type == "dna": - dna_length = len(target_gene.target_sequence.sequence) + dna_length = len(target_gene.target_sequence.sequence) if target_gene.target_sequence.sequence else 0 protein_lengths.append(0) # Do not infer a protein target length from DNA. dna_lengths.append(dna_length) else: @@ -666,7 +667,7 @@ def get_score_set_target_lengths(score_set: ScoreSet): def summarize_nt_mutations_in_variant(variant: Variant): if variant.hgvs_nt is None: return {"num_mutations": 0} - nt_variants: Tuple[List[Optional[mavehgvs.Variant]], List[Optional[str]]] = mavehgvs.util.parse_variant_strings( + nt_variants: Tuple[List[Optional[mavehgvs.Variant]], List[Optional[str]]] = mavehgvs_util.parse_variant_strings( [variant.hgvs_nt] ) nt_variant: Optional[mavehgvs.Variant] = nt_variants[0][0] @@ -680,7 +681,7 @@ def summarize_nt_mutations_in_variant(variant: Variant): def summarize_pro_mutations_in_variant(variant: Variant): if variant.hgvs_pro is None: return {"num_mutations": 0, "most_severe_mutation_type": None} - pro_variants: Tuple[List[Optional[mavehgvs.Variant]], List[Optional[str]]] = mavehgvs.util.parse_variant_strings( + pro_variants: Tuple[List[Optional[mavehgvs.Variant]], List[Optional[str]]] = mavehgvs_util.parse_variant_strings( [variant.hgvs_pro] ) pro_variant: Optional[mavehgvs.Variant] = pro_variants[0][0]