diff --git a/README.md b/README.md index aad3e735..6f1c873e 100644 --- a/README.md +++ b/README.md @@ -135,3 +135,43 @@ We maintain deployment configuration options and steps within a [private reposit the production MaveDB environment. The main difference between the production setup and these local setups is that the worker and api services are split into distinct environments, allowing them to scale up or down individually dependent on need. + +## Scripts + +### export_public_data.py + +Usage: + +``` +python3 -m mavedb.scripts.export_public_data +``` + +This generates a ZIP archive named `mavedb-dump.zip` in the working directory. the ZIP file has the following contents: +- main.json: A JSON file providing metadata for all of the published experiment sets, experiments, and score sets +- variants/ + - [URN].counts.csv (for each variant URN): The score set's variant count columns, + sorted by variant number + - [URN].scores.csv (for each variant URN): The score set's variant count columns, + sorted by variant number + +In the exported JSON metadata, the root object's `experimentSets` property gives an array of experiment sets. +Experiments are nested in their parent experiment sets, and score sets in their parent experiments. + +The variant URNs used in filenames do not include the `urn:mavedb:` scheme identifier, so they look like +`00000001-a-1.counts.csv` and `00000001-a-1.scores.csv`, for instance. + +Unpublished data and data sets licensed other than under the Create Commmons Zero license are not included in the dump, +and user details are limited to ORCID IDs and names of contributors to published data sets. + + +### recalculate_score_set_statistics.py + +Usage: + +``` +python3 -m mavedb.scripts.recalculate_score_set_statistics + +python3 -m mavedb.scripts.recalculate_score_set_statistics -urn +``` + +This script recalculates statistics for all score sets or, if a URN is specified, for one score set. Statistics are stored as a JSON object in each score set's `statistics` property. diff --git a/alembic/versions/b823e14d6a00_score_set_statistics.py b/alembic/versions/b823e14d6a00_score_set_statistics.py new file mode 100644 index 00000000..4af63178 --- /dev/null +++ b/alembic/versions/b823e14d6a00_score_set_statistics.py @@ -0,0 +1,24 @@ +"""Score set statistics + +Revision ID: b823e14d6a00 +Revises: 9702d32bacb3 +Create Date: 2024-05-12 18:37:23.203099 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = 'b823e14d6a00' +down_revision = '9702d32bacb3' +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column('scoresets', sa.Column('statistics', postgresql.JSONB(astext_type=sa.Text()), nullable=True)) + + +def downgrade(): + op.drop_column('scoresets', 'statistics') diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index ff9e55a7..52888571 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -1,8 +1,11 @@ import csv import io +import logging import re -from typing import Any, BinaryIO, Iterable, Optional, Sequence +from typing import Any, BinaryIO, Iterable, List, Optional, Sequence, Tuple, Union +import mavehgvs +from mavehgvs import util as mavehgvs_util import numpy as np import pandas as pd from pandas.testing import assert_index_equal @@ -42,6 +45,8 @@ from mavedb.models.variant import Variant from mavedb.view_models.search import ScoreSetsSearch +logger = logging.getLogger(__name__) + VariantData = dict[str, Optional[dict[str, dict]]] @@ -635,3 +640,230 @@ def columns_for_dataset(dataset: Optional[pd.DataFrame]) -> list[str]: return [] return [col for col in dataset.columns if col not in HGVSColumns.options()] + + +def get_score_set_target_lengths(score_set: ScoreSet): + dna_lengths: list[int] = [] + protein_lengths: list[int] = [] + for target_gene in score_set.target_genes: + if target_gene.target_sequence is not None: + if target_gene.target_sequence.sequence_type == "protein": # or "dna" + protein_length = len(target_gene.target_sequence.sequence) if target_gene.target_sequence.sequence else 0 + protein_lengths.append(protein_length) + dna_lengths.append(protein_length * 3) # Infer DNA target length from protein. + elif target_gene.target_sequence.sequence_type == "dna": + dna_length = len(target_gene.target_sequence.sequence) if target_gene.target_sequence.sequence else 0 + protein_lengths.append(0) # Do not infer a protein target length from DNA. + dna_lengths.append(dna_length) + else: + # Invalid sequence type + raise ValidationError("Invalid sequence type") + return { + "dna": dna_lengths, + "protein": protein_lengths, + } + + +def summarize_nt_mutations_in_variant(variant: Variant): + if variant.hgvs_nt is None: + return {"num_mutations": 0} + nt_variants: Tuple[List[Optional[mavehgvs.Variant]], List[Optional[str]]] = mavehgvs_util.parse_variant_strings( + [variant.hgvs_nt] + ) + nt_variant: Optional[mavehgvs.Variant] = nt_variants[0][0] + if nt_variant is None: + return {"num_mutations": 0} + return { + "num_mutations": len(nt_variant.positions) if type(nt_variant.positions) is list else 1, + } + + +def summarize_pro_mutations_in_variant(variant: Variant): + if variant.hgvs_pro is None: + return {"num_mutations": 0, "most_severe_mutation_type": None} + pro_variants: Tuple[List[Optional[mavehgvs.Variant]], List[Optional[str]]] = mavehgvs_util.parse_variant_strings( + [variant.hgvs_pro] + ) + pro_variant: Optional[mavehgvs.Variant] = pro_variants[0][0] + if pro_variant is None: + return {"num_mutations": 0, "most_severe_mutation_type": None} + # print(f"{pro_variant.sequence}, VTYPE: {pro_variant.variant_type}, {pro_variant.target_id}") + + has_synonmyous_mutations = False + has_missense_mutations = False + has_nonsense_mutations = False + has_other_mutations = False + + # The pro_variant object contains either one sequence or a list of them, and similarly one variation type or a list. + # We normalize both here, turning single elements into lists of length 1. + sequences: list[Union[str, tuple[str, str], None]] = ( + pro_variant.sequence if type(pro_variant.sequence) is list else [pro_variant.sequence] + ) + variation_types: list[str] = ( + pro_variant.variant_type if type(pro_variant.variant_type) is list else [pro_variant.variant_type] + ) + + # Look at each variation (mutation) and count synonymous, nonsense, and missense mutations. + for i, variation_type in enumerate(variation_types): + if variation_type == "equal": + has_synonmyous_mutations = True + elif variation_type == "sub": + sequence = sequences[i] + # For a substitution, there should be two sequence elements (WT and mutated). The second sequence element + # may in general be one of the following: + # - An amino acid + # - Ter (stop codon). MaveHGVS doesn't support the short notation *. + # - - or del. This should not occur when the variant type is "sub." + # - =. This should not occur when the variant type is "sub," but we allow it. + if type(sequence) is not tuple: + logger.warn(f"Variant {variant.urn} has a sequence inconsistent with its variant type.") + has_other_mutations = True + elif sequence[1] in ["*", "Ter"]: + has_nonsense_mutations = True + elif sequence[1] in ["-", "del"]: + logger.warn(f"Variant {variant.urn} has a sequence inconsistent with its variant type.") + has_other_mutations = True + elif sequence[1] == "=" or sequence[0] == sequence[1]: + has_synonmyous_mutations = True + else: + has_missense_mutations = True + else: + # print(variation_type) + has_other_mutations = True + + # Set the variant type only if all variations are of the same type. + if has_other_mutations: + mutation_type = "other" + elif has_nonsense_mutations: + mutation_type = "nonsense" + elif has_missense_mutations: + mutation_type = "missense" + elif has_synonmyous_mutations: + mutation_type = "synonymous" + else: + mutation_type = "other" + + return { + "num_mutations": len(pro_variant.positions) if type(pro_variant.positions) is list else 1, + "most_severe_mutation_type": mutation_type, + } + + +def calculate_score_set_statistics(score_set: ScoreSet): + score_set.target_genes + + lengths = get_score_set_target_lengths(score_set) + dna_target_length = sum(lengths["dna"]) + pro_target_length = sum(lengths["protein"]) + + num_single_mutant_nt_variants = 0 + num_double_mutant_nt_variants = 0 + num_triple_plus_mutant_nt_variants = 0 + + num_single_mutant_pro_variants = 0 + num_double_mutant_pro_variants = 0 + num_triple_plus_mutant_pro_variants = 0 + + num_splice_variants = 0 + + num_missense_variants = 0 + num_nonsense_variants = 0 + num_synonymous_variants = 0 + + # Count of all AA- and NT-sequence mutations, to be used in determining the average number of mutations per position + num_nt_mutations = 0 + num_pro_mutations = 0 + + all_variants_have_hgvs_nt = True + all_variants_have_hgvs_pro = True + some_variants_have_hgvs_nt = False + some_variants_have_hgvs_pro = False + + for v in score_set.variants: + # if not re.search(r"\*$", v.hgvs_pro): + # continue + + variant_has_hgvs_nt = v.hgvs_nt is not None + variant_has_hgvs_pro = v.hgvs_pro is not None + all_variants_have_hgvs_nt = all_variants_have_hgvs_nt and variant_has_hgvs_nt + all_variants_have_hgvs_pro = all_variants_have_hgvs_pro and variant_has_hgvs_pro + some_variants_have_hgvs_nt = some_variants_have_hgvs_nt or variant_has_hgvs_nt + some_variants_have_hgvs_pro = some_variants_have_hgvs_pro or variant_has_hgvs_pro + + nt_mutation_summary = summarize_nt_mutations_in_variant(v) + pro_mutation_summary = summarize_pro_mutations_in_variant(v) + + num_nt_mutations = nt_mutation_summary["num_mutations"] + num_pro_mutations = pro_mutation_summary["num_mutations"] + + # Count variants by number of AA-sequence mutations. + if num_pro_mutations == 1: + num_single_mutant_pro_variants += 1 + elif num_pro_mutations == 2: + num_double_mutant_pro_variants += 1 + elif num_pro_mutations > 2: + num_triple_plus_mutant_pro_variants += 1 + + # Count variants by number of NT-sequence mutations. + if num_nt_mutations == 1: + num_single_mutant_nt_variants += 1 + elif num_nt_mutations == 2: + num_double_mutant_nt_variants += 1 + elif num_nt_mutations > 2: + num_triple_plus_mutant_nt_variants += 1 + + # Count variants with hgvs_splice set. + if v.hgvs_splice is not None: + num_splice_variants += 1 + + # Count protein sequence variants by most severe mutation type. Those with mutations other than missense + most_severe_pro_mutation_type: int = pro_mutation_summary["most_severe_mutation_type"] + if most_severe_pro_mutation_type == "nonsense": + num_nonsense_variants += 1 + elif most_severe_pro_mutation_type == "missense": + num_missense_variants += 1 + elif most_severe_pro_mutation_type == "synonymous": + num_synonymous_variants += 1 + + # print(f"{v.hgvs_pro}: {num_nt_mutations}/{num_pro_mutations} ({most_severe_pro_mutation_type})") + + statistics = { + "num_splice_variants": num_splice_variants, + "target_length": { + "dna": dna_target_length, + "protein": pro_target_length, + }, + } + + if some_variants_have_hgvs_nt or some_variants_have_hgvs_pro: + statistics["num_variants_by_mutation_count"] = {} + + if some_variants_have_hgvs_nt: + statistics["num_variants_by_mutation_count"]["nt"] = { # type: ignore + "single": num_single_mutant_nt_variants, + "double": num_double_mutant_nt_variants, + "triple_or_more": num_triple_plus_mutant_nt_variants, + } + + if some_variants_have_hgvs_pro: + statistics["num_variants_by_mutation_count"]["pro"] = { # type: ignore + "single": num_single_mutant_pro_variants, + "double": num_double_mutant_pro_variants, + "triple_or_more": num_triple_plus_mutant_pro_variants, + } + + if all_variants_have_hgvs_nt or all_variants_have_hgvs_pro: + statistics["mean_num_mutations_per_position"] = {} + + if all_variants_have_hgvs_nt: + statistics["mean_num_mutations_per_position"]["dna"] = num_nt_mutations / dna_target_length if dna_target_length > 0 else 0 # type: ignore + + if all_variants_have_hgvs_pro: + statistics["num_variants_by_mutation_type"] = { + "missense": num_missense_variants, + "nonsense": num_nonsense_variants, + "synonymous": num_synonymous_variants, + } + statistics["mean_num_mutations_per_position"]["protein"] = num_pro_mutations / pro_target_length if pro_target_length > 0 else 0 # type: ignore + + return statistics diff --git a/src/mavedb/models/score_set.py b/src/mavedb/models/score_set.py index 7b3be75f..1deb23fa 100644 --- a/src/mavedb/models/score_set.py +++ b/src/mavedb/models/score_set.py @@ -73,6 +73,7 @@ class ScoreSet(Base): extra_metadata = Column(JSONB, nullable=False) dataset_columns = Column(JSONB, nullable=False, default={}) external_links = Column(JSONB, nullable=False, default={}) + statistics = Column(JSONB, nullable=True) normalised = Column(Boolean, nullable=False, default=False) private = Column(Boolean, nullable=False, default=True) diff --git a/src/mavedb/scripts/recalculate_score_set_statistics.py b/src/mavedb/scripts/recalculate_score_set_statistics.py new file mode 100644 index 00000000..aa8cc91f --- /dev/null +++ b/src/mavedb/scripts/recalculate_score_set_statistics.py @@ -0,0 +1,42 @@ +""" +Recalculate and update statistics for all score sets. + +Usage: +``` +python3 -m mavedb.scripts.recalculate_score_set_statistics + +python3 -m mavedb.scripts.recalculate_score_set_statistics -urn +``` +""" + +import argparse +import logging + + +from sqlalchemy import select + +from mavedb.lib.score_sets import calculate_score_set_statistics +from mavedb.lib.script_environment import init_script_environment +from mavedb.models.score_set import ScoreSet + +logger = logging.getLogger(__name__) + +parser = argparse.ArgumentParser( + prog='recalculate_score_set_statistics', + description='Calculate and store statistics for one score set or all score sets.' +) +parser.add_argument("-urn", help="URN of the score set to update") +args = parser.parse_args() + +db = init_script_environment() + +if args.urn is not None: + score_sets = db.scalars(select(ScoreSet).where(ScoreSet.urn == args.urn)).all() +else: + score_sets = db.scalars(select(ScoreSet)).all() + +for score_set in score_sets: + statistics = calculate_score_set_statistics(score_set) + score_set.statistics = statistics + db.add(score_set) + db.commit() diff --git a/src/mavedb/view_models/score_set.py b/src/mavedb/view_models/score_set.py index 93e95f20..699e8450 100644 --- a/src/mavedb/view_models/score_set.py +++ b/src/mavedb/view_models/score_set.py @@ -51,6 +51,43 @@ def get(self, key: Any, default: Any = ...) -> Any: return super().get(key, default) +class ScoreSetStatisticsMutationCountDistribution(BaseModel): + single: int + double: int + triple_or_more: int + + +class ScoreSetStatisticsMutationCountDistributions(BaseModel): + nt: Optional[ScoreSetStatisticsMutationCountDistribution] + pro: Optional[ScoreSetStatisticsMutationCountDistribution] + + +class ScoreSetStatisticsMutationTypeDistribution(BaseModel): + missense: int + nonsense: int + synonymous: int + + +class ScoreSetStatisticsTargetLengths(BaseModel): + dna: Optional[int] + protein: Optional[int] + + +class ScoreSetStatisticsMutationsPerPosition(BaseModel): + dna: Optional[int] + protein: Optional[int] + + +class ScoreSetStatistics(BaseModel): + """Score set statistics generated as a dict by calculate_score_set_statistics.""" + + num_splice_variants: Optional[int] + num_variants_by_mutation_count: Optional[ScoreSetStatisticsMutationCountDistributions] + num_variants_by_mutation_type: Optional[ScoreSetStatisticsMutationTypeDistribution] + target_length: Optional[ScoreSetStatisticsTargetLengths] + mean_num_mutations_per_position: Optional[ScoreSetStatisticsMutationsPerPosition] + + class ScoreSetBase(BaseModel): """Base class for score set view models.""" @@ -266,6 +303,7 @@ class ScoreSet(SavedScoreSet): private: bool processing_state: Optional[ProcessingState] processing_errors: Optional[dict] + statistics: Optional[ScoreSetStatistics] class ScoreSetWithVariants(ScoreSet): @@ -296,3 +334,4 @@ class ScoreSetPublicDump(SavedScoreSet): private: bool processing_state: Optional[ProcessingState] processing_errors: Optional[Dict] + statistics: Optional[ScoreSetStatistics] diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index d41baf81..f4b204e5 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -6,6 +6,7 @@ from sqlalchemy.orm import Session from mavedb.lib.score_sets import ( + calculate_score_set_statistics, columns_for_dataset, create_variants, create_variants_data, @@ -67,7 +68,8 @@ async def create_variants_for_score_set( variants_data = create_variants_data(validated_scores, validated_counts, None) create_variants(db, score_set, variants_data) - + score_set.statistics = calculate_score_set_statistics(score_set) + # Validation errors arise from problematic user data. These should be inserted into the database so failures can # be persisted to them. except ValidationError as e: