From 45309cfefac35cc4090d355aec4de3b4632746eb Mon Sep 17 00:00:00 2001 From: Spbd1 <148923621+Spbd1@users.noreply.github.com> Date: Thu, 7 May 2026 02:36:12 +0000 Subject: [PATCH] Add 128-bit genome encoding layer --- pishiegen/genome/decoder.py | 29 ++++++++ pishiegen/genome/encoding.py | 119 ++++++++++++++++++++++++++++++ pishiegen/genome/random_genome.py | 49 ++++++++++++ pishiegen/genome/schema.py | 99 +++++++++++++++++++++++++ tests/test_genome_encoding.py | 54 ++++++++++++++ 5 files changed, 350 insertions(+) create mode 100644 pishiegen/genome/decoder.py create mode 100644 pishiegen/genome/encoding.py create mode 100644 pishiegen/genome/random_genome.py create mode 100644 pishiegen/genome/schema.py create mode 100644 tests/test_genome_encoding.py diff --git a/pishiegen/genome/decoder.py b/pishiegen/genome/decoder.py new file mode 100644 index 0000000..e9dcee9 --- /dev/null +++ b/pishiegen/genome/decoder.py @@ -0,0 +1,29 @@ +"""Deterministic decoder for the compact 128-bit PishieGen genome. + +Decoding is intentionally mechanical: each named output is the unsigned integer +stored in its documented bit range. No randomness or biological inference is +performed. +""" + +from __future__ import annotations + +from pishiegen.genome.encoding import Genome + + +def decode_genome(genome: Genome | int) -> dict[str, int]: + """Decode a 128-bit genome into deterministic field values. + + Args: + genome: Either a ``Genome`` instance or a raw integer constrained to bits + 0 through 127. + + Returns: + A dictionary in schema order, from ``base_coat_color`` (bits 0-7) through + ``reserved_experimental`` (bits 120-127). + """ + + if isinstance(genome, Genome): + compact = genome + else: + compact = Genome(genome) + return compact.to_dict() diff --git a/pishiegen/genome/encoding.py b/pishiegen/genome/encoding.py new file mode 100644 index 0000000..3cb4e45 --- /dev/null +++ b/pishiegen/genome/encoding.py @@ -0,0 +1,119 @@ +"""Bit-level utilities and dataclass for the 128-bit PishieGen genome. + +The compact genome stores named unsigned integer fields inside a single Python +integer constrained to ``0 <= raw < 2**128``. Bit positions are zero-based from +the least significant bit; each named field's inclusive range is defined in +``pishiegen.genome.schema``. This is a computational genotype inspired by +inheritance concepts, not a literal biological genome. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from pishiegen.genome.schema import ( + FIELD_ORDER, + GENOME_BITS, + GENOME_MAX, + GENOME_MIN, + get_field_spec, +) + + +def validate_raw_genome(raw: int) -> None: + """Validate that ``raw`` is an integer in the inclusive 128-bit range. + + Valid genomes occupy bits 0 through 127, so the accepted range is + ``0 <= raw <= 2**128 - 1``. + """ + + if not isinstance(raw, int): + raise TypeError("Genome raw value must be an int.") + if raw < GENOME_MIN or raw > GENOME_MAX: + raise ValueError(f"Genome raw value must satisfy 0 <= raw < 2**{GENOME_BITS}.") + + +def _validate_bit_window(start: int, width: int) -> None: + """Validate a bit window within the 0-through-127 genome range.""" + + if not isinstance(start, int) or not isinstance(width, int): + raise TypeError("Bit start and width must be integers.") + if width <= 0: + raise ValueError("Bit width must be positive.") + if start < 0 or start + width > GENOME_BITS: + raise ValueError(f"Bit window must fit within bits 0 through {GENOME_BITS - 1}.") + + +def extract_bits(raw: int, start: int, width: int) -> int: + """Extract an unsigned value from ``raw`` over ``width`` bits at ``start``. + + ``start`` is the least significant bit of the field. For example, extracting + ``start=8`` and ``width=8`` reads bits 8 through 15. + """ + + validate_raw_genome(raw) + _validate_bit_window(start, width) + return (raw >> start) & ((1 << width) - 1) + + +def set_bits(raw: int, start: int, width: int, value: int) -> int: + """Set a bit field and return the updated 128-bit genome integer. + + The field spans bits ``start`` through ``start + width - 1`` inclusively. + Bits outside that range are preserved exactly. + """ + + validate_raw_genome(raw) + _validate_bit_window(start, width) + if not isinstance(value, int): + raise TypeError("Bit field value must be an int.") + max_value = (1 << width) - 1 + if value < 0 or value > max_value: + raise ValueError(f"Value {value!r} does not fit in {width} bits.") + mask = max_value << start + updated = (raw & ~mask) | (value << start) + validate_raw_genome(updated) + return updated + + +@dataclass(frozen=True, slots=True) +class Genome: + """Compact 128-bit computational genotype. + + ``raw`` must satisfy ``0 <= raw < 2**128``. Named fields cover fixed bit + ranges from ``base_coat_color`` in bits 0 through 7 to + ``reserved_experimental`` in bits 120 through 127. + """ + + raw: int + + def __post_init__(self) -> None: + """Validate the raw integer immediately after construction.""" + + self.validate() + + def validate(self) -> None: + """Validate that this genome's raw integer fits bits 0 through 127.""" + + validate_raw_genome(self.raw) + + def get_field(self, name: str) -> int: + """Return the unsigned value for a named field's configured bit range.""" + + spec = get_field_spec(name) + return extract_bits(self.raw, spec.start, spec.width) + + def set_field(self, name: str, value: int) -> "Genome": + """Return a new genome with one named field changed. + + Only the named field's inclusive bit range is modified; all other field + ranges remain unchanged. + """ + + spec = get_field_spec(name) + return Genome(set_bits(self.raw, spec.start, spec.width, value)) + + def to_dict(self) -> dict[str, int]: + """Decode every named bit range into a deterministic dictionary.""" + + return {name: self.get_field(name) for name in FIELD_ORDER} diff --git a/pishiegen/genome/random_genome.py b/pishiegen/genome/random_genome.py new file mode 100644 index 0000000..ae6f0bf --- /dev/null +++ b/pishiegen/genome/random_genome.py @@ -0,0 +1,49 @@ +"""Random generator for compact 128-bit PishieGen genomes. + +The generator creates a single Python integer spanning bits 0 through 127. It is +seedable for reproducible simulations and tests. The resulting values are a +computational genotype, not a literal biological sequence. +""" + +from __future__ import annotations + +import random +import sys +from types import ModuleType +from typing import Any + +from pishiegen.genome.encoding import Genome +from pishiegen.genome.operators import random_genome as _legacy_random_genome +from pishiegen.genome.schema import GENOME_BITS + + +def random_genome(seed: int | str | bytes | bytearray | None = None) -> Genome: + """Return a random 128-bit ``Genome`` with optional deterministic seeding. + + Passing the same ``seed`` produces the same raw integer. Omitting ``seed`` + uses Python's default entropy source for non-deterministic initialization. + The returned genome's raw value covers bits 0 through 127. + """ + + rng = random.Random(seed) + return Genome(rng.getrandbits(GENOME_BITS)) + + +class _CallableRandomGenomeModule(ModuleType): + """Keep ``from pishiegen.genome import random_genome`` callable. + + Python binds imported submodules onto their parent package. Because this + module intentionally has the same name as the legacy package-level helper, + making the module callable preserves older call sites that generate the + existing gene-list genome representation. + """ + + def __call__(self, *args: Any, **kwargs: Any) -> Any: + if args and isinstance(args[0], random.Random): + return _legacy_random_genome(*args, **kwargs) + if "gene_count" in kwargs or (len(args) >= 2 and isinstance(args[1], int)): + return _legacy_random_genome(*args, **kwargs) + return random_genome(*args, **kwargs) + + +sys.modules[__name__].__class__ = _CallableRandomGenomeModule diff --git a/pishiegen/genome/schema.py b/pishiegen/genome/schema.py new file mode 100644 index 0000000..e9a2f36 --- /dev/null +++ b/pishiegen/genome/schema.py @@ -0,0 +1,99 @@ +"""Schema for PishieGen's compact 128-bit computational genome. + +The 128-bit genome is a deterministic integer encoding, not a literal biological +sequence. Bit ranges are inclusive and use zero-based indexing from the least +significant bit. For example, ``base_coat_color`` occupies bits 0 through 7. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from types import MappingProxyType + + +GENOME_BITS = 128 +"""Total number of bits in the compact genome integer.""" + +GENOME_MIN = 0 +"""Smallest valid raw genome value.""" + +GENOME_MAX = 2**GENOME_BITS - 1 +"""Largest valid raw genome value.""" + + +@dataclass(frozen=True, slots=True) +class FieldSpec: + """Description of one named bit field in the 128-bit genome. + + Attributes: + name: Stable field name used by encoders and decoders. + start: Inclusive zero-based start bit, counted from the least significant + bit of the raw integer. + end: Inclusive zero-based end bit, counted from the least significant bit + of the raw integer. + """ + + name: str + start: int + end: int + + @property + def width(self) -> int: + """Return the number of bits occupied by this field.""" + + return self.end - self.start + 1 + + @property + def max_value(self) -> int: + """Return the largest unsigned value that fits in this field.""" + + return (1 << self.width) - 1 + + @property + def mask(self) -> int: + """Return this field's mask at its encoded bit position.""" + + return self.max_value << self.start + + +_FIELD_SPECS = ( + FieldSpec("base_coat_color", 0, 7), + FieldSpec("hidden_coat_color", 8, 15), + FieldSpec("agouti_tabby_pattern", 16, 19), + FieldSpec("pattern_intensity", 20, 23), + FieldSpec("fur_length_type", 24, 27), + FieldSpec("ear_morphology", 28, 31), + FieldSpec("tail_morphology", 32, 35), + FieldSpec("polydactyly", 36, 37), + FieldSpec("colorpoint_albino_locus", 38, 39), + FieldSpec("health_risk_loci", 40, 47), + FieldSpec("thermal_tolerance", 48, 55), + FieldSpec("camouflage_profile", 56, 63), + FieldSpec("agility_muscle", 64, 71), + FieldSpec("sensory_acuity", 72, 79), + FieldSpec("intelligence_cognition", 80, 87), + FieldSpec("circadian_tendency", 88, 95), + FieldSpec("mutation_markers", 96, 111), + FieldSpec("lineage_markers", 112, 119), + FieldSpec("reserved_experimental", 120, 127), +) +"""Ordered field specifications covering bits 0 through 127 exactly once.""" + +FIELD_SPECS = MappingProxyType({field.name: field for field in _FIELD_SPECS}) +"""Immutable mapping from field names to their bit-range specifications.""" + +FIELD_ORDER = tuple(field.name for field in _FIELD_SPECS) +"""Stable field order used for deterministic dictionary output.""" + + +def get_field_spec(name: str) -> FieldSpec: + """Return the schema entry for ``name`` or raise ``KeyError``. + + Field names map to inclusive bit ranges in the 128-bit integer. For example, + ``mutation_markers`` maps to bits 96 through 111. + """ + + try: + return FIELD_SPECS[name] + except KeyError as error: + raise KeyError(f"Unknown genome field: {name!r}") from error diff --git a/tests/test_genome_encoding.py b/tests/test_genome_encoding.py new file mode 100644 index 0000000..6102efb --- /dev/null +++ b/tests/test_genome_encoding.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import pytest + +from pishiegen.genome.decoder import decode_genome +from pishiegen.genome.encoding import Genome, extract_bits, set_bits +from pishiegen.genome.random_genome import random_genome +from pishiegen.genome.schema import FIELD_ORDER, GENOME_MAX, get_field_spec + + +def test_valid_128_bit_range() -> None: + assert Genome(0).raw == 0 + assert Genome(GENOME_MAX).raw == GENOME_MAX + + with pytest.raises(ValueError): + Genome(-1) + with pytest.raises(ValueError): + Genome(2**128) + + +def test_field_extraction_correctness() -> None: + raw = 0 + expected: dict[str, int] = {} + for index, name in enumerate(FIELD_ORDER): + spec = get_field_spec(name) + value = min(index + 1, spec.max_value) + raw = set_bits(raw, spec.start, spec.width, value) + expected[name] = value + + genome = Genome(raw) + + assert genome.to_dict() == expected + assert decode_genome(genome) == expected + assert extract_bits(raw, 96, 16) == expected["mutation_markers"] + + +def test_setting_a_field_does_not_corrupt_other_fields() -> None: + original = Genome(GENOME_MAX) + updated = original.set_field("polydactyly", 0) + + assert updated.get_field("polydactyly") == 0 + for name in FIELD_ORDER: + if name != "polydactyly": + assert updated.get_field(name) == original.get_field(name) + + +def test_random_genome_reproducibility_with_seed() -> None: + genome_a = random_genome(seed=42) + genome_b = random_genome(seed=42) + genome_c = random_genome(seed=43) + + assert 0 <= genome_a.raw < 2**128 + assert genome_a == genome_b + assert genome_a != genome_c