Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions pishiegen/genome/decoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Deterministic decoder for the compact 128-bit PishieGen genome.

Decoding is intentionally mechanical: each named output is the unsigned integer
stored in its documented bit range. No randomness or biological inference is
performed.
"""

from __future__ import annotations

from pishiegen.genome.encoding import Genome


def decode_genome(genome: Genome | int) -> dict[str, int]:
"""Decode a 128-bit genome into deterministic field values.

Args:
genome: Either a ``Genome`` instance or a raw integer constrained to bits
0 through 127.

Returns:
A dictionary in schema order, from ``base_coat_color`` (bits 0-7) through
``reserved_experimental`` (bits 120-127).
"""

if isinstance(genome, Genome):
compact = genome
else:
compact = Genome(genome)
return compact.to_dict()
119 changes: 119 additions & 0 deletions pishiegen/genome/encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""Bit-level utilities and dataclass for the 128-bit PishieGen genome.

The compact genome stores named unsigned integer fields inside a single Python
integer constrained to ``0 <= raw < 2**128``. Bit positions are zero-based from
the least significant bit; each named field's inclusive range is defined in
``pishiegen.genome.schema``. This is a computational genotype inspired by
inheritance concepts, not a literal biological genome.
"""

from __future__ import annotations

from dataclasses import dataclass

from pishiegen.genome.schema import (
FIELD_ORDER,
GENOME_BITS,
GENOME_MAX,
GENOME_MIN,
get_field_spec,
)


def validate_raw_genome(raw: int) -> None:
"""Validate that ``raw`` is an integer in the inclusive 128-bit range.

Valid genomes occupy bits 0 through 127, so the accepted range is
``0 <= raw <= 2**128 - 1``.
"""

if not isinstance(raw, int):
raise TypeError("Genome raw value must be an int.")
if raw < GENOME_MIN or raw > GENOME_MAX:
raise ValueError(f"Genome raw value must satisfy 0 <= raw < 2**{GENOME_BITS}.")


def _validate_bit_window(start: int, width: int) -> None:
"""Validate a bit window within the 0-through-127 genome range."""

if not isinstance(start, int) or not isinstance(width, int):
raise TypeError("Bit start and width must be integers.")
if width <= 0:
raise ValueError("Bit width must be positive.")
if start < 0 or start + width > GENOME_BITS:
raise ValueError(f"Bit window must fit within bits 0 through {GENOME_BITS - 1}.")


def extract_bits(raw: int, start: int, width: int) -> int:
"""Extract an unsigned value from ``raw`` over ``width`` bits at ``start``.

``start`` is the least significant bit of the field. For example, extracting
``start=8`` and ``width=8`` reads bits 8 through 15.
"""

validate_raw_genome(raw)
_validate_bit_window(start, width)
return (raw >> start) & ((1 << width) - 1)


def set_bits(raw: int, start: int, width: int, value: int) -> int:
"""Set a bit field and return the updated 128-bit genome integer.

The field spans bits ``start`` through ``start + width - 1`` inclusively.
Bits outside that range are preserved exactly.
"""

validate_raw_genome(raw)
_validate_bit_window(start, width)
if not isinstance(value, int):
raise TypeError("Bit field value must be an int.")
max_value = (1 << width) - 1
if value < 0 or value > max_value:
raise ValueError(f"Value {value!r} does not fit in {width} bits.")
mask = max_value << start
updated = (raw & ~mask) | (value << start)
validate_raw_genome(updated)
return updated


@dataclass(frozen=True, slots=True)
class Genome:
"""Compact 128-bit computational genotype.

``raw`` must satisfy ``0 <= raw < 2**128``. Named fields cover fixed bit
ranges from ``base_coat_color`` in bits 0 through 7 to
``reserved_experimental`` in bits 120 through 127.
"""

raw: int

def __post_init__(self) -> None:
"""Validate the raw integer immediately after construction."""

self.validate()

def validate(self) -> None:
"""Validate that this genome's raw integer fits bits 0 through 127."""

validate_raw_genome(self.raw)

def get_field(self, name: str) -> int:
"""Return the unsigned value for a named field's configured bit range."""

spec = get_field_spec(name)
return extract_bits(self.raw, spec.start, spec.width)

def set_field(self, name: str, value: int) -> "Genome":
"""Return a new genome with one named field changed.

Only the named field's inclusive bit range is modified; all other field
ranges remain unchanged.
"""

spec = get_field_spec(name)
return Genome(set_bits(self.raw, spec.start, spec.width, value))

def to_dict(self) -> dict[str, int]:
"""Decode every named bit range into a deterministic dictionary."""

return {name: self.get_field(name) for name in FIELD_ORDER}
49 changes: 49 additions & 0 deletions pishiegen/genome/random_genome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Random generator for compact 128-bit PishieGen genomes.

The generator creates a single Python integer spanning bits 0 through 127. It is
seedable for reproducible simulations and tests. The resulting values are a
computational genotype, not a literal biological sequence.
"""

from __future__ import annotations

import random
import sys
from types import ModuleType
from typing import Any

from pishiegen.genome.encoding import Genome
from pishiegen.genome.operators import random_genome as _legacy_random_genome
from pishiegen.genome.schema import GENOME_BITS


def random_genome(seed: int | str | bytes | bytearray | None = None) -> Genome:
"""Return a random 128-bit ``Genome`` with optional deterministic seeding.

Passing the same ``seed`` produces the same raw integer. Omitting ``seed``
uses Python's default entropy source for non-deterministic initialization.
The returned genome's raw value covers bits 0 through 127.
"""

rng = random.Random(seed)
return Genome(rng.getrandbits(GENOME_BITS))


class _CallableRandomGenomeModule(ModuleType):
"""Keep ``from pishiegen.genome import random_genome`` callable.

Python binds imported submodules onto their parent package. Because this
module intentionally has the same name as the legacy package-level helper,
making the module callable preserves older call sites that generate the
existing gene-list genome representation.
"""

def __call__(self, *args: Any, **kwargs: Any) -> Any:
if args and isinstance(args[0], random.Random):
return _legacy_random_genome(*args, **kwargs)
if "gene_count" in kwargs or (len(args) >= 2 and isinstance(args[1], int)):
return _legacy_random_genome(*args, **kwargs)
return random_genome(*args, **kwargs)


sys.modules[__name__].__class__ = _CallableRandomGenomeModule
99 changes: 99 additions & 0 deletions pishiegen/genome/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""Schema for PishieGen's compact 128-bit computational genome.

The 128-bit genome is a deterministic integer encoding, not a literal biological
sequence. Bit ranges are inclusive and use zero-based indexing from the least
significant bit. For example, ``base_coat_color`` occupies bits 0 through 7.
"""

from __future__ import annotations

from dataclasses import dataclass
from types import MappingProxyType


GENOME_BITS = 128
"""Total number of bits in the compact genome integer."""

GENOME_MIN = 0
"""Smallest valid raw genome value."""

GENOME_MAX = 2**GENOME_BITS - 1
"""Largest valid raw genome value."""


@dataclass(frozen=True, slots=True)
class FieldSpec:
"""Description of one named bit field in the 128-bit genome.

Attributes:
name: Stable field name used by encoders and decoders.
start: Inclusive zero-based start bit, counted from the least significant
bit of the raw integer.
end: Inclusive zero-based end bit, counted from the least significant bit
of the raw integer.
"""

name: str
start: int
end: int

@property
def width(self) -> int:
"""Return the number of bits occupied by this field."""

return self.end - self.start + 1

@property
def max_value(self) -> int:
"""Return the largest unsigned value that fits in this field."""

return (1 << self.width) - 1

@property
def mask(self) -> int:
"""Return this field's mask at its encoded bit position."""

return self.max_value << self.start


_FIELD_SPECS = (
FieldSpec("base_coat_color", 0, 7),
FieldSpec("hidden_coat_color", 8, 15),
FieldSpec("agouti_tabby_pattern", 16, 19),
FieldSpec("pattern_intensity", 20, 23),
FieldSpec("fur_length_type", 24, 27),
FieldSpec("ear_morphology", 28, 31),
FieldSpec("tail_morphology", 32, 35),
FieldSpec("polydactyly", 36, 37),
FieldSpec("colorpoint_albino_locus", 38, 39),
FieldSpec("health_risk_loci", 40, 47),
FieldSpec("thermal_tolerance", 48, 55),
FieldSpec("camouflage_profile", 56, 63),
FieldSpec("agility_muscle", 64, 71),
FieldSpec("sensory_acuity", 72, 79),
FieldSpec("intelligence_cognition", 80, 87),
FieldSpec("circadian_tendency", 88, 95),
FieldSpec("mutation_markers", 96, 111),
FieldSpec("lineage_markers", 112, 119),
FieldSpec("reserved_experimental", 120, 127),
)
"""Ordered field specifications covering bits 0 through 127 exactly once."""

FIELD_SPECS = MappingProxyType({field.name: field for field in _FIELD_SPECS})
"""Immutable mapping from field names to their bit-range specifications."""

FIELD_ORDER = tuple(field.name for field in _FIELD_SPECS)
"""Stable field order used for deterministic dictionary output."""


def get_field_spec(name: str) -> FieldSpec:
"""Return the schema entry for ``name`` or raise ``KeyError``.

Field names map to inclusive bit ranges in the 128-bit integer. For example,
``mutation_markers`` maps to bits 96 through 111.
"""

try:
return FIELD_SPECS[name]
except KeyError as error:
raise KeyError(f"Unknown genome field: {name!r}") from error
54 changes: 54 additions & 0 deletions tests/test_genome_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from __future__ import annotations

import pytest

from pishiegen.genome.decoder import decode_genome
from pishiegen.genome.encoding import Genome, extract_bits, set_bits
from pishiegen.genome.random_genome import random_genome
from pishiegen.genome.schema import FIELD_ORDER, GENOME_MAX, get_field_spec


def test_valid_128_bit_range() -> None:
assert Genome(0).raw == 0
assert Genome(GENOME_MAX).raw == GENOME_MAX

with pytest.raises(ValueError):
Genome(-1)
with pytest.raises(ValueError):
Genome(2**128)


def test_field_extraction_correctness() -> None:
raw = 0
expected: dict[str, int] = {}
for index, name in enumerate(FIELD_ORDER):
spec = get_field_spec(name)
value = min(index + 1, spec.max_value)
raw = set_bits(raw, spec.start, spec.width, value)
expected[name] = value

genome = Genome(raw)

assert genome.to_dict() == expected
assert decode_genome(genome) == expected
assert extract_bits(raw, 96, 16) == expected["mutation_markers"]


def test_setting_a_field_does_not_corrupt_other_fields() -> None:
original = Genome(GENOME_MAX)
updated = original.set_field("polydactyly", 0)

assert updated.get_field("polydactyly") == 0
for name in FIELD_ORDER:
if name != "polydactyly":
assert updated.get_field(name) == original.get_field(name)


def test_random_genome_reproducibility_with_seed() -> None:
genome_a = random_genome(seed=42)
genome_b = random_genome(seed=42)
genome_c = random_genome(seed=43)

assert 0 <= genome_a.raw < 2**128
assert genome_a == genome_b
assert genome_a != genome_c