diff --git a/README.md b/README.md index 8af7c78..c6d39ad 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,38 @@ Why: - behaviour stays explainable, reproducible, and testable - correction risk is lower when every decision is rule-backed +## Shifted Number-Row Digit Support + +`cikmov` supports deterministic correction when shifted number-row symbols are typed instead of digits. + +Supported substitutions: + +```text +! -> 1 +@ -> 2 +" -> 2 +# -> 3 +£ -> 3 +$ -> 4 +% -> 5 +^ -> 6 +& -> 7 +* -> 8 +( -> 9 +) -> 0 +``` + +Scope rules: + +- mapping is the union of UK + US number-row shifted symbols (including Irish usage of UK layout) +- no keyboard-layout detection is performed at runtime +- substitutions are attempted only where grammar requires digits: + - outward digit positions + - district digit positions + - inward first character +- substitutions are not attempted in letter-only positions +- when stripping shifted symbols produces an already-valid compact postcode, symbols are treated as noise and not as digit substitutions + ## Public API ```php @@ -159,6 +191,10 @@ Scoring policy: - this reflects higher structural significance of outward geography encoding - ambiguity lowers confidence further - alternatives are capped at 5 entries for bounded output size +- shifted number-row symbol penalties: + - inward digit substitution: `-8` + - outward non-area digit substitution: `-14` + - outward area digit substitution: `-22` (reserved for completeness; current grammar does not place digits in outward area-letter slots) Ambiguity application policy: @@ -252,6 +288,24 @@ $result = Cikmov::analyse('EC1A 1AI'); // no correction is applied ``` +### 6) Shifted-digit correction + +```php +$result = Cikmov::analyse('EC1A !AL'); +// bestCandidate: "EC1A 1AL" +// confidence: 92 +// appliedPostcode: "EC1A 1AL" +``` + +### 7) Shifted symbol in letter position is rejected + +```php +$result = Cikmov::analyse('EC1A 1A!'); +// invalid: no shifted-digit substitution in letter-only positions +// bestCandidate: null +// appliedPostcode: null +``` + ## Embedded Postcode Areas The full area set is embedded and enforced: diff --git a/src/Internal/Analyser.php b/src/Internal/Analyser.php index d0aeacc..618aee1 100644 --- a/src/Internal/Analyser.php +++ b/src/Internal/Analyser.php @@ -11,6 +11,9 @@ final class Analyser { private const OUTWARD_SUBSTITUTION_BASE_PENALTY = 8; private const INWARD_SUBSTITUTION_BASE_PENALTY = 4; + private const OUTWARD_SHIFTED_DIGIT_AREA_PENALTY = 22; + private const OUTWARD_SHIFTED_DIGIT_PENALTY = 14; + private const INWARD_SHIFTED_DIGIT_PENALTY = 8; private const TIE_AMBIGUITY_PENALTY = 15; private const NEAR_AMBIGUITY_PENALTY = 6; private const ALTERNATIVE_SCORE_WINDOW = 4; @@ -80,7 +83,22 @@ public static function analyse(string $input, int $minConfidenceToApply): Result ); } - if (!preg_match('/[A-Z]/', $compact) || !preg_match('/[0-9]/', $compact)) { + $compactWithoutShiftedSymbols = PostcodeRules::stripShiftedDigitSymbols($compact); + if ($compactWithoutShiftedSymbols !== $compact && PostcodeRules::isValidCompact($compactWithoutShiftedSymbols)) { + $canonical = PostcodeRules::formatCompact($compactWithoutShiftedSymbols); + + return new Result( + input: $input, + normalizedInput: $canonical, + inputWasValid: true, + bestCandidate: $canonical, + confidence: 100, + appliedPostcode: $canonical, + alternatives: [] + ); + } + + if (!preg_match('/[A-Z]/', $compact) || !PostcodeRules::containsDigitLikeCharacter($compact)) { return new Result( input: $input, normalizedInput: $normalizedInput, @@ -206,11 +224,17 @@ private static function generateCandidates(string $compact): array continue; } + $areaLength = str_starts_with($pattern, 'AA') ? 2 : 1; $optionsByPosition = []; $isPatternViable = true; foreach ($outwardTokens as $position => $token) { - $options = self::optionsForCharacter($outwardInput[$position], $token, true); + $options = self::optionsForCharacter( + character: $outwardInput[$position], + expectedToken: $token, + outward: true, + isOutwardAreaPosition: $position < $areaLength + ); if ($options === []) { $isPatternViable = false; break; @@ -273,12 +297,23 @@ private static function isClassCompatibleOutward(string $outward, string $patter return false; } - if ($token === 'D' && !ctype_digit($character)) { - return false; - } + if ($token !== 'L') { + if (ctype_digit($character)) { + if ($token === 'N' && $character === '0') { + return false; + } - if ($token === 'N' && (!ctype_digit($character) || $character === '0')) { - return false; + continue; + } + + $shiftedDigit = PostcodeRules::shiftedDigitReplacement($character); + if ($shiftedDigit === null) { + return false; + } + + if ($token === 'N' && $shiftedDigit === '0') { + return false; + } } } @@ -315,8 +350,12 @@ private static function walkCandidateOptions( /** * @return list */ - private static function optionsForCharacter(string $character, string $expectedToken, bool $outward): array - { + private static function optionsForCharacter( + string $character, + string $expectedToken, + bool $outward, + bool $isOutwardAreaPosition = false + ): array { $basePenalty = $outward ? self::OUTWARD_SUBSTITUTION_BASE_PENALTY : self::INWARD_SUBSTITUTION_BASE_PENALTY; $options = []; @@ -346,6 +385,14 @@ private static function optionsForCharacter(string $character, string $expectedT $options[] = ['char' => $replacement, 'penalty' => $basePenalty + $extraPenalty]; } } + + $shiftedDigit = PostcodeRules::shiftedDigitReplacement($character); + if ($shiftedDigit !== null && ($expectedToken !== 'N' || $shiftedDigit !== '0')) { + $options[] = [ + 'char' => $shiftedDigit, + 'penalty' => self::shiftedDigitPenalty($outward, $isOutwardAreaPosition), + ]; + } } $deduplicated = []; @@ -371,4 +418,17 @@ private static function optionsForCharacter(string $character, string $expectedT return $finalOptions; } + + private static function shiftedDigitPenalty(bool $outward, bool $isOutwardAreaPosition): int + { + if (!$outward) { + return self::INWARD_SHIFTED_DIGIT_PENALTY; + } + + if ($isOutwardAreaPosition) { + return self::OUTWARD_SHIFTED_DIGIT_AREA_PENALTY; + } + + return self::OUTWARD_SHIFTED_DIGIT_PENALTY; + } } diff --git a/src/Internal/PostcodeRules.php b/src/Internal/PostcodeRules.php index 796c143..99f5c86 100644 --- a/src/Internal/PostcodeRules.php +++ b/src/Internal/PostcodeRules.php @@ -15,6 +15,28 @@ final class PostcodeRules private const FORBIDDEN_FIRST_OUTWARD_LETTERS = 'QVX'; private const FORBIDDEN_SECOND_OUTWARD_LETTERS = 'IJZ'; private const AA9A_ALLOWED_FINAL_LETTERS = 'ABEHMNPRVWXY'; + private const SHIFTED_DIGIT_SYMBOLS = '!@"#$%^&*()'; + private const SHIFTED_DIGIT_ALIASES = [ + "\u{00A3}" => '#', + ]; + + /** + * @var array + */ + private const SHIFTED_DIGIT_TO_DIGIT = [ + '!' => '1', + '@' => '2', + '"' => '2', + '#' => '3', + "\u{00A3}" => '3', + '$' => '4', + '%' => '5', + '^' => '6', + '&' => '7', + '*' => '8', + '(' => '9', + ')' => '0', + ]; /** * @var array> @@ -172,9 +194,31 @@ final class PostcodeRules public static function compactFromInput(string $input): string { $normalized = strtoupper($input); - $compact = preg_replace('/[^A-Z0-9]+/', '', $normalized); + $normalized = strtr($normalized, self::SHIFTED_DIGIT_ALIASES); + $compact = preg_replace('/[^A-Z0-9!@"#$%\^&*()]+/', '', $normalized); + if ($compact === null || $compact === '') { + return ''; + } - return $compact ?? ''; + // Shifted symbols can validly stand in for digits, but never at postcode boundaries. + $compact = trim($compact, self::SHIFTED_DIGIT_SYMBOLS); + + return $compact; + } + + public static function containsDigitLikeCharacter(string $compact): bool + { + return strpbrk($compact, '0123456789' . self::SHIFTED_DIGIT_SYMBOLS) !== false; + } + + public static function shiftedDigitReplacement(string $character): ?string + { + return self::SHIFTED_DIGIT_TO_DIGIT[$character] ?? null; + } + + public static function stripShiftedDigitSymbols(string $compact): string + { + return str_replace(str_split(self::SHIFTED_DIGIT_SYMBOLS), '', $compact); } public static function displayFromCompact(string $compact): string diff --git a/tests/CikmovTest.php b/tests/CikmovTest.php index 999f87d..f86dcfd 100644 --- a/tests/CikmovTest.php +++ b/tests/CikmovTest.php @@ -86,6 +86,56 @@ public static function lowercaseAndNoiseProvider(): iterable yield 'extra spaces' => ['yo1 7hb', 'YO1 7HB']; } + #[DataProvider('surroundingShiftedNoiseProvider')] + public function testSurroundingShiftedSymbolsAreIgnoredAsNoise(string $input, string $canonical): void + { + $result = Cikmov::analyse($input); + + self::assertTrue($result->inputWasValid); + self::assertSame($canonical, $result->appliedPostcode); + self::assertSame(100, $result->confidence); + } + + /** + * @return iterable + */ + public static function surroundingShiftedNoiseProvider(): iterable + { + yield 'trailing !' => ['EC1A 1AL!', 'EC1A 1AL']; + yield 'leading !' => ['!EC1A 1AL', 'EC1A 1AL']; + yield 'wrapped by parentheses' => ['(EC1A 1AL)', 'EC1A 1AL']; + } + + public function testWrappedShiftedCorrectionStillCorrectsDeterministically(): void + { + $result = Cikmov::analyse('(EC!A 1AL)'); + + self::assertFalse($result->inputWasValid); + self::assertSame('EC1A 1AL', $result->bestCandidate); + self::assertSame('EC1A 1AL', $result->appliedPostcode); + } + + #[DataProvider('strayInsertedShiftedSymbolProvider')] + public function testStrayInsertedShiftedSymbolsAreTreatedAsNoise(string $input, string $canonical): void + { + $result = Cikmov::analyse($input); + + self::assertTrue($result->inputWasValid); + self::assertSame($canonical, $result->bestCandidate); + self::assertSame($canonical, $result->appliedPostcode); + self::assertSame(100, $result->confidence); + } + + /** + * @return iterable + */ + public static function strayInsertedShiftedSymbolProvider(): iterable + { + yield 'M district with inserted ! noise' => ['M!1 1AE', 'M1 1AE']; + yield 'SW district with inserted @ noise' => ['SW@1A 1AA', 'SW1A 1AA']; + yield 'inward separator with inserted ! noise' => ['EC1A !1AL', 'EC1A 1AL']; + } + #[DataProvider('inwardDigitConfusionProvider')] public function testInwardDigitConfusionsAreCorrected(string $input, string $expected): void { @@ -111,6 +161,86 @@ public static function inwardDigitConfusionProvider(): iterable yield 'G->6' => ['EC1A GAL', 'EC1A 6AL']; } + #[DataProvider('shiftedInwardDigitProvider')] + public function testShiftedInwardDigitCharactersAreCorrected(string $input, string $expected): void + { + $result = Cikmov::analyse($input); + + self::assertFalse($result->inputWasValid); + self::assertSame($expected, $result->bestCandidate); + self::assertSame($expected, $result->appliedPostcode); + self::assertSame(92, $result->confidence); + } + + /** + * @return iterable + */ + public static function shiftedInwardDigitProvider(): iterable + { + yield '!->1' => ['EC1A !AL', 'EC1A 1AL']; + yield '@->2' => ['EC1A @AL', 'EC1A 2AL']; + yield '"->2' => ['EC1A "AL', 'EC1A 2AL']; + yield '#->3' => ['EC1A #AL', 'EC1A 3AL']; + yield '£->3' => ["EC1A \u{00A3}AL", 'EC1A 3AL']; + yield '$->4' => ['EC1A $AL', 'EC1A 4AL']; + yield '%->5' => ['EC1A %AL', 'EC1A 5AL']; + yield '^->6' => ['EC1A ^AL', 'EC1A 6AL']; + yield '&->7' => ['EC1A &AL', 'EC1A 7AL']; + yield '*->8' => ['EC1A *AL', 'EC1A 8AL']; + yield '(->9' => ['EC1A (AL', 'EC1A 9AL']; + yield ')->0' => ['EC1A )AL', 'EC1A 0AL']; + } + + #[DataProvider('shiftedOutwardDigitProvider')] + public function testShiftedOutwardDigitCharactersAreCorrected(string $input, string $expected): void + { + $result = Cikmov::analyse($input); + + self::assertFalse($result->inputWasValid); + self::assertSame($expected, $result->bestCandidate); + self::assertSame($expected, $result->appliedPostcode); + self::assertSame(86, $result->confidence); + } + + /** + * @return iterable + */ + public static function shiftedOutwardDigitProvider(): iterable + { + yield 'AA9A digit' => ['EC!A 1AL', 'EC1A 1AL']; + yield 'AA9 digit' => ['YO( 7HB', 'YO9 7HB']; + } + + public function testShiftedOutwardNPositionStillRejectsZeroDistrict(): void + { + $result = Cikmov::analyse('SW)A 1AA'); + + self::assertFalse($result->inputWasValid); + self::assertNull($result->bestCandidate); + self::assertNull($result->appliedPostcode); + self::assertSame(0, $result->confidence); + } + + public function testShiftedDigitsAreNotAppliedInLetterPositions(): void + { + $result = Cikmov::analyse('EC1A 1A!'); + + self::assertFalse($result->inputWasValid); + self::assertNull($result->bestCandidate); + self::assertNull($result->appliedPostcode); + self::assertSame(0, $result->confidence); + } + + public function testShiftedDigitsCanCombineWithExistingConfusions(): void + { + $result = Cikmov::analyse('EC!A 1A1'); + + self::assertFalse($result->inputWasValid); + self::assertSame('EC1A 1AL', $result->bestCandidate); + self::assertSame(82, $result->confidence); + self::assertNull($result->appliedPostcode); + } + #[DataProvider('inwardLetterConfusionProvider')] public function testInwardLetterConfusionsAreCorrected(string $input, string $expected): void { @@ -410,6 +540,7 @@ public static function idempotencyProvider(): iterable { yield 'valid input' => ['EC1A 1AL']; yield 'correction input' => ['EC1A IAL']; + yield 'shifted correction input' => ['EC1A !AL']; yield 'area confusion correction' => ['Y01 7HB']; yield 'ambiguous input' => ['B01 8TH']; yield 'rejected input' => ['!!!!']; diff --git a/tests/PostcodeRulesTest.php b/tests/PostcodeRulesTest.php index c849256..4f658ac 100644 --- a/tests/PostcodeRulesTest.php +++ b/tests/PostcodeRulesTest.php @@ -77,6 +77,28 @@ public function testCompactFromInputStripsNoise(): void self::assertSame('WC2H7LT', PostcodeRules::compactFromInput(" wc2h-7lt\t")); } + public function testCompactFromInputRetainsShiftedDigitCharacters(): void + { + self::assertSame('EC!A"AL', PostcodeRules::compactFromInput(' ec!a "al ')); + } + + public function testCompactFromInputNormalizesPoundToHashAlias(): void + { + self::assertSame('EC1A#AL', PostcodeRules::compactFromInput("EC1A \u{00A3}AL")); + } + + public function testCompactFromInputTrimsSurroundingShiftedSymbols(): void + { + self::assertSame('EC1A1AL', PostcodeRules::compactFromInput('!EC1A 1AL!')); + self::assertSame('EC!A1AL', PostcodeRules::compactFromInput('(EC!A 1AL)')); + } + + public function testStripShiftedDigitSymbolsRemovesInsertedShiftedNoise(): void + { + self::assertSame('M11AE', PostcodeRules::stripShiftedDigitSymbols('M!11AE')); + self::assertSame('SW1A1AA', PostcodeRules::stripShiftedDigitSymbols('SW@1A1AA')); + } + public function testDisplayFromCompactSpacingRules(): void { self::assertSame('', PostcodeRules::displayFromCompact(''));