refactor(psl-parser): remove offset tracking from tokenizer

SevInf · SevInf · commit ba27de246c4d · 2026-03-05T15:13:17.000+01:00
Offsets on tokens are incompatible with the incremental parsing goal.
The Tokenizer class now tracks position internally without exposing
offsets on the Token interface.
diff --git a/packages/1-framework/2-authoring/psl-parser/src/tokenizer.ts b/packages/1-framework/2-authoring/psl-parser/src/tokenizer.ts
@@ -24,7 +24,6 @@ export type TokenKind =
 export interface Token {
   readonly kind: TokenKind;
   readonly text: string;
-  readonly offset: number;
 }
 
 export class Tokenizer {
@@ -40,7 +39,7 @@ export class Tokenizer {
 
   next(): Token {
     const token = this.#buffer.shift() ?? scan(this.#source, this.#pos);
-    this.#pos = token.offset + token.text.length;
+    this.#pos += token.text.length;
     return token;
   }
 
@@ -50,7 +49,7 @@ export class Tokenizer {
       if (last?.kind === 'Eof') {
         break;
       }
-      const scanPos = last !== undefined ? last.offset + last.text.length : this.#pos;
+      const scanPos = this.#buffer.reduce((pos, t) => pos + t.text.length, this.#pos);
       this.#buffer.push(scan(this.#source, scanPos));
     }
     return (
@@ -61,7 +60,7 @@ export class Tokenizer {
 
 function scan(source: string, pos: number): Token {
   if (pos >= source.length) {
-    return { kind: 'Eof', text: '', offset: source.length };
+    return { kind: 'Eof', text: '' };
   }
 
   return (
@@ -75,7 +74,6 @@ function scan(source: string, pos: number): Token {
     scanPunctuation(source, pos) ?? {
       kind: 'Invalid' as const,
       text: readChar(source, pos),
-      offset: pos,
     }
   );
 }
@@ -84,9 +82,9 @@ function scanNewline(source: string, pos: number): Token | undefined {
   const ch = source.charAt(pos);
   if (ch !== '\r' && ch !== '\n') return undefined;
   if (ch === '\r' && source.charAt(pos + 1) === '\n') {
-    return { kind: 'Newline', text: '\r\n', offset: pos };
+    return { kind: 'Newline', text: '\r\n' };
   }
-  return { kind: 'Newline', text: ch, offset: pos };
+  return { kind: 'Newline', text: ch };
 }
 
 function scanWhitespace(source: string, pos: number): Token | undefined {
@@ -98,7 +96,7 @@ function scanWhitespace(source: string, pos: number): Token | undefined {
     if (c !== ' ' && c !== '\t') break;
     end++;
   }
-  return { kind: 'Whitespace', text: source.slice(pos, end), offset: pos };
+  return { kind: 'Whitespace', text: source.slice(pos, end) };
 }
 
 function scanComment(source: string, pos: number): Token | undefined {
@@ -109,15 +107,15 @@ function scanComment(source: string, pos: number): Token | undefined {
     if (c === '\n' || c === '\r') break;
     end++;
   }
-  return { kind: 'Comment', text: source.slice(pos, end), offset: pos };
+  return { kind: 'Comment', text: source.slice(pos, end) };
 }
 
 function scanAt(source: string, pos: number): Token | undefined {
   if (source.charAt(pos) !== '@') return undefined;
   if (source.charAt(pos + 1) === '@') {
-    return { kind: 'DoubleAt', text: '@@', offset: pos };
+    return { kind: 'DoubleAt', text: '@@' };
   }
-  return { kind: 'At', text: '@', offset: pos };
+  return { kind: 'At', text: '@' };
 }
 
 function scanIdent(source: string, pos: number): Token | undefined {
@@ -132,7 +130,7 @@ function scanIdent(source: string, pos: number): Token | undefined {
       break;
     }
   }
-  return { kind: 'Ident', text: source.slice(pos, end), offset: pos };
+  return { kind: 'Ident', text: source.slice(pos, end) };
 }
 
 function scanNumber(source: string, pos: number): Token | undefined {
@@ -153,7 +151,7 @@ function scanNumber(source: string, pos: number): Token | undefined {
       end++;
     }
   }
-  return { kind: 'NumberLiteral', text: source.slice(pos, end), offset: pos };
+  return { kind: 'NumberLiteral', text: source.slice(pos, end) };
 }
 
 function scanString(source: string, pos: number): Token | undefined {
@@ -167,22 +165,22 @@ function scanString(source: string, pos: number): Token | undefined {
     }
     if (c === '"') {
       end++; // include closing quote
-      return { kind: 'StringLiteral', text: source.slice(pos, end), offset: pos };
+      return { kind: 'StringLiteral', text: source.slice(pos, end) };
     }
     if (c === '\n' || c === '\r') {
       // Unterminated: stop before newline
-      return { kind: 'StringLiteral', text: source.slice(pos, end), offset: pos };
+      return { kind: 'StringLiteral', text: source.slice(pos, end) };
     }
     end++;
   }
   // Unterminated at EOF
-  return { kind: 'StringLiteral', text: source.slice(pos, end), offset: pos };
+  return { kind: 'StringLiteral', text: source.slice(pos, end) };
 }
 
 function scanPunctuation(source: string, pos: number): Token | undefined {
   const kind = PUNCTUATION[source.charAt(pos)];
   if (kind === undefined) return undefined;
-  return { kind, text: source.charAt(pos), offset: pos };
+  return { kind, text: source.charAt(pos) };
 }
 
 function readChar(source: string, pos: number): string {
diff --git a/packages/1-framework/2-authoring/psl-parser/test/tokenizer.test.ts b/packages/1-framework/2-authoring/psl-parser/test/tokenizer.test.ts
@@ -304,26 +304,6 @@ describe('Tokenizer', () => {
     });
   });
 
-  describe('offsets', () => {
-    it('each token offset equals sum of preceding text lengths', () => {
-      const source = 'model User {\n  id Int @id\n}\n';
-      const tokens = collectAll(source);
-      let expectedOffset = 0;
-      for (const token of tokens) {
-        expect(token.offset).toBe(expectedOffset);
-        expectedOffset += token.text.length;
-      }
-    });
-
-    it('Eof offset equals source length', () => {
-      const source = 'model User {}';
-      const tokens = collectAll(source);
-      const eof = tokens[tokens.length - 1]!;
-      expect(eof.kind).toBe('Eof');
-      expect(eof.offset).toBe(source.length);
-    });
-  });
-
   describe('cursor API', () => {
     it('peek(0) returns the same token as a subsequent next()', () => {
       const t = new Tokenizer('model User');

Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,6 @@ export type TokenKind =`
`24`	`24`	`export interface Token {`
`25`	`25`	`readonly kind: TokenKind;`
`26`	`26`	`readonly text: string;`
`27`		`- readonly offset: number;`
`28`	`27`	`}`
`29`	`28`
`30`	`29`	`export class Tokenizer {`
`@@ -40,7 +39,7 @@ export class Tokenizer {`
`40`	`39`
`41`	`40`	`next(): Token {`
`42`	`41`	`const token = this.#buffer.shift() ?? scan(this.#source, this.#pos);`
`43`		`- this.#pos = token.offset + token.text.length;`
	`42`	`+ this.#pos += token.text.length;`
`44`	`43`	`return token;`
`45`	`44`	`}`
`46`	`45`
`@@ -50,7 +49,7 @@ export class Tokenizer {`
`50`	`49`	`if (last?.kind === 'Eof') {`
`51`	`50`	`break;`
`52`	`51`	`}`
`53`		`- const scanPos = last !== undefined ? last.offset + last.text.length : this.#pos;`
	`52`	`+ const scanPos = this.#buffer.reduce((pos, t) => pos + t.text.length, this.#pos);`
`54`	`53`	`this.#buffer.push(scan(this.#source, scanPos));`
`55`	`54`	`}`
`56`	`55`	`return (`
`@@ -61,7 +60,7 @@ export class Tokenizer {`
`61`	`60`
`62`	`61`	`function scan(source: string, pos: number): Token {`
`63`	`62`	`if (pos >= source.length) {`
`64`		`- return { kind: 'Eof', text: '', offset: source.length };`
	`63`	`+ return { kind: 'Eof', text: '' };`
`65`	`64`	`}`
`66`	`65`
`67`	`66`	`return (`
`@@ -75,7 +74,6 @@ function scan(source: string, pos: number): Token {`
`75`	`74`	`scanPunctuation(source, pos) ?? {`
`76`	`75`	`kind: 'Invalid' as const,`
`77`	`76`	`text: readChar(source, pos),`
`78`		`- offset: pos,`
`79`	`77`	`}`
`80`	`78`	`);`
`81`	`79`	`}`
`@@ -84,9 +82,9 @@ function scanNewline(source: string, pos: number): Token \| undefined {`
`84`	`82`	`const ch = source.charAt(pos);`
`85`	`83`	`if (ch !== '\r' && ch !== '\n') return undefined;`
`86`	`84`	`if (ch === '\r' && source.charAt(pos + 1) === '\n') {`
`87`		`- return { kind: 'Newline', text: '\r\n', offset: pos };`
	`85`	`+ return { kind: 'Newline', text: '\r\n' };`
`88`	`86`	`}`
`89`		`- return { kind: 'Newline', text: ch, offset: pos };`
	`87`	`+ return { kind: 'Newline', text: ch };`
`90`	`88`	`}`
`91`	`89`
`92`	`90`	`function scanWhitespace(source: string, pos: number): Token \| undefined {`
`@@ -98,7 +96,7 @@ function scanWhitespace(source: string, pos: number): Token \| undefined {`
`98`	`96`	`if (c !== ' ' && c !== '\t') break;`
`99`	`97`	`end++;`
`100`	`98`	`}`
`101`		`- return { kind: 'Whitespace', text: source.slice(pos, end), offset: pos };`
	`99`	`+ return { kind: 'Whitespace', text: source.slice(pos, end) };`
`102`	`100`	`}`
`103`	`101`
`104`	`102`	`function scanComment(source: string, pos: number): Token \| undefined {`
`@@ -109,15 +107,15 @@ function scanComment(source: string, pos: number): Token \| undefined {`
`109`	`107`	`if (c === '\n' \|\| c === '\r') break;`
`110`	`108`	`end++;`
`111`	`109`	`}`
`112`		`- return { kind: 'Comment', text: source.slice(pos, end), offset: pos };`
	`110`	`+ return { kind: 'Comment', text: source.slice(pos, end) };`
`113`	`111`	`}`
`114`	`112`
`115`	`113`	`function scanAt(source: string, pos: number): Token \| undefined {`
`116`	`114`	`if (source.charAt(pos) !== '@') return undefined;`
`117`	`115`	`if (source.charAt(pos + 1) === '@') {`
`118`		`- return { kind: 'DoubleAt', text: '@@', offset: pos };`
	`116`	`+ return { kind: 'DoubleAt', text: '@@' };`
`119`	`117`	`}`
`120`		`- return { kind: 'At', text: '@', offset: pos };`
	`118`	`+ return { kind: 'At', text: '@' };`
`121`	`119`	`}`
`122`	`120`
`123`	`121`	`function scanIdent(source: string, pos: number): Token \| undefined {`
`@@ -132,7 +130,7 @@ function scanIdent(source: string, pos: number): Token \| undefined {`
`132`	`130`	`break;`
`133`	`131`	`}`
`134`	`132`	`}`
`135`		`- return { kind: 'Ident', text: source.slice(pos, end), offset: pos };`
	`133`	`+ return { kind: 'Ident', text: source.slice(pos, end) };`
`136`	`134`	`}`
`137`	`135`
`138`	`136`	`function scanNumber(source: string, pos: number): Token \| undefined {`
`@@ -153,7 +151,7 @@ function scanNumber(source: string, pos: number): Token \| undefined {`
`153`	`151`	`end++;`
`154`	`152`	`}`
`155`	`153`	`}`
`156`		`- return { kind: 'NumberLiteral', text: source.slice(pos, end), offset: pos };`
	`154`	`+ return { kind: 'NumberLiteral', text: source.slice(pos, end) };`
`157`	`155`	`}`
`158`	`156`
`159`	`157`	`function scanString(source: string, pos: number): Token \| undefined {`
`@@ -167,22 +165,22 @@ function scanString(source: string, pos: number): Token \| undefined {`
`167`	`165`	`}`
`168`	`166`	`if (c === '"') {`
`169`	`167`	`end++; // include closing quote`
`170`		`- return { kind: 'StringLiteral', text: source.slice(pos, end), offset: pos };`
	`168`	`+ return { kind: 'StringLiteral', text: source.slice(pos, end) };`
`171`	`169`	`}`
`172`	`170`	`if (c === '\n' \|\| c === '\r') {`
`173`	`171`	`// Unterminated: stop before newline`
`174`		`- return { kind: 'StringLiteral', text: source.slice(pos, end), offset: pos };`
	`172`	`+ return { kind: 'StringLiteral', text: source.slice(pos, end) };`
`175`	`173`	`}`
`176`	`174`	`end++;`
`177`	`175`	`}`
`178`	`176`	`// Unterminated at EOF`
`179`		`- return { kind: 'StringLiteral', text: source.slice(pos, end), offset: pos };`
	`177`	`+ return { kind: 'StringLiteral', text: source.slice(pos, end) };`
`180`	`178`	`}`
`181`	`179`
`182`	`180`	`function scanPunctuation(source: string, pos: number): Token \| undefined {`
`183`	`181`	`const kind = PUNCTUATION[source.charAt(pos)];`
`184`	`182`	`if (kind === undefined) return undefined;`
`185`		`- return { kind, text: source.charAt(pos), offset: pos };`
	`183`	`+ return { kind, text: source.charAt(pos) };`
`186`	`184`	`}`
`187`	`185`
`188`	`186`	`function readChar(source: string, pos: number): string {`