Compare commits

..

4 commits

Author SHA1 Message Date
Yura Dupyn
3ec7005198 Introduce CodePointString 2026-04-25 11:20:21 +02:00
Yura Dupyn
ec6ba36220 Renaming 2026-04-25 11:17:47 +02:00
Yura Dupyn
9c72959cd3 Few helpers. Now getLineRange throws on out of bounds line. 2026-04-25 01:58:35 +02:00
Yura Dupyn
85bc9b05e1 Add cursor abstraction 2026-04-25 01:44:09 +02:00
2 changed files with 171 additions and 13 deletions

View file

@ -1,5 +1,8 @@
TypeScript library for handling source code strings without having to deal with intricacies of JS's UTF16 encoding. TypeScript library for handling source code strings without having to deal with intricacies of JS's UTF16 encoding.
# CodePointString
A wrapper for a string that's just an array of codepoints. There's no newline or offset tracking to the original string.
# SourceText # SourceText
A sane, UTF-16-safe string wrapper specifically designed for parsing source code, tracking line numbers, and generating CLI error messages. A sane, UTF-16-safe string wrapper specifically designed for parsing source code, tracking line numbers, and generating CLI error messages.
Think of it as a fat wrapper for a string that understand more info about the string like line structure. Think of it as a fat wrapper for a string that understand more info about the string like line structure.
@ -19,6 +22,8 @@ It also allows for Spatial Tracking or various sub-regions within the source. It
- `SourceLocation` is basically a smart 2D coordinate equivalent to `(line, col)` (but also tracks `CodePointIndex`) - `SourceLocation` is basically a smart 2D coordinate equivalent to `(line, col)` (but also tracks `CodePointIndex`)
- `Span` an interval determined by `start` and `end` SourceLocations - `Span` an interval determined by `start` and `end` SourceLocations
# Source Cursor
- `SourceCursor` is a mutable cursor over `SourceRegion`. Primarily useful to build parsers on top of `SourceRegion`. It is line-aware.
# Rendering CLI Errors # Rendering CLI Errors
Secondary functionality is `function renderSpan(region: SourceRegion, span: Span, contextLines = 1): LineView[]` which is able to render spans of source-code as follows Secondary functionality is `function renderSpan(region: SourceRegion, span: Span, contextLines = 1): LineView[]` which is able to render spans of source-code as follows

View file

@ -21,13 +21,14 @@ export const DIGIT_9: CodePoint = char('9');
export const DOT: CodePoint = char('.'); export const DOT: CodePoint = char('.');
// Hex Boundaries // Hex Boundaries
export const LOWERCASE_a: CodePoint = char('a'); export const LOWERCASE_A: CodePoint = char('a');
export const UPPERCASE_A: CodePoint = char('A'); export const UPPERCASE_A: CodePoint = char('A');
export const LOWERCASE_f: CodePoint = char('f'); export const LOWERCASE_F: CodePoint = char('f');
export const UPPERCASE_F: CodePoint = char('F'); export const UPPERCASE_F: CodePoint = char('F');
export const LOWERCASE_z: CodePoint = char('z'); export const LOWERCASE_Z: CodePoint = char('z');
export const UPPERCASE_Z: CodePoint = char('Z'); export const UPPERCASE_Z: CodePoint = char('Z');
// === Predicates ===
export function isBetween(a: CodePoint, x: CodePoint, b: CodePoint): boolean { export function isBetween(a: CodePoint, x: CodePoint, b: CodePoint): boolean {
return a <= x && x <= b; return a <= x && x <= b;
@ -38,7 +39,7 @@ export function isDigit(x: CodePoint): boolean {
} }
export function isAsciiAlpha(x: CodePoint): boolean { export function isAsciiAlpha(x: CodePoint): boolean {
return isBetween(LOWERCASE_a, x, LOWERCASE_z) return isBetween(LOWERCASE_A, x, LOWERCASE_Z)
|| isBetween(UPPERCASE_A, x, UPPERCASE_Z); || isBetween(UPPERCASE_A, x, UPPERCASE_Z);
} }
@ -46,6 +47,17 @@ export function isAsciiAlphanumeric(x: CodePoint): boolean {
return isAsciiAlpha(x) || isDigit(x); return isAsciiAlpha(x) || isDigit(x);
} }
export function isAsciiWhitespace(cp: CodePoint): boolean {
return cp === SPACE
|| cp === TAB
|| cp === NEW_LINE
|| cp === CARRIAGE_RETURN;
}
export function isAsciiInlineWhitespace(cp: CodePoint): boolean {
return cp === SPACE || cp === TAB;
}
export type CodePointRef = { export type CodePointRef = {
char: CodePoint, char: CodePoint,
offset: StringIndex, offset: StringIndex,
@ -56,7 +68,51 @@ export type CodePointSpan = {
end: CodePointIndex, end: CodePointIndex,
} }
// === CodePointString ===
export class CodePointString {
readonly codePoints: readonly CodePoint[];
constructor(source: string) {
const codePointsInternal: CodePoint[] = [];
let i = 0;
while (i < source.length) {
const char = source.codePointAt(i) as CodePoint;
codePointsInternal.push(char);
const size =(char > 0xFFFF ? 2 : 1);
i += size;
}
this.codePoints = Object.freeze(codePointsInternal);
}
static makeFromString(s: string): CodePointString {
return new CodePointString(s);
}
codePointAt(index: CodePointIndex): CodePoint {
return this.codePoints[index];
}
get length(): CodePointIndex {
return this.codePoints.length;
}
toString(): string {
let result = "";
for (const cp of this.codePoints) {
result += String.fromCodePoint(cp);
}
return result;
}
}
// === Source Text === // === Source Text ===
// TODO:
// @deprecated and say to use `SourceText.makeFromString` instead.
export function sourceText(s: string): SourceText {
return SourceText.makeFromString(s);
}
export class SourceText { export class SourceText {
readonly source: string; readonly source: string;
// TODO: Later you can try to change this to two `Uint32Array`s - one for codepoints (each 20 bit but whatever), the other for pointers to original string. // TODO: Later you can try to change this to two `Uint32Array`s - one for codepoints (each 20 bit but whatever), the other for pointers to original string.
@ -65,6 +121,10 @@ export class SourceText {
// Stores the CodePointIndex where each line begins // Stores the CodePointIndex where each line begins
readonly lineStarts: CodePointIndex[]; readonly lineStarts: CodePointIndex[];
static makeFromString(s: string): SourceText {
return new SourceText(s);
}
constructor(rawSource: string) { constructor(rawSource: string) {
// TODO: This shouldn't really be a concern of the library. // TODO: This shouldn't really be a concern of the library.
// const source = rawSource.normalize('NFC'); // const source = rawSource.normalize('NFC');
@ -228,11 +288,12 @@ export class SourceText {
return this.sliceByCp(startCp, endCp); return this.sliceByCp(startCp, endCp);
} }
getLineRange(line: number): { start: CodePointIndex, end: CodePointIndex } {
tryGetLineRange(line: number): CodePointSpan | undefined {
const lineIndex = line - 1; const lineIndex = line - 1;
if (lineIndex < 0 || lineIndex >= this.lineStarts.length) { if (lineIndex < 0 || lineIndex >= this.lineStarts.length) {
// TODO: This is a bit suspicious. Maybe return undefined? return undefined;
return { start: 0, end: 0 };
} }
const start = this.lineStarts[lineIndex]; const start = this.lineStarts[lineIndex];
@ -240,12 +301,16 @@ export class SourceText {
? this.lineStarts[lineIndex + 1] ? this.lineStarts[lineIndex + 1]
: this.#chars.length; : this.#chars.length;
return { start, end }; return rawSpan(start, end);
} }
}
export function sourceText(s: string): SourceText { getLineRange(line: number): { start: CodePointIndex, end: CodePointIndex } {
return new SourceText(s); const range = this.tryGetLineRange(line);
if (range === undefined) {
throw new Error(`Line ${line} is out of bounds (line count: ${this.lineCount})`);
}
return range;
}
} }
// Creates a Span from two SourceLocations. // Creates a Span from two SourceLocations.
@ -296,6 +361,10 @@ export class SourceRegion {
return span(loc, loc); return span(loc, loc);
} }
get codePointSpan(): CodePointSpan {
return rawSpan(this.span.start.index, this.span.end.index);
}
*codePoints(): IterableIterator<[CodePointIndex, CodePoint]> { *codePoints(): IterableIterator<[CodePointIndex, CodePoint]> {
const start = this.span.start.index; const start = this.span.start.index;
const end = this.span.end.index; const end = this.span.end.index;
@ -366,6 +435,90 @@ export type SourceLocation = {
column: number; // 1-based column: number; // 1-based
} }
export function containsSpan(outer: CodePointSpan, inner: CodePointSpan): boolean {
return outer.start <= inner.start && inner.end <= outer.end;
}
export function containsIndex(span: CodePointSpan, index: CodePointIndex): boolean {
return span.start <= index && index < span.end;
}
// === Cursor ===
export class SourceCursor {
private index: CodePointIndex;
constructor(public readonly region: SourceRegion) {
this.index = region.span.start.index;
}
current(): CodePointIndex {
return this.index;
}
checkpoint(): CodePointIndex {
return this.index;
}
restore(index: CodePointIndex) {
this.index = index;
}
peek(): CodePoint | undefined {
if (this.index >= this.region.span.end.index) return undefined;
return this.region.codePointAt(this.index);
}
advance(): CodePoint | undefined {
const cp = this.peek();
if (cp === undefined) return undefined;
this.index += 1;
return cp;
}
isAtEnd(): boolean {
return this.index >= this.region.span.end.index;
}
spanFrom(start: CodePointIndex): CodePointSpan {
return rawSpan(start, this.index);
}
currentSpan(): CodePointSpan {
return this.isAtEnd()
? pointSpan(this.index)
: rawSpan(this.index, this.index + 1);
}
eofSpan(): CodePointSpan {
return pointSpan(this.region.span.end.index);
}
slice(span: CodePointSpan): string {
return this.region.slice(span);
}
location(): SourceLocation {
return this.region.source.getLocation(this.index);
}
moveToNextLineStart(): void {
const loc = this.region.source.getLocation(this.index);
const nextLine = loc.line + 1;
if (nextLine > this.region.span.end.line) {
this.index = this.region.span.end.index;
return;
}
const range = this.region.source.getLineRange(nextLine);
this.index = Math.min(range.start, this.region.span.end.index);
}
}
// === Rendering Utilities === // === Rendering Utilities ===
export type LineView = { export type LineView = {