Compare commits

...

4 commits

Author SHA1 Message Date
Yura Dupyn
3ec7005198 Introduce CodePointString 2026-04-25 11:20:21 +02:00
Yura Dupyn
ec6ba36220 Renaming 2026-04-25 11:17:47 +02:00
Yura Dupyn
9c72959cd3 Few helpers. Now getLineRange throws on out of bounds line. 2026-04-25 01:58:35 +02:00
Yura Dupyn
85bc9b05e1 Add cursor abstraction 2026-04-25 01:44:09 +02:00
2 changed files with 171 additions and 13 deletions

View file

@ -1,5 +1,8 @@
TypeScript library for handling source code strings without having to deal with intricacies of JS's UTF16 encoding.
# CodePointString
A wrapper for a string that's just an array of codepoints. There's no newline or offset tracking to the original string.
# SourceText
A sane, UTF-16-safe string wrapper specifically designed for parsing source code, tracking line numbers, and generating CLI error messages.
Think of it as a fat wrapper for a string that understand more info about the string like line structure.
@ -19,6 +22,8 @@ It also allows for Spatial Tracking or various sub-regions within the source. It
- `SourceLocation` is basically a smart 2D coordinate equivalent to `(line, col)` (but also tracks `CodePointIndex`)
- `Span` an interval determined by `start` and `end` SourceLocations
# Source Cursor
- `SourceCursor` is a mutable cursor over `SourceRegion`. Primarily useful to build parsers on top of `SourceRegion`. It is line-aware.
# Rendering CLI Errors
Secondary functionality is `function renderSpan(region: SourceRegion, span: Span, contextLines = 1): LineView[]` which is able to render spans of source-code as follows

View file

@ -21,13 +21,14 @@ export const DIGIT_9: CodePoint = char('9');
export const DOT: CodePoint = char('.');
// Hex Boundaries
export const LOWERCASE_a: CodePoint = char('a');
export const LOWERCASE_A: CodePoint = char('a');
export const UPPERCASE_A: CodePoint = char('A');
export const LOWERCASE_f: CodePoint = char('f');
export const LOWERCASE_F: CodePoint = char('f');
export const UPPERCASE_F: CodePoint = char('F');
export const LOWERCASE_z: CodePoint = char('z');
export const LOWERCASE_Z: CodePoint = char('z');
export const UPPERCASE_Z: CodePoint = char('Z');
// === Predicates ===
export function isBetween(a: CodePoint, x: CodePoint, b: CodePoint): boolean {
return a <= x && x <= b;
@ -38,7 +39,7 @@ export function isDigit(x: CodePoint): boolean {
}
export function isAsciiAlpha(x: CodePoint): boolean {
return isBetween(LOWERCASE_a, x, LOWERCASE_z)
return isBetween(LOWERCASE_A, x, LOWERCASE_Z)
|| isBetween(UPPERCASE_A, x, UPPERCASE_Z);
}
@ -46,6 +47,17 @@ export function isAsciiAlphanumeric(x: CodePoint): boolean {
return isAsciiAlpha(x) || isDigit(x);
}
export function isAsciiWhitespace(cp: CodePoint): boolean {
return cp === SPACE
|| cp === TAB
|| cp === NEW_LINE
|| cp === CARRIAGE_RETURN;
}
export function isAsciiInlineWhitespace(cp: CodePoint): boolean {
return cp === SPACE || cp === TAB;
}
export type CodePointRef = {
char: CodePoint,
offset: StringIndex,
@ -56,7 +68,51 @@ export type CodePointSpan = {
end: CodePointIndex,
}
// === CodePointString ===
export class CodePointString {
readonly codePoints: readonly CodePoint[];
constructor(source: string) {
const codePointsInternal: CodePoint[] = [];
let i = 0;
while (i < source.length) {
const char = source.codePointAt(i) as CodePoint;
codePointsInternal.push(char);
const size =(char > 0xFFFF ? 2 : 1);
i += size;
}
this.codePoints = Object.freeze(codePointsInternal);
}
static makeFromString(s: string): CodePointString {
return new CodePointString(s);
}
codePointAt(index: CodePointIndex): CodePoint {
return this.codePoints[index];
}
get length(): CodePointIndex {
return this.codePoints.length;
}
toString(): string {
let result = "";
for (const cp of this.codePoints) {
result += String.fromCodePoint(cp);
}
return result;
}
}
// === Source Text ===
// TODO:
// @deprecated and say to use `SourceText.makeFromString` instead.
export function sourceText(s: string): SourceText {
return SourceText.makeFromString(s);
}
export class SourceText {
readonly source: string;
// TODO: Later you can try to change this to two `Uint32Array`s - one for codepoints (each 20 bit but whatever), the other for pointers to original string.
@ -65,6 +121,10 @@ export class SourceText {
// Stores the CodePointIndex where each line begins
readonly lineStarts: CodePointIndex[];
static makeFromString(s: string): SourceText {
return new SourceText(s);
}
constructor(rawSource: string) {
// TODO: This shouldn't really be a concern of the library.
// const source = rawSource.normalize('NFC');
@ -228,11 +288,12 @@ export class SourceText {
return this.sliceByCp(startCp, endCp);
}
getLineRange(line: number): { start: CodePointIndex, end: CodePointIndex } {
tryGetLineRange(line: number): CodePointSpan | undefined {
const lineIndex = line - 1;
if (lineIndex < 0 || lineIndex >= this.lineStarts.length) {
// TODO: This is a bit suspicious. Maybe return undefined?
return { start: 0, end: 0 };
return undefined;
}
const start = this.lineStarts[lineIndex];
@ -240,12 +301,16 @@ export class SourceText {
? this.lineStarts[lineIndex + 1]
: this.#chars.length;
return { start, end };
}
return rawSpan(start, end);
}
export function sourceText(s: string): SourceText {
return new SourceText(s);
getLineRange(line: number): { start: CodePointIndex, end: CodePointIndex } {
const range = this.tryGetLineRange(line);
if (range === undefined) {
throw new Error(`Line ${line} is out of bounds (line count: ${this.lineCount})`);
}
return range;
}
}
// Creates a Span from two SourceLocations.
@ -296,6 +361,10 @@ export class SourceRegion {
return span(loc, loc);
}
get codePointSpan(): CodePointSpan {
return rawSpan(this.span.start.index, this.span.end.index);
}
*codePoints(): IterableIterator<[CodePointIndex, CodePoint]> {
const start = this.span.start.index;
const end = this.span.end.index;
@ -366,6 +435,90 @@ export type SourceLocation = {
column: number; // 1-based
}
export function containsSpan(outer: CodePointSpan, inner: CodePointSpan): boolean {
return outer.start <= inner.start && inner.end <= outer.end;
}
export function containsIndex(span: CodePointSpan, index: CodePointIndex): boolean {
return span.start <= index && index < span.end;
}
// === Cursor ===
export class SourceCursor {
private index: CodePointIndex;
constructor(public readonly region: SourceRegion) {
this.index = region.span.start.index;
}
current(): CodePointIndex {
return this.index;
}
checkpoint(): CodePointIndex {
return this.index;
}
restore(index: CodePointIndex) {
this.index = index;
}
peek(): CodePoint | undefined {
if (this.index >= this.region.span.end.index) return undefined;
return this.region.codePointAt(this.index);
}
advance(): CodePoint | undefined {
const cp = this.peek();
if (cp === undefined) return undefined;
this.index += 1;
return cp;
}
isAtEnd(): boolean {
return this.index >= this.region.span.end.index;
}
spanFrom(start: CodePointIndex): CodePointSpan {
return rawSpan(start, this.index);
}
currentSpan(): CodePointSpan {
return this.isAtEnd()
? pointSpan(this.index)
: rawSpan(this.index, this.index + 1);
}
eofSpan(): CodePointSpan {
return pointSpan(this.region.span.end.index);
}
slice(span: CodePointSpan): string {
return this.region.slice(span);
}
location(): SourceLocation {
return this.region.source.getLocation(this.index);
}
moveToNextLineStart(): void {
const loc = this.region.source.getLocation(this.index);
const nextLine = loc.line + 1;
if (nextLine > this.region.span.end.line) {
this.index = this.region.span.end.index;
return;
}
const range = this.region.source.getLineRange(nextLine);
this.index = Math.min(range.start, this.region.span.end.index);
}
}
// === Rendering Utilities ===
export type LineView = {