diff --git a/README.md b/README.md index 7c9a8fd..5d92fc7 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ TypeScript library for handling source code strings without having to deal with intricacies of JS's UTF16 encoding. +# CodePointString +A wrapper for a string that's just an array of codepoints. There's no newline or offset tracking to the original string. + # SourceText A sane, UTF-16-safe string wrapper specifically designed for parsing source code, tracking line numbers, and generating CLI error messages. Think of it as a fat wrapper for a string that understand more info about the string like line structure. @@ -19,6 +22,8 @@ It also allows for Spatial Tracking or various sub-regions within the source. It - `SourceLocation` is basically a smart 2D coordinate equivalent to `(line, col)` (but also tracks `CodePointIndex`) - `Span` an interval determined by `start` and `end` SourceLocations +# Source Cursor +- `SourceCursor` is a mutable cursor over `SourceRegion`. Primarily useful to build parsers on top of `SourceRegion`. It is line-aware. # Rendering CLI Errors Secondary functionality is `function renderSpan(region: SourceRegion, span: Span, contextLines = 1): LineView[]` which is able to render spans of source-code as follows diff --git a/src/index.ts b/src/index.ts index 82ed23a..2db4378 100644 --- a/src/index.ts +++ b/src/index.ts @@ -21,13 +21,14 @@ export const DIGIT_9: CodePoint = char('9'); export const DOT: CodePoint = char('.'); // Hex Boundaries -export const LOWERCASE_a: CodePoint = char('a'); +export const LOWERCASE_A: CodePoint = char('a'); export const UPPERCASE_A: CodePoint = char('A'); -export const LOWERCASE_f: CodePoint = char('f'); +export const LOWERCASE_F: CodePoint = char('f'); export const UPPERCASE_F: CodePoint = char('F'); -export const LOWERCASE_z: CodePoint = char('z'); +export const LOWERCASE_Z: CodePoint = char('z'); export const UPPERCASE_Z: CodePoint = char('Z'); +// === Predicates === export function isBetween(a: CodePoint, x: CodePoint, b: CodePoint): boolean { return a <= x && x <= b; @@ -38,7 +39,7 @@ export function isDigit(x: CodePoint): boolean { } export function isAsciiAlpha(x: CodePoint): boolean { - return isBetween(LOWERCASE_a, x, LOWERCASE_z) + return isBetween(LOWERCASE_A, x, LOWERCASE_Z) || isBetween(UPPERCASE_A, x, UPPERCASE_Z); } @@ -46,6 +47,17 @@ export function isAsciiAlphanumeric(x: CodePoint): boolean { return isAsciiAlpha(x) || isDigit(x); } +export function isAsciiWhitespace(cp: CodePoint): boolean { + return cp === SPACE + || cp === TAB + || cp === NEW_LINE + || cp === CARRIAGE_RETURN; +} + +export function isAsciiInlineWhitespace(cp: CodePoint): boolean { + return cp === SPACE || cp === TAB; +} + export type CodePointRef = { char: CodePoint, offset: StringIndex, @@ -56,7 +68,51 @@ export type CodePointSpan = { end: CodePointIndex, } +// === CodePointString === +export class CodePointString { + readonly codePoints: readonly CodePoint[]; + + constructor(source: string) { + const codePointsInternal: CodePoint[] = []; + let i = 0; + while (i < source.length) { + const char = source.codePointAt(i) as CodePoint; + codePointsInternal.push(char); + + const size =(char > 0xFFFF ? 2 : 1); + i += size; + } + this.codePoints = Object.freeze(codePointsInternal); + } + + static makeFromString(s: string): CodePointString { + return new CodePointString(s); + } + + codePointAt(index: CodePointIndex): CodePoint { + return this.codePoints[index]; + } + + get length(): CodePointIndex { + return this.codePoints.length; + } + + toString(): string { + let result = ""; + for (const cp of this.codePoints) { + result += String.fromCodePoint(cp); + } + return result; + } +} + // === Source Text === +// TODO: +// @deprecated and say to use `SourceText.makeFromString` instead. +export function sourceText(s: string): SourceText { + return SourceText.makeFromString(s); +} + export class SourceText { readonly source: string; // TODO: Later you can try to change this to two `Uint32Array`s - one for codepoints (each 20 bit but whatever), the other for pointers to original string. @@ -65,6 +121,10 @@ export class SourceText { // Stores the CodePointIndex where each line begins readonly lineStarts: CodePointIndex[]; + static makeFromString(s: string): SourceText { + return new SourceText(s); + } + constructor(rawSource: string) { // TODO: This shouldn't really be a concern of the library. // const source = rawSource.normalize('NFC'); @@ -228,24 +288,29 @@ export class SourceText { return this.sliceByCp(startCp, endCp); } - getLineRange(line: number): { start: CodePointIndex, end: CodePointIndex } { + + tryGetLineRange(line: number): CodePointSpan | undefined { const lineIndex = line - 1; + if (lineIndex < 0 || lineIndex >= this.lineStarts.length) { - // TODO: This is a bit suspicious. Maybe return undefined? - return { start: 0, end: 0 }; + return undefined; } const start = this.lineStarts[lineIndex]; const end = (lineIndex + 1 < this.lineStarts.length) ? this.lineStarts[lineIndex + 1] : this.#chars.length; - - return { start, end }; - } -} -export function sourceText(s: string): SourceText { - return new SourceText(s); + return rawSpan(start, end); + } + + getLineRange(line: number): { start: CodePointIndex, end: CodePointIndex } { + const range = this.tryGetLineRange(line); + if (range === undefined) { + throw new Error(`Line ${line} is out of bounds (line count: ${this.lineCount})`); + } + return range; + } } // Creates a Span from two SourceLocations. @@ -296,6 +361,10 @@ export class SourceRegion { return span(loc, loc); } + get codePointSpan(): CodePointSpan { + return rawSpan(this.span.start.index, this.span.end.index); + } + *codePoints(): IterableIterator<[CodePointIndex, CodePoint]> { const start = this.span.start.index; const end = this.span.end.index; @@ -366,6 +435,90 @@ export type SourceLocation = { column: number; // 1-based } +export function containsSpan(outer: CodePointSpan, inner: CodePointSpan): boolean { + return outer.start <= inner.start && inner.end <= outer.end; +} + +export function containsIndex(span: CodePointSpan, index: CodePointIndex): boolean { + return span.start <= index && index < span.end; +} + +// === Cursor === + +export class SourceCursor { + private index: CodePointIndex; + + constructor(public readonly region: SourceRegion) { + this.index = region.span.start.index; + } + + current(): CodePointIndex { + return this.index; + } + + checkpoint(): CodePointIndex { + return this.index; + } + + restore(index: CodePointIndex) { + this.index = index; + } + + peek(): CodePoint | undefined { + if (this.index >= this.region.span.end.index) return undefined; + return this.region.codePointAt(this.index); + } + + advance(): CodePoint | undefined { + const cp = this.peek(); + if (cp === undefined) return undefined; + this.index += 1; + return cp; + } + + isAtEnd(): boolean { + return this.index >= this.region.span.end.index; + } + + spanFrom(start: CodePointIndex): CodePointSpan { + return rawSpan(start, this.index); + } + + currentSpan(): CodePointSpan { + return this.isAtEnd() + ? pointSpan(this.index) + : rawSpan(this.index, this.index + 1); + } + + eofSpan(): CodePointSpan { + return pointSpan(this.region.span.end.index); + } + + slice(span: CodePointSpan): string { + return this.region.slice(span); + } + + + location(): SourceLocation { + return this.region.source.getLocation(this.index); + } + + moveToNextLineStart(): void { + const loc = this.region.source.getLocation(this.index); + const nextLine = loc.line + 1; + + if (nextLine > this.region.span.end.line) { + this.index = this.region.span.end.index; + return; + } + + const range = this.region.source.getLineRange(nextLine); + this.index = Math.min(range.start, this.region.span.end.index); + } +} + + + // === Rendering Utilities === export type LineView = {