not stable lol
parent
aaf13743ed
commit
6e6a5c03aa
|
|
@ -1,4 +1,4 @@
|
|||
disco
|
||||
|
||||
out
|
||||
*.o
|
||||
node_modules
|
||||
|
|
@ -3,8 +3,8 @@
|
|||
import { readFileSync } from "fs";
|
||||
import { compile } from "./compiler";
|
||||
import grammar from "./grammar";
|
||||
import { tokenize } from "./tokenizer";
|
||||
import colorize from "./util/asm/colorize";
|
||||
import tokenize from "./util/disco/tokenizer";
|
||||
import { printTokens } from "./util/utils";
|
||||
|
||||
console.log();
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@ export class Terminal extends Token { static terminal: true = true };
|
|||
// these tokens are special, for formatting and generalization reasons.
|
||||
export class $Newline extends Terminal { }
|
||||
export class $Whitespace extends Terminal { }
|
||||
export class $EOF extends Terminal { }
|
||||
|
||||
function isTerminal(tokenClass: TokenClass): tokenClass is TerminalTokenClass {
|
||||
return tokenClass.terminal;
|
||||
|
|
|
|||
|
|
@ -1,50 +1,32 @@
|
|||
import { Grammar, NonTerminal, Production, Terminal, Token } from "./earley";
|
||||
import { $Newline, Grammar, NonTerminal, Production, Terminal, Token } from "./earley";
|
||||
import { AST } from './ast';
|
||||
|
||||
export class $KeywordLink extends Terminal { }
|
||||
export class $KeywordEquals extends Terminal { }
|
||||
export class $KeywordLParen extends Terminal { }
|
||||
export class $KeywordRParen extends Terminal { }
|
||||
export class $KeywordConst extends Terminal { }
|
||||
|
||||
export class $String extends Terminal {}
|
||||
export class $Identifier extends Terminal {}
|
||||
|
||||
export class $Newline extends Terminal { }
|
||||
|
||||
export class $Program extends NonTerminal { }
|
||||
export class $Statement extends NonTerminal { }
|
||||
export class $LinkStatement extends NonTerminal { }
|
||||
export class $VariableDeclaration extends NonTerminal { }
|
||||
export class $Expression extends NonTerminal { }
|
||||
export class $InvocationExpression extends NonTerminal { }
|
||||
export class $VariableReference extends NonTerminal { }
|
||||
import * as t from './util/disco/tokens';
|
||||
|
||||
const ps: Production[] = [
|
||||
{ left: $Program, right: [$Statement], resolver: (s) => !!s ? AST.Body([s]) : AST.Body([]) },
|
||||
{ left: $Program, right: [$Statement, $Program], resolver: (s, ss) => !!s ? AST.Body([s, ...ss.value]) : ss},
|
||||
{ left: t.$Program, right: [t.$Statement], resolver: (s) => !!s ? AST.Body([s]) : AST.Body([]) },
|
||||
{ left: t.$Program, right: [t.$Statement, t.$Program], resolver: (s, ss) => !!s ? AST.Body([s, ...ss.value]) : ss},
|
||||
|
||||
{ left: $Statement, right: [$Newline], resolver: () => false },
|
||||
{ left: $Statement, right: [$LinkStatement], resolver: a => a },
|
||||
{ left: $Statement, right: [$VariableDeclaration], resolver: a => a },
|
||||
{ left: $Statement, right: [$Expression], resolver: a => a },
|
||||
{ left: t.$Statement, right: [$Newline], resolver: () => false },
|
||||
{ left: t.$Statement, right: [t.$LinkStatement], resolver: a => a },
|
||||
{ left: t.$Statement, right: [t.$VariableDeclaration], resolver: a => a },
|
||||
{ left: t.$Statement, right: [t.$Expression], resolver: a => a },
|
||||
|
||||
{ left: $Expression, right: [$String], resolver: (s: $String) => AST.String(s.value) },
|
||||
{ left: $Expression, right: [$InvocationExpression], resolver: a => a },
|
||||
{ left: $Expression, right: [$VariableReference], resolver: a => a },
|
||||
{ left: t.$Expression, right: [t.$String], resolver: (s: t.$String) => AST.String(s.value) },
|
||||
{ left: t.$Expression, right: [t.$InvocationExpression], resolver: a => a },
|
||||
{ left: t.$Expression, right: [t.$VariableReference], resolver: a => a },
|
||||
|
||||
{ left: $VariableReference, right: [$Identifier], resolver: (identifier: $Identifier) => AST.VariableReference(identifier.value) },
|
||||
{ left: t.$VariableReference, right: [t.$Identifier], resolver: (identifier: t.$Identifier) => AST.VariableReference(identifier.value) },
|
||||
|
||||
{ left: $InvocationExpression, right: [$Identifier, $KeywordLParen, $Expression, $KeywordRParen],
|
||||
resolver: (identifier: $Identifier, _, arg: any, __) => AST.Invocation(identifier.value, arg) },
|
||||
{ left: t.$InvocationExpression, right: [t.$Identifier, t.$KeywordLParen, t.$Expression, t.$KeywordRParen],
|
||||
resolver: (identifier: t.$Identifier, _, arg: any, __) => AST.Invocation(identifier.value, arg) },
|
||||
|
||||
{ left: $VariableDeclaration, right: [$KeywordConst, $Identifier, $KeywordEquals, $Expression],
|
||||
resolver: (_, identifier: $Identifier, __, value: any) => AST.Const(identifier.value, value) },
|
||||
{ left: t.$VariableDeclaration, right: [t.$KeywordConst, t.$Identifier, t.$KeywordEquals, t.$Expression],
|
||||
resolver: (_, identifier: t.$Identifier, __, value: any) => AST.Const(identifier.value, value) },
|
||||
|
||||
{ left: $LinkStatement, right: [$KeywordLink, $Identifier], resolver: (_, identifier: $Identifier) => AST.Link(identifier.value) },
|
||||
{ left: t.$LinkStatement, right: [t.$KeywordLink, t.$Identifier], resolver: (_, identifier: t.$Identifier) => AST.Link(identifier.value) },
|
||||
|
||||
]
|
||||
|
||||
const grammar = new Grammar(ps, $Program);
|
||||
const grammar = new Grammar(ps, t.$Program);
|
||||
|
||||
export default grammar;
|
||||
|
|
@ -1,85 +0,0 @@
|
|||
import * as chalk from 'chalk';
|
||||
import { readFileSync, writeFileSync } from 'fs';
|
||||
import { $Identifier, $KeywordConst, $KeywordEquals, $KeywordLink, $KeywordLParen, $KeywordRParen, $Newline, $String } from './grammar';
|
||||
|
||||
|
||||
const keywords = new Map([
|
||||
['=', $KeywordEquals],
|
||||
['(', $KeywordLParen],
|
||||
[')', $KeywordRParen],
|
||||
['link', $KeywordLink],
|
||||
['const', $KeywordConst],
|
||||
]);
|
||||
|
||||
export function tokenize(string) {
|
||||
let inString = false;
|
||||
let escaping = false;
|
||||
let tokens = [];
|
||||
let token = '';
|
||||
// let line = 1;
|
||||
// let col = 1;
|
||||
// const newline = () => (col = 1, line ++);
|
||||
// const nextColumn = () => line ++;
|
||||
const resetToken = () => token = '';
|
||||
const addToken = (_token?) => {
|
||||
if(_token) {
|
||||
token = _token;
|
||||
}
|
||||
if(token.trim() !== '') {
|
||||
if(keywords.has(token)) {
|
||||
const kwTokenClass = keywords.get(token);
|
||||
tokens.push(new kwTokenClass(0, 0, token));
|
||||
} else if (isStringDelim(token[0]))
|
||||
tokens.push(new $String(0, 0, token.substring(1, token.length - 1)));
|
||||
else if (token === 'NEWLINE')
|
||||
tokens.push(new $Newline(0, 0, token))
|
||||
else
|
||||
tokens.push(new $Identifier(0, 0, token));
|
||||
resetToken();
|
||||
}
|
||||
}
|
||||
// let _line = line;
|
||||
// let _col = col;
|
||||
|
||||
const isWhitespace = (char) => [' ', '\n', '\t', '\r'].includes(char);
|
||||
const isNewline = (char) => char === '\n';
|
||||
const isSingleCharToken = (char) => ['(', ')', '='].includes(char);
|
||||
const isStringDelim = (char) => ["'", '"'].includes(char);
|
||||
const isEscapeChar = (char) => char === '\\';
|
||||
const escape = (char) => (char === 'n' ? '\n'
|
||||
: char === 't' ? '\t'
|
||||
: char === 'r' ? '\r' : char)
|
||||
|
||||
for (const char of string) {
|
||||
if(isNewline(char)) {
|
||||
// newline();
|
||||
addToken();
|
||||
// only add newlines if we've actually started tokens...
|
||||
if(tokens.length > 0)
|
||||
addToken('NEWLINE')
|
||||
} else if (escaping) {
|
||||
token += escape(char)
|
||||
escaping = false;
|
||||
} else if (isStringDelim(char)) {
|
||||
token += char;
|
||||
inString = !inString;
|
||||
} else if (inString) {
|
||||
if(isEscapeChar(char)) {
|
||||
escaping = true;
|
||||
} else {
|
||||
token += char
|
||||
}
|
||||
} else if(isSingleCharToken(char)) {
|
||||
addToken();
|
||||
addToken(char);
|
||||
} else if(isWhitespace(char)) {
|
||||
addToken();
|
||||
} else {
|
||||
token += char;
|
||||
}
|
||||
// if(!isNewline(char))
|
||||
// nextColumn();
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
|
@ -59,7 +59,3 @@ const asmTokenizer = createTokenizer([
|
|||
[ /^[A-Za-z._][A-Za-z_]{0,}/, tokens.$Identifier]
|
||||
])
|
||||
export default asmTokenizer;
|
||||
|
||||
import input from './testInput';
|
||||
import { printTokens } from "../utils";
|
||||
printTokens(asmTokenizer(input));
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
export function logASM(asm: string) {
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
import { $Newline } from "../../earley";
|
||||
import { createTokenizer } from "../generalTokenizer";
|
||||
import * as t from './tokens';
|
||||
|
||||
export default createTokenizer([
|
||||
[ /^[\r\t ]{1,}/, null],
|
||||
[ /^\n/, $Newline],
|
||||
[/[a-zA-Z][A-Za-z0-9]{0,}/, t.$Identifier],
|
||||
])
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
import { NonTerminal, Terminal } from "../../earley";
|
||||
|
||||
export class $KeywordLink extends Terminal { }
|
||||
export class $KeywordEquals extends Terminal { }
|
||||
export class $KeywordLParen extends Terminal { }
|
||||
export class $KeywordRParen extends Terminal { }
|
||||
export class $KeywordConst extends Terminal { }
|
||||
|
||||
export class $String extends Terminal {}
|
||||
export class $Identifier extends Terminal {}
|
||||
|
||||
export class $Program extends NonTerminal { }
|
||||
export class $Statement extends NonTerminal { }
|
||||
export class $LinkStatement extends NonTerminal { }
|
||||
export class $VariableDeclaration extends NonTerminal { }
|
||||
export class $Expression extends NonTerminal { }
|
||||
export class $InvocationExpression extends NonTerminal { }
|
||||
export class $VariableReference extends NonTerminal { }
|
||||
|
|
@ -1,6 +1,8 @@
|
|||
import { Terminal, TerminalTokenClass } from "../earley";
|
||||
import { Matcher } from "./regex";
|
||||
|
||||
type TokenMatcher = [ RegExp, TerminalTokenClass ];
|
||||
type Index = number;
|
||||
|
||||
interface Match {
|
||||
regex: RegExp;
|
||||
|
|
@ -10,18 +12,18 @@ interface Match {
|
|||
}
|
||||
|
||||
// this is kinda bullshit lol exec is a dumb method.
|
||||
function getFirstMatch(r: RegExp, str: string): [number, string] {
|
||||
function getFirstMatch(r: RegExp | Matcher, str: string): [Index, string] {
|
||||
if (r instanceof RegExp) {
|
||||
let matches = str.match(r);
|
||||
if(matches === null) return [-1, ''];
|
||||
return [matches.index, matches[0]];
|
||||
}
|
||||
}
|
||||
|
||||
export function createTokenizer(tokenMap: TokenMatcher[]) {
|
||||
|
||||
return function tokenize(str: string, l = 1, c = 1): Terminal[] {
|
||||
|
||||
const possibleMatches: Match[] = tokenMap
|
||||
.map(([regex, tokenClass]) => {
|
||||
const getMatchesFromTokenMatcher =
|
||||
(str: string) =>
|
||||
([regex, tokenClass]: TokenMatcher): Match =>
|
||||
{
|
||||
const [index, match] = getFirstMatch(regex, str);
|
||||
if(index === -1) return null;
|
||||
return {
|
||||
|
|
@ -30,7 +32,25 @@ export function createTokenizer(tokenMap: TokenMatcher[]) {
|
|||
length: match.length,
|
||||
matchedString: match
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
const advanceLC = (l: number, c: number, str: string) => {
|
||||
for(const char of str) {
|
||||
c ++;
|
||||
if(char === '\n') {
|
||||
l ++;
|
||||
c = 1;
|
||||
}
|
||||
}
|
||||
return [l, c];
|
||||
}
|
||||
|
||||
export function createTokenizer(tokenMap: TokenMatcher[]) {
|
||||
|
||||
return function tokenize(str: string, l = 1, c = 1): Terminal[] {
|
||||
|
||||
const possibleMatches: Match[] = tokenMap
|
||||
.map(getMatchesFromTokenMatcher(str))
|
||||
.filter(v => !!v);
|
||||
|
||||
const longestLength = possibleMatches
|
||||
|
|
@ -41,23 +61,23 @@ export function createTokenizer(tokenMap: TokenMatcher[]) {
|
|||
.filter(v => v.length === longestLength);
|
||||
|
||||
console.assert(longestMatches.length > 0, 'No token matches found');
|
||||
if(longestMatches.length === 0) process.exit(1);
|
||||
|
||||
const [{tokenClass, matchedString}] = longestMatches;
|
||||
const {tokenClass, matchedString} = longestMatches[0];
|
||||
const length = matchedString.length;
|
||||
const token = tokenClass ? new tokenClass(l, c, matchedString) : null;
|
||||
|
||||
const rest = str.substring(length);
|
||||
|
||||
|
||||
const token = tokenClass ? new tokenClass(l, c, matchedString) : null;
|
||||
if(rest === '') return [ token ];
|
||||
|
||||
for(const char of matchedString) {
|
||||
c ++;
|
||||
if(char === '\n') {
|
||||
l ++;
|
||||
c = 1;
|
||||
[l, c] = advanceLC(l, c, str);
|
||||
if(tokenClass) {
|
||||
return [
|
||||
new tokenClass(l, c, matchedString),
|
||||
...tokenize(rest, l, c)
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
return token ? [token, ...tokenize(rest, l, c)] : tokenize(rest, l, c);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,176 @@
|
|||
type Match = {
|
||||
offset: number;
|
||||
length: number;
|
||||
text: string;
|
||||
original: string;
|
||||
}
|
||||
|
||||
const match = (offset: number, length: number, text: string, original: string): Match => {
|
||||
return { offset, length, text, original };
|
||||
}
|
||||
|
||||
export type Matcher = (str: string) => Match[]
|
||||
|
||||
export const matchChar = (char: string): Matcher => {
|
||||
const matcher = (test: string) => {
|
||||
return test[0] === char[0] ? [match(0, 1, test[0], test)] : []
|
||||
}
|
||||
matcher.toString = () => {
|
||||
return char;
|
||||
}
|
||||
return matcher;
|
||||
}
|
||||
|
||||
export const matchCharClass = (chars: string[]): Matcher => {
|
||||
const matcher = (test: string) => {
|
||||
return chars.includes(test[0]) ? [match(0, 1, test[0], test)] : []
|
||||
}
|
||||
matcher.toString = () => {
|
||||
return '[' + chars.join('') + ']';
|
||||
}
|
||||
return matcher;
|
||||
}
|
||||
|
||||
const combineMatches = (a: Match, b: Match): Match => {
|
||||
return match(
|
||||
Math.min(a.offset, b.offset),
|
||||
a.length + b.length,
|
||||
a.text + b.text,
|
||||
a.original.length > b.original.length ? a.original : b.original
|
||||
)
|
||||
}
|
||||
|
||||
export const matchSequence = (matcherA: Matcher, matcherB: Matcher): Matcher => {
|
||||
const matcher = (test: string) => {
|
||||
const matches = [];
|
||||
for (const match of matcherA(test)) {
|
||||
const rest = test.substring(match.length);
|
||||
for (const restMatch of matcherB(rest)) {
|
||||
matches.push(combineMatches(match, restMatch));
|
||||
}
|
||||
}
|
||||
return matches;
|
||||
}
|
||||
matcher.toString = () => {
|
||||
return matcherA.toString() + matcherB.toString();
|
||||
}
|
||||
return matcher;
|
||||
}
|
||||
|
||||
const repeatMatcher = (matcher: Matcher, test: string, n: number): Match[] => {
|
||||
if(n === 0) {
|
||||
return [match(0, 0, '', test)];
|
||||
}
|
||||
const matches = matcher(test);
|
||||
if(n === 1) {
|
||||
return matches;
|
||||
}
|
||||
return matches.map(match => {
|
||||
const rest = match.original.substring(match.length);
|
||||
return repeatMatcher(matcher, rest, n - 1).map(nextMatch => combineMatches(match, nextMatch));
|
||||
}).flat();
|
||||
}
|
||||
|
||||
// this logic sucks lol
|
||||
// really you should just keep matching until you
|
||||
// have no more characters or you hit the match limit.
|
||||
// like this shit increases O by 2 on each nested call...
|
||||
// TODO /\ \/ /\ \/ /\ \/ /\ \/ /\ \/ /\ \/ /\ \/ /\
|
||||
export const matchMany = (matcherA: Matcher, min = 1, max = Infinity): Matcher => {
|
||||
const matcher = (test: string) => {
|
||||
const rmatches: Match[] = [];
|
||||
const limitedMax = Math.min(max, test.length);
|
||||
for(let c = min; c <= limitedMax; c ++) {
|
||||
const matches = repeatMatcher(matcherA, test, c);
|
||||
rmatches.push(...matches);
|
||||
}
|
||||
return rmatches;
|
||||
}
|
||||
matcher.toString = () => {
|
||||
return '(' + (matcherA.toString()) + '){' + (min === 0 ? '' : min) + ',' + (max === Infinity ? '' : max) + '}';
|
||||
}
|
||||
return matcher;
|
||||
}
|
||||
|
||||
// variable names regex, theory...
|
||||
|
||||
|
||||
const matchers = [
|
||||
matchChar('a'),
|
||||
matchCharClass(['a', 'b', 'c']),
|
||||
matchSequence(
|
||||
matchChar('a'),
|
||||
matchCharClass(['a', 'b', 'c'])
|
||||
),
|
||||
matchMany(
|
||||
matchCharClass(['a', 'b', 'c'])
|
||||
),
|
||||
matchMany(
|
||||
matchCharClass(['a', 'b', 'c']),
|
||||
1,
|
||||
1
|
||||
),
|
||||
];
|
||||
|
||||
const tests = [
|
||||
'a',
|
||||
'b',
|
||||
'c',
|
||||
'd',
|
||||
'ab',
|
||||
'bc',
|
||||
'cd',
|
||||
'da',
|
||||
]
|
||||
|
||||
console.clear();
|
||||
|
||||
const logMatches = (ms: Match[]) => {
|
||||
for(const match of ms) {
|
||||
console.log(
|
||||
' '.repeat(8) +
|
||||
chalk.white(match.original.substring(0, match.offset)) +
|
||||
chalk.green(match.text) +
|
||||
chalk.white(match.original.substring(match.offset + match.length))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
const Y = true;
|
||||
const N = false;
|
||||
const testMatrix = [
|
||||
[Y, N, N, N, N, N, N, N],
|
||||
[Y, Y, Y, N, N, N, N, N],
|
||||
[N, N, N, N, Y, N, N, N],
|
||||
[Y, Y, Y, N, Y, Y, N, N],
|
||||
[Y, Y, Y, N, N, N, N, N]
|
||||
]
|
||||
import * as chalk from 'chalk';
|
||||
// dirty levels off the CHARTS
|
||||
let i = 0, j = 0, p = 0, f = 0;
|
||||
for (const matcher of matchers) {
|
||||
j = 0;
|
||||
for (const testString of tests) {
|
||||
const matches = matcher(testString).filter(match => match.length === testString.length);
|
||||
if (matches.length > 0 === testMatrix[i][j]) {
|
||||
p ++;
|
||||
} else {
|
||||
f ++;
|
||||
console.log(
|
||||
chalk.red('[ FAIL ]'),
|
||||
chalk.ansi256(143)('/' + matcher.toString() + '/'),
|
||||
'incorrectly returned',
|
||||
matches.length,
|
||||
'match' + (matches.length !== 1 ? 'es' : '') + ' for',
|
||||
testString,
|
||||
);
|
||||
logMatches(matches);
|
||||
console.log('')
|
||||
}
|
||||
j++;
|
||||
}
|
||||
i++
|
||||
}
|
||||
console.log('' + p + ' test' + (p !== 1 ? 's' : '') + ' passed.')
|
||||
console.log('' + f + ' test' + (f !== 1 ? 's' : '') + ' failed.')
|
||||
process.exit(f);
|
||||
|
|
@ -6,10 +6,10 @@ class $Plus extends Terminal { }
|
|||
class $Newline extends Terminal { }
|
||||
|
||||
const tokenizer = createTokenizer([
|
||||
{ match: /^[0-9]{1,}$/, token: $Number },
|
||||
{ match: /^[\r\t ]{1,}$/, token: null },
|
||||
{ match: '\n', token: $Newline },
|
||||
{ match: '+', token: $Plus },
|
||||
[ /^[0-9]{1,}$/, $Number ],
|
||||
[ /^[\r\t ]{1,}$/, null ],
|
||||
[ /\n/, $Newline ],
|
||||
[ /+/, $Plus ],
|
||||
])
|
||||
|
||||
console.log(tokenizer("5 + \n 6 ").map(v => v.toString()).join(' '));
|
||||
4
todo.md
4
todo.md
|
|
@ -3,8 +3,10 @@
|
|||
- [x] colorize the assembly output
|
||||
- [x] create generalTokenizer to make tokenization generic
|
||||
- [ ] rewrite disco tokenizer to the new generalTokenizer
|
||||
- [ ] explore defining non terminals in a grammar with just a string
|
||||
- possibly using tagged template strings??
|
||||
- [ ] add an EOF token to earley, and yknow, add it to the tokenizer.
|
||||
- [ ] add number support
|
||||
- [ ] add number support in consts
|
||||
- [ ] add comment support
|
||||
- [ ] add fixed length array support
|
||||
- [ ] organize AST elements into classes
|
||||
|
|
|
|||
Loading…
Reference in New Issue