diff --git a/.gitignore b/.gitignore index e967f96..c36ca48 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -disco + out *.o node_modules \ No newline at end of file diff --git a/src/disco.ts b/src/disco.ts index bf6f39b..6ca9956 100755 --- a/src/disco.ts +++ b/src/disco.ts @@ -3,8 +3,8 @@ import { readFileSync } from "fs"; import { compile } from "./compiler"; import grammar from "./grammar"; -import { tokenize } from "./tokenizer"; import colorize from "./util/asm/colorize"; +import tokenize from "./util/disco/tokenizer"; import { printTokens } from "./util/utils"; console.log(); diff --git a/src/earley.ts b/src/earley.ts index fc12993..2c8ac6d 100644 --- a/src/earley.ts +++ b/src/earley.ts @@ -37,6 +37,7 @@ export class Terminal extends Token { static terminal: true = true }; // these tokens are special, for formatting and generalization reasons. export class $Newline extends Terminal { } export class $Whitespace extends Terminal { } +export class $EOF extends Terminal { } function isTerminal(tokenClass: TokenClass): tokenClass is TerminalTokenClass { return tokenClass.terminal; diff --git a/src/grammar.ts b/src/grammar.ts index 824007d..ec94c1d 100644 --- a/src/grammar.ts +++ b/src/grammar.ts @@ -1,50 +1,32 @@ -import { Grammar, NonTerminal, Production, Terminal, Token } from "./earley"; +import { $Newline, Grammar, NonTerminal, Production, Terminal, Token } from "./earley"; import { AST } from './ast'; - -export class $KeywordLink extends Terminal { } -export class $KeywordEquals extends Terminal { } -export class $KeywordLParen extends Terminal { } -export class $KeywordRParen extends Terminal { } -export class $KeywordConst extends Terminal { } - -export class $String extends Terminal {} -export class $Identifier extends Terminal {} - -export class $Newline extends Terminal { } - -export class $Program extends NonTerminal { } -export class $Statement extends NonTerminal { } -export class $LinkStatement extends NonTerminal { } -export class $VariableDeclaration extends NonTerminal { } -export class $Expression extends NonTerminal { } -export class $InvocationExpression extends NonTerminal { } -export class $VariableReference extends NonTerminal { } +import * as t from './util/disco/tokens'; const ps: Production[] = [ - { left: $Program, right: [$Statement], resolver: (s) => !!s ? AST.Body([s]) : AST.Body([]) }, - { left: $Program, right: [$Statement, $Program], resolver: (s, ss) => !!s ? AST.Body([s, ...ss.value]) : ss}, + { left: t.$Program, right: [t.$Statement], resolver: (s) => !!s ? AST.Body([s]) : AST.Body([]) }, + { left: t.$Program, right: [t.$Statement, t.$Program], resolver: (s, ss) => !!s ? AST.Body([s, ...ss.value]) : ss}, - { left: $Statement, right: [$Newline], resolver: () => false }, - { left: $Statement, right: [$LinkStatement], resolver: a => a }, - { left: $Statement, right: [$VariableDeclaration], resolver: a => a }, - { left: $Statement, right: [$Expression], resolver: a => a }, + { left: t.$Statement, right: [$Newline], resolver: () => false }, + { left: t.$Statement, right: [t.$LinkStatement], resolver: a => a }, + { left: t.$Statement, right: [t.$VariableDeclaration], resolver: a => a }, + { left: t.$Statement, right: [t.$Expression], resolver: a => a }, - { left: $Expression, right: [$String], resolver: (s: $String) => AST.String(s.value) }, - { left: $Expression, right: [$InvocationExpression], resolver: a => a }, - { left: $Expression, right: [$VariableReference], resolver: a => a }, + { left: t.$Expression, right: [t.$String], resolver: (s: t.$String) => AST.String(s.value) }, + { left: t.$Expression, right: [t.$InvocationExpression], resolver: a => a }, + { left: t.$Expression, right: [t.$VariableReference], resolver: a => a }, - { left: $VariableReference, right: [$Identifier], resolver: (identifier: $Identifier) => AST.VariableReference(identifier.value) }, + { left: t.$VariableReference, right: [t.$Identifier], resolver: (identifier: t.$Identifier) => AST.VariableReference(identifier.value) }, - { left: $InvocationExpression, right: [$Identifier, $KeywordLParen, $Expression, $KeywordRParen], - resolver: (identifier: $Identifier, _, arg: any, __) => AST.Invocation(identifier.value, arg) }, + { left: t.$InvocationExpression, right: [t.$Identifier, t.$KeywordLParen, t.$Expression, t.$KeywordRParen], + resolver: (identifier: t.$Identifier, _, arg: any, __) => AST.Invocation(identifier.value, arg) }, - { left: $VariableDeclaration, right: [$KeywordConst, $Identifier, $KeywordEquals, $Expression], - resolver: (_, identifier: $Identifier, __, value: any) => AST.Const(identifier.value, value) }, + { left: t.$VariableDeclaration, right: [t.$KeywordConst, t.$Identifier, t.$KeywordEquals, t.$Expression], + resolver: (_, identifier: t.$Identifier, __, value: any) => AST.Const(identifier.value, value) }, - { left: $LinkStatement, right: [$KeywordLink, $Identifier], resolver: (_, identifier: $Identifier) => AST.Link(identifier.value) }, + { left: t.$LinkStatement, right: [t.$KeywordLink, t.$Identifier], resolver: (_, identifier: t.$Identifier) => AST.Link(identifier.value) }, ] -const grammar = new Grammar(ps, $Program); +const grammar = new Grammar(ps, t.$Program); export default grammar; \ No newline at end of file diff --git a/src/tokenizer.ts b/src/tokenizer.ts deleted file mode 100644 index b54f5e5..0000000 --- a/src/tokenizer.ts +++ /dev/null @@ -1,85 +0,0 @@ -import * as chalk from 'chalk'; -import { readFileSync, writeFileSync } from 'fs'; -import { $Identifier, $KeywordConst, $KeywordEquals, $KeywordLink, $KeywordLParen, $KeywordRParen, $Newline, $String } from './grammar'; - - -const keywords = new Map([ - ['=', $KeywordEquals], - ['(', $KeywordLParen], - [')', $KeywordRParen], - ['link', $KeywordLink], - ['const', $KeywordConst], -]); - -export function tokenize(string) { - let inString = false; - let escaping = false; - let tokens = []; - let token = ''; - // let line = 1; - // let col = 1; - // const newline = () => (col = 1, line ++); - // const nextColumn = () => line ++; - const resetToken = () => token = ''; - const addToken = (_token?) => { - if(_token) { - token = _token; - } - if(token.trim() !== '') { - if(keywords.has(token)) { - const kwTokenClass = keywords.get(token); - tokens.push(new kwTokenClass(0, 0, token)); - } else if (isStringDelim(token[0])) - tokens.push(new $String(0, 0, token.substring(1, token.length - 1))); - else if (token === 'NEWLINE') - tokens.push(new $Newline(0, 0, token)) - else - tokens.push(new $Identifier(0, 0, token)); - resetToken(); - } - } - // let _line = line; - // let _col = col; - - const isWhitespace = (char) => [' ', '\n', '\t', '\r'].includes(char); - const isNewline = (char) => char === '\n'; - const isSingleCharToken = (char) => ['(', ')', '='].includes(char); - const isStringDelim = (char) => ["'", '"'].includes(char); - const isEscapeChar = (char) => char === '\\'; - const escape = (char) => (char === 'n' ? '\n' - : char === 't' ? '\t' - : char === 'r' ? '\r' : char) - - for (const char of string) { - if(isNewline(char)) { - // newline(); - addToken(); - // only add newlines if we've actually started tokens... - if(tokens.length > 0) - addToken('NEWLINE') - } else if (escaping) { - token += escape(char) - escaping = false; - } else if (isStringDelim(char)) { - token += char; - inString = !inString; - } else if (inString) { - if(isEscapeChar(char)) { - escaping = true; - } else { - token += char - } - } else if(isSingleCharToken(char)) { - addToken(); - addToken(char); - } else if(isWhitespace(char)) { - addToken(); - } else { - token += char; - } - // if(!isNewline(char)) - // nextColumn(); - } - - return tokens; -} \ No newline at end of file diff --git a/src/util/asm/tokenizer.ts b/src/util/asm/tokenizer.ts index 6ab2ae7..e6f58cc 100644 --- a/src/util/asm/tokenizer.ts +++ b/src/util/asm/tokenizer.ts @@ -59,7 +59,3 @@ const asmTokenizer = createTokenizer([ [ /^[A-Za-z._][A-Za-z_]{0,}/, tokens.$Identifier] ]) export default asmTokenizer; - -import input from './testInput'; -import { printTokens } from "../utils"; -printTokens(asmTokenizer(input)); \ No newline at end of file diff --git a/src/util/asmLogger.ts b/src/util/asmLogger.ts deleted file mode 100644 index 66c28de..0000000 --- a/src/util/asmLogger.ts +++ /dev/null @@ -1,3 +0,0 @@ -export function logASM(asm: string) { - -} \ No newline at end of file diff --git a/src/util/disco/tokenizer.ts b/src/util/disco/tokenizer.ts new file mode 100644 index 0000000..6646fc0 --- /dev/null +++ b/src/util/disco/tokenizer.ts @@ -0,0 +1,9 @@ +import { $Newline } from "../../earley"; +import { createTokenizer } from "../generalTokenizer"; +import * as t from './tokens'; + +export default createTokenizer([ + [ /^[\r\t ]{1,}/, null], + [ /^\n/, $Newline], + [/[a-zA-Z][A-Za-z0-9]{0,}/, t.$Identifier], +]) \ No newline at end of file diff --git a/src/util/disco/tokens.ts b/src/util/disco/tokens.ts new file mode 100644 index 0000000..f7bdf27 --- /dev/null +++ b/src/util/disco/tokens.ts @@ -0,0 +1,18 @@ +import { NonTerminal, Terminal } from "../../earley"; + +export class $KeywordLink extends Terminal { } +export class $KeywordEquals extends Terminal { } +export class $KeywordLParen extends Terminal { } +export class $KeywordRParen extends Terminal { } +export class $KeywordConst extends Terminal { } + +export class $String extends Terminal {} +export class $Identifier extends Terminal {} + +export class $Program extends NonTerminal { } +export class $Statement extends NonTerminal { } +export class $LinkStatement extends NonTerminal { } +export class $VariableDeclaration extends NonTerminal { } +export class $Expression extends NonTerminal { } +export class $InvocationExpression extends NonTerminal { } +export class $VariableReference extends NonTerminal { } \ No newline at end of file diff --git a/src/util/generalTokenizer.ts b/src/util/generalTokenizer.ts index 551b64a..3dd71b2 100644 --- a/src/util/generalTokenizer.ts +++ b/src/util/generalTokenizer.ts @@ -1,6 +1,8 @@ import { Terminal, TerminalTokenClass } from "../earley"; +import { Matcher } from "./regex"; type TokenMatcher = [ RegExp, TerminalTokenClass ]; +type Index = number; interface Match { regex: RegExp; @@ -10,10 +12,37 @@ interface Match { } // this is kinda bullshit lol exec is a dumb method. -function getFirstMatch(r: RegExp, str: string): [number, string] { - let matches = str.match(r); - if(matches === null) return [-1, '']; - return [matches.index, matches[0]]; +function getFirstMatch(r: RegExp | Matcher, str: string): [Index, string] { + if (r instanceof RegExp) { + let matches = str.match(r); + if(matches === null) return [-1, '']; + return [matches.index, matches[0]]; + } +} + +const getMatchesFromTokenMatcher = + (str: string) => + ([regex, tokenClass]: TokenMatcher): Match => +{ + const [index, match] = getFirstMatch(regex, str); + if(index === -1) return null; + return { + regex, + tokenClass, + length: match.length, + matchedString: match + } +} + +const advanceLC = (l: number, c: number, str: string) => { + for(const char of str) { + c ++; + if(char === '\n') { + l ++; + c = 1; + } + } + return [l, c]; } export function createTokenizer(tokenMap: TokenMatcher[]) { @@ -21,16 +50,7 @@ export function createTokenizer(tokenMap: TokenMatcher[]) { return function tokenize(str: string, l = 1, c = 1): Terminal[] { const possibleMatches: Match[] = tokenMap - .map(([regex, tokenClass]) => { - const [index, match] = getFirstMatch(regex, str); - if(index === -1) return null; - return { - regex, - tokenClass, - length: match.length, - matchedString: match - } - }) + .map(getMatchesFromTokenMatcher(str)) .filter(v => !!v); const longestLength = possibleMatches @@ -41,23 +61,23 @@ export function createTokenizer(tokenMap: TokenMatcher[]) { .filter(v => v.length === longestLength); console.assert(longestMatches.length > 0, 'No token matches found'); + if(longestMatches.length === 0) process.exit(1); - const [{tokenClass, matchedString}] = longestMatches; + const {tokenClass, matchedString} = longestMatches[0]; const length = matchedString.length; - const token = tokenClass ? new tokenClass(l, c, matchedString) : null; - const rest = str.substring(length); + + const token = tokenClass ? new tokenClass(l, c, matchedString) : null; if(rest === '') return [ token ]; - for(const char of matchedString) { - c ++; - if(char === '\n') { - l ++; - c = 1; - } + [l, c] = advanceLC(l, c, str); + if(tokenClass) { + return [ + new tokenClass(l, c, matchedString), + ...tokenize(rest, l, c) + ] } - return token ? [token, ...tokenize(rest, l, c)] : tokenize(rest, l, c); } } \ No newline at end of file diff --git a/src/util/regex.ts b/src/util/regex.ts new file mode 100644 index 0000000..d18b1d5 --- /dev/null +++ b/src/util/regex.ts @@ -0,0 +1,176 @@ +type Match = { + offset: number; + length: number; + text: string; + original: string; +} + +const match = (offset: number, length: number, text: string, original: string): Match => { + return { offset, length, text, original }; +} + +export type Matcher = (str: string) => Match[] + +export const matchChar = (char: string): Matcher => { + const matcher = (test: string) => { + return test[0] === char[0] ? [match(0, 1, test[0], test)] : [] + } + matcher.toString = () => { + return char; + } + return matcher; +} + +export const matchCharClass = (chars: string[]): Matcher => { + const matcher = (test: string) => { + return chars.includes(test[0]) ? [match(0, 1, test[0], test)] : [] + } + matcher.toString = () => { + return '[' + chars.join('') + ']'; + } + return matcher; +} + +const combineMatches = (a: Match, b: Match): Match => { + return match( + Math.min(a.offset, b.offset), + a.length + b.length, + a.text + b.text, + a.original.length > b.original.length ? a.original : b.original + ) +} + +export const matchSequence = (matcherA: Matcher, matcherB: Matcher): Matcher => { + const matcher = (test: string) => { + const matches = []; + for (const match of matcherA(test)) { + const rest = test.substring(match.length); + for (const restMatch of matcherB(rest)) { + matches.push(combineMatches(match, restMatch)); + } + } + return matches; + } + matcher.toString = () => { + return matcherA.toString() + matcherB.toString(); + } + return matcher; +} + +const repeatMatcher = (matcher: Matcher, test: string, n: number): Match[] => { + if(n === 0) { + return [match(0, 0, '', test)]; + } + const matches = matcher(test); + if(n === 1) { + return matches; + } + return matches.map(match => { + const rest = match.original.substring(match.length); + return repeatMatcher(matcher, rest, n - 1).map(nextMatch => combineMatches(match, nextMatch)); + }).flat(); +} + +// this logic sucks lol +// really you should just keep matching until you +// have no more characters or you hit the match limit. +// like this shit increases O by 2 on each nested call... +// TODO /\ \/ /\ \/ /\ \/ /\ \/ /\ \/ /\ \/ /\ \/ /\ +export const matchMany = (matcherA: Matcher, min = 1, max = Infinity): Matcher => { + const matcher = (test: string) => { + const rmatches: Match[] = []; + const limitedMax = Math.min(max, test.length); + for(let c = min; c <= limitedMax; c ++) { + const matches = repeatMatcher(matcherA, test, c); + rmatches.push(...matches); + } + return rmatches; + } + matcher.toString = () => { + return '(' + (matcherA.toString()) + '){' + (min === 0 ? '' : min) + ',' + (max === Infinity ? '' : max) + '}'; + } + return matcher; +} + +// variable names regex, theory... + + +const matchers = [ + matchChar('a'), + matchCharClass(['a', 'b', 'c']), + matchSequence( + matchChar('a'), + matchCharClass(['a', 'b', 'c']) + ), + matchMany( + matchCharClass(['a', 'b', 'c']) + ), + matchMany( + matchCharClass(['a', 'b', 'c']), + 1, + 1 + ), +]; + +const tests = [ + 'a', + 'b', + 'c', + 'd', + 'ab', + 'bc', + 'cd', + 'da', +] + +console.clear(); + +const logMatches = (ms: Match[]) => { + for(const match of ms) { + console.log( + ' '.repeat(8) + + chalk.white(match.original.substring(0, match.offset)) + + chalk.green(match.text) + + chalk.white(match.original.substring(match.offset + match.length)) + ); + } +} + +const Y = true; +const N = false; +const testMatrix = [ + [Y, N, N, N, N, N, N, N], + [Y, Y, Y, N, N, N, N, N], + [N, N, N, N, Y, N, N, N], + [Y, Y, Y, N, Y, Y, N, N], + [Y, Y, Y, N, N, N, N, N] +] +import * as chalk from 'chalk'; +// dirty levels off the CHARTS +let i = 0, j = 0, p = 0, f = 0; +for (const matcher of matchers) { + j = 0; + for (const testString of tests) { + const matches = matcher(testString).filter(match => match.length === testString.length); + if (matches.length > 0 === testMatrix[i][j]) { + p ++; + } else { + f ++; + console.log( + chalk.red('[ FAIL ]'), + chalk.ansi256(143)('/' + matcher.toString() + '/'), + 'incorrectly returned', + matches.length, + 'match' + (matches.length !== 1 ? 'es' : '') + ' for', + testString, + ); + logMatches(matches); + console.log('') + } + j++; + } + i++ +} +console.log('' + p + ' test' + (p !== 1 ? 's' : '') + ' passed.') +console.log('' + f + ' test' + (f !== 1 ? 's' : '') + ' failed.') +process.exit(f); \ No newline at end of file diff --git a/src/util/tokenTest.ts b/src/util/tokenTest.ts index 7ab47f1..cc1eb22 100644 --- a/src/util/tokenTest.ts +++ b/src/util/tokenTest.ts @@ -6,10 +6,10 @@ class $Plus extends Terminal { } class $Newline extends Terminal { } const tokenizer = createTokenizer([ - { match: /^[0-9]{1,}$/, token: $Number }, - { match: /^[\r\t ]{1,}$/, token: null }, - { match: '\n', token: $Newline }, - { match: '+', token: $Plus }, + [ /^[0-9]{1,}$/, $Number ], + [ /^[\r\t ]{1,}$/, null ], + [ /\n/, $Newline ], + [ /+/, $Plus ], ]) console.log(tokenizer("5 + \n 6 ").map(v => v.toString()).join(' ')); \ No newline at end of file diff --git a/todo.md b/todo.md index 38e4154..f095c32 100644 --- a/todo.md +++ b/todo.md @@ -3,8 +3,10 @@ - [x] colorize the assembly output - [x] create generalTokenizer to make tokenization generic - [ ] rewrite disco tokenizer to the new generalTokenizer +- [ ] explore defining non terminals in a grammar with just a string + - possibly using tagged template strings?? - [ ] add an EOF token to earley, and yknow, add it to the tokenizer. -- [ ] add number support +- [ ] add number support in consts - [ ] add comment support - [ ] add fixed length array support - [ ] organize AST elements into classes