not stable lol

2022-04-05 18:34:10 -04:00 · 2022-04-05 18:34:10 -04:00 · 6e6a5c03aa
parent aaf13743ed
commit 6e6a5c03aa
13 changed files with 275 additions and 159 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,4 @@
-disco
+
 out
 *.o
 node_modules
--- a/src/disco.ts
+++ b/src/disco.ts
@ -3,8 +3,8 @@
 import { readFileSync } from "fs";
 import { compile } from "./compiler";
 import grammar from "./grammar";
-import { tokenize } from "./tokenizer";
 import colorize from "./util/asm/colorize";
+import tokenize from "./util/disco/tokenizer";
 import { printTokens } from "./util/utils";

 console.log();
--- a/src/earley.ts
+++ b/src/earley.ts
@ -37,6 +37,7 @@ export class Terminal extends Token { static terminal: true = true };
 // these tokens are special, for formatting and generalization reasons.
 export class $Newline extends Terminal { }
 export class $Whitespace extends Terminal { }
+export class $EOF extends Terminal { }

 function isTerminal(tokenClass: TokenClass): tokenClass is TerminalTokenClass {
  return tokenClass.terminal;
--- a/src/grammar.ts
+++ b/src/grammar.ts
@ -1,50 +1,32 @@
-import { Grammar, NonTerminal, Production, Terminal, Token } from "./earley";
+import { $Newline, Grammar, NonTerminal, Production, Terminal, Token } from "./earley";
 import { AST } from './ast';
-
-export class $KeywordLink extends Terminal { }
-export class $KeywordEquals extends Terminal { }
-export class $KeywordLParen extends Terminal { }
-export class $KeywordRParen extends Terminal { }
-export class $KeywordConst extends Terminal { }
-
-export class $String extends Terminal {}
-export class $Identifier extends Terminal {}
-
-export class $Newline extends Terminal { }
-
-export class $Program extends NonTerminal { }
-export class $Statement extends NonTerminal { }
-export class $LinkStatement extends NonTerminal { }
-export class $VariableDeclaration extends NonTerminal { }
-export class $Expression extends NonTerminal { }
-export class $InvocationExpression extends NonTerminal { }
-export class $VariableReference extends NonTerminal { }
+import * as t from './util/disco/tokens';

 const ps: Production[] = [
-  { left: $Program, right: [$Statement], resolver: (s) => !!s ? AST.Body([s]) : AST.Body([]) },
-  { left: $Program, right: [$Statement, $Program], resolver: (s, ss) => !!s ? AST.Body([s, ...ss.value]) : ss},
+  { left: t.$Program, right: [t.$Statement], resolver: (s) => !!s ? AST.Body([s]) : AST.Body([]) },
+  { left: t.$Program, right: [t.$Statement, t.$Program], resolver: (s, ss) => !!s ? AST.Body([s, ...ss.value]) : ss},

-  { left: $Statement, right: [$Newline], resolver: () => false },
-  { left: $Statement, right: [$LinkStatement], resolver: a => a },
-  { left: $Statement, right: [$VariableDeclaration], resolver: a => a },
-  { left: $Statement, right: [$Expression], resolver: a => a },
+  { left: t.$Statement, right: [$Newline], resolver: () => false },
+  { left: t.$Statement, right: [t.$LinkStatement], resolver: a => a },
+  { left: t.$Statement, right: [t.$VariableDeclaration], resolver: a => a },
+  { left: t.$Statement, right: [t.$Expression], resolver: a => a },

-  { left: $Expression, right: [$String], resolver: (s: $String) => AST.String(s.value) },
-  { left: $Expression, right: [$InvocationExpression], resolver: a => a },
-  { left: $Expression, right: [$VariableReference], resolver: a => a },
+  { left: t.$Expression, right: [t.$String], resolver: (s: t.$String) => AST.String(s.value) },
+  { left: t.$Expression, right: [t.$InvocationExpression], resolver: a => a },
+  { left: t.$Expression, right: [t.$VariableReference], resolver: a => a },

-  { left: $VariableReference, right: [$Identifier], resolver: (identifier: $Identifier) => AST.VariableReference(identifier.value) },
+  { left: t.$VariableReference, right: [t.$Identifier], resolver: (identifier: t.$Identifier) => AST.VariableReference(identifier.value) },

-  { left: $InvocationExpression, right: [$Identifier, $KeywordLParen, $Expression, $KeywordRParen],
-    resolver: (identifier: $Identifier, _, arg: any, __) => AST.Invocation(identifier.value, arg) },
+  { left: t.$InvocationExpression, right: [t.$Identifier, t.$KeywordLParen, t.$Expression, t.$KeywordRParen],
+    resolver: (identifier: t.$Identifier, _, arg: any, __) => AST.Invocation(identifier.value, arg) },

-  { left: $VariableDeclaration, right: [$KeywordConst, $Identifier, $KeywordEquals, $Expression],
-    resolver: (_, identifier: $Identifier, __, value: any) => AST.Const(identifier.value, value) },
+  { left: t.$VariableDeclaration, right: [t.$KeywordConst, t.$Identifier, t.$KeywordEquals, t.$Expression],
+    resolver: (_, identifier: t.$Identifier, __, value: any) => AST.Const(identifier.value, value) },

-  { left: $LinkStatement, right: [$KeywordLink, $Identifier], resolver: (_, identifier: $Identifier) => AST.Link(identifier.value) },
+  { left: t.$LinkStatement, right: [t.$KeywordLink, t.$Identifier], resolver: (_, identifier: t.$Identifier) => AST.Link(identifier.value) },

 ]

-const grammar = new Grammar(ps, $Program);
+const grammar = new Grammar(ps, t.$Program);

 export default grammar;
--- a/src/tokenizer.ts
+++ b/src/tokenizer.ts
@ -1,85 +0,0 @@
-import * as chalk from 'chalk';
-import { readFileSync, writeFileSync } from 'fs';
-import { $Identifier, $KeywordConst, $KeywordEquals, $KeywordLink, $KeywordLParen, $KeywordRParen, $Newline, $String } from './grammar';
-
-
-const keywords = new Map([
-  ['=', $KeywordEquals],
-  ['(', $KeywordLParen],
-  [')', $KeywordRParen],
-  ['link', $KeywordLink],
-  ['const', $KeywordConst],
-]);
-
-export function tokenize(string) {
-  let inString = false;
-  let escaping = false;
-  let tokens = [];
-  let token = '';
-  // let line = 1;
-  // let col = 1;
-  // const newline = () => (col = 1, line ++);
-  // const nextColumn = () => line ++;
-  const resetToken = () => token = '';
-  const addToken = (_token?) => {
-    if(_token) {
-      token = _token;
-    }
-    if(token.trim() !== '') {
-      if(keywords.has(token)) {
-        const kwTokenClass = keywords.get(token);
-        tokens.push(new kwTokenClass(0, 0, token));
-      } else if (isStringDelim(token[0]))
-        tokens.push(new $String(0, 0, token.substring(1, token.length - 1)));
-      else if (token === 'NEWLINE')
-        tokens.push(new $Newline(0, 0, token))
-      else
-        tokens.push(new $Identifier(0, 0, token));
-      resetToken();
-    }
-  }
-  // let _line = line;
-  // let _col = col;
-
-  const isWhitespace = (char) => [' ', '\n', '\t', '\r'].includes(char);
-  const isNewline = (char) => char === '\n';
-  const isSingleCharToken = (char) => ['(', ')', '='].includes(char);
-  const isStringDelim = (char) => ["'", '"'].includes(char);
-  const isEscapeChar = (char) => char === '\\';
-  const escape = (char) => (char === 'n' ? '\n'
-                          : char === 't' ? '\t'
-                          : char === 'r' ? '\r' : char)
-
-  for (const char of string) {
-    if(isNewline(char)) {
-      // newline();
-      addToken();
-      // only add newlines if we've actually started tokens...
-      if(tokens.length > 0)
-        addToken('NEWLINE')
-    } else if (escaping) {
-      token += escape(char)
-      escaping = false;
-    } else if (isStringDelim(char)) {
-      token += char;
-      inString = !inString;
-    } else if (inString) {
-      if(isEscapeChar(char)) {
-        escaping = true;
-      } else {
-        token += char
-      }
-    } else if(isSingleCharToken(char)) {
-      addToken();
-      addToken(char);
-    } else if(isWhitespace(char)) {
-      addToken();
-    } else {
-      token += char;
-    }
-    // if(!isNewline(char))
-    //   nextColumn();
-  }
-
-  return tokens;
-}
--- a/src/util/asm/tokenizer.ts
+++ b/src/util/asm/tokenizer.ts
@ -59,7 +59,3 @@ const asmTokenizer = createTokenizer([
  [ /^[A-Za-z._][A-Za-z_]{0,}/, tokens.$Identifier]
 ])
 export default asmTokenizer;
-
-import input from './testInput';
-import { printTokens } from "../utils";
-printTokens(asmTokenizer(input));
--- a/src/util/asmLogger.ts
+++ b/src/util/asmLogger.ts
@ -1,3 +0,0 @@
-export function logASM(asm: string) {
-
-}
--- a/src/util/disco/tokenizer.ts
+++ b/src/util/disco/tokenizer.ts
@ -0,0 +1,9 @@
+import { $Newline } from "../../earley";
+import { createTokenizer } from "../generalTokenizer";
+import * as t from './tokens';
+
+export default createTokenizer([
+  [ /^[\r\t ]{1,}/, null],
+  [ /^\n/, $Newline],
+  [/[a-zA-Z][A-Za-z0-9]{0,}/, t.$Identifier],
+])
--- a/src/util/disco/tokens.ts
+++ b/src/util/disco/tokens.ts
@ -0,0 +1,18 @@
+import { NonTerminal, Terminal } from "../../earley";
+
+export class $KeywordLink extends Terminal { }
+export class $KeywordEquals extends Terminal { }
+export class $KeywordLParen extends Terminal { }
+export class $KeywordRParen extends Terminal { }
+export class $KeywordConst extends Terminal { }
+
+export class $String extends Terminal {}
+export class $Identifier extends Terminal {}
+
+export class $Program extends NonTerminal { }
+export class $Statement extends NonTerminal { }
+export class $LinkStatement extends NonTerminal { }
+export class $VariableDeclaration extends NonTerminal { }
+export class $Expression extends NonTerminal { }
+export class $InvocationExpression extends NonTerminal { }
+export class $VariableReference extends NonTerminal { }
--- a/src/util/generalTokenizer.ts
+++ b/src/util/generalTokenizer.ts
@ -1,6 +1,8 @@
 import { Terminal, TerminalTokenClass } from "../earley";
+import { Matcher } from "./regex";

 type TokenMatcher = [ RegExp, TerminalTokenClass ];
+type Index = number;

 interface Match {
  regex: RegExp;
@ -10,18 +12,18 @@ interface Match {
 }

 // this is kinda bullshit lol exec is a dumb method.
-function getFirstMatch(r: RegExp, str: string): [number, string] {
+function getFirstMatch(r: RegExp | Matcher, str: string): [Index, string] {
+  if (r instanceof RegExp) {
    let matches = str.match(r);
    if(matches === null) return [-1, ''];
    return [matches.index, matches[0]];
+  }
 }

-export function createTokenizer(tokenMap: TokenMatcher[]) {
-
-  return function tokenize(str: string, l = 1, c = 1): Terminal[] {
-
-    const possibleMatches: Match[] = tokenMap
-      .map(([regex, tokenClass]) => {
+const getMatchesFromTokenMatcher =
+  (str: string) =>
+  ([regex, tokenClass]: TokenMatcher): Match =>
+{
  const [index, match] = getFirstMatch(regex, str);
  if(index === -1) return null;
  return {
@ -30,7 +32,25 @@ export function createTokenizer(tokenMap: TokenMatcher[]) {
    length: match.length,
    matchedString: match
  }
-      })
+}
+
+const advanceLC = (l: number, c: number, str: string) => {
+  for(const char of str) {
+    c ++;
+    if(char === '\n') {
+      l ++;
+      c = 1;
+    }
+  }
+  return [l, c];
+}
+
+export function createTokenizer(tokenMap: TokenMatcher[]) {
+
+  return function tokenize(str: string, l = 1, c = 1): Terminal[] {
+
+    const possibleMatches: Match[] = tokenMap
+      .map(getMatchesFromTokenMatcher(str))
      .filter(v => !!v);

    const longestLength = possibleMatches
@ -41,23 +61,23 @@ export function createTokenizer(tokenMap: TokenMatcher[]) {
      .filter(v => v.length === longestLength);

    console.assert(longestMatches.length > 0, 'No token matches found');
+    if(longestMatches.length === 0) process.exit(1);

-    const [{tokenClass, matchedString}] = longestMatches;
+    const {tokenClass, matchedString} = longestMatches[0];
    const length = matchedString.length;
-    const token = tokenClass ? new tokenClass(l, c, matchedString) : null;
-
    const rest = str.substring(length);

+    
+    const token = tokenClass ? new tokenClass(l, c, matchedString) : null;
    if(rest === '') return [ token ];

-    for(const char of matchedString) {
-      c ++;
-      if(char === '\n') {
-        l ++;
-        c = 1;
+    [l, c] = advanceLC(l, c, str);
+    if(tokenClass) {
+      return [
+        new tokenClass(l, c, matchedString),
+        ...tokenize(rest, l, c)
+      ]
    }
-    }
-
    return token ? [token, ...tokenize(rest, l, c)] : tokenize(rest, l, c);
  }
 }
--- a/src/util/regex.ts
+++ b/src/util/regex.ts
@ -0,0 +1,176 @@
+type Match = {
+  offset: number;
+  length: number;
+  text: string;
+  original: string;
+}
+
+const match = (offset: number, length: number, text: string, original: string): Match => {
+  return { offset, length, text, original };
+}
+
+export type Matcher = (str: string) => Match[]
+
+export const matchChar = (char: string): Matcher => {
+  const matcher = (test: string) => {
+    return test[0] === char[0] ? [match(0, 1, test[0], test)] : []
+  }
+  matcher.toString = () => {
+    return char;
+  }
+  return matcher;
+}
+
+export const matchCharClass = (chars: string[]): Matcher => {
+  const matcher = (test: string) => {
+    return chars.includes(test[0]) ? [match(0, 1, test[0], test)] : []
+  }
+  matcher.toString = () => {
+    return '[' + chars.join('') + ']';
+  }
+  return matcher;
+}
+
+const combineMatches = (a: Match, b: Match): Match => {
+  return match(
+    Math.min(a.offset, b.offset),
+    a.length + b.length,
+    a.text + b.text,
+    a.original.length > b.original.length ? a.original : b.original
+  )
+}
+
+export const matchSequence = (matcherA: Matcher, matcherB: Matcher): Matcher => {
+  const matcher = (test: string) => {
+    const matches = [];
+    for (const match of matcherA(test)) {
+      const rest = test.substring(match.length);
+      for (const restMatch of matcherB(rest)) {
+        matches.push(combineMatches(match, restMatch));
+      }
+    }
+    return matches;
+  }
+  matcher.toString = () => {
+    return matcherA.toString() + matcherB.toString();
+  }
+  return matcher;
+}
+
+const repeatMatcher = (matcher: Matcher, test: string, n: number): Match[] => {
+  if(n === 0) {
+    return [match(0, 0, '', test)];
+  }
+  const matches = matcher(test);
+  if(n === 1) {
+    return matches;
+  }
+  return matches.map(match => {
+    const rest = match.original.substring(match.length);
+    return repeatMatcher(matcher, rest, n - 1).map(nextMatch => combineMatches(match, nextMatch));
+  }).flat();
+}
+
+// this logic sucks lol
+// really you should just keep matching until you
+// have no more characters or you hit the match limit.
+// like this shit increases O by 2 on each nested call...
+// TODO /\ \/ /\ \/ /\ \/ /\ \/ /\ \/ /\ \/ /\ \/ /\
+export const matchMany = (matcherA: Matcher, min = 1, max = Infinity): Matcher => {
+  const matcher = (test: string) => {
+    const rmatches: Match[] = [];
+    const limitedMax = Math.min(max, test.length);
+    for(let c = min; c <= limitedMax; c ++) {
+      const matches = repeatMatcher(matcherA, test, c);
+      rmatches.push(...matches);
+    }
+    return rmatches;
+  }
+  matcher.toString = () => {
+    return '(' + (matcherA.toString()) + '){' + (min === 0 ? '' : min) + ',' + (max === Infinity ? '' : max) + '}';
+  }
+  return matcher;
+}
+
+// variable names regex, theory...
+
+
+const matchers = [
+  matchChar('a'),
+  matchCharClass(['a', 'b', 'c']),
+  matchSequence(
+    matchChar('a'),
+    matchCharClass(['a', 'b', 'c'])
+  ),
+  matchMany(
+    matchCharClass(['a', 'b', 'c'])
+  ),
+  matchMany(
+    matchCharClass(['a', 'b', 'c']),
+    1,
+    1
+  ),
+];
+
+const tests = [
+  'a',
+  'b',
+  'c',
+  'd',
+  'ab',
+  'bc',
+  'cd',
+  'da',
+]
+
+console.clear();
+
+const logMatches = (ms: Match[]) => {
+  for(const match of ms) {
+    console.log(
+      ' '.repeat(8) +
+      chalk.white(match.original.substring(0, match.offset)) +
+      chalk.green(match.text) + 
+      chalk.white(match.original.substring(match.offset + match.length))
+    );
+  }
+}
+
+const Y = true;
+const N = false;
+const testMatrix = [
+  [Y, N, N, N, N, N, N, N],
+  [Y, Y, Y, N, N, N, N, N],
+  [N, N, N, N, Y, N, N, N],
+  [Y, Y, Y, N, Y, Y, N, N],
+  [Y, Y, Y, N, N, N, N, N]
+]
+import * as chalk from 'chalk';
+// dirty levels off the CHARTS
+let i = 0, j = 0, p = 0, f = 0;
+for (const matcher of matchers) {
+  j = 0;
+  for (const testString of tests) {
+    const matches = matcher(testString).filter(match => match.length === testString.length);
+    if (matches.length > 0 === testMatrix[i][j]) {
+      p ++;
+    } else {
+      f ++;
+      console.log(
+        chalk.red('[ FAIL ]'),
+        chalk.ansi256(143)('/' + matcher.toString() + '/'),
+        'incorrectly returned',
+        matches.length,
+        'match' + (matches.length !== 1 ? 'es' : '') + ' for',
+        testString,
+      );
+      logMatches(matches);
+      console.log('')
+    }
+    j++;
+  }
+  i++
+}
+console.log('' + p + ' test' + (p !== 1 ? 's' : '') + ' passed.')
+console.log('' + f + ' test' + (f !== 1 ? 's' : '') + ' failed.')
+process.exit(f);
--- a/src/util/tokenTest.ts
+++ b/src/util/tokenTest.ts
@ -6,10 +6,10 @@ class $Plus extends Terminal { }
 class $Newline extends Terminal { }

 const tokenizer = createTokenizer([
-  { match: /^[0-9]{1,}$/,       token: $Number  },
-  { match: /^[\r\t ]{1,}$/,     token: null     },
-  { match: '\n',                token: $Newline },
-  { match: '+',                 token: $Plus    },
+  [ /^[0-9]{1,}$/,       $Number  ],
+  [ /^[\r\t ]{1,}$/,     null     ],
+  [ /\n/,                $Newline ],
+  [ /+/,                 $Plus    ],
 ])

 console.log(tokenizer("5 + \n 6   ").map(v => v.toString()).join('  '));
--- a/todo.md
+++ b/todo.md
@ -3,8 +3,10 @@
 - [x] colorize the assembly output
 - [x] create generalTokenizer to make tokenization generic
 - [ ] rewrite disco tokenizer to the new generalTokenizer
+- [ ] explore defining non terminals in a grammar with just a string
+  - possibly using tagged template strings?? 
 - [ ] add an EOF token to earley, and yknow, add it to the tokenizer.
- [ ] add number support
+- [ ] add number support in consts
 - [ ] add comment support
 - [ ] add fixed length array support
 - [ ] organize AST elements into classes