From aaf13743ed3bae70a0f89c005586988ad987ecd3 Mon Sep 17 00:00:00 2001
From: Bronwen <marcus@valnet.xyz>
Date: Thu, 17 Mar 2022 16:47:57 -0400
Subject: [PATCH] weewoo

---
 disco.disco                  |   3 +-
 disco_test                   | Bin 8992 -> 33048 bytes
 disco_test.asm               |  18 +++---
 src/compiler.ts              |  26 +++++++--
 src/disco.ts                 |   3 +-
 src/util/asm/grammar.ts      |  11 +++-
 src/util/asm/tokenizer.ts    |  84 +++++++++++++++++++--------
 src/util/asm/tokens.ts       |  10 +++-
 src/util/generalTokenizer.ts | 109 ++++++++++++++++-------------------
 todo.md                      |  51 +++++++++++-----
 10 files changed, 201 insertions(+), 114 deletions(-)
diff --git a/disco.disco b/disco.disco
index a26ac76..757bfe7 100644
--- a/disco.disco
+++ b/disco.disco
@@ -1,2 +1,3 @@
 link log
-log("Hello World")
\ No newline at end of file
+const a = "a"
+log("hello world")
\ No newline at end of file
diff --git a/disco_test b/disco_test
index e128b8f358cac9219bda586acc72c50f05a28968..26cf2a8022b359da434e023555a2b471d7a1b8d9 100755
GIT binary patch
literal 33048
zcmeI*&r6g+7{Kw_)h#n|?Gi>oTU2C_O1}^aT0_~j#6;J$pr*Bb&5d<8bQhICw?l{R
z6hzP;&^ZVSv<MF#JO&=S7eaODScmL+-uK<s6{8^N^c{HKnP=YF*_qF~*LnBd_b)%T
z8xyE8#snq85=BWKNUdd$L|nq!{?1tUweEq-+FM-$)#;X*&a)<AZK8W5Q9Z={e6+gH
z_<2aWHFRxStj#+!d9Ur|)$<LUm(9qHF@K2WRi)MxH|EwBveW5o%BHhpIk~ZUeGC44
zYM@2)@>;x}>Z|Vasf4v%or%s3_oCjq%KO?}tG`Cs(%MuqpR9cTck?xtheXSbt?SBj
z?Ct5h)ZNuneg?%&>ylhcv%e6n%4^d8gYxre*T$bp(HI@4R?QL}lGjPSAoWeDcS`h2
zXxaFT<+oV%eeQt2xTr%!qCujqFg4khNsqRrW-}?twd|G9bv&Nm@#N%-GjCs=9a(%i
z|NPJi>5E8sdz4k}s(0J^ER0zR9+PX6+x9L;W&b+Q=HHcmZ-<25(>)urU$zyftVPx{
zI6IYhCR#7F%DU2{?jUnSDxJT*==rqxxq49$KmY**5I_I{1Q0*~0R#|0009ILKmY**
z5I_I{1Q0*~0R#|0009ILKmY**5I_I{1Q0*~0R#|0009ILKmY**5I_I{1Q0;rUj>HZ
zr4Q?;pUE`wrC`)MS-hr4eB-6h*B9iRad1fz6Alf((fVPy_|4?&;_EB%_2o}r<E8ft
zwK_%*B;N>!?$lS(c<JM7?GA_Rvex?~v}~;P?6ICrJ|(rrqO`SWThBB%$eH9O|FpCA
zC(XE%$>bu_xyfH=rL~`e00IagfB*srAb<b@2q1s}0tg_000IagfB*srAb<b@2q1s}
z0tg_000IagfB*srAb<b@2q1s}0tg_000IagfB*srAb<b@2q1s}0{>AUVCDbUgLY!v
zIy26#Lf*0CPBP_8ngbKbbk-c!7Yfvv)j5Ajt8*>e!qPr2(I_t=xF_!y(038&|EOzz
z-_Ota`6EBSsJXs?AnJ`D^z)#fyI&LV{%Kw3+bt26hQ7-{bA88w)Y^1Q#Mmv?Y~3~M
x-@e<ad|}j%rn9;go5|gFFVe`}t+cF@O_|tWSL|x<(100CT#5Dd5BIq9{{&Uep4|Wd

literal 8992
zcmeI2y-EW?5XUDMPgI1Qog&DIh1vvdL{KD%h7-h2L_sg7Is71clG{QnZR1n;7@|cA
z3*W(B5wx+iiZlChIZs8fv2X@5GyC6}z2%o`**#laE?c=A7_(pw<j5&WNqK<FVN(H6
zf@#R3?*v=ccC@E3GaV{E={Xa@Q38F#rc{qXf6v4U&7oDKN0;jBB+Vs<=9XkIu~&`2
zMAN5b4orXvFaajO1egF5U;<2l2`~XBzy$s!fmN^Z;5E(Z`H4>1UgKr+Y8I0w5h>Vf
zHxw@l=Wigoyw;u9x_y528uwRphHsMFD%b}-?V;Crye7F|`?na_m;e)C0!)AjFaajO
z1egF5U;<2l2{3^_Cg6p&TI8-p^;-4Eo#g4Z-`vxus`FsA@8Pvh=l>3PZKcX&s;==r
z*AK|q9E@bvAJY1GMjzAqbVi>+-T7jlQ&P8}*Sr^y{Q&dd=#yG^GWAg1k9Q(pgs~_F
z@gXdh7gpAn%NyX=qFw1QiH_TuA0Ab~_qXHNk3~=y;D^;f1OUbO^iTxb7)4#hJ(G#>
zL_krM>3#k-3}Pew7xK*nEn;}-_Y@V<Ql{lZ8ODtI^F}c|TqPv$EcE=xw;-z5GQFwj
yxN6|i;%L;@rGGT;Cx_OJzvSCeQEq}ZhPI?a;`MULxDT_h`cHNLXi}+?Z2vp0OIe-(

diff --git a/disco_test.asm b/disco_test.asm
index 39c7bfb..44f4432 100644
--- a/disco_test.asm
+++ b/disco_test.asm
@@ -1,15 +1,19 @@
+bits 64
+default rel
 section .data
-  EFDNYLFZ db 72,101,108,108,111,32,87,111,114,108,100,0
+  GSDGYLUR db 97,0
+  STVGNPWI db 104,101,108,108,111,32,119,111,114,108,100,0
 section .text
-  global _start
-_start:
+  global _main
+_main:
   push rbp
   mov rbp, rsp
-  mov rdi, EFDNYLFZ
+  push qword [rel GSDGYLUR]
+  mov rdi, STVGNPWI
   call _log
   mov rsp, rbp
   pop rbp
-  mov rax, 60
+  mov rax, 0x02000001
   mov rdi, 0
   syscall
 _log:
@@ -24,12 +28,12 @@ _log_loop:
   jmp _log_loop
 _log_loop_end:
   mov rdx, rbx
-  mov rax, 1
+  mov rax, 0x02000004
   mov rdi, 1
   pop rsi
   syscall
   push 10
-  mov rax, 1
+  mov rax, 0x02000004
   mov rdi, 1
   mov rsi, rsp
   mov rdx, 1
diff --git a/src/compiler.ts b/src/compiler.ts
index 891dbb8..58a447e 100644
--- a/src/compiler.ts
+++ b/src/compiler.ts
@@ -23,7 +23,7 @@ const localVariables = new Map();
 const sections = {
   preamble() {
     if(process.platform === 'darwin') {
-      return '  global _main\n';
+      return 'bits 64\ndefault rel\n';
     } else {
       return '';
     }
@@ -37,10 +37,21 @@ const sections = {
   },
   text() {
     if(process.platform === 'darwin') {
-      return 'section .text\n_main:\n  push rbp\n  mov rbp, rsp\n  '
-      + statements.join('\n  ')
-      + '\n  mov rsp, rbp\n  pop rbp\n  mov rax, 0x02000001\n  mov rdi, 0\n  syscall\n'
-      + [...linkedLibraries.values()].map(({asmName, asm}) => asmName + ':\n' + asm).join('\n')
+      return (
+        'section .text\n' +
+        '  global _main\n' +
+        '_main:\n' +
+        '  push rbp\n' +
+        '  mov rbp, rsp\n' +
+        statements.map(v => `  ${v}\n`).join('') +
+        '  mov rsp, rbp\n' +
+        '  pop rbp\n' +
+        '  mov rax, 0x02000001\n' +
+        '  mov rdi, 0\n' +
+        '  syscall\n' + [...linkedLibraries.values()]
+          .map(({asmName, asm}) => asmName + ':\n' + asm)
+          .join('\n')
+      );
     } else {
       return 'section .text\n  global _start\n_start:\n  push rbp\n  mov rbp, rsp\n  '
         + statements.join('\n  ')
@@ -106,7 +117,10 @@ function compileVariable(name, value) {
   });
   if(value.type === 'string') {
     const variableName = compileStringLiteral(value.value);
-    statements.push('push ' + variableName)
+    if(process.platform === 'darwin')
+      statements.push(`push qword [rel ${variableName}]`);
+    else
+      statements.push('push ' + variableName);
   } else {
     console.error('dont know how to set a variable to a non string lol')
   }
diff --git a/src/disco.ts b/src/disco.ts
index bc91bd5..bf6f39b 100755
--- a/src/disco.ts
+++ b/src/disco.ts
@@ -32,8 +32,8 @@ const asmFile = compile(ast)
 try {
   console.log();
   console.log('=== ASM ===');
-  console.log(colorize(asmFile));
   require('fs').writeFileSync('disco_test.asm', asmFile);
+  console.log(colorize(asmFile));
 
   console.log();
   console.log('=== nasm ===');
@@ -59,6 +59,7 @@ function ld() {
     require('child_process').execSync([
       'ld', 'disco_test.o',
       '-o', 'disco_test',
+      '-no_pie',
       '-macosx_version_min', '11.0',
       '-L', '/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib',
       '-lSystem'
diff --git a/src/util/asm/grammar.ts b/src/util/asm/grammar.ts
index 034e6c3..03f75f3 100644
--- a/src/util/asm/grammar.ts
+++ b/src/util/asm/grammar.ts
@@ -26,8 +26,12 @@ export default new Grammar([
   { left: t.$Line, right: [t.$Global, t.$Identifier],
     resolver: (_, {value}) => `  ${ansi(...keywordColor).bold('global')} ${ansi(...identifierColor)(value)}` },
   { left: t.$Line, right: [t.$Identifier, t.$Colon], resolver: ({value}) => `${ansi(...identifierColor)(value)}:` },
+  { left: t.$Line, right: [t.$Bits, t.$Number], resolver: (_, n) => `${ansi(...keywordColor).bold('bits')} ${ansi(...numberColor)(n.value)}`},
+  { left: t.$Line, right: [t.$Default, t.$Rel], resolver: () => `${ansi(...keywordColor).bold('default')} ${ansi(...keywordColor).bold('rel')}`},
 
   // actual instructions
+  { left: t.$Line, right: [t.$Push, t.$DataSize, t.$LBracket, t.$Rel, t.$Identifier, t.$RBracket],
+    resolver: (_, size, __, ___, identifier) => `  ${ansi(...instructionColor)('push')} ${size} ${ansi(...pointerColor)('[')}${ansi(...keywordColor).bold('rel')} ${ansi(...identifierColor)(identifier.value)}${ansi(...pointerColor)(']')}` },
   { left: t.$Line, right: [t.$Push, t.$Value], resolver: (_, v) => `  ${ansi(...instructionColor)('push')} ${v}` },
   { left: t.$Line, right: [t.$Pop, t.$Value], resolver: (_, v) => `  ${ansi(...instructionColor)('pop')} ${v}` },
   { left: t.$Line, right: [t.$Cmp, t.$Register, t.$Comma, t.$Value],
@@ -52,5 +56,10 @@ export default new Grammar([
   { left: t.$Value, right: [t.$Identifier], resolver: (v) => ansi(...identifierColor)(v.value) },
 
   { left: t.$CompoundString, right: [t.$Number], resolver: (n) => ansi(...numberColor)(n.value) },
-  { left: t.$CompoundString, right: [t.$Number, t.$Comma, t.$CompoundString], resolver: (n, _, ns) => ansi(...numberColor)(n.value) + ',' + ns }
+  { left: t.$CompoundString, right: [t.$Number, t.$Comma, t.$CompoundString], resolver: (n, _, ns) => ansi(...numberColor)(n.value) + ',' + ns },
+
+  { left: t.$DataSize, right: [t.$Word], resolver: (v) => ansi(...keywordColor).bold(v.value) },
+  { left: t.$DataSize, right: [t.$DWord], resolver: (v) => ansi(...keywordColor).bold(v.value) },
+  { left: t.$DataSize, right: [t.$QWord], resolver: (v) => ansi(...keywordColor).bold(v.value) },
+  { left: t.$DataSize, right: [t.$OWord], resolver: (v) => ansi(...keywordColor).bold(v.value) },
 ], t.$Program);
\ No newline at end of file
diff --git a/src/util/asm/tokenizer.ts b/src/util/asm/tokenizer.ts
index 57c6614..6ab2ae7 100644
--- a/src/util/asm/tokenizer.ts
+++ b/src/util/asm/tokenizer.ts
@@ -4,28 +4,62 @@ import {
   $Newline,
 } from "./../../earley";
 
-export default createTokenizer([
-  { match: /^[\r\t ]{1,}$/, token: null },
-  { match: 'section', token: tokens.$Section },
-  { match: 'db', token: tokens.$Db },
-  { match: 'global', token: tokens.$Global },
-  { match: '\n', token: $Newline },
-  { match: ':', token: tokens.$Colon },
-  { match: ',', token: tokens.$Comma },
-  { match: '[', token: tokens.$LBracket },
-  { match: ']', token: tokens.$RBracket },
-  { match: '-', token: tokens.$Minus },
-  { match: 'mov', token: tokens.$Mov },
-  { match: 'push', token: tokens.$Push },
-  { match: 'pop', token: tokens.$Pop },
-  { match: 'call', token: tokens.$Call },
-  { match: 'syscall', token: tokens.$Syscall },
-  { match: 'ret', token: tokens.$Ret },
-  { match: 'je', token: tokens.$Je },
-  { match: 'jmp', token: tokens.$Jmp },
-  { match: 'cmp', token: tokens.$Cmp },
-  { match: 'inc', token: tokens.$Inc },
-  { match: /^[0-9]{1,}$/, token: tokens.$Number },
-  { match: /^(rbp|rsp|rax|rcx|rbx|rdx|rdi|rsi|al|bl|cl|dl|ah|bh|ch|dh|ax|bx|cx|dx|eax|ebx|ecx|edx)$/, token: tokens.$Register },
-  { match: /^[A-Za-z._][A-Za-z_]{0,}$/, token: tokens.$Identifier },
-])
\ No newline at end of file
+const asmTokenizer = createTokenizer([
+  // whitespaces
+  [ /^[\r\t ]{1,}/, null],
+  [ /^\n/, $Newline],
+
+  // keywords
+  [ /^section/, tokens.$Section],
+  [ /^db/, tokens.$Db],
+  [ /^global/, tokens.$Global],
+  [ /^bits/, tokens.$Bits],
+  [ /^default/, tokens.$Default],
+  [ /^rel/, tokens.$Rel],
+  [ /^word/, tokens.$Word],
+  [ /^dword/, tokens.$DWord],
+  [ /^qword/, tokens.$QWord],
+  [ /^oword/, tokens.$OWord],
+  
+  // punctuation
+  [ /^:/, tokens.$Colon],
+  [ /^,/, tokens.$Comma],
+  [ /^\[/, tokens.$LBracket],
+  [ /^\]/, tokens.$RBracket],
+  [ /^-/, tokens.$Minus],
+
+  // instructions
+  [ /^mov/, tokens.$Mov],
+  [ /^push/, tokens.$Push],
+  [ /^pop/, tokens.$Pop],
+  [ /^syscall/, tokens.$Syscall],
+  [ /^ret/, tokens.$Ret],
+  [ /^je/, tokens.$Je],
+  [ /^jmp/, tokens.$Jmp],
+  [ /^cmp/, tokens.$Cmp],
+  [ /^inc/, tokens.$Inc],
+
+  // pseudo-instructions
+  [ /^call/, tokens.$Call],
+
+  // 8 bit general purpose registers...
+  [ /^(al|ah|bl|bh|cl|ch|dl|dh)/, tokens.$Register ],
+  // 16 bit general purpose registers...
+  [ /^(ax|bx|cx|dx)/, tokens.$Register ],
+  // 32 bit general purpose registers...
+  [ /^(eax|ebx|ecx|edx)/, tokens.$Register ],
+  // 64 bit general purpose registers...
+  [ /^(rax|rbx|rcx|rdx)/, tokens.$Register ],
+  // other registers, idk.
+  [ /^(rbp|rsp|rdi|rsi)/, tokens.$Register],
+
+  // user-defined
+  [ /^[0-9]{1,}/, tokens.$Number],
+  [ /^0x[0-9A-Fa-f]{1,}/, tokens.$Number],
+  [ /^[A-Za-z._][A-Za-z_]{0,}/, tokens.$Identifier]
+])
+export default asmTokenizer;
+
+import input from './testInput';
+import { printTokens } from "../utils";
+printTokens(asmTokenizer(input));
\ No newline at end of file
diff --git a/src/util/asm/tokens.ts b/src/util/asm/tokens.ts
index dfa9027..43edef6 100644
--- a/src/util/asm/tokens.ts
+++ b/src/util/asm/tokens.ts
@@ -21,6 +21,13 @@ export class $RBracket extends Terminal { }
 export class $Comma extends Terminal { }
 export class $Colon extends Terminal { }
 export class $Minus extends Terminal { }
+export class $Bits extends Terminal { }
+export class $Default extends Terminal { }
+export class $Rel extends Terminal { }
+export class $Word extends Terminal { }
+export class $DWord extends Terminal { }
+export class $QWord extends Terminal { }
+export class $OWord extends Terminal { }
 
 // varying tokens
 export class $Identifier extends Terminal { }
@@ -33,4 +40,5 @@ export class $Line extends NonTerminal { }
 export class $PointerDereference extends NonTerminal { }
 export class $Program extends NonTerminal { }
 export class $CompoundString extends NonTerminal { }
-export class $Value extends NonTerminal { }
\ No newline at end of file
+export class $Value extends NonTerminal { }
+export class $DataSize extends NonTerminal { }
\ No newline at end of file
diff --git a/src/util/generalTokenizer.ts b/src/util/generalTokenizer.ts
index 78dcb7d..551b64a 100644
--- a/src/util/generalTokenizer.ts
+++ b/src/util/generalTokenizer.ts
@@ -1,72 +1,63 @@
-import { TerminalTokenClass } from "../earley";
-import { inspect } from 'util';
+import { Terminal, TerminalTokenClass } from "../earley";
 
-interface TokenMatcher {
-  match: RegExp | string,
-  token: TerminalTokenClass
+type TokenMatcher = [ RegExp, TerminalTokenClass ];
+
+interface Match {
+  regex: RegExp;
+  length: number;
+  tokenClass: TerminalTokenClass;
+  matchedString: string;
+}
+
+// this is kinda bullshit lol exec is a dumb method.
+function getFirstMatch(r: RegExp, str: string): [number, string] {
+  let matches = str.match(r);
+  if(matches === null) return [-1, ''];
+  return [matches.index, matches[0]];
 }
 
 export function createTokenizer(tokenMap: TokenMatcher[]) {
-  return function tokenize(str: string) {
-    let tokens = [];
-    let token = '';
-    let line = 1, column = 0;
-    for(let i = 0; i < str.length; i ++) {
-      const char = str[i];
-      const lookahead = (i < str.length - 1 ? str[i + 1] : null)
-      column++;
-      token += char;
 
-      for(const {match: matcher, token: tokenClass} of tokenMap) {
-        if(typeof matcher === 'string') {
-          if(matcher === token) {
-            if(tokenClass !== null) {
-              tokens.push(new tokenClass(line, column - token.length + 1, token));
-            }
-            token = '';
-          } else {
-            // dw about it
-          }
-        } else {
-          // matcher is regex...
-          // * note: this only tests if token contains a match, not that it _is_ a match
-          if(matcher.test(token)) {
-            if(lookahead) {
-              if(!matcher.test(token + lookahead)) {
-                // the next character would not match, so this must be the match.
-                // ! PS: it is possible that even though this would no longer
-                // ! match, another matcher could still match more.
-                // ! in those cases, we would want to expand on this logic
-                // ! to only match if there are no matches for any matcher
-                // ! in the lookahead.
-                // ! in practice this means tracking all possible non lookahead
-                // ! matches, then testing them for their lookahead afterwards
-                // ! in another loop, and only tokenizing if you have only one
-                // ! option, and that option will fail on the lookahead.
-                if(tokenClass !== null) {
-                  tokens.push(new tokenClass(line, column - token.length + 1, token));
-                }
-                token = '';
-              } else {
-                // the lookahead matches this too, so we should probably hold off
-                // on tokenizing it...
-              }
-            } else {
-              if(tokenClass !== null) {
-                tokens.push(new tokenClass(line, column - token.length + 1, token));
-              }
-              token = '';
-            }
-          }
+  return function tokenize(str: string, l = 1, c = 1): Terminal[] {
+
+    const possibleMatches: Match[] = tokenMap
+      .map(([regex, tokenClass]) => {
+        const [index, match] = getFirstMatch(regex, str);
+        if(index === -1) return null;
+        return {
+          regex,
+          tokenClass,
+          length: match.length,
+          matchedString: match
         }
-      }
+      })
+      .filter(v => !!v);
 
+    const longestLength = possibleMatches
+      .map(v => v.length)
+      .reduce((a, v) => a > v ? a : v, -Infinity);
+
+    const longestMatches = possibleMatches
+      .filter(v => v.length === longestLength);
+
+    console.assert(longestMatches.length > 0, 'No token matches found');
+
+    const [{tokenClass, matchedString}] = longestMatches;
+    const length = matchedString.length;
+    const token = tokenClass ? new tokenClass(l, c, matchedString) : null;
+
+    const rest = str.substring(length);
+
+    if(rest === '') return [ token ];
+
+    for(const char of matchedString) {
+      c ++;
       if(char === '\n') {
-        line ++;
-        column = 0;
+        l ++;
+        c = 1;
       }
     }
 
-    return tokens;
+    return token ? [token, ...tokenize(rest, l, c)] : tokenize(rest, l, c);
   }
 }
\ No newline at end of file
diff --git a/todo.md b/todo.md
index ba0eae7..38e4154 100644
--- a/todo.md
+++ b/todo.md
@@ -1,15 +1,40 @@
 # Todo List
 
-[x] colorize the assembly output
-[ ] rewrite disco tokenizer to the new generalTokenizer
-[ ] add an EOF token to earley, and yknow, add it to the tokenizer.
-[ ] add number support
-[ ] add comment support
-[ ] add fixed length array support
-[ ] organize AST elements into classes
-[ ] better logging of the AST
-[ ] optionally artifically slow down compilation (for fun)
-[ ] implement functions
-[ ] implement some basic maths operations
-[ ] implement multi-argument invocations
-[ ] implement return values
+- [x] colorize the assembly output
+- [x] create generalTokenizer to make tokenization generic
+- [ ] rewrite disco tokenizer to the new generalTokenizer
+- [ ] add an EOF token to earley, and yknow, add it to the tokenizer.
+- [ ] add number support
+- [ ] add comment support
+- [ ] add fixed length array support
+- [ ] organize AST elements into classes
+- [ ] better logging of the AST
+- [ ] optionally artifically slow down compilation (for fun)
+- [ ] implement functions
+- [ ] implement some basic maths operations
+- [ ] implement multi-argument invocations
+- [ ] implement return values
+- [ ] write a regex compiler
+- [ ] write log in disco. creat a library for just doing syscalls. the rest can be done in disco
+
+# Changelog
+
+- fixed macos compilation to use relative addressing (i think)
+- fixed a bug in the general tokenizer that failed to match some tokens properly.
+
+---
+
+- create generalized tokenizer
+- implement assembly language grammar for syntax highlighting
+- create a vscode extension for syntax highlighting
+
+---
+
+- compile disco code to assembly as POC
+- create an AST for disco code
+- implement earley grammar for disco including:
+  - linking library functions
+  - calling functions
+  - string literals
+  - string variables
+- created earley parser
\ No newline at end of file