From aaf13743ed3bae70a0f89c005586988ad987ecd3 Mon Sep 17 00:00:00 2001 From: Bronwen Date: Thu, 17 Mar 2022 16:47:57 -0400 Subject: [PATCH] weewoo --- disco.disco | 3 +- disco_test | Bin 8992 -> 33048 bytes disco_test.asm | 18 +++--- src/compiler.ts | 26 +++++++-- src/disco.ts | 3 +- src/util/asm/grammar.ts | 11 +++- src/util/asm/tokenizer.ts | 84 +++++++++++++++++++-------- src/util/asm/tokens.ts | 10 +++- src/util/generalTokenizer.ts | 109 ++++++++++++++++------------------- todo.md | 51 +++++++++++----- 10 files changed, 201 insertions(+), 114 deletions(-) diff --git a/disco.disco b/disco.disco index a26ac76..757bfe7 100644 --- a/disco.disco +++ b/disco.disco @@ -1,2 +1,3 @@ link log -log("Hello World") \ No newline at end of file +const a = "a" +log("hello world") \ No newline at end of file diff --git a/disco_test b/disco_test index e128b8f358cac9219bda586acc72c50f05a28968..26cf2a8022b359da434e023555a2b471d7a1b8d9 100755 GIT binary patch literal 33048 zcmeI*&r6g+7{Kw_)h#n|?Gi>oTU2C_O1}^aT0_~j#6;J$pr*Bb&5d<8bQhICw?l{R z6hzP;&^ZVSv;x}>Z|Vasf4v%or%s3_oCjq%KO?}tG`Cs(%MuqpR9cTck?xtheXSbt?SBj z?Ct5h)ZNuneg?%&>ylhcv%e6n%4^d8gYxre*T$bp(HI@4R?QL}lGjPSAoWeDcS`h2 zXxaFT<+oV%eeQt2xTr%!qCujqFg4khNsqRrW-}?twd|G9bv&Nm@#N%-GjCs=9a(%i z|NPJi>5E8sdz4k}s(0J^ER0zR9+PX6+x9L;W&b+Q=HHcmZ-<25(>)urU$zyftVPx{ zI6IYhCR#7F%DU2{?jUnSDxJT*==rqxxq49$KmY**5I_I{1Q0*~0R#|0009ILKmY** z5I_I{1Q0*~0R#|0009ILKmY**5I_I{1Q0*~0R#|0009ILKmY**5I_I{1Q0;rUj>HZ zr4Q?;pUE`wrC`)MS-hr4eB-6h*B9iRad1fz6Alf((fVPy_|4?&;_EB%_2o}r!?$lS(cbu_xyfH=rL~`e00IagfB*srAbAUVCDbUgLY!v zIy26#Lf*0CPBP_8ngbKbbk-c!7Yfvv)j5Ajt8*>e!qPr2(I_t=xF_!y(038&|EOzz z-_Ota`6EBSsJXs?AnJ`D^z)#fyI&LV{%Kw3+bt26hQ7-{bA88w)Y^1Q#Mmv?Y~3~M x-@e1UgKr+Y8I0w5h>Vf zHxw@l=Wigoyw;u9x_y528uwRphHsMFD%b}-?V;Crye7F|`?na_m;e)C0!)AjFaajO z1egF5U;<2l2{3^_Cg6p&TI8-p^;-4Eo#g4Z-`vxus`FsA@8Pvh=l>3PZKcX&s;==r z*AK|q9E@bvAJY1GMjzAqbVi>+-T7jlQ&P8}*Sr^y{Q&dd=#yG^GWAg1k9Q(pgs~_F z@gXdh7gpAn%NyX=qFw1QiH_TuA0Ab~_qXHNk3~=y;D^;f1OUbO^iTxb7)4#hJ(G#> zL_krM>3#k-3}Pew7xK*nEn;}-_Y@V asmName + ':\n' + asm).join('\n') + return ( + 'section .text\n' + + ' global _main\n' + + '_main:\n' + + ' push rbp\n' + + ' mov rbp, rsp\n' + + statements.map(v => ` ${v}\n`).join('') + + ' mov rsp, rbp\n' + + ' pop rbp\n' + + ' mov rax, 0x02000001\n' + + ' mov rdi, 0\n' + + ' syscall\n' + [...linkedLibraries.values()] + .map(({asmName, asm}) => asmName + ':\n' + asm) + .join('\n') + ); } else { return 'section .text\n global _start\n_start:\n push rbp\n mov rbp, rsp\n ' + statements.join('\n ') @@ -106,7 +117,10 @@ function compileVariable(name, value) { }); if(value.type === 'string') { const variableName = compileStringLiteral(value.value); - statements.push('push ' + variableName) + if(process.platform === 'darwin') + statements.push(`push qword [rel ${variableName}]`); + else + statements.push('push ' + variableName); } else { console.error('dont know how to set a variable to a non string lol') } diff --git a/src/disco.ts b/src/disco.ts index bc91bd5..bf6f39b 100755 --- a/src/disco.ts +++ b/src/disco.ts @@ -32,8 +32,8 @@ const asmFile = compile(ast) try { console.log(); console.log('=== ASM ==='); - console.log(colorize(asmFile)); require('fs').writeFileSync('disco_test.asm', asmFile); + console.log(colorize(asmFile)); console.log(); console.log('=== nasm ==='); @@ -59,6 +59,7 @@ function ld() { require('child_process').execSync([ 'ld', 'disco_test.o', '-o', 'disco_test', + '-no_pie', '-macosx_version_min', '11.0', '-L', '/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib', '-lSystem' diff --git a/src/util/asm/grammar.ts b/src/util/asm/grammar.ts index 034e6c3..03f75f3 100644 --- a/src/util/asm/grammar.ts +++ b/src/util/asm/grammar.ts @@ -26,8 +26,12 @@ export default new Grammar([ { left: t.$Line, right: [t.$Global, t.$Identifier], resolver: (_, {value}) => ` ${ansi(...keywordColor).bold('global')} ${ansi(...identifierColor)(value)}` }, { left: t.$Line, right: [t.$Identifier, t.$Colon], resolver: ({value}) => `${ansi(...identifierColor)(value)}:` }, + { left: t.$Line, right: [t.$Bits, t.$Number], resolver: (_, n) => `${ansi(...keywordColor).bold('bits')} ${ansi(...numberColor)(n.value)}`}, + { left: t.$Line, right: [t.$Default, t.$Rel], resolver: () => `${ansi(...keywordColor).bold('default')} ${ansi(...keywordColor).bold('rel')}`}, // actual instructions + { left: t.$Line, right: [t.$Push, t.$DataSize, t.$LBracket, t.$Rel, t.$Identifier, t.$RBracket], + resolver: (_, size, __, ___, identifier) => ` ${ansi(...instructionColor)('push')} ${size} ${ansi(...pointerColor)('[')}${ansi(...keywordColor).bold('rel')} ${ansi(...identifierColor)(identifier.value)}${ansi(...pointerColor)(']')}` }, { left: t.$Line, right: [t.$Push, t.$Value], resolver: (_, v) => ` ${ansi(...instructionColor)('push')} ${v}` }, { left: t.$Line, right: [t.$Pop, t.$Value], resolver: (_, v) => ` ${ansi(...instructionColor)('pop')} ${v}` }, { left: t.$Line, right: [t.$Cmp, t.$Register, t.$Comma, t.$Value], @@ -52,5 +56,10 @@ export default new Grammar([ { left: t.$Value, right: [t.$Identifier], resolver: (v) => ansi(...identifierColor)(v.value) }, { left: t.$CompoundString, right: [t.$Number], resolver: (n) => ansi(...numberColor)(n.value) }, - { left: t.$CompoundString, right: [t.$Number, t.$Comma, t.$CompoundString], resolver: (n, _, ns) => ansi(...numberColor)(n.value) + ',' + ns } + { left: t.$CompoundString, right: [t.$Number, t.$Comma, t.$CompoundString], resolver: (n, _, ns) => ansi(...numberColor)(n.value) + ',' + ns }, + + { left: t.$DataSize, right: [t.$Word], resolver: (v) => ansi(...keywordColor).bold(v.value) }, + { left: t.$DataSize, right: [t.$DWord], resolver: (v) => ansi(...keywordColor).bold(v.value) }, + { left: t.$DataSize, right: [t.$QWord], resolver: (v) => ansi(...keywordColor).bold(v.value) }, + { left: t.$DataSize, right: [t.$OWord], resolver: (v) => ansi(...keywordColor).bold(v.value) }, ], t.$Program); \ No newline at end of file diff --git a/src/util/asm/tokenizer.ts b/src/util/asm/tokenizer.ts index 57c6614..6ab2ae7 100644 --- a/src/util/asm/tokenizer.ts +++ b/src/util/asm/tokenizer.ts @@ -4,28 +4,62 @@ import { $Newline, } from "./../../earley"; -export default createTokenizer([ - { match: /^[\r\t ]{1,}$/, token: null }, - { match: 'section', token: tokens.$Section }, - { match: 'db', token: tokens.$Db }, - { match: 'global', token: tokens.$Global }, - { match: '\n', token: $Newline }, - { match: ':', token: tokens.$Colon }, - { match: ',', token: tokens.$Comma }, - { match: '[', token: tokens.$LBracket }, - { match: ']', token: tokens.$RBracket }, - { match: '-', token: tokens.$Minus }, - { match: 'mov', token: tokens.$Mov }, - { match: 'push', token: tokens.$Push }, - { match: 'pop', token: tokens.$Pop }, - { match: 'call', token: tokens.$Call }, - { match: 'syscall', token: tokens.$Syscall }, - { match: 'ret', token: tokens.$Ret }, - { match: 'je', token: tokens.$Je }, - { match: 'jmp', token: tokens.$Jmp }, - { match: 'cmp', token: tokens.$Cmp }, - { match: 'inc', token: tokens.$Inc }, - { match: /^[0-9]{1,}$/, token: tokens.$Number }, - { match: /^(rbp|rsp|rax|rcx|rbx|rdx|rdi|rsi|al|bl|cl|dl|ah|bh|ch|dh|ax|bx|cx|dx|eax|ebx|ecx|edx)$/, token: tokens.$Register }, - { match: /^[A-Za-z._][A-Za-z_]{0,}$/, token: tokens.$Identifier }, -]) \ No newline at end of file +const asmTokenizer = createTokenizer([ + // whitespaces + [ /^[\r\t ]{1,}/, null], + [ /^\n/, $Newline], + + // keywords + [ /^section/, tokens.$Section], + [ /^db/, tokens.$Db], + [ /^global/, tokens.$Global], + [ /^bits/, tokens.$Bits], + [ /^default/, tokens.$Default], + [ /^rel/, tokens.$Rel], + [ /^word/, tokens.$Word], + [ /^dword/, tokens.$DWord], + [ /^qword/, tokens.$QWord], + [ /^oword/, tokens.$OWord], + + // punctuation + [ /^:/, tokens.$Colon], + [ /^,/, tokens.$Comma], + [ /^\[/, tokens.$LBracket], + [ /^\]/, tokens.$RBracket], + [ /^-/, tokens.$Minus], + + // instructions + [ /^mov/, tokens.$Mov], + [ /^push/, tokens.$Push], + [ /^pop/, tokens.$Pop], + [ /^syscall/, tokens.$Syscall], + [ /^ret/, tokens.$Ret], + [ /^je/, tokens.$Je], + [ /^jmp/, tokens.$Jmp], + [ /^cmp/, tokens.$Cmp], + [ /^inc/, tokens.$Inc], + + // pseudo-instructions + [ /^call/, tokens.$Call], + + // 8 bit general purpose registers... + [ /^(al|ah|bl|bh|cl|ch|dl|dh)/, tokens.$Register ], + // 16 bit general purpose registers... + [ /^(ax|bx|cx|dx)/, tokens.$Register ], + // 32 bit general purpose registers... + [ /^(eax|ebx|ecx|edx)/, tokens.$Register ], + // 64 bit general purpose registers... + [ /^(rax|rbx|rcx|rdx)/, tokens.$Register ], + // other registers, idk. + [ /^(rbp|rsp|rdi|rsi)/, tokens.$Register], + + // user-defined + [ /^[0-9]{1,}/, tokens.$Number], + [ /^0x[0-9A-Fa-f]{1,}/, tokens.$Number], + [ /^[A-Za-z._][A-Za-z_]{0,}/, tokens.$Identifier] +]) +export default asmTokenizer; + +import input from './testInput'; +import { printTokens } from "../utils"; +printTokens(asmTokenizer(input)); \ No newline at end of file diff --git a/src/util/asm/tokens.ts b/src/util/asm/tokens.ts index dfa9027..43edef6 100644 --- a/src/util/asm/tokens.ts +++ b/src/util/asm/tokens.ts @@ -21,6 +21,13 @@ export class $RBracket extends Terminal { } export class $Comma extends Terminal { } export class $Colon extends Terminal { } export class $Minus extends Terminal { } +export class $Bits extends Terminal { } +export class $Default extends Terminal { } +export class $Rel extends Terminal { } +export class $Word extends Terminal { } +export class $DWord extends Terminal { } +export class $QWord extends Terminal { } +export class $OWord extends Terminal { } // varying tokens export class $Identifier extends Terminal { } @@ -33,4 +40,5 @@ export class $Line extends NonTerminal { } export class $PointerDereference extends NonTerminal { } export class $Program extends NonTerminal { } export class $CompoundString extends NonTerminal { } -export class $Value extends NonTerminal { } \ No newline at end of file +export class $Value extends NonTerminal { } +export class $DataSize extends NonTerminal { } \ No newline at end of file diff --git a/src/util/generalTokenizer.ts b/src/util/generalTokenizer.ts index 78dcb7d..551b64a 100644 --- a/src/util/generalTokenizer.ts +++ b/src/util/generalTokenizer.ts @@ -1,72 +1,63 @@ -import { TerminalTokenClass } from "../earley"; -import { inspect } from 'util'; +import { Terminal, TerminalTokenClass } from "../earley"; -interface TokenMatcher { - match: RegExp | string, - token: TerminalTokenClass +type TokenMatcher = [ RegExp, TerminalTokenClass ]; + +interface Match { + regex: RegExp; + length: number; + tokenClass: TerminalTokenClass; + matchedString: string; +} + +// this is kinda bullshit lol exec is a dumb method. +function getFirstMatch(r: RegExp, str: string): [number, string] { + let matches = str.match(r); + if(matches === null) return [-1, '']; + return [matches.index, matches[0]]; } export function createTokenizer(tokenMap: TokenMatcher[]) { - return function tokenize(str: string) { - let tokens = []; - let token = ''; - let line = 1, column = 0; - for(let i = 0; i < str.length; i ++) { - const char = str[i]; - const lookahead = (i < str.length - 1 ? str[i + 1] : null) - column++; - token += char; - for(const {match: matcher, token: tokenClass} of tokenMap) { - if(typeof matcher === 'string') { - if(matcher === token) { - if(tokenClass !== null) { - tokens.push(new tokenClass(line, column - token.length + 1, token)); - } - token = ''; - } else { - // dw about it - } - } else { - // matcher is regex... - // * note: this only tests if token contains a match, not that it _is_ a match - if(matcher.test(token)) { - if(lookahead) { - if(!matcher.test(token + lookahead)) { - // the next character would not match, so this must be the match. - // ! PS: it is possible that even though this would no longer - // ! match, another matcher could still match more. - // ! in those cases, we would want to expand on this logic - // ! to only match if there are no matches for any matcher - // ! in the lookahead. - // ! in practice this means tracking all possible non lookahead - // ! matches, then testing them for their lookahead afterwards - // ! in another loop, and only tokenizing if you have only one - // ! option, and that option will fail on the lookahead. - if(tokenClass !== null) { - tokens.push(new tokenClass(line, column - token.length + 1, token)); - } - token = ''; - } else { - // the lookahead matches this too, so we should probably hold off - // on tokenizing it... - } - } else { - if(tokenClass !== null) { - tokens.push(new tokenClass(line, column - token.length + 1, token)); - } - token = ''; - } - } + return function tokenize(str: string, l = 1, c = 1): Terminal[] { + + const possibleMatches: Match[] = tokenMap + .map(([regex, tokenClass]) => { + const [index, match] = getFirstMatch(regex, str); + if(index === -1) return null; + return { + regex, + tokenClass, + length: match.length, + matchedString: match } - } + }) + .filter(v => !!v); + const longestLength = possibleMatches + .map(v => v.length) + .reduce((a, v) => a > v ? a : v, -Infinity); + + const longestMatches = possibleMatches + .filter(v => v.length === longestLength); + + console.assert(longestMatches.length > 0, 'No token matches found'); + + const [{tokenClass, matchedString}] = longestMatches; + const length = matchedString.length; + const token = tokenClass ? new tokenClass(l, c, matchedString) : null; + + const rest = str.substring(length); + + if(rest === '') return [ token ]; + + for(const char of matchedString) { + c ++; if(char === '\n') { - line ++; - column = 0; + l ++; + c = 1; } } - return tokens; + return token ? [token, ...tokenize(rest, l, c)] : tokenize(rest, l, c); } } \ No newline at end of file diff --git a/todo.md b/todo.md index ba0eae7..38e4154 100644 --- a/todo.md +++ b/todo.md @@ -1,15 +1,40 @@ # Todo List -[x] colorize the assembly output -[ ] rewrite disco tokenizer to the new generalTokenizer -[ ] add an EOF token to earley, and yknow, add it to the tokenizer. -[ ] add number support -[ ] add comment support -[ ] add fixed length array support -[ ] organize AST elements into classes -[ ] better logging of the AST -[ ] optionally artifically slow down compilation (for fun) -[ ] implement functions -[ ] implement some basic maths operations -[ ] implement multi-argument invocations -[ ] implement return values +- [x] colorize the assembly output +- [x] create generalTokenizer to make tokenization generic +- [ ] rewrite disco tokenizer to the new generalTokenizer +- [ ] add an EOF token to earley, and yknow, add it to the tokenizer. +- [ ] add number support +- [ ] add comment support +- [ ] add fixed length array support +- [ ] organize AST elements into classes +- [ ] better logging of the AST +- [ ] optionally artifically slow down compilation (for fun) +- [ ] implement functions +- [ ] implement some basic maths operations +- [ ] implement multi-argument invocations +- [ ] implement return values +- [ ] write a regex compiler +- [ ] write log in disco. creat a library for just doing syscalls. the rest can be done in disco + +# Changelog + +- fixed macos compilation to use relative addressing (i think) +- fixed a bug in the general tokenizer that failed to match some tokens properly. + +--- + +- create generalized tokenizer +- implement assembly language grammar for syntax highlighting +- create a vscode extension for syntax highlighting + +--- + +- compile disco code to assembly as POC +- create an AST for disco code +- implement earley grammar for disco including: + - linking library functions + - calling functions + - string literals + - string variables +- created earley parser \ No newline at end of file