diff --git a/disco_test b/disco_test index e0aa67f..e128b8f 100755 Binary files a/disco_test and b/disco_test differ diff --git a/disco_test.asm b/disco_test.asm index 8e5b7c5..39c7bfb 100644 --- a/disco_test.asm +++ b/disco_test.asm @@ -1,15 +1,15 @@ - global _main section .data - UMRRSQMF db 72,101,108,108,111,32,87,111,114,108,100,0 + EFDNYLFZ db 72,101,108,108,111,32,87,111,114,108,100,0 section .text -_main: + global _start +_start: push rbp mov rbp, rsp - mov rdi, UMRRSQMF + mov rdi, EFDNYLFZ call _log mov rsp, rbp pop rbp - mov rax, 0x02000001 + mov rax, 60 mov rdi, 0 syscall _log: @@ -24,12 +24,12 @@ _log_loop: jmp _log_loop _log_loop_end: mov rdx, rbx - mov rax, 0x02000004 + mov rax, 1 mov rdi, 1 pop rsi syscall push 10 - mov rax, 0x02000004 + mov rax, 1 mov rdi, 1 mov rsi, rsp mov rdx, 1 diff --git a/src/disco.ts b/src/disco.ts index 62a30d7..bc91bd5 100755 --- a/src/disco.ts +++ b/src/disco.ts @@ -1,12 +1,11 @@ #!/usr/bin/env node -// const AST = require('../ast.js'); -// const compile = require('../compiler.js'); - import { readFileSync } from "fs"; import { compile } from "./compiler"; -import grammar, { $Newline } from "./grammar"; +import grammar from "./grammar"; import { tokenize } from "./tokenizer"; +import colorize from "./util/asm/colorize"; +import { printTokens } from "./util/utils"; console.log(); console.log('=== Original ==='); @@ -15,15 +14,12 @@ console.log(fileContents) console.log('=== Tokenization ==='); const tokens = tokenize(fileContents); -for(const token of tokens) { - process.stdout.write(token.toString() + ' '); - if(token instanceof $Newline) console.log(); -} +printTokens(tokens); console.log(); console.log('=== Parsing ==='); -const ast = grammar.solveFor(tokens)[0]; +const ast = grammar.solveFor(tokens, { silent: false })[0]; console.log(); console.log('=== AST ==='); @@ -36,7 +32,7 @@ const asmFile = compile(ast) try { console.log(); console.log('=== ASM ==='); - console.log(asmFile); + console.log(colorize(asmFile)); require('fs').writeFileSync('disco_test.asm', asmFile); console.log(); diff --git a/src/earley.ts b/src/earley.ts index 3c42a45..fc12993 100644 --- a/src/earley.ts +++ b/src/earley.ts @@ -1,15 +1,14 @@ -import * as chalk from 'chalk'; - -const rgb2ansi = (r: number, g: number, b: number) => r * 36 + g * 6 + b + 16 -const ansi = (r: number, g = r, b = r) => chalk.ansi256(rgb2ansi(r, g, b)); +import { ansi } from './util/utils'; export abstract class Token { l: number; c: number; + value: string; static terminal: boolean; - constructor(l: number, c: number) { + constructor(l: number, c: number, value: string) { this.l = l; this.c = c; + this.value = value; } static toString() { if(this.terminal) { @@ -35,6 +34,10 @@ export abstract class Token { export class NonTerminal extends Token { static terminal: false = false }; export class Terminal extends Token { static terminal: true = true }; +// these tokens are special, for formatting and generalization reasons. +export class $Newline extends Terminal { } +export class $Whitespace extends Terminal { } + function isTerminal(tokenClass: TokenClass): tokenClass is TerminalTokenClass { return tokenClass.terminal; } @@ -43,9 +46,9 @@ function isNonTerminal(tokenClass: TokenClass): tokenClass is NonTerminalTokenCl return !tokenClass.terminal; } -type TerminalTokenClass = { new(...args: any[]) : Terminal, terminal: true } -type NonTerminalTokenClass = { new(...args: any[]) : NonTerminal, terminal: false } -type TokenClass = TerminalTokenClass | NonTerminalTokenClass; +export type TerminalTokenClass = { new(...args: any[]) : Terminal, terminal: true } +export type NonTerminalTokenClass = { new(...args: any[]) : NonTerminal, terminal: false } +export type TokenClass = TerminalTokenClass | NonTerminalTokenClass; function getTokenClassFromToken(token: Token): TokenClass { return token.constructor as TokenClass; @@ -96,7 +99,7 @@ export class Grammar { this.startingSymbol = startingSymbol; } - solveFor(tokens: Token[]) { + solveFor(tokens: Token[], options: { silent: boolean } = { silent: true }) { const state = new TimeMachine(() => new SingleEarleyState()); const possibleStartingProductions = getProductionsForTokenClass(this.productions, this.startingSymbol) @@ -132,12 +135,12 @@ export class Grammar { // expand all non terminals here - console.log(ansi(3, 3, 0)('s') + ansi(4, 4, 0)(state.currentIndex) + ': ' + this.startingSymbol.toString()); - console.log(state.current.toString(), '\n') + if(!options.silent) console.log(ansi(3, 3, 0)('s') + ansi(4, 4, 0)(state.currentIndex) + ': ' + this.startingSymbol.toString()); + if(!options.silent) console.log(state.current.toString(), '\n') for(const token of tokens) { state.newState(); - console.log(ansi(3, 3, 0)('s') + ansi(4, 4, 0)(state.currentIndex) + ': ' + token.toString()); + if(!options.silent) console.log(ansi(3, 3, 0)('s') + ansi(4, 4, 0)(state.currentIndex) + ': ' + token.toString()); for(const partialMatch of state.previousState.partialMatches) { if(partialMatch.complete) continue; @@ -148,11 +151,15 @@ export class Grammar { } console.assert(state.current.partialMatches.length !== 0, ansi(4, 1, 1)('unexpected token ' + token.toString())) + if(state.current.partialMatches.length === 0) { + if(!options.silent) console.log(); + process.exit(1); + } state.current.partialMatches.forEach(expand); state.current.deduplicate() - console.log(state.current.toString(), '\n') + if(!options.silent) console.log(state.current.toString(), '\n') } const completedResolutions = []; diff --git a/src/grammar.ts b/src/grammar.ts index e16a213..824007d 100644 --- a/src/grammar.ts +++ b/src/grammar.ts @@ -7,21 +7,8 @@ export class $KeywordLParen extends Terminal { } export class $KeywordRParen extends Terminal { } export class $KeywordConst extends Terminal { } -export class $String extends Terminal { - value: string; - constructor(l: number, c: number, value: string) { - super(l, c); - this.value = value; - } -} - -export class $Identifier extends Terminal { - value: string; - constructor(l: number, c: number, value: string) { - super(l, c); - this.value = value; - } -} +export class $String extends Terminal {} +export class $Identifier extends Terminal {} export class $Newline extends Terminal { } diff --git a/src/tokenizer.ts b/src/tokenizer.ts index 09cfa74..b54f5e5 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -28,11 +28,11 @@ export function tokenize(string) { if(token.trim() !== '') { if(keywords.has(token)) { const kwTokenClass = keywords.get(token); - tokens.push(new kwTokenClass(0, 0)); + tokens.push(new kwTokenClass(0, 0, token)); } else if (isStringDelim(token[0])) tokens.push(new $String(0, 0, token.substring(1, token.length - 1))); else if (token === 'NEWLINE') - tokens.push(new $Newline(0, 0)) + tokens.push(new $Newline(0, 0, token)) else tokens.push(new $Identifier(0, 0, token)); resetToken(); @@ -82,4 +82,4 @@ export function tokenize(string) { } return tokens; -} +} \ No newline at end of file diff --git a/src/util/asm/colorize.ts b/src/util/asm/colorize.ts new file mode 100644 index 0000000..410e4bc --- /dev/null +++ b/src/util/asm/colorize.ts @@ -0,0 +1,6 @@ +import grammar from './grammar'; +import tokenize from './tokenizer'; + +export default function colorize(str: string): string { + return grammar.solveFor(tokenize(str))[0]; +} \ No newline at end of file diff --git a/src/util/asm/grammar.ts b/src/util/asm/grammar.ts new file mode 100644 index 0000000..034e6c3 --- /dev/null +++ b/src/util/asm/grammar.ts @@ -0,0 +1,56 @@ +import { Grammar, Production, $Newline } from "../../earley"; +import { ansi } from "../utils"; +import * as t from "./tokens"; + +// add EOF token to basic shit, and always add it to tokenizer +// const grammar = new Grammar(ps, ); +type ansiRGB = [number, number, number]; + +const registerColor: ansiRGB = [5, 3, 0]; +const numberColor: ansiRGB = [4, 4, 0]; +const keywordColor: ansiRGB = [2, 4, 0]; +const instructionColor: ansiRGB = [5, 1, 4]; +const syscallColor: ansiRGB = [5, 1, 5]; +const identifierColor: ansiRGB = [0, 4, 5]; +const pointerColor: ansiRGB = [3, 0, 5]; + +export default new Grammar([ + { left: t.$Program, right: [t.$Line], resolver: (s) => !!s ? s : '' }, + { left: t.$Program, right: [t.$Line, $Newline, t.$Program], resolver: (s, _, ss) => !!s ? s + '\n' + ss : ss}, + + // lines that arent instructions? idk man. + { left: t.$Line, right: [t.$Section, t.$Identifier], + resolver: (_, identifier) => `${ansi(...keywordColor).bold('section')} ${ansi(...identifierColor)(identifier.value)}` }, + { left: t.$Line, right: [t.$Identifier, t.$Db, t.$CompoundString], + resolver: (identifier, _, ns) => ` ${ansi(...identifierColor)(identifier.value)} ${ansi(...keywordColor).bold('db')} ${ns}` }, + { left: t.$Line, right: [t.$Global, t.$Identifier], + resolver: (_, {value}) => ` ${ansi(...keywordColor).bold('global')} ${ansi(...identifierColor)(value)}` }, + { left: t.$Line, right: [t.$Identifier, t.$Colon], resolver: ({value}) => `${ansi(...identifierColor)(value)}:` }, + + // actual instructions + { left: t.$Line, right: [t.$Push, t.$Value], resolver: (_, v) => ` ${ansi(...instructionColor)('push')} ${v}` }, + { left: t.$Line, right: [t.$Pop, t.$Value], resolver: (_, v) => ` ${ansi(...instructionColor)('pop')} ${v}` }, + { left: t.$Line, right: [t.$Cmp, t.$Register, t.$Comma, t.$Value], + resolver: (_, register, __, value) => ` ${ansi(...instructionColor)('cmp')} ${ansi(...registerColor)(register.value)}, ${value}`}, + { left: t.$Line, right: [t.$Je, t.$Identifier], resolver: (_, {value}) => ` ${ansi(...instructionColor)('je')} ${ansi(...identifierColor)(value)}` }, + { left: t.$Line, right: [t.$Jmp, t.$Identifier], resolver: (_, {value}) => ` ${ansi(...instructionColor)('jmp')} ${ansi(...identifierColor)(value)}` }, + { left: t.$Line, right: [t.$Ret], resolver: () => ` ${ansi(...keywordColor).bold('ret')}`}, + { left: t.$Line, right: [t.$Inc, t.$Register], resolver: (_, register) => ` ${ansi(...instructionColor)('inc')} ${ansi(...registerColor)(register.value)}` }, + { left: t.$Line, right: [t.$Syscall], resolver: () => ` ${ansi(...syscallColor).bold('syscall')}` }, + { left: t.$Line, right: [t.$Mov, t.$Register, t.$Comma, t.$Value], + resolver: (_, register, __, value) => ` ${ansi(...instructionColor)('mov')} ${ansi(...registerColor)(register.value)}, ${value}` }, + { left: t.$Line, right: [t.$Mov, t.$Register, t.$Comma, t.$PointerDereference], + resolver: (_, register, __, value) => ` ${ansi(...instructionColor)('mov')} ${ansi(...registerColor)(register.value)}, ${value}` }, + { left: t.$Line, right: [t.$Call, t.$Identifier], resolver: (_, {value}) => ` ${ansi(...keywordColor).bold('call')} ${ansi(...identifierColor)(value)}` }, + + { left: t.$PointerDereference, right: [t.$LBracket, t.$Value, t.$Minus, t.$Number, t.$RBracket], + resolver: (_, v, __, n) => `${ansi(...pointerColor)('[')}${v}-${ansi(...numberColor)(n.value)}${ansi(...pointerColor)(']')}` }, + { left: t.$PointerDereference, right: [t.$LBracket, t.$Value, t.$RBracket], resolver: (_, v) => `${ansi(...pointerColor)('[')}${v}${ansi(...pointerColor)(']')}` }, + + { left: t.$Value, right: [t.$Number], resolver: (v) => ansi(...numberColor)(v.value) }, + { left: t.$Value, right: [t.$Register], resolver: (v) => ansi(...registerColor)(v.value) }, + { left: t.$Value, right: [t.$Identifier], resolver: (v) => ansi(...identifierColor)(v.value) }, + + { left: t.$CompoundString, right: [t.$Number], resolver: (n) => ansi(...numberColor)(n.value) }, + { left: t.$CompoundString, right: [t.$Number, t.$Comma, t.$CompoundString], resolver: (n, _, ns) => ansi(...numberColor)(n.value) + ',' + ns } +], t.$Program); \ No newline at end of file diff --git a/src/util/asm/testInput.ts b/src/util/asm/testInput.ts new file mode 100644 index 0000000..fd3fadc --- /dev/null +++ b/src/util/asm/testInput.ts @@ -0,0 +1,53 @@ +export default `section .data + QVGWSIUM db 84,104,105,115,32,83,116,114,105,110,103,32,105,115,32,67,111,110,116,97,105,110,101,100,32,105,110,32,97,32,118,97,114,105,97,98,108,101,0 + ZYXGJUBF db 84,104,105,115,32,105,115,32,97,32,115,101,99,111,110,100,32,115,116,114,105,110,103,32,105,110,32,97,32,118,97,114,105,97,98,108,101,0 + GPBLFTCX db 104,101,108,108,111,0 + GXMDWCDF db 119,111,114,108,100,0 +section .text + global _start +_start: + push rbp + mov rbp, rsp + push QVGWSIUM + push ZYXGJUBF + mov rdi, GPBLFTCX + call _log + mov rdi, GXMDWCDF + call _log + mov rdi, [rbp - 8] + call _log + mov rdi, [rbp - 16] + call _log + mov rdi, [rbp - 8] + call _log + mov rdi, [rbp - 16] + call _log + mov rsp, rbp + pop rbp + mov rax, 60 + mov rdi, 0 + syscall +_log: + push rdi + mov rbx, 0 +_log_loop: + mov cl, [rdi] + cmp cl, 0 + je _log_loop_end + inc rdi + inc rbx + jmp _log_loop +_log_loop_end: + mov rdx, rbx + mov rax, 1 + mov rdi, 1 + pop rsi + syscall + push 10 + mov rax, 1 + mov rdi, 1 + mov rsi, rsp + mov rdx, 1 + syscall + pop rdi + ret`; \ No newline at end of file diff --git a/src/util/asm/tokenizer.ts b/src/util/asm/tokenizer.ts new file mode 100644 index 0000000..57c6614 --- /dev/null +++ b/src/util/asm/tokenizer.ts @@ -0,0 +1,31 @@ +import { createTokenizer } from "../generalTokenizer"; +import * as tokens from "./tokens"; +import { + $Newline, +} from "./../../earley"; + +export default createTokenizer([ + { match: /^[\r\t ]{1,}$/, token: null }, + { match: 'section', token: tokens.$Section }, + { match: 'db', token: tokens.$Db }, + { match: 'global', token: tokens.$Global }, + { match: '\n', token: $Newline }, + { match: ':', token: tokens.$Colon }, + { match: ',', token: tokens.$Comma }, + { match: '[', token: tokens.$LBracket }, + { match: ']', token: tokens.$RBracket }, + { match: '-', token: tokens.$Minus }, + { match: 'mov', token: tokens.$Mov }, + { match: 'push', token: tokens.$Push }, + { match: 'pop', token: tokens.$Pop }, + { match: 'call', token: tokens.$Call }, + { match: 'syscall', token: tokens.$Syscall }, + { match: 'ret', token: tokens.$Ret }, + { match: 'je', token: tokens.$Je }, + { match: 'jmp', token: tokens.$Jmp }, + { match: 'cmp', token: tokens.$Cmp }, + { match: 'inc', token: tokens.$Inc }, + { match: /^[0-9]{1,}$/, token: tokens.$Number }, + { match: /^(rbp|rsp|rax|rcx|rbx|rdx|rdi|rsi|al|bl|cl|dl|ah|bh|ch|dh|ax|bx|cx|dx|eax|ebx|ecx|edx)$/, token: tokens.$Register }, + { match: /^[A-Za-z._][A-Za-z_]{0,}$/, token: tokens.$Identifier }, +]) \ No newline at end of file diff --git a/src/util/asm/tokens.ts b/src/util/asm/tokens.ts new file mode 100644 index 0000000..dfa9027 --- /dev/null +++ b/src/util/asm/tokens.ts @@ -0,0 +1,36 @@ +import { Terminal, NonTerminal } from "../../earley"; + +// Instruction keywords... +export class $Mov extends Terminal { } +export class $Push extends Terminal { } +export class $Pop extends Terminal { } +export class $Call extends Terminal { } +export class $Syscall extends Terminal { } +export class $Ret extends Terminal { } +export class $Je extends Terminal { } +export class $Inc extends Terminal { } +export class $Cmp extends Terminal { } +export class $Jmp extends Terminal { } + +// keywords +export class $Section extends Terminal { } +export class $Global extends Terminal { } +export class $Db extends Terminal { } +export class $LBracket extends Terminal { } +export class $RBracket extends Terminal { } +export class $Comma extends Terminal { } +export class $Colon extends Terminal { } +export class $Minus extends Terminal { } + +// varying tokens +export class $Identifier extends Terminal { } +export class $String extends Terminal { } +export class $Number extends Terminal { } +export class $Register extends Terminal { } + +// non terminals +export class $Line extends NonTerminal { } +export class $PointerDereference extends NonTerminal { } +export class $Program extends NonTerminal { } +export class $CompoundString extends NonTerminal { } +export class $Value extends NonTerminal { } \ No newline at end of file diff --git a/src/util/asmLogger.ts b/src/util/asmLogger.ts new file mode 100644 index 0000000..66c28de --- /dev/null +++ b/src/util/asmLogger.ts @@ -0,0 +1,3 @@ +export function logASM(asm: string) { + +} \ No newline at end of file diff --git a/src/util/generalTokenizer.ts b/src/util/generalTokenizer.ts new file mode 100644 index 0000000..78dcb7d --- /dev/null +++ b/src/util/generalTokenizer.ts @@ -0,0 +1,72 @@ +import { TerminalTokenClass } from "../earley"; +import { inspect } from 'util'; + +interface TokenMatcher { + match: RegExp | string, + token: TerminalTokenClass +} + +export function createTokenizer(tokenMap: TokenMatcher[]) { + return function tokenize(str: string) { + let tokens = []; + let token = ''; + let line = 1, column = 0; + for(let i = 0; i < str.length; i ++) { + const char = str[i]; + const lookahead = (i < str.length - 1 ? str[i + 1] : null) + column++; + token += char; + + for(const {match: matcher, token: tokenClass} of tokenMap) { + if(typeof matcher === 'string') { + if(matcher === token) { + if(tokenClass !== null) { + tokens.push(new tokenClass(line, column - token.length + 1, token)); + } + token = ''; + } else { + // dw about it + } + } else { + // matcher is regex... + // * note: this only tests if token contains a match, not that it _is_ a match + if(matcher.test(token)) { + if(lookahead) { + if(!matcher.test(token + lookahead)) { + // the next character would not match, so this must be the match. + // ! PS: it is possible that even though this would no longer + // ! match, another matcher could still match more. + // ! in those cases, we would want to expand on this logic + // ! to only match if there are no matches for any matcher + // ! in the lookahead. + // ! in practice this means tracking all possible non lookahead + // ! matches, then testing them for their lookahead afterwards + // ! in another loop, and only tokenizing if you have only one + // ! option, and that option will fail on the lookahead. + if(tokenClass !== null) { + tokens.push(new tokenClass(line, column - token.length + 1, token)); + } + token = ''; + } else { + // the lookahead matches this too, so we should probably hold off + // on tokenizing it... + } + } else { + if(tokenClass !== null) { + tokens.push(new tokenClass(line, column - token.length + 1, token)); + } + token = ''; + } + } + } + } + + if(char === '\n') { + line ++; + column = 0; + } + } + + return tokens; + } +} \ No newline at end of file diff --git a/src/util/tokenTest.ts b/src/util/tokenTest.ts new file mode 100644 index 0000000..7ab47f1 --- /dev/null +++ b/src/util/tokenTest.ts @@ -0,0 +1,15 @@ +import { Terminal } from '../earley'; +import { createTokenizer } from './generalTokenizer'; + +class $Number extends Terminal { } +class $Plus extends Terminal { } +class $Newline extends Terminal { } + +const tokenizer = createTokenizer([ + { match: /^[0-9]{1,}$/, token: $Number }, + { match: /^[\r\t ]{1,}$/, token: null }, + { match: '\n', token: $Newline }, + { match: '+', token: $Plus }, +]) + +console.log(tokenizer("5 + \n 6 ").map(v => v.toString()).join(' ')); \ No newline at end of file diff --git a/src/util/utils.ts b/src/util/utils.ts new file mode 100644 index 0000000..218534c --- /dev/null +++ b/src/util/utils.ts @@ -0,0 +1,13 @@ +import * as chalk from 'chalk'; +import { Token, $Newline } from '../earley'; + +export function printTokens(tokens: Token[]) { + for(const token of tokens) { + process.stdout.write(token.toString() + ' '); + if(token instanceof $Newline) console.log(); + } + console.log(); +} + +const rgb2ansi = (r: number, g: number, b: number) => r * 36 + g * 6 + b + 16 +export const ansi = (r: number, g = r, b = r) => chalk.ansi256(rgb2ansi(r, g, b)); diff --git a/todo.md b/todo.md index 65c22a2..91c02c4 100644 --- a/todo.md +++ b/todo.md @@ -1,6 +1,7 @@ # Todo List -[ ] colorize the assembly output +[x] colorize the assembly output +[ ] rewrite disco tokenizer to the new generalTokenizer [ ] add number support [ ] add comment support [ ] add fixed length array support @@ -9,4 +10,4 @@ [ ] optionally artifically slow down compilation (for fun) [ ] implement some basic maths operations [ ] implement multi-argument invocations -[ ] implement return values \ No newline at end of file +[ ] implement return values