weewoo
parent
0bc4b561c6
commit
aaf13743ed
|
|
@ -1,2 +1,3 @@
|
|||
link log
|
||||
log("Hello World")
|
||||
const a = "a"
|
||||
log("hello world")
|
||||
BIN
disco_test
BIN
disco_test
Binary file not shown.
|
|
@ -1,15 +1,19 @@
|
|||
bits 64
|
||||
default rel
|
||||
section .data
|
||||
EFDNYLFZ db 72,101,108,108,111,32,87,111,114,108,100,0
|
||||
GSDGYLUR db 97,0
|
||||
STVGNPWI db 104,101,108,108,111,32,119,111,114,108,100,0
|
||||
section .text
|
||||
global _start
|
||||
_start:
|
||||
global _main
|
||||
_main:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
mov rdi, EFDNYLFZ
|
||||
push qword [rel GSDGYLUR]
|
||||
mov rdi, STVGNPWI
|
||||
call _log
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
mov rax, 60
|
||||
mov rax, 0x02000001
|
||||
mov rdi, 0
|
||||
syscall
|
||||
_log:
|
||||
|
|
@ -24,12 +28,12 @@ _log_loop:
|
|||
jmp _log_loop
|
||||
_log_loop_end:
|
||||
mov rdx, rbx
|
||||
mov rax, 1
|
||||
mov rax, 0x02000004
|
||||
mov rdi, 1
|
||||
pop rsi
|
||||
syscall
|
||||
push 10
|
||||
mov rax, 1
|
||||
mov rax, 0x02000004
|
||||
mov rdi, 1
|
||||
mov rsi, rsp
|
||||
mov rdx, 1
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ const localVariables = new Map();
|
|||
const sections = {
|
||||
preamble() {
|
||||
if(process.platform === 'darwin') {
|
||||
return ' global _main\n';
|
||||
return 'bits 64\ndefault rel\n';
|
||||
} else {
|
||||
return '';
|
||||
}
|
||||
|
|
@ -37,10 +37,21 @@ const sections = {
|
|||
},
|
||||
text() {
|
||||
if(process.platform === 'darwin') {
|
||||
return 'section .text\n_main:\n push rbp\n mov rbp, rsp\n '
|
||||
+ statements.join('\n ')
|
||||
+ '\n mov rsp, rbp\n pop rbp\n mov rax, 0x02000001\n mov rdi, 0\n syscall\n'
|
||||
+ [...linkedLibraries.values()].map(({asmName, asm}) => asmName + ':\n' + asm).join('\n')
|
||||
return (
|
||||
'section .text\n' +
|
||||
' global _main\n' +
|
||||
'_main:\n' +
|
||||
' push rbp\n' +
|
||||
' mov rbp, rsp\n' +
|
||||
statements.map(v => ` ${v}\n`).join('') +
|
||||
' mov rsp, rbp\n' +
|
||||
' pop rbp\n' +
|
||||
' mov rax, 0x02000001\n' +
|
||||
' mov rdi, 0\n' +
|
||||
' syscall\n' + [...linkedLibraries.values()]
|
||||
.map(({asmName, asm}) => asmName + ':\n' + asm)
|
||||
.join('\n')
|
||||
);
|
||||
} else {
|
||||
return 'section .text\n global _start\n_start:\n push rbp\n mov rbp, rsp\n '
|
||||
+ statements.join('\n ')
|
||||
|
|
@ -106,7 +117,10 @@ function compileVariable(name, value) {
|
|||
});
|
||||
if(value.type === 'string') {
|
||||
const variableName = compileStringLiteral(value.value);
|
||||
statements.push('push ' + variableName)
|
||||
if(process.platform === 'darwin')
|
||||
statements.push(`push qword [rel ${variableName}]`);
|
||||
else
|
||||
statements.push('push ' + variableName);
|
||||
} else {
|
||||
console.error('dont know how to set a variable to a non string lol')
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,8 +32,8 @@ const asmFile = compile(ast)
|
|||
try {
|
||||
console.log();
|
||||
console.log('=== ASM ===');
|
||||
console.log(colorize(asmFile));
|
||||
require('fs').writeFileSync('disco_test.asm', asmFile);
|
||||
console.log(colorize(asmFile));
|
||||
|
||||
console.log();
|
||||
console.log('=== nasm ===');
|
||||
|
|
@ -59,6 +59,7 @@ function ld() {
|
|||
require('child_process').execSync([
|
||||
'ld', 'disco_test.o',
|
||||
'-o', 'disco_test',
|
||||
'-no_pie',
|
||||
'-macosx_version_min', '11.0',
|
||||
'-L', '/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib',
|
||||
'-lSystem'
|
||||
|
|
|
|||
|
|
@ -26,8 +26,12 @@ export default new Grammar([
|
|||
{ left: t.$Line, right: [t.$Global, t.$Identifier],
|
||||
resolver: (_, {value}) => ` ${ansi(...keywordColor).bold('global')} ${ansi(...identifierColor)(value)}` },
|
||||
{ left: t.$Line, right: [t.$Identifier, t.$Colon], resolver: ({value}) => `${ansi(...identifierColor)(value)}:` },
|
||||
{ left: t.$Line, right: [t.$Bits, t.$Number], resolver: (_, n) => `${ansi(...keywordColor).bold('bits')} ${ansi(...numberColor)(n.value)}`},
|
||||
{ left: t.$Line, right: [t.$Default, t.$Rel], resolver: () => `${ansi(...keywordColor).bold('default')} ${ansi(...keywordColor).bold('rel')}`},
|
||||
|
||||
// actual instructions
|
||||
{ left: t.$Line, right: [t.$Push, t.$DataSize, t.$LBracket, t.$Rel, t.$Identifier, t.$RBracket],
|
||||
resolver: (_, size, __, ___, identifier) => ` ${ansi(...instructionColor)('push')} ${size} ${ansi(...pointerColor)('[')}${ansi(...keywordColor).bold('rel')} ${ansi(...identifierColor)(identifier.value)}${ansi(...pointerColor)(']')}` },
|
||||
{ left: t.$Line, right: [t.$Push, t.$Value], resolver: (_, v) => ` ${ansi(...instructionColor)('push')} ${v}` },
|
||||
{ left: t.$Line, right: [t.$Pop, t.$Value], resolver: (_, v) => ` ${ansi(...instructionColor)('pop')} ${v}` },
|
||||
{ left: t.$Line, right: [t.$Cmp, t.$Register, t.$Comma, t.$Value],
|
||||
|
|
@ -52,5 +56,10 @@ export default new Grammar([
|
|||
{ left: t.$Value, right: [t.$Identifier], resolver: (v) => ansi(...identifierColor)(v.value) },
|
||||
|
||||
{ left: t.$CompoundString, right: [t.$Number], resolver: (n) => ansi(...numberColor)(n.value) },
|
||||
{ left: t.$CompoundString, right: [t.$Number, t.$Comma, t.$CompoundString], resolver: (n, _, ns) => ansi(...numberColor)(n.value) + ',' + ns }
|
||||
{ left: t.$CompoundString, right: [t.$Number, t.$Comma, t.$CompoundString], resolver: (n, _, ns) => ansi(...numberColor)(n.value) + ',' + ns },
|
||||
|
||||
{ left: t.$DataSize, right: [t.$Word], resolver: (v) => ansi(...keywordColor).bold(v.value) },
|
||||
{ left: t.$DataSize, right: [t.$DWord], resolver: (v) => ansi(...keywordColor).bold(v.value) },
|
||||
{ left: t.$DataSize, right: [t.$QWord], resolver: (v) => ansi(...keywordColor).bold(v.value) },
|
||||
{ left: t.$DataSize, right: [t.$OWord], resolver: (v) => ansi(...keywordColor).bold(v.value) },
|
||||
], t.$Program);
|
||||
|
|
@ -4,28 +4,62 @@ import {
|
|||
$Newline,
|
||||
} from "./../../earley";
|
||||
|
||||
export default createTokenizer([
|
||||
{ match: /^[\r\t ]{1,}$/, token: null },
|
||||
{ match: 'section', token: tokens.$Section },
|
||||
{ match: 'db', token: tokens.$Db },
|
||||
{ match: 'global', token: tokens.$Global },
|
||||
{ match: '\n', token: $Newline },
|
||||
{ match: ':', token: tokens.$Colon },
|
||||
{ match: ',', token: tokens.$Comma },
|
||||
{ match: '[', token: tokens.$LBracket },
|
||||
{ match: ']', token: tokens.$RBracket },
|
||||
{ match: '-', token: tokens.$Minus },
|
||||
{ match: 'mov', token: tokens.$Mov },
|
||||
{ match: 'push', token: tokens.$Push },
|
||||
{ match: 'pop', token: tokens.$Pop },
|
||||
{ match: 'call', token: tokens.$Call },
|
||||
{ match: 'syscall', token: tokens.$Syscall },
|
||||
{ match: 'ret', token: tokens.$Ret },
|
||||
{ match: 'je', token: tokens.$Je },
|
||||
{ match: 'jmp', token: tokens.$Jmp },
|
||||
{ match: 'cmp', token: tokens.$Cmp },
|
||||
{ match: 'inc', token: tokens.$Inc },
|
||||
{ match: /^[0-9]{1,}$/, token: tokens.$Number },
|
||||
{ match: /^(rbp|rsp|rax|rcx|rbx|rdx|rdi|rsi|al|bl|cl|dl|ah|bh|ch|dh|ax|bx|cx|dx|eax|ebx|ecx|edx)$/, token: tokens.$Register },
|
||||
{ match: /^[A-Za-z._][A-Za-z_]{0,}$/, token: tokens.$Identifier },
|
||||
const asmTokenizer = createTokenizer([
|
||||
// whitespaces
|
||||
[ /^[\r\t ]{1,}/, null],
|
||||
[ /^\n/, $Newline],
|
||||
|
||||
// keywords
|
||||
[ /^section/, tokens.$Section],
|
||||
[ /^db/, tokens.$Db],
|
||||
[ /^global/, tokens.$Global],
|
||||
[ /^bits/, tokens.$Bits],
|
||||
[ /^default/, tokens.$Default],
|
||||
[ /^rel/, tokens.$Rel],
|
||||
[ /^word/, tokens.$Word],
|
||||
[ /^dword/, tokens.$DWord],
|
||||
[ /^qword/, tokens.$QWord],
|
||||
[ /^oword/, tokens.$OWord],
|
||||
|
||||
// punctuation
|
||||
[ /^:/, tokens.$Colon],
|
||||
[ /^,/, tokens.$Comma],
|
||||
[ /^\[/, tokens.$LBracket],
|
||||
[ /^\]/, tokens.$RBracket],
|
||||
[ /^-/, tokens.$Minus],
|
||||
|
||||
// instructions
|
||||
[ /^mov/, tokens.$Mov],
|
||||
[ /^push/, tokens.$Push],
|
||||
[ /^pop/, tokens.$Pop],
|
||||
[ /^syscall/, tokens.$Syscall],
|
||||
[ /^ret/, tokens.$Ret],
|
||||
[ /^je/, tokens.$Je],
|
||||
[ /^jmp/, tokens.$Jmp],
|
||||
[ /^cmp/, tokens.$Cmp],
|
||||
[ /^inc/, tokens.$Inc],
|
||||
|
||||
// pseudo-instructions
|
||||
[ /^call/, tokens.$Call],
|
||||
|
||||
// 8 bit general purpose registers...
|
||||
[ /^(al|ah|bl|bh|cl|ch|dl|dh)/, tokens.$Register ],
|
||||
// 16 bit general purpose registers...
|
||||
[ /^(ax|bx|cx|dx)/, tokens.$Register ],
|
||||
// 32 bit general purpose registers...
|
||||
[ /^(eax|ebx|ecx|edx)/, tokens.$Register ],
|
||||
// 64 bit general purpose registers...
|
||||
[ /^(rax|rbx|rcx|rdx)/, tokens.$Register ],
|
||||
// other registers, idk.
|
||||
[ /^(rbp|rsp|rdi|rsi)/, tokens.$Register],
|
||||
|
||||
// user-defined
|
||||
[ /^[0-9]{1,}/, tokens.$Number],
|
||||
[ /^0x[0-9A-Fa-f]{1,}/, tokens.$Number],
|
||||
[ /^[A-Za-z._][A-Za-z_]{0,}/, tokens.$Identifier]
|
||||
])
|
||||
export default asmTokenizer;
|
||||
|
||||
import input from './testInput';
|
||||
import { printTokens } from "../utils";
|
||||
printTokens(asmTokenizer(input));
|
||||
|
|
@ -21,6 +21,13 @@ export class $RBracket extends Terminal { }
|
|||
export class $Comma extends Terminal { }
|
||||
export class $Colon extends Terminal { }
|
||||
export class $Minus extends Terminal { }
|
||||
export class $Bits extends Terminal { }
|
||||
export class $Default extends Terminal { }
|
||||
export class $Rel extends Terminal { }
|
||||
export class $Word extends Terminal { }
|
||||
export class $DWord extends Terminal { }
|
||||
export class $QWord extends Terminal { }
|
||||
export class $OWord extends Terminal { }
|
||||
|
||||
// varying tokens
|
||||
export class $Identifier extends Terminal { }
|
||||
|
|
@ -34,3 +41,4 @@ export class $PointerDereference extends NonTerminal { }
|
|||
export class $Program extends NonTerminal { }
|
||||
export class $CompoundString extends NonTerminal { }
|
||||
export class $Value extends NonTerminal { }
|
||||
export class $DataSize extends NonTerminal { }
|
||||
|
|
@ -1,72 +1,63 @@
|
|||
import { TerminalTokenClass } from "../earley";
|
||||
import { inspect } from 'util';
|
||||
import { Terminal, TerminalTokenClass } from "../earley";
|
||||
|
||||
interface TokenMatcher {
|
||||
match: RegExp | string,
|
||||
token: TerminalTokenClass
|
||||
type TokenMatcher = [ RegExp, TerminalTokenClass ];
|
||||
|
||||
interface Match {
|
||||
regex: RegExp;
|
||||
length: number;
|
||||
tokenClass: TerminalTokenClass;
|
||||
matchedString: string;
|
||||
}
|
||||
|
||||
// this is kinda bullshit lol exec is a dumb method.
|
||||
function getFirstMatch(r: RegExp, str: string): [number, string] {
|
||||
let matches = str.match(r);
|
||||
if(matches === null) return [-1, ''];
|
||||
return [matches.index, matches[0]];
|
||||
}
|
||||
|
||||
export function createTokenizer(tokenMap: TokenMatcher[]) {
|
||||
return function tokenize(str: string) {
|
||||
let tokens = [];
|
||||
let token = '';
|
||||
let line = 1, column = 0;
|
||||
for(let i = 0; i < str.length; i ++) {
|
||||
const char = str[i];
|
||||
const lookahead = (i < str.length - 1 ? str[i + 1] : null)
|
||||
column++;
|
||||
token += char;
|
||||
|
||||
for(const {match: matcher, token: tokenClass} of tokenMap) {
|
||||
if(typeof matcher === 'string') {
|
||||
if(matcher === token) {
|
||||
if(tokenClass !== null) {
|
||||
tokens.push(new tokenClass(line, column - token.length + 1, token));
|
||||
}
|
||||
token = '';
|
||||
} else {
|
||||
// dw about it
|
||||
}
|
||||
} else {
|
||||
// matcher is regex...
|
||||
// * note: this only tests if token contains a match, not that it _is_ a match
|
||||
if(matcher.test(token)) {
|
||||
if(lookahead) {
|
||||
if(!matcher.test(token + lookahead)) {
|
||||
// the next character would not match, so this must be the match.
|
||||
// ! PS: it is possible that even though this would no longer
|
||||
// ! match, another matcher could still match more.
|
||||
// ! in those cases, we would want to expand on this logic
|
||||
// ! to only match if there are no matches for any matcher
|
||||
// ! in the lookahead.
|
||||
// ! in practice this means tracking all possible non lookahead
|
||||
// ! matches, then testing them for their lookahead afterwards
|
||||
// ! in another loop, and only tokenizing if you have only one
|
||||
// ! option, and that option will fail on the lookahead.
|
||||
if(tokenClass !== null) {
|
||||
tokens.push(new tokenClass(line, column - token.length + 1, token));
|
||||
}
|
||||
token = '';
|
||||
} else {
|
||||
// the lookahead matches this too, so we should probably hold off
|
||||
// on tokenizing it...
|
||||
}
|
||||
} else {
|
||||
if(tokenClass !== null) {
|
||||
tokens.push(new tokenClass(line, column - token.length + 1, token));
|
||||
}
|
||||
token = '';
|
||||
}
|
||||
}
|
||||
return function tokenize(str: string, l = 1, c = 1): Terminal[] {
|
||||
|
||||
const possibleMatches: Match[] = tokenMap
|
||||
.map(([regex, tokenClass]) => {
|
||||
const [index, match] = getFirstMatch(regex, str);
|
||||
if(index === -1) return null;
|
||||
return {
|
||||
regex,
|
||||
tokenClass,
|
||||
length: match.length,
|
||||
matchedString: match
|
||||
}
|
||||
}
|
||||
})
|
||||
.filter(v => !!v);
|
||||
|
||||
const longestLength = possibleMatches
|
||||
.map(v => v.length)
|
||||
.reduce((a, v) => a > v ? a : v, -Infinity);
|
||||
|
||||
const longestMatches = possibleMatches
|
||||
.filter(v => v.length === longestLength);
|
||||
|
||||
console.assert(longestMatches.length > 0, 'No token matches found');
|
||||
|
||||
const [{tokenClass, matchedString}] = longestMatches;
|
||||
const length = matchedString.length;
|
||||
const token = tokenClass ? new tokenClass(l, c, matchedString) : null;
|
||||
|
||||
const rest = str.substring(length);
|
||||
|
||||
if(rest === '') return [ token ];
|
||||
|
||||
for(const char of matchedString) {
|
||||
c ++;
|
||||
if(char === '\n') {
|
||||
line ++;
|
||||
column = 0;
|
||||
l ++;
|
||||
c = 1;
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
return token ? [token, ...tokenize(rest, l, c)] : tokenize(rest, l, c);
|
||||
}
|
||||
}
|
||||
51
todo.md
51
todo.md
|
|
@ -1,15 +1,40 @@
|
|||
# Todo List
|
||||
|
||||
[x] colorize the assembly output
|
||||
[ ] rewrite disco tokenizer to the new generalTokenizer
|
||||
[ ] add an EOF token to earley, and yknow, add it to the tokenizer.
|
||||
[ ] add number support
|
||||
[ ] add comment support
|
||||
[ ] add fixed length array support
|
||||
[ ] organize AST elements into classes
|
||||
[ ] better logging of the AST
|
||||
[ ] optionally artifically slow down compilation (for fun)
|
||||
[ ] implement functions
|
||||
[ ] implement some basic maths operations
|
||||
[ ] implement multi-argument invocations
|
||||
[ ] implement return values
|
||||
- [x] colorize the assembly output
|
||||
- [x] create generalTokenizer to make tokenization generic
|
||||
- [ ] rewrite disco tokenizer to the new generalTokenizer
|
||||
- [ ] add an EOF token to earley, and yknow, add it to the tokenizer.
|
||||
- [ ] add number support
|
||||
- [ ] add comment support
|
||||
- [ ] add fixed length array support
|
||||
- [ ] organize AST elements into classes
|
||||
- [ ] better logging of the AST
|
||||
- [ ] optionally artifically slow down compilation (for fun)
|
||||
- [ ] implement functions
|
||||
- [ ] implement some basic maths operations
|
||||
- [ ] implement multi-argument invocations
|
||||
- [ ] implement return values
|
||||
- [ ] write a regex compiler
|
||||
- [ ] write log in disco. creat a library for just doing syscalls. the rest can be done in disco
|
||||
|
||||
# Changelog
|
||||
|
||||
- fixed macos compilation to use relative addressing (i think)
|
||||
- fixed a bug in the general tokenizer that failed to match some tokens properly.
|
||||
|
||||
---
|
||||
|
||||
- create generalized tokenizer
|
||||
- implement assembly language grammar for syntax highlighting
|
||||
- create a vscode extension for syntax highlighting
|
||||
|
||||
---
|
||||
|
||||
- compile disco code to assembly as POC
|
||||
- create an AST for disco code
|
||||
- implement earley grammar for disco including:
|
||||
- linking library functions
|
||||
- calling functions
|
||||
- string literals
|
||||
- string variables
|
||||
- created earley parser
|
||||
Loading…
Reference in New Issue