From 377c008ebeb5894586397eab9ca7e059bb735b67 Mon Sep 17 00:00:00 2001 From: Henrik Hautakoski Date: Sat, 10 Nov 2018 12:05:10 +0100 Subject: [PATCH] asm: lexer implementation. --- .gitignore | 1 + Makefile | 14 +++- src/as/as.c | 54 ++++++++++++ src/as/lexer.c | 221 +++++++++++++++++++++++++++++++++++++++++++++++++ src/as/lexer.h | 95 +++++++++++++++++++++ 5 files changed, 381 insertions(+), 4 deletions(-) create mode 100644 src/as/as.c create mode 100644 src/as/lexer.c create mode 100644 src/as/lexer.h diff --git a/.gitignore b/.gitignore index 563e7eb..b979155 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.o m16vm +/as diff --git a/Makefile b/Makefile index 422bd6f..45518e7 100644 --- a/Makefile +++ b/Makefile @@ -3,13 +3,19 @@ CC = gcc CFLAGS = -Iinclude -DMEM_SIZE=32 -DM16_DEBUG_MEM LD = $(CC) -VM = m16vm +PROGRAMS = m16vm as -$(VM) : src/vm.o src/cpu.o src/mm.o src/instr_decode.o src/syscall.o src/program.o - $(LD) $(LDFLAGS)-o $@ $^ +all: $(PROGRAMS) + +m16vm : src/vm.o src/cpu.o src/mm.o src/instr_decode.o src/syscall.o src/program.o + $(LD) $(LDFLAGS) -o $@ $^ + +as : src/as/as.o src/as/lexer.o + $(LD) $(LDFLAGS) -o $@ $^ clean : $(RM) src/*.o + $(RM) src/as/*.o distclean : clean - $(RM) $(VM) + $(RM) $(PROGRAMS) diff --git a/src/as/as.c b/src/as/as.c new file mode 100644 index 0000000..3e2052e --- /dev/null +++ b/src/as/as.c @@ -0,0 +1,54 @@ +/* as.c + * + * Copyright (C) 2012 Henrik Hautakoski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ +#include +#include +#include "lexer.h" + +int usage(char *program) { + + fprintf(stderr, "Usage: %s \n", program); + return -1; +} + +int main(int argc, char **argv) { + + FILE *fd; + struct lexer lex; + + if (argc < 2) + return usage(argv[0]); + + fd = fopen(argv[1], "r"); + if (fd == NULL) + return -1; + + lexer_init(&lex, fd); + + do { + if (lexer_get_next(&lex) < 0) + break; + + lexer_print_token(&lex.token); + } while(lex.token.type != TOKEN_EOI); + + + fclose(fd); + return 0; +} diff --git a/src/as/lexer.c b/src/as/lexer.c new file mode 100644 index 0000000..7a1f7aa --- /dev/null +++ b/src/as/lexer.c @@ -0,0 +1,221 @@ + +#include +#include +#include "lexer.h" + +/** + * macros for the grammar. + */ + +// Numbers is defined as [0-9] +#define number(x) ((x) >= '0' && (x) <= '9') + +// The first digit can however also contain '-' +#define first_number(x) (number(x) || (x) == '-' ) + +// First character in strings can be [a-z][A-Z] or '_' +#define first_string(x) \ + ( ((x) >= 'a' && (x) <= 'z') \ + || ((x) >= 'A' && (x) <= 'Z') \ + || (x) == '_' ) + +// All characters after can also include numbers or ':' +#define string(x) \ + (first_string(x) || number(x)) + +#define space(x) ((x) == ' ' || (x) == '\t' || (x) == '\r') + + +/** + * Helper functions + */ + +static int read_next(struct lexer *lex) { + + int c, comment = 0; + + while((c = fgetc(lex->fp)) != EOF) { + + if (c == '\n') + break; + + if (comment) + continue; + + if (c == ';') { + comment = 1; + } else if (!space(c)) { + break; + } + } + return c; + } + +static int read_number(FILE *fp) { + + int c, neg = 0, val = 0; + while((c = fgetc(fp)) != EOF) { + if (neg == 0 && c == '-') { + neg = 1; + continue; + } + if (!number(c)) { + ungetc(c, fp); + break; + } + val = (val * 10) + (c - '0'); + } + + if (neg) + return -1 * val; + return val; +} + +static int read_string(FILE *fp) { + + int c, label_decl = 0, i = 0; + char buf[64]; + + while((c = fgetc(fp)) != EOF && i < 64) { + + if (string(c)) { + buf[i++] = c; + } else { + if (c == ':') { + label_decl = 1; + } else { + ungetc(c, fp); + } + break; + } + } + buf[i] = '\0'; + + if (label_decl) { + return TOKEN_LABEL_DECL; + } else if (!strcmp("noop", buf)) { + return TOKEN_OPCODE_NOOP; + } else if (!strcmp("add", buf)) { + return TOKEN_OPCODE_ADD; + } else if (!strcmp("movl", buf)) { + return TOKEN_OPCODE_MOVL; + } else if (!strcmp("movh", buf)) { + return TOKEN_OPCODE_MOVH; + } else if (!strcmp("ld", buf)) { + return TOKEN_OPCODE_LD; + } else if (!strcmp("sw", buf)) { + return TOKEN_OPCODE_SW; + } else if (!strcmp("beq", buf)) { + return TOKEN_OPCODE_BEQ; + } else if (!strcmp("jmp", buf)) { + return TOKEN_OPCODE_JMP; + } else if (!strcmp("jr", buf)) { + return TOKEN_OPCODE_JR; + } else if (!strcmp("int", buf)) { + return TOKEN_OPCODE_INT; + } + return TOKEN_LABEL; +} + +/** + * Exposed functions + */ + +void lexer_init(struct lexer *lex, FILE *fp) { + + lex->lineno = 1; + lex->fp = fp; + lex->token.type = TOKEN_EOI; +} + +int lexer_get_next(struct lexer *lex) { + + uint16_t num; + int ch = read_next(lex); + + if (lex->token.type == TOKEN_EOL) + lex->lineno++; + + switch(ch) { + case EOF : lex->token.type = TOKEN_EOI; + break; + case '\n' : + lex->token.type = TOKEN_EOL; + break; + case ',' : lex->token.type = TOKEN_ARG_SEP; + break; + case '$' : + lex->token.type = TOKEN_REG; + num = read_number(lex->fp); + // Registers is 8-bit only. + if (num > 0xF) { + fprintf(stderr, "ERROR: Invalid register value '%i' on line: %i\n", num, lex->lineno); + return -1; + } + lex->token.value.n = num; + break; + default: + if (first_number(ch)) { + ungetc(ch, lex->fp); + lex->token.type = TOKEN_NUMBER; + lex->token.value.n = read_number(lex->fp); + } else if (first_string(ch)) { + ungetc(ch, lex->fp); + lex->token.type = read_string(lex->fp); + } else { + fprintf(stderr, "ERROR: Invalid character '%c' on line: %i\n", ch, lex->lineno); + return -1; + } + } + + lex->token.lineno = lex->lineno; + return 0; +} + +void lexer_print_token(struct token *token) { + + static int lineno = 0; + + if (token->lineno != lineno) { + lineno = token->lineno; + printf("\n%i: ", lineno); + } + + switch(token->type) { + case TOKEN_OPCODE_NOOP : printf(" [OP NOOP] "); + break; + case TOKEN_OPCODE_ADD : printf(" [OP ADD] "); + break; + case TOKEN_OPCODE_MOVL : printf(" [OP MOVL] "); + break; + case TOKEN_OPCODE_MOVH : printf(" [OP MOVH] "); + break; + case TOKEN_OPCODE_LD : printf(" [OP LD] "); + break; + case TOKEN_OPCODE_SW : printf(" [OP SW] "); + break; + case TOKEN_OPCODE_BEQ : printf(" [OP BEQ] "); + break; + case TOKEN_OPCODE_JMP : printf(" [OP JMP] "); + break; + case TOKEN_OPCODE_JR : printf(" [OP JR] "); + break; + case TOKEN_OPCODE_INT : printf(" [OP INT] "); + break; + case TOKEN_LABEL : printf(" [LABEL] "); + break; + case TOKEN_LABEL_DECL : printf(" [LABEL DECL] "); + break; + case TOKEN_REG : printf(" [REG %i] ", token->value.n); + break; + case TOKEN_ARG_SEP : printf(" [SEP] "); + break; + case TOKEN_NUMBER : printf(" [NUM %i] ", token->value.n); + break; + case TOKEN_EOI : printf(" [EOI] "); + break; + case TOKEN_EOL : printf(" [EOL] "); + break; + default: printf(" [U] "); + } +} diff --git a/src/as/lexer.h b/src/as/lexer.h new file mode 100644 index 0000000..e2a5319 --- /dev/null +++ b/src/as/lexer.h @@ -0,0 +1,95 @@ +/* lexer.h + * + * Copyright (C) 2018 Henrik Hautakoski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ +#ifndef ASM_LEXER_H +#define ASM_LEXER_H + +#include +#include + +/** + * All token types. + */ +enum token_type { + TOKEN_EOI = -1, + TOKEN_EOL, // Newline + TOKEN_OPCODE_NOOP, + TOKEN_OPCODE_ADD, + TOKEN_OPCODE_MOVL, + TOKEN_OPCODE_MOVH, + TOKEN_OPCODE_LD, + TOKEN_OPCODE_SW, + TOKEN_OPCODE_BEQ, + TOKEN_OPCODE_JMP, + TOKEN_OPCODE_JR, + TOKEN_OPCODE_INT, + TOKEN_LABEL, + TOKEN_LABEL_DECL, + TOKEN_REG, + TOKEN_NUMBER, + TOKEN_ARG_SEP +}; + +/** + * Token structure. + * + * Holds information about a single token. + */ +struct token { + // Line number where the token was extracted from. + uint16_t lineno; + + enum token_type type; + + /* + * Token value, depending on type + * this can be a string or unsigned short + */ + union { + uint16_t n; + char s[32]; + } value; +}; + +/** + * Lexer state + */ +struct lexer { + uint16_t lineno; // current line number + FILE * fp; // File being lexed. + struct token token; // Current token +}; + +/** + * Initialize the lexer with a file pointer to the file + * that should be lexed. + */ +void lexer_init(struct lexer *lex, FILE *fp); + +/** + * Advance the lexer to the next token. + */ +int lexer_get_next(struct lexer *lex); + +/** + * For debugging, prints the token to standard output. + */ +void lexer_print_token(struct token *token); + +#endif /* ASM_LEXER_H */