From 377c008ebeb5894586397eab9ca7e059bb735b67 Mon Sep 17 00:00:00 2001
From: Henrik Hautakoski <henrik.hautakoski@gmail.com>
Date: Sat, 10 Nov 2018 12:05:10 +0100
Subject: [PATCH] asm: lexer implementation.

---
 .gitignore     |   1 +
 Makefile       |  14 +++-
 src/as/as.c    |  54 ++++++++++++
 src/as/lexer.c | 221 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/as/lexer.h |  95 +++++++++++++++++++++
 5 files changed, 381 insertions(+), 4 deletions(-)
 create mode 100644 src/as/as.c
 create mode 100644 src/as/lexer.c
 create mode 100644 src/as/lexer.h
diff --git a/.gitignore b/.gitignore
index 563e7eb..b979155 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 *.o
 m16vm
+/as
diff --git a/Makefile b/Makefile
index 422bd6f..45518e7 100644
--- a/Makefile
+++ b/Makefile
@@ -3,13 +3,19 @@ CC = gcc
 CFLAGS = -Iinclude -DMEM_SIZE=32 -DM16_DEBUG_MEM
 LD = $(CC)
 
-VM = m16vm
+PROGRAMS = m16vm as
 
-$(VM) : src/vm.o src/cpu.o src/mm.o src/instr_decode.o src/syscall.o src/program.o
-	$(LD) $(LDFLAGS)-o $@ $^
+all: $(PROGRAMS)
+
+m16vm : src/vm.o src/cpu.o src/mm.o src/instr_decode.o src/syscall.o src/program.o
+	$(LD) $(LDFLAGS) -o $@ $^
+
+as : src/as/as.o src/as/lexer.o
+	$(LD) $(LDFLAGS) -o $@ $^
 
 clean :
 	$(RM) src/*.o
+	$(RM) src/as/*.o
 
 distclean : clean
-	$(RM) $(VM)
+	$(RM) $(PROGRAMS)
diff --git a/src/as/as.c b/src/as/as.c
new file mode 100644
index 0000000..3e2052e
--- /dev/null
+++ b/src/as/as.c
@@ -0,0 +1,54 @@
+/* as.c
+ *
+ *   Copyright (C) 2012   Henrik Hautakoski <henrik@fiktivkod.org>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ *   MA 02110-1301, USA.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include "lexer.h"
+
+int usage(char *program) {
+
+	fprintf(stderr, "Usage: %s <string>\n", program);
+	return -1;
+}
+
+int main(int argc, char **argv) {
+
+	FILE *fd;
+	struct lexer lex;
+
+	if (argc < 2)
+		return usage(argv[0]);
+
+	fd = fopen(argv[1], "r");
+	if (fd == NULL)
+		return -1;
+
+	lexer_init(&lex, fd);
+
+	do {
+		if (lexer_get_next(&lex) < 0)
+			break;
+
+		lexer_print_token(&lex.token);
+	} while(lex.token.type != TOKEN_EOI);
+
+
+	fclose(fd);
+	return 0;
+}
diff --git a/src/as/lexer.c b/src/as/lexer.c
new file mode 100644
index 0000000..7a1f7aa
--- /dev/null
+++ b/src/as/lexer.c
@@ -0,0 +1,221 @@
+
+#include <stdio.h>
+#include <string.h>
+#include "lexer.h"
+
+/**
+ * macros for the grammar.
+ */
+
+// Numbers is defined as [0-9]
+#define number(x) ((x) >= '0' && (x) <= '9')
+
+// The first digit can however also contain '-'
+#define first_number(x) (number(x) || (x) == '-' )
+
+// First character in strings can be [a-z][A-Z] or '_'
+#define first_string(x)			\
+	(  ((x) >= 'a' && (x) <= 'z')   \
+	|| ((x) >= 'A' && (x) <= 'Z')	\
+	||  (x) == '_'			)
+
+// All characters after can also include numbers or ':'
+#define string(x) \
+	(first_string(x) || number(x))
+
+#define space(x) ((x) == ' ' || (x) == '\t' || (x) == '\r')
+
+
+/**
+ * Helper functions
+ */
+
+static int read_next(struct lexer *lex) {
+
+ 	int c, comment = 0;
+
+ 	while((c = fgetc(lex->fp)) != EOF) {
+
+ 		if (c == '\n')
+			break;
+
+ 		if (comment)
+ 			continue;
+
+ 		if (c == ';') {
+ 			comment = 1;
+ 		} else if (!space(c)) {
+ 			break;
+ 		}
+ 	}
+ 	return c;
+ }
+
+static int read_number(FILE *fp) {
+
+	int c, neg = 0, val = 0;
+	while((c = fgetc(fp)) != EOF) {
+		if (neg == 0 && c == '-') {
+			neg = 1;
+			continue;
+		}
+		if (!number(c)) {
+			ungetc(c, fp);
+			break;
+		}
+		val = (val * 10) + (c - '0');
+	}
+
+	if (neg)
+		return -1 * val;
+	return val;
+}
+
+static int read_string(FILE *fp) {
+
+	int c, label_decl = 0, i = 0;
+	char buf[64];
+
+	while((c = fgetc(fp)) != EOF && i < 64) {
+
+		if (string(c)) {
+			buf[i++] = c;
+		} else {
+			if (c == ':') {
+				label_decl = 1;
+			} else {
+				ungetc(c, fp);
+			}
+			break;
+		}
+	}
+	buf[i] = '\0';
+
+	if (label_decl) {
+		return TOKEN_LABEL_DECL;
+	} else if (!strcmp("noop", buf)) {
+		return TOKEN_OPCODE_NOOP;
+	} else if (!strcmp("add", buf)) {
+		return TOKEN_OPCODE_ADD;
+	} else if (!strcmp("movl", buf)) {
+		return TOKEN_OPCODE_MOVL;
+	} else if (!strcmp("movh", buf)) {
+		return TOKEN_OPCODE_MOVH;
+	} else if (!strcmp("ld", buf)) {
+		return TOKEN_OPCODE_LD;
+	} else if (!strcmp("sw", buf)) {
+		return TOKEN_OPCODE_SW;
+	} else if (!strcmp("beq", buf)) {
+		return TOKEN_OPCODE_BEQ;
+	} else if (!strcmp("jmp", buf)) {
+		return TOKEN_OPCODE_JMP;
+	} else if (!strcmp("jr", buf)) {
+		return TOKEN_OPCODE_JR;
+	} else if (!strcmp("int", buf)) {
+		return TOKEN_OPCODE_INT;
+	}
+	return TOKEN_LABEL;
+}
+
+/**
+ * Exposed functions
+ */
+
+void lexer_init(struct lexer *lex, FILE *fp) {
+
+	lex->lineno = 1;
+	lex->fp = fp;
+	lex->token.type = TOKEN_EOI;
+}
+
+int lexer_get_next(struct lexer *lex) {
+
+	uint16_t num;
+	int ch = read_next(lex);
+
+	if (lex->token.type == TOKEN_EOL)
+		lex->lineno++;
+
+	switch(ch) {
+	case EOF : lex->token.type = TOKEN_EOI;
+		break;
+	case '\n' :
+		lex->token.type = TOKEN_EOL;
+		break;
+	case ',' : lex->token.type = TOKEN_ARG_SEP;
+		break;
+	case '$' :
+		lex->token.type = TOKEN_REG;
+		num = read_number(lex->fp);
+		// Registers is 8-bit only.
+		if (num > 0xF) {
+			fprintf(stderr, "ERROR: Invalid register value '%i' on line: %i\n", num, lex->lineno);
+			return -1;
+		}
+		lex->token.value.n = num;
+		break;
+	default:
+		if (first_number(ch)) {
+			ungetc(ch, lex->fp);
+			lex->token.type = TOKEN_NUMBER;
+			lex->token.value.n = read_number(lex->fp);
+		} else if (first_string(ch)) {
+			ungetc(ch, lex->fp);
+			lex->token.type = read_string(lex->fp);
+		} else {
+			fprintf(stderr, "ERROR: Invalid character '%c' on line: %i\n", ch, lex->lineno);
+			return -1;
+		}
+	}
+
+	lex->token.lineno = lex->lineno;
+	return 0;
+}
+
+void lexer_print_token(struct token *token) {
+
+	static int lineno = 0;
+
+	if (token->lineno != lineno) {
+		lineno = token->lineno;
+		printf("\n%i: ", lineno);
+	}
+
+	switch(token->type) {
+	case TOKEN_OPCODE_NOOP : printf(" [OP NOOP] ");
+		break;
+	case TOKEN_OPCODE_ADD : printf(" [OP ADD] ");
+		break;
+	case TOKEN_OPCODE_MOVL : printf(" [OP MOVL] ");
+		break;
+	case TOKEN_OPCODE_MOVH : printf(" [OP MOVH] ");
+		break;
+	case TOKEN_OPCODE_LD : printf(" [OP LD] ");
+		break;
+	case TOKEN_OPCODE_SW : printf(" [OP SW] ");
+		break;
+	case TOKEN_OPCODE_BEQ : printf(" [OP BEQ] ");
+		break;
+	case TOKEN_OPCODE_JMP : printf(" [OP JMP] ");
+		break;
+	case TOKEN_OPCODE_JR : printf(" [OP JR] ");
+		break;
+	case TOKEN_OPCODE_INT : printf(" [OP INT] ");
+		break;
+	case TOKEN_LABEL : printf(" [LABEL] ");
+		break;
+	case TOKEN_LABEL_DECL : printf(" [LABEL DECL] ");
+		break;
+	case TOKEN_REG : printf(" [REG %i] ", token->value.n);
+		break;
+	case TOKEN_ARG_SEP : printf(" [SEP] ");
+		break;
+	case TOKEN_NUMBER : printf(" [NUM %i] ", token->value.n);
+		break;
+	case TOKEN_EOI : printf(" [EOI] ");
+		break;
+	case TOKEN_EOL : printf(" [EOL] ");
+		break;
+	default: printf(" [U] ");
+	}
+}
diff --git a/src/as/lexer.h b/src/as/lexer.h
new file mode 100644
index 0000000..e2a5319
--- /dev/null
+++ b/src/as/lexer.h
@@ -0,0 +1,95 @@
+/* lexer.h
+ *
+ *   Copyright (C) 2018   Henrik Hautakoski <henrik@fiktivkod.org>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ *   MA 02110-1301, USA.
+ */
+#ifndef ASM_LEXER_H
+#define ASM_LEXER_H
+
+#include <stdlib.h>
+#include <stdint.h>
+
+/**
+ * All token types.
+ */
+enum token_type {
+	TOKEN_EOI = -1,
+	TOKEN_EOL,		// Newline
+	TOKEN_OPCODE_NOOP,
+	TOKEN_OPCODE_ADD,
+	TOKEN_OPCODE_MOVL,
+	TOKEN_OPCODE_MOVH,
+	TOKEN_OPCODE_LD,
+	TOKEN_OPCODE_SW,
+	TOKEN_OPCODE_BEQ,
+	TOKEN_OPCODE_JMP,
+	TOKEN_OPCODE_JR,
+	TOKEN_OPCODE_INT,
+	TOKEN_LABEL,
+	TOKEN_LABEL_DECL,
+	TOKEN_REG,
+	TOKEN_NUMBER,
+	TOKEN_ARG_SEP
+};
+
+/**
+ * Token structure.
+ *
+ * Holds information about a single token.
+ */
+struct token {
+	// Line number where the token was extracted from.
+	uint16_t 	lineno;
+
+	enum token_type	type;
+
+	/*
+	 * Token value, depending on type
+	 * this can be a string or unsigned short
+	 */
+	union {
+		uint16_t n;
+		char	 s[32];
+	} value;
+};
+
+/**
+ * Lexer state
+ */
+struct lexer {
+	uint16_t 	lineno;		// current line number
+	FILE *		fp;		// File being lexed.
+	struct token 	token;		// Current token
+};
+
+/**
+ * Initialize the lexer with a file pointer to the file
+ * that should be lexed.
+ */
+void lexer_init(struct lexer *lex, FILE *fp);
+
+/**
+ * Advance the lexer to the next token.
+ */
+int lexer_get_next(struct lexer *lex);
+
+/**
+ * For debugging, prints the token to standard output.
+ */
+void lexer_print_token(struct token *token);
+
+#endif /* ASM_LEXER_H */