diff options
| author | Mistivia <i@mistivia.com> | 2025-02-21 18:57:46 +0800 |
|---|---|---|
| committer | Mistivia <i@mistivia.com> | 2025-02-21 18:58:09 +0800 |
| commit | b747628cbfba50fe3d74f7b8ed316d0bd2d56bdc (patch) | |
| tree | 3fcc7cdb5285b80d02957136a87e96ced06eaa8d | |
| parent | 0f310a7f7ba2c3db3e36de1b9068a34f46ee5b17 (diff) | |
new assembler: tokenizer
| -rw-r--r-- | Makefile | 8 | ||||
| -rw-r--r-- | src/as_main.c | 25 | ||||
| -rw-r--r-- | src/as_tokenizer.c | 198 | ||||
| -rw-r--r-- | src/as_tokenizer.h | 36 | ||||
| -rw-r--r-- | tests/test_as_tokenizer.c | 53 |
5 files changed, 316 insertions, 4 deletions
@@ -4,15 +4,15 @@ ldflags = -lm cc = gcc csc = chicken-csc -src = $(shell find src/ -name '*.c' -not -name 'main.c') +src = $(shell find src/ -name '*.c' -not -name '*main.c') obj = $(src:.c=.o) tests=$(shell find tests/ -name '*.c') tests_bin=$(tests:.c=.bin) -all: $(target) fvm-as +all: $(target) fvm-as -fvm-as: assembler/fvm-as.scm - $(csc) $< -o $@ +fvm-as: $(obj) src/as_main.c + $(cc) $(cflags) $(ldflags) $^ -o $@ full: all $(tests_bin) diff --git a/src/as_main.c b/src/as_main.c new file mode 100644 index 0000000..8585fa1 --- /dev/null +++ b/src/as_main.c @@ -0,0 +1,25 @@ +#include <stdio.h> + +#include "as_tokenizer.h" + +// AST +// === +// +// <prog> ::= <stmts> +// <stmts> ::= <stmt> <newline> | <stmt> <stmts> <newline> +// <stmt> ::= <tag> <instr> | <instr> | <tag> +// <instr> ::= <op> | <op> <arg> | <op> <label> +// <tag> ::= <label> : +// <op> ::= add | sub | mul | div | mod | eq + +int main(int argc, char** argv) { + if (argc != 2) { + fprintf(stderr, "Usage: fvm-as <inputfile>\n"); + return 1; + } + + FILE *fp = fopen(argv[1], "r"); + TokenStream* ts = makeTokenStream(fp); + return 0; +} + diff --git a/src/as_tokenizer.c b/src/as_tokenizer.c new file mode 100644 index 0000000..9766372 --- /dev/null +++ b/src/as_tokenizer.c @@ -0,0 +1,198 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "as_tokenizer.h" + +int InputStream_nextChar(InputStream *s) { + if (s->cursor == -1) { + return EOF; + } + if (s->buf_pos == s->cursor) { + size_t n = fread(s->buf, 1, INPUT_STREAM_BUF_SIZE, s->fp); + if (n == 0) { + s->cursor = -1; + return EOF; + } + s->buf_pos = n; + s->cursor = 0; + } + int c = s->buf[s->cursor]; + s->cursor++; + if (c == '\n') { + s->line++; + s->col = 1; + } else { + s->col++; + } + return c; +} + +int InputStream_peekChar(InputStream *s) { + if (s->cursor == -1) { + return EOF; + } + if (s->buf_pos == s->cursor) { + size_t n = fread(s->buf, 1, INPUT_STREAM_BUF_SIZE, s->fp); + if (n == 0) { + return EOF; + } + s->buf_pos = n; + s->cursor = 0; + } + return s->buf[s->cursor]; +} + +char* ops[] = { + "add", "sub", "mul", "div", "mod", "eq" +}; + +int isOp(const char* str) { + for (int i = 0; i < sizeof(ops) / sizeof(ops[0]); i++) { + if (strcmp(ops[i], str) == 0) { + return 1; + } + } + return 0; +} + +int isStartOfIndentifier(int c) { + if (c >= 'a' && c <= 'z') { + return 1; + } + if (c >= 'A' && c <= 'Z') { + return 1; + } + if (c == '_') { + return 1; + } + return 0; +} + +int isPartOfIndentifier(int c) { + if (isStartOfIndentifier(c)) { + return 1; + } + if (c >= '0' && c <= '9') { + return 1; + } + return 0; +} + +Token nextTokenImpl(InputStream *s) { + int c; + while (1) { + c = InputStream_peekChar(s); + if (c == EOF) { + break; + } + if (c == '\n') { + InputStream_nextChar(s); + Token t = (Token){.type = NEWLINE, .line = s->line, .col = s->col}; + return t; + } + if (c == ':') { + InputStream_nextChar(s); + return (Token){.type = COLON, .line = s->line, .col = s->col}; + } + if (c == ' ' || c == '\t') { + InputStream_nextChar(s); + continue; + } + if (c >= '0' && c <= '9') { + int64_t ival = 0; + while (1) { + InputStream_nextChar(s); + ival = ival * 10 + (c - '0'); + c = InputStream_peekChar(s); + if (c < '0' || c > '9') { + break; + } + } + return (Token){.type = ARG, .ival = ival, .line = s->line, .col = s->col}; + } + if (isStartOfIndentifier(c)) { + size_t line = s->line; + size_t col = s->col; + char *sval = malloc(256); + size_t i = 0; + while(1) { + if (i >= 255) { + fprintf(stderr, "error: identifier too long\n"); + exit(1); + } + InputStream_nextChar(s); + sval[i++] = c; + c = InputStream_peekChar(s); + if (!isPartOfIndentifier(c)) { + break; + } + } + sval[i] = '\0'; + if (isOp(sval)) { + return (Token){.type = OP, .sval = sval, .line = line, .col = col}; + } + return (Token){.type = LABEL, .sval = sval, .line = line, .col = col}; + } + fprintf(stderr, "error: invalid character %c at line %d, col %d\n", c, s->line, s->col); + } + // end of file + return (Token){.type = ENDOFFILE}; +} + +Token *nextToken(TokenStream *ts) { + if (ts->buf != NULL) { + Token *t = ts->buf; + ts->buf = NULL; + return t; + } + Token *t = malloc(sizeof(Token)); + *t = nextTokenImpl(ts->s); + return t; +} + +Token *peekToken(TokenStream *ts) { + if (ts->buf != NULL) { + return ts->buf; + } + ts->buf = malloc(sizeof(Token)); + *(ts->buf) = nextTokenImpl(ts->s); + return ts->buf; +} + +void printToken(Token *t) { + switch (t->type) { + case OP: + printf("OP: %s, line: %d, col: %d\n", t->sval, t->line, t->col); + break; + case ARG: + printf("ARG: %ld, line: %d, col: %d\n", t->ival, t->line, t->col); + break; + case LABEL: + printf("LABEL: %s, line: %d, col: %d\n", t->sval, t->line, t->col); + break; + case COLON: + printf("COLON\n"); + break; + case NEWLINE: + printf("NEWLINE\n"); + break; + case ENDOFFILE: + printf("ENDOFFILE\n"); + break; + } +} + +TokenStream* makeTokenStream(FILE* fp) { + InputStream *s = malloc(sizeof(InputStream)); + s->fp = fp; + s->buf = malloc(INPUT_STREAM_BUF_SIZE); + s->buf_pos = 0; + s->cursor = 0; + s->line = 1; + s->col = 1; + TokenStream *ts = malloc(sizeof(TokenStream)); + ts->s = s; + ts->buf = NULL; + return ts; +} diff --git a/src/as_tokenizer.h b/src/as_tokenizer.h new file mode 100644 index 0000000..fef8625 --- /dev/null +++ b/src/as_tokenizer.h @@ -0,0 +1,36 @@ +#include <stdint.h> +#include <stdio.h> + +typedef enum { + OP, ARG, LABEL, COLON, NEWLINE, ENDOFFILE +} TokenType; + +typedef struct { + TokenType type; + int line; + int col; + char *sval; + int64_t ival; + double fval; +} Token; + +#define INPUT_STREAM_BUF_SIZE 1024 + +typedef struct { + FILE *fp; + char *buf; + int buf_pos; + int cursor; + int line; + int col; +} InputStream; + +typedef struct { + Token* buf; + InputStream *s; +} TokenStream; + +Token *nextToken(TokenStream *ts); +Token *peekToken(TokenStream *ts); +void printToken(Token *t); +TokenStream* makeTokenStream(FILE* fp); diff --git a/tests/test_as_tokenizer.c b/tests/test_as_tokenizer.c new file mode 100644 index 0000000..c32eddb --- /dev/null +++ b/tests/test_as_tokenizer.c @@ -0,0 +1,53 @@ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <assert.h> + +#include "as_tokenizer.h" + +char *inputBuffer = + "start:\n" + " add 1\n" + " sub start\n" + " div\n" + " eq\n"; + +char *expectedOutput = + "LABEL: start, line: 1, col: 1\n" + "COLON\n" + "NEWLINE\n" + "OP: add, line: 2, col: 5\n" + "ARG: 1, line: 2, col: 10\n" + "NEWLINE\n" + "OP: sub, line: 3, col: 5\n" + "LABEL: start, line: 3, col: 9\n" + "NEWLINE\n" + "OP: div, line: 4, col: 5\n" + "NEWLINE\n" + "OP: eq, line: 5, col: 5\n" + "NEWLINE\n"; + +int main(int argc, char** argv) { + printf("[TEST] assembler tokenizer\n"); + // make a memory buffer to FILE* + FILE *fp = fmemopen(inputBuffer, strlen(inputBuffer), "r"); + TokenStream* ts = makeTokenStream(fp); + + char *outputBuffer = malloc(10240); + // redirect stdout to a file + FILE *out = fmemopen(outputBuffer, 10240, "w"); + FILE *origin_stdout = stdout; + stdout = out; + while (peekToken(ts)->type != ENDOFFILE) { + printToken(peekToken(ts)); + nextToken(ts); + } + fclose(out); + stdout = origin_stdout; + // compare outputBuffer with expectedOutput + assert(strcmp(outputBuffer, expectedOutput) == 0); + printf("[PASS] assembler tokenizer\n"); + return 0; +} + + |
