From a1ab0ff26f23c82f15180051204eeb6279747c9a Mon Sep 17 00:00:00 2001 From: Rui Ueyama Date: Sat, 3 Aug 2019 12:36:06 +0900 Subject: [PATCH] Add a tokenizer to allow space characters between tokens --- main.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++------- test.sh | 1 + 2 files changed, 105 insertions(+), 15 deletions(-) diff --git a/main.c b/main.c index 358a4b10..5cf2beee 100644 --- a/main.c +++ b/main.c @@ -1,33 +1,122 @@ +#include +#include +#include #include #include +#include -int main(int argc, char **argv) { - if (argc != 2) { - fprintf(stderr, "%s: invalid number of arguments\n", argv[0]); - return 1; - } +typedef enum { + TK_PUNCT, // Punctuators + TK_NUM, // Numeric literals + TK_EOF, // End-of-file markers +} TokenKind; - char *p = argv[1]; +// Token type +typedef struct Token Token; +struct Token { + TokenKind kind; // Token kind + Token *next; // Next token + int val; // If kind is TK_NUM, its value + char *loc; // Token location + int len; // Token length +}; - printf(" .globl main\n"); - printf("main:\n"); - printf(" mov $%ld, %%rax\n", strtol(p, &p, 10)); +// Reports an error and exit. +static void error(char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + exit(1); +} + +// Consumes the current token if it matches `s`. +static bool equal(Token *tok, char *op) { + return memcmp(tok->loc, op, tok->len) == 0 && op[tok->len] == '\0'; +} + +// Ensure that the current token is `s`. +static Token *skip(Token *tok, char *s) { + if (!equal(tok, s)) + error("expected '%s'", s); + return tok->next; +} + +// Ensure that the current token is TK_NUM. +static int get_number(Token *tok) { + if (tok->kind != TK_NUM) + error("expected a number"); + return tok->val; +} + +// Create a new token. +static Token *new_token(TokenKind kind, char *start, char *end) { + Token *tok = calloc(1, sizeof(Token)); + tok->kind = kind; + tok->loc = start; + tok->len = end - start; + return tok; +} + +// Tokenize `p` and returns new tokens. +static Token *tokenize(char *p) { + Token head = {}; + Token *cur = &head; while (*p) { - if (*p == '+') { + // Skip whitespace characters. + if (isspace(*p)) { p++; - printf(" add $%ld, %%rax\n", strtol(p, &p, 10)); continue; } - if (*p == '-') { + // Numeric literal + if (isdigit(*p)) { + cur = cur->next = new_token(TK_NUM, p, p); + char *q = p; + cur->val = strtoul(p, &p, 10); + cur->len = p - q; + continue; + } + + // Punctuator + if (*p == '+' || *p == '-') { + cur = cur->next = new_token(TK_PUNCT, p, p + 1); p++; - printf(" sub $%ld, %%rax\n", strtol(p, &p, 10)); continue; } - fprintf(stderr, "unexpected character: '%c'\n", *p); - return 1; + error("invalid token"); + } + + cur = cur->next = new_token(TK_EOF, p, p); + return head.next; +} + +int main(int argc, char **argv) { + if (argc != 2) + error("%s: invalid number of arguments", argv[0]); + + Token *tok = tokenize(argv[1]); + + printf(" .globl main\n"); + printf("main:\n"); + + // The first token must be a number + printf(" mov $%d, %%rax\n", get_number(tok)); + tok = tok->next; + + // ... followed by either `+ ` or `- `. + while (tok->kind != TK_EOF) { + if (equal(tok, "+")) { + printf(" add $%d, %%rax\n", get_number(tok->next)); + tok = tok->next->next; + continue; + } + + tok = skip(tok, "-"); + printf(" sub $%d, %%rax\n", get_number(tok)); + tok = tok->next; } printf(" ret\n"); diff --git a/test.sh b/test.sh index 9550bbcb..876e4e22 100755 --- a/test.sh +++ b/test.sh @@ -19,5 +19,6 @@ assert() { assert 0 0 assert 42 42 assert 21 '5+20-4' +assert 41 ' 12 + 34 - 5 ' echo OK