From 962399fab26c7dd370abf71db621a72c818d6caf Mon Sep 17 00:00:00 2001 From: Lukas Krickl Date: Sun, 19 Nov 2023 10:59:00 +0100 Subject: [PATCH] WIP: tokenizer --- src/test.c | 16 +++-- src/ulas.c | 194 +++++++++++++++++++++++++++++------------------------ 2 files changed, 114 insertions(+), 96 deletions(-) diff --git a/src/test.c b/src/test.c index 6fc8c51..b8aefcd 100644 --- a/src/test.c +++ b/src/test.c @@ -50,9 +50,10 @@ void test_tok(void) { TESTBEGIN("tok"); - assert_tok(" test tokens with, line / * + - , ; $1", - {"test", "tokens", "with", ",", "line", "/", "*", "+", "-", ",", - ";", "$1", NULL}); + assert_tok( + " test tokens with, line / * + - , ; $1 = == != > < >= <=", + {"test", "tokens", "with", ",", "line", "/", "*", "+", "-", ",", + ";", "$1", "=", "==", "!=", ">", "<", ">=", "<=", NULL}); assert_tokuntil(" this is a, test for tok , until", ',', {"this is a", "test for tok ", "until", NULL}); @@ -178,13 +179,12 @@ void test_preproc(void) { assert((expected_rc) == rc); \ } - -#define ASSERT_TOTOK(expected_val, expected_rc, token) \ +#define ASSERT_TOTOK(expected_val, expected_rc, token) \ { \ int rc = 0; \ struct ulas_tok tok = ulas_totok((token), strlen(token), &rc); \ assert((expected_rc) == rc); \ - assert(tok.type == (expected_val)); \ + assert(tok.type == (expected_val)); \ free(tok.val.strv); \ } @@ -220,10 +220,12 @@ void test_totok(void) { ASSERT_UNEXPECTED_TOTOK(-1, "1symbol123"); - // generic tokens with no value + // generic tokens with no value ASSERT_TOTOK(ULAS_EQ, 0, "=="); + ASSERT_TOTOK(ULAS_NEQ, 0, "!="); ASSERT_TOTOK('=', 0, "="); ASSERT_TOTOK('+', 0, "+"); + ASSERT_TOTOK('!', 0, "!"); TESTEND("totok"); } diff --git a/src/ulas.c b/src/ulas.c index 8cd5537..0279fa3 100644 --- a/src/ulas.c +++ b/src/ulas.c @@ -148,11 +148,19 @@ int ulas_tok(struct ulas_str *dst, const char **out_line, unsigned long n) { char c = line[i]; switch (c) { - case ',': case '+': case '-': case '*': case '/': + case '~': + case '|': + case '&': + case '%': + case '(': + case ')': + case '[': + case ']': + case ',': case '\\': case ULAS_TOK_COMMENT: if (WELD_TOKISTERM) { @@ -172,6 +180,18 @@ int ulas_tok(struct ulas_str *dst, const char **out_line, unsigned long n) { dst->buf[write++] = line[i++]; dst->buf[write++] = line[i++]; goto tokdone; + case '=': + case '<': + case '!': + case '>': + if (line[i + 1] == '=') { + dst->buf[write] = line[i]; + i++; + write++; + } + dst->buf[write] = line[i]; + write++; + break; default: if (isspace(line[i])) { goto tokdone; @@ -262,109 +282,105 @@ struct ulas_tok ulas_totok(char *buf, unsigned long n, int *rc) { unsigned char first = buf[0]; buf++; - switch (first) { - case '+': - case '-': - case '*': - case '/': - case '~': - case '|': - case '&': - case '%': - case '(': - case ')': - case '[': - case ']': - case ',': - case ';': + if (n == 1) { // single char tokens tok.type = first; - goto end; - case '"': - // string - tok.type = ULAS_STR; - - // FIXME: this likely mallocs a few extra bytes - // but honestly its probably fine - tok.val.strv = malloc(n * sizeof(char) + 1); - memset(tok.val.strv, 0, n); - - long i = 0; - while (*buf && *buf != '\"') { - if (*buf == '\\') { + } else { + switch (first) { + case '"': + // string + tok.type = ULAS_STR; + + // FIXME: this likely mallocs a few extra bytes + // but honestly its probably fine + tok.val.strv = malloc(n * sizeof(char) + 1); + memset(tok.val.strv, 0, n); + + long i = 0; + while (*buf && *buf != '\"') { + if (*buf == '\\') { + buf++; + tok.val.strv[i] = ulas_unescape(*buf, rc); + } else { + tok.val.strv[i] = *buf; + } + i++; buf++; - tok.val.strv[i] = ulas_unescape(*buf, rc); - } else { - tok.val.strv[i] = *buf; } - i++; - buf++; - } - tok.val.strv[i] = '\0'; + tok.val.strv[i] = '\0'; - if (*buf != '\"') { - *rc = -1; - ULASERR("Unterminated string sequence\n"); - goto end; - } - buf++; - break; - case '=': - if (*buf == '=') { - tok.type = ULAS_EQ; - buf++; - } else { - tok.type = first; - } - break; - case '!': - if (*buf == '=') { - tok.type = ULAS_NEQ; + if (*buf != '\"') { + *rc = -1; + ULASERR("Unterminated string sequence\n"); + goto end; + } buf++; - } else { - tok.type = first; - } - break; - default: - if (isdigit(first)) { - // integer - tok.type = ULAS_INT; - - // 0b prefix is not supported in strtol... so we implement it by hand - if (*buf == 'b') { + break; + case '=': + if (*buf == '=') { + tok.type = ULAS_EQ; buf++; - tok.val.intv = (int)strtol(buf, &buf, 2); - } else { - tok.val.intv = (int)strtol(buf - 1, &buf, 0); } - } else if (first == '\'') { - tok.type = ULAS_INT; - if (*buf == '\\') { + break; + case '!': + if (*buf == '=') { + tok.type = ULAS_NEQ; buf++; - tok.val.intv = ulas_unescape(*buf, rc); - } else { - tok.val.intv = (int)*buf; } - buf++; - if (*buf != '\'') { + break; + case '<': + if (*buf == '=') { + tok.type = ULAS_LTEQ; + buf++; + } + break; + case '>': + if (*buf == '=') { + tok.type = ULAS_GTEQ; + buf++; + } + break; + default: + if (isdigit(first)) { + // integer + tok.type = ULAS_INT; + + // 0b prefix is not supported in strtol... so we implement it by hand + if (*buf == 'b') { + buf++; + tok.val.intv = (int)strtol(buf, &buf, 2); + } else { + tok.val.intv = (int)strtol(buf - 1, &buf, 0); + } + } else if (first == '\'') { + tok.type = ULAS_INT; + if (*buf == '\\') { + buf++; + tok.val.intv = ulas_unescape(*buf, rc); + } else { + tok.val.intv = (int)*buf; + } + buf++; + if (*buf != '\'') { + *rc = -1; + ULASERR("Unterminated character sequence\n"); + goto end; + } + buf++; + break; + } else if (ulas_isname(buf - 1, n)) { + // literal token + // we resolve it later, will need to malloc here for now + tok.type = ULAS_SYMBOL; + tok.val.strv = strndup(buf - 1, n); + buf += n - 1; + } else { + ULASERR("Unexpected token: %s\n", buf); *rc = -1; - ULASERR("Unterminated character sequence\n"); goto end; } - buf++; break; - } else if (ulas_isname(buf - 1, n)) { - // literal token - // we resolve it later, will need to malloc here for now - tok.type = ULAS_SYMBOL; - tok.val.strv = strndup(buf - 1, n); - buf += n - 1; - } else { - ULASERR("Unexpected token: %s\n", buf); - *rc = -1; - goto end; } - break; } end: -- 2.30.2