From: Lukas Krickl Date: Tue, 7 Nov 2023 09:23:49 +0000 (+0100) Subject: Made tokenizer better at... tokenizing X-Git-Url: https://git.krickl.dev/?a=commitdiff_plain;h=91624db40e5295f974044c8e91350ffd2be0d194;p=ulas%2F.git Made tokenizer better at... tokenizing --- diff --git a/include/ulas.h b/include/ulas.h index 4c9c998..1d5963b 100644 --- a/include/ulas.h +++ b/include/ulas.h @@ -103,12 +103,14 @@ enum ulas_ppdirs { }; enum ulas_ppdefs { - ULAS_PP_DEF, - ULAS_PP_MACRO, + ULAS_PPDEF, + ULAS_PPMACRO, }; struct ulas_ppdef { enum ulas_ppdefs type; + char *name; + char *value; bool undef; }; @@ -203,22 +205,11 @@ int ulas_main(struct ulas_config cfg); char *ulas_strndup(const char *src, size_t n); -/** - * A token rule returns true when a token should end - * otherwise returns false - */ -typedef int (*ulas_tokrule)(int current); - // tokenisze according to pre-defined rules // returns the amount of bytes of line that were // consumed or -1 on error // returns 0 when no more tokens can be read -int ulas_tok(struct ulas_str *dst, const char *line, size_t n, - ulas_tokrule rule); - -// smae as ulas_tok but modifies line -int ulas_tokline(struct ulas_str *dst, const char **line, size_t n, - ulas_tokrule rule); +int ulas_tok(struct ulas_str *dst, const char **out_line, size_t n); /** * str diff --git a/src/test.c b/src/test.c index 5b963fc..f96ee58 100644 --- a/src/test.c +++ b/src/test.c @@ -8,42 +8,33 @@ #define TESTBEGIN(name) printf("[test %s]\n", (name)); #define TESTEND(name) printf("[%s ok]\n", (name)); -#define assert_tok(expected_tok, expected_ret, line, rule) \ - { \ - struct ulas_str dst = ulas_str(ULAS_TOKMAX); \ - memset(dst.buf, 0, ULAS_TOKMAX); \ - assert(ulas_tok(&dst, (line), ULAS_TOKMAX, (rule)) == (expected_ret)); \ - assert(strcmp(dst.buf, expected_tok) == 0); \ - ulas_strfree(&dst); \ - } - -#define assert_tokline(expected_n, line, rule, ...) \ +#define assert_tok(line, ...) \ { \ const char *expect[] = __VA_ARGS__; \ - size_t n = ULAS_TOKMAX; \ + size_t n = strlen(line); \ struct ulas_str dst = ulas_str(n); \ memset(dst.buf, 0, n); \ int i = 0; \ const char *pline = line; \ - while (ulas_tokline(&dst, &pline, n, rule)) { \ + while (ulas_tok(&dst, &pline, n)) { \ + puts(dst.buf); \ + assert(expect[i]); \ assert(strcmp(dst.buf, expect[i]) == 0); \ i++; \ } \ - assert(i == expected_n); \ + size_t expect_n = 0; \ + for (expect_n = 0; expect[expect_n]; expect_n++) { \ + } \ + assert(i == expect_n); \ ulas_strfree(&dst); \ } void test_tok(void) { TESTBEGIN("tok"); - assert_tok("test", 4, "test tokens", isspace); - assert_tok("test", 6, " test tokens", isspace); - assert_tok("tokens", 6, "tokens", isspace); - assert_tok("", 0, "", isspace); - assert_tok("", -1, NULL, isspace); - - assert_tokline(4, " test tokens with line", isspace, - {"test", "tokens", "with", "line"}); + assert_tok( + " test tokens with line / * + - , ;", + {"test", "tokens", "with", "line", "/", "*", "+", "-", ",", ";", NULL}); TESTEND("tok"); } diff --git a/src/ulas.c b/src/ulas.c index fbf8d7b..d937549 100644 --- a/src/ulas.c +++ b/src/ulas.c @@ -50,11 +50,8 @@ int ulas_main(struct ulas_config cfg) { return 0; } -int ulas_tok(struct ulas_str *dst, const char *line, size_t n, - ulas_tokrule rule) { - if (!dst->buf || !line || n == 0) { - return -1; - } +int ulas_tok(struct ulas_str *dst, const char **out_line, size_t n) { + const char *line = *out_line; ulas_strensr(dst, n + 1); int i = 0; @@ -63,32 +60,44 @@ int ulas_tok(struct ulas_str *dst, const char *line, size_t n, #define weld_tokcond (i < n && write < n && line[i]) // always skip leading terminators - while (weld_tokcond && rule(line[i])) { + while (weld_tokcond && isspace(line[i])) { i++; } - while (weld_tokcond) { - if (rule(line[i])) { - break; + char c = line[i]; + + switch (c) { + case ',': + case '+': + case '-': + case '*': + case '/': + case '\\': + case ULAS_TOK_COMMENT: + // single char tokens + dst->buf[write++] = line[i++]; + break; + // consume rest of the line but do not write anything to tokens + i = (int)n; + break; + default: + while (weld_tokcond) { + if (isspace(line[i])) { + break; + } + dst->buf[write] = line[i]; + i++; + write++; } - dst->buf[write] = line[i]; - i++; - write++; + break; } + #undef weld_tokcond dst->buf[write] = '\0'; - return i; -} -int ulas_tokline(struct ulas_str *dst, const char **line, size_t n, - ulas_tokrule rule) { - int rc = ulas_tok(dst, *line, n, rule); - if (rc == -1) { - return -1; - } - *line += rc; - return rc; + *out_line += i; + return i; } struct ulas_str ulas_str(size_t n) { @@ -118,17 +127,33 @@ void ulas_strfree(struct ulas_str *s) { char *ulas_preprocexpand(struct ulas_preproc *pp, const char *raw_line, size_t *n) { const char *praw_line = raw_line; - ulas_strensr(&pp->line, (*n) + 1); + memset(pp->line.buf, 0, pp->line.maxlen); + + int read = 0; // go through all tokens, see if a define matches the token, // if so expand it // only expand macros if they match toks[0] though! // otherwise memcpy the read bytes 1:1 into the new string - while (ulas_tokline(&pp->tok, &praw_line, *n, isalnum)) { + while ((read = ulas_tok(&pp->tok, &praw_line, *n))) { + bool found = false; + for (size_t i = 0; i < pp->defslen; i++) { + struct ulas_ppdef *def = &pp->defs[i]; + if (strncmp(def->name, pp->tok.buf, pp->tok.maxlen) != 0) { + continue; + } + + // if so... expand now + found = true; + } + // if not found: copy everythin from prev to the current raw_line point - + // tok lenght -> this keeps the line in-tact as is + if (!found) { + ulas_strensr(&pp->line, (*n) + 1); + strncat(pp->line.buf, praw_line - read, read); + } } - // TODO: actually expand here... - strncpy(pp->line.buf, raw_line, (*n) + 1); *n = strlen(pp->line.buf); return pp->line.buf; } @@ -147,7 +172,7 @@ int ulas_preprocline(struct ulas_preproc *pp, FILE *dst, const char *raw_line, enum ulas_ppdirs found_dir = ULAS_PPDIR_NONE; // check if the first token is any of the valid preproc directives - if (ulas_tokline(&pp->tok, &pline, n, isspace)) { + if (ulas_tok(&pp->tok, &pline, n)) { // not a preproc directive... if (pp->tok.buf[0] != ULAS_TOK_PREPROC_BEGIN) { goto found;