Made tokenizer better at... tokenizing
authorLukas Krickl <lukas@krickl.dev>
Tue, 7 Nov 2023 09:23:49 +0000 (10:23 +0100)
committerLukas Krickl <lukas@krickl.dev>
Tue, 7 Nov 2023 09:23:49 +0000 (10:23 +0100)
include/ulas.h
src/test.c
src/ulas.c

index 4c9c998fe030ffcb6eb9944e0a4b63f58cf405b6..1d5963bfe380948f648f19b03acaf9d3ba26d1d2 100644 (file)
@@ -103,12 +103,14 @@ enum ulas_ppdirs {
 };
 
 enum ulas_ppdefs {
-  ULAS_PP_DEF,
-  ULAS_PP_MACRO,
+  ULAS_PPDEF,
+  ULAS_PPMACRO,
 };
 
 struct ulas_ppdef {
   enum ulas_ppdefs type;
+  char *name;
+  char *value;
   bool undef;
 };
 
@@ -203,22 +205,11 @@ int ulas_main(struct ulas_config cfg);
 
 char *ulas_strndup(const char *src, size_t n);
 
-/**
- * A token rule returns true when a token should end
- * otherwise returns false
- */
-typedef int (*ulas_tokrule)(int current);
-
 // tokenisze according to pre-defined rules
 // returns the amount of bytes of line that were
 // consumed or -1 on error
 // returns 0 when no more tokens can be read
-int ulas_tok(struct ulas_str *dst, const char *line, size_t n,
-             ulas_tokrule rule);
-
-// smae as ulas_tok but modifies line
-int ulas_tokline(struct ulas_str *dst, const char **line, size_t n,
-                 ulas_tokrule rule);
+int ulas_tok(struct ulas_str *dst, const char **out_line, size_t n);
 
 /**
  * str
index 5b963fc1e2a6137cc1f8c7e0d2dd809bec8bae16..f96ee585ee0123052a1620a43b0becc9a0275b5c 100644 (file)
@@ -8,42 +8,33 @@
 #define TESTBEGIN(name) printf("[test %s]\n", (name));
 #define TESTEND(name) printf("[%s ok]\n", (name));
 
-#define assert_tok(expected_tok, expected_ret, line, rule)                     \
-  {                                                                            \
-    struct ulas_str dst = ulas_str(ULAS_TOKMAX);                               \
-    memset(dst.buf, 0, ULAS_TOKMAX);                                           \
-    assert(ulas_tok(&dst, (line), ULAS_TOKMAX, (rule)) == (expected_ret));     \
-    assert(strcmp(dst.buf, expected_tok) == 0);                                \
-    ulas_strfree(&dst);                                                        \
-  }
-
-#define assert_tokline(expected_n, line, rule, ...)                            \
+#define assert_tok(line, ...)                                                  \
   {                                                                            \
     const char *expect[] = __VA_ARGS__;                                        \
-    size_t n = ULAS_TOKMAX;                                                    \
+    size_t n = strlen(line);                                                   \
     struct ulas_str dst = ulas_str(n);                                         \
     memset(dst.buf, 0, n);                                                     \
     int i = 0;                                                                 \
     const char *pline = line;                                                  \
-    while (ulas_tokline(&dst, &pline, n, rule)) {                              \
+    while (ulas_tok(&dst, &pline, n)) {                                        \
+      puts(dst.buf);                                                           \
+      assert(expect[i]);                                                       \
       assert(strcmp(dst.buf, expect[i]) == 0);                                 \
       i++;                                                                     \
     }                                                                          \
-    assert(i == expected_n);                                                   \
+    size_t expect_n = 0;                                                       \
+    for (expect_n = 0; expect[expect_n]; expect_n++) {                         \
+    }                                                                          \
+    assert(i == expect_n);                                                     \
     ulas_strfree(&dst);                                                        \
   }
 
 void test_tok(void) {
   TESTBEGIN("tok");
 
-  assert_tok("test", 4, "test tokens", isspace);
-  assert_tok("test", 6, "  test tokens", isspace);
-  assert_tok("tokens", 6, "tokens", isspace);
-  assert_tok("", 0, "", isspace);
-  assert_tok("", -1, NULL, isspace);
-
-  assert_tokline(4, "  test  tokens   with   line", isspace,
-                 {"test", "tokens", "with", "line"});
+  assert_tok(
+      "  test  tokens   with   line / * + - , ;",
+      {"test", "tokens", "with", "line", "/", "*", "+", "-", ",", ";", NULL});
 
   TESTEND("tok");
 }
index fbf8d7b919a476cccd1123657214004aa1fff234..d93754962a17a105fa6f3ab47d459de7d1a55f0f 100644 (file)
@@ -50,11 +50,8 @@ int ulas_main(struct ulas_config cfg) {
   return 0;
 }
 
-int ulas_tok(struct ulas_str *dst, const char *line, size_t n,
-             ulas_tokrule rule) {
-  if (!dst->buf || !line || n == 0) {
-    return -1;
-  }
+int ulas_tok(struct ulas_str *dst, const char **out_line, size_t n) {
+  const char *line = *out_line;
   ulas_strensr(dst, n + 1);
 
   int i = 0;
@@ -63,32 +60,44 @@ int ulas_tok(struct ulas_str *dst, const char *line, size_t n,
 #define weld_tokcond (i < n && write < n && line[i])
 
   // always skip leading terminators
-  while (weld_tokcond && rule(line[i])) {
+  while (weld_tokcond && isspace(line[i])) {
     i++;
   }
 
-  while (weld_tokcond) {
-    if (rule(line[i])) {
-      break;
+  char c = line[i];
+
+  switch (c) {
+  case ',':
+  case '+':
+  case '-':
+  case '*':
+  case '/':
+  case '\\':
+  case ULAS_TOK_COMMENT:
+    // single char tokens
+    dst->buf[write++] = line[i++];
+    break;
+    // consume rest of the line but do not write anything to tokens
+    i = (int)n;
+    break;
+  default:
+    while (weld_tokcond) {
+      if (isspace(line[i])) {
+        break;
+      }
+      dst->buf[write] = line[i];
+      i++;
+      write++;
     }
-    dst->buf[write] = line[i];
-    i++;
-    write++;
+    break;
   }
+
 #undef weld_tokcond
 
   dst->buf[write] = '\0';
-  return i;
-}
 
-int ulas_tokline(struct ulas_str *dst, const char **line, size_t n,
-                 ulas_tokrule rule) {
-  int rc = ulas_tok(dst, *line, n, rule);
-  if (rc == -1) {
-    return -1;
-  }
-  *line += rc;
-  return rc;
+  *out_line += i;
+  return i;
 }
 
 struct ulas_str ulas_str(size_t n) {
@@ -118,17 +127,33 @@ void ulas_strfree(struct ulas_str *s) {
 char *ulas_preprocexpand(struct ulas_preproc *pp, const char *raw_line,
                          size_t *n) {
   const char *praw_line = raw_line;
-  ulas_strensr(&pp->line, (*n) + 1);
+  memset(pp->line.buf, 0, pp->line.maxlen);
+
+  int read = 0;
 
   // go through all tokens, see if a define matches the token,
   // if so expand it
   // only expand macros if they match toks[0] though!
   // otherwise memcpy the read bytes 1:1 into the new string
-  while (ulas_tokline(&pp->tok, &praw_line, *n, isalnum)) {
+  while ((read = ulas_tok(&pp->tok, &praw_line, *n))) {
+    bool found = false;
+    for (size_t i = 0; i < pp->defslen; i++) {
+      struct ulas_ppdef *def = &pp->defs[i];
+      if (strncmp(def->name, pp->tok.buf, pp->tok.maxlen) != 0) {
+        continue;
+      }
+
+      // if so... expand now
+      found = true;
+    }
+    // if not found: copy everythin from prev to the current raw_line point -
+    // tok lenght -> this keeps the line in-tact as is
+    if (!found) {
+      ulas_strensr(&pp->line, (*n) + 1);
+      strncat(pp->line.buf, praw_line - read, read);
+    }
   }
 
-  // TODO: actually expand here...
-  strncpy(pp->line.buf, raw_line, (*n) + 1);
   *n = strlen(pp->line.buf);
   return pp->line.buf;
 }
@@ -147,7 +172,7 @@ int ulas_preprocline(struct ulas_preproc *pp, FILE *dst, const char *raw_line,
   enum ulas_ppdirs found_dir = ULAS_PPDIR_NONE;
 
   // check if the first token is any of the valid preproc directives
-  if (ulas_tokline(&pp->tok, &pline, n, isspace)) {
+  if (ulas_tok(&pp->tok, &pline, n)) {
     // not a preproc directive...
     if (pp->tok.buf[0] != ULAS_TOK_PREPROC_BEGIN) {
       goto found;