WIP: tokenizer

author Lukas Krickl <lukas@krickl.dev>

Sun, 19 Nov 2023 09:59:00 +0000 (10:59 +0100)

committer Lukas Krickl <lukas@krickl.dev>

Sun, 19 Nov 2023 09:59:00 +0000 (10:59 +0100)
author Lukas Krickl <lukas@krickl.dev>
Sun, 19 Nov 2023 09:59:00 +0000 (10:59 +0100)
committer Lukas Krickl <lukas@krickl.dev>
Sun, 19 Nov 2023 09:59:00 +0000 (10:59 +0100)
diff --git a/src/test.c b/src/test.c

index 6fc8c51f1cbfcc8d48560f9820cf90c329710a73..b8aefcd56db66b374a48170b88bc101aa22f2918 100644 (file)
--- a/src/test.c
+++ b/src/test.c
@@ -50,9 +50,10 @@
  void test_tok(void) {
    TESTBEGIN("tok");
  
-  assert_tok("  test  tokens   with,   line / * + - , ; $1",
-             {"test", "tokens", "with", ",", "line", "/", "*", "+", "-", ",",
-              ";", "$1", NULL});
+  assert_tok(
+      "  test  tokens   with,   line / * + - , ; $1 = == != > < >= <=",
+      {"test", "tokens", "with", ",",  "line", "/", "*", "+",  "-",  ",",
+       ";",    "$1",     "=",    "==", "!=",   ">", "<", ">=", "<=", NULL});
  
    assert_tokuntil(" this is a, test for tok , until", ',',
                    {"this is a", "test for tok ", "until", NULL});
@@ -178,13 +179,12 @@ void test_preproc(void) {
      assert((expected_rc) == rc);                                               \
    }
  
-
-#define ASSERT_TOTOK(expected_val, expected_rc, token)                  \
+#define ASSERT_TOTOK(expected_val, expected_rc, token)                         \
    {                                                                            \
      int rc = 0;                                                                \
      struct ulas_tok tok = ulas_totok((token), strlen(token), &rc);             \
      assert((expected_rc) == rc);                                               \
-    assert(tok.type == (expected_val));                                           \
+    assert(tok.type == (expected_val));                                        \
      free(tok.val.strv);                                                        \
    }
  
@@ -220,10 +220,12 @@ void test_totok(void) {
  
    ASSERT_UNEXPECTED_TOTOK(-1, "1symbol123");
  
-  // generic tokens with no value 
+  // generic tokens with no value
    ASSERT_TOTOK(ULAS_EQ, 0, "==");
+  ASSERT_TOTOK(ULAS_NEQ, 0, "!=");
    ASSERT_TOTOK('=', 0, "=");
    ASSERT_TOTOK('+', 0, "+");
+  ASSERT_TOTOK('!', 0, "!");
  
    TESTEND("totok");
  }
diff --git a/src/ulas.c b/src/ulas.c

index 8cd55378ae45868770021d82f054e607b20734a2..0279fa386ca5c4dc14fdaa3771cb7aea523604ad 100644 (file)
--- a/src/ulas.c
+++ b/src/ulas.c
@@ -148,11 +148,19 @@ int ulas_tok(struct ulas_str *dst, const char **out_line, unsigned long n) {
      char c = line[i];
  
      switch (c) {
-    case ',':
      case '+':
      case '-':
      case '*':
      case '/':
+    case '~':
+    case '|':
+    case '&':
+    case '%':
+    case '(':
+    case ')':
+    case '[':
+    case ']':
+    case ',':
      case '\\':
      case ULAS_TOK_COMMENT:
        if (WELD_TOKISTERM) {
@@ -172,6 +180,18 @@ int ulas_tok(struct ulas_str *dst, const char **out_line, unsigned long n) {
        dst->buf[write++] = line[i++];
        dst->buf[write++] = line[i++];
        goto tokdone;
+    case '=':
+    case '<':
+    case '!':
+    case '>':
+      if (line[i + 1] == '=') {
+        dst->buf[write] = line[i];
+        i++;
+        write++;
+      }
+      dst->buf[write] = line[i];
+      write++;
+      break;
      default:
        if (isspace(line[i])) {
          goto tokdone;
@@ -262,109 +282,105 @@ struct ulas_tok ulas_totok(char *buf, unsigned long n, int *rc) {
    unsigned char first = buf[0];
    buf++;
  
-  switch (first) {
-  case '+':
-  case '-':
-  case '*':
-  case '/':
-  case '~':
-  case '|':
-  case '&':
-  case '%':
-  case '(':
-  case ')':
-  case '[':
-  case ']':
-  case ',':
-  case ';':
+  if (n == 1) {
      // single char tokens
      tok.type = first;
-    goto end;
-  case '"':
-    // string
-    tok.type = ULAS_STR;
-
-    // FIXME: this likely mallocs a few extra bytes
-    // but honestly its probably fine
-    tok.val.strv = malloc(n * sizeof(char) + 1);
-    memset(tok.val.strv, 0, n);
-
-    long i = 0;
-    while (*buf && *buf != '\"') {
-      if (*buf == '\\') {
+  } else {
+    switch (first) {
+    case '"':
+      // string
+      tok.type = ULAS_STR;
+
+      // FIXME: this likely mallocs a few extra bytes
+      // but honestly its probably fine
+      tok.val.strv = malloc(n * sizeof(char) + 1);
+      memset(tok.val.strv, 0, n);
+
+      long i = 0;
+      while (*buf && *buf != '\"') {
+        if (*buf == '\\') {
+          buf++;
+          tok.val.strv[i] = ulas_unescape(*buf, rc);
+        } else {
+          tok.val.strv[i] = *buf;
+        }
+        i++;
          buf++;
-        tok.val.strv[i] = ulas_unescape(*buf, rc);
-      } else {
-        tok.val.strv[i] = *buf;
        }
-      i++;
-      buf++;
-    }
-    tok.val.strv[i] = '\0';
+      tok.val.strv[i] = '\0';
  
-    if (*buf != '\"') {
-      *rc = -1;
-      ULASERR("Unterminated string sequence\n");
-      goto end;
-    }
-    buf++;
-    break;
-  case '=':
-    if (*buf == '=') {
-      tok.type = ULAS_EQ;
-      buf++;
-    } else {
-      tok.type = first;
-    }
-    break;
-  case '!':
-    if (*buf == '=') {
-      tok.type = ULAS_NEQ;
+      if (*buf != '\"') {
+        *rc = -1;
+        ULASERR("Unterminated string sequence\n");
+        goto end;
+      }
        buf++;
-    } else {
-      tok.type = first;
-    }
-    break;
-  default:
-    if (isdigit(first)) {
-      // integer
-      tok.type = ULAS_INT;
-
-      // 0b prefix is not supported in strtol... so we implement it by hand
-      if (*buf == 'b') {
+      break;
+    case '=':
+      if (*buf == '=') {
+        tok.type = ULAS_EQ;
          buf++;
-        tok.val.intv = (int)strtol(buf, &buf, 2);
-      } else {
-        tok.val.intv = (int)strtol(buf - 1, &buf, 0);
        }
-    } else if (first == '\'') {
-      tok.type = ULAS_INT;
-      if (*buf == '\\') {
+      break;
+    case '!':
+      if (*buf == '=') {
+        tok.type = ULAS_NEQ;
          buf++;
-        tok.val.intv = ulas_unescape(*buf, rc);
-      } else {
-        tok.val.intv = (int)*buf;
        }
-      buf++;
-      if (*buf != '\'') {
+      break;
+    case '<':
+      if (*buf == '=') {
+        tok.type = ULAS_LTEQ;
+        buf++;
+      }
+      break;
+    case '>':
+      if (*buf == '=') {
+        tok.type = ULAS_GTEQ;
+        buf++;
+      }
+      break;
+    default:
+      if (isdigit(first)) {
+        // integer
+        tok.type = ULAS_INT;
+
+        // 0b prefix is not supported in strtol... so we implement it by hand
+        if (*buf == 'b') {
+          buf++;
+          tok.val.intv = (int)strtol(buf, &buf, 2);
+        } else {
+          tok.val.intv = (int)strtol(buf - 1, &buf, 0);
+        }
+      } else if (first == '\'') {
+        tok.type = ULAS_INT;
+        if (*buf == '\\') {
+          buf++;
+          tok.val.intv = ulas_unescape(*buf, rc);
+        } else {
+          tok.val.intv = (int)*buf;
+        }
+        buf++;
+        if (*buf != '\'') {
+          *rc = -1;
+          ULASERR("Unterminated character sequence\n");
+          goto end;
+        }
+        buf++;
+        break;
+      } else if (ulas_isname(buf - 1, n)) {
+        // literal token
+        // we resolve it later, will need to malloc here for now
+        tok.type = ULAS_SYMBOL;
+        tok.val.strv = strndup(buf - 1, n);
+        buf += n - 1;
+      } else {
+        ULASERR("Unexpected token: %s\n", buf);
          *rc = -1;
-        ULASERR("Unterminated character sequence\n");
          goto end;
        }
-      buf++;
        break;
-    } else if (ulas_isname(buf - 1, n)) {
-      // literal token
-      // we resolve it later, will need to malloc here for now
-      tok.type = ULAS_SYMBOL;
-      tok.val.strv = strndup(buf - 1, n);
-      buf += n - 1;
-    } else {
-      ULASERR("Unexpected token: %s\n", buf);
-      *rc = -1;
-      goto end;
      }
-    break;
    }
  
  end:
author	Lukas Krickl <lukas@krickl.dev>
	Sun, 19 Nov 2023 09:59:00 +0000 (10:59 +0100)
committer	Lukas Krickl <lukas@krickl.dev>
	Sun, 19 Nov 2023 09:59:00 +0000 (10:59 +0100)
src/test.c		patch \| blob \| history
src/ulas.c		patch \| blob \| history