Mon 16 Mar 11:09:06 CET 2026

2026-03-16 11:12:40 +01:00 · 2026-03-16 11:12:40 +01:00 · 3c12254ac5
commit 3c12254ac5
parent 713711821b
1 changed files with 485 additions and 0 deletions
--- a/src/lexer.c
+++ b/src/lexer.c
@ -0,0 +1,485 @@
+#include "config.h"
+#include "types.h"
+#include "tokens.h"
+#include "lexer.h"
+#include "parsef.h"
+#include "printf.h"
+#include "ccall.h"
+#include "cconst.h"
+#include "log.h"
+
+static char NL[]="\n";
+
+/*
+  All name lists must be sorted (wrt. first character)!
+*/
+const char * keyword[] = {
+  "AND",
+  "AWAIT",
+  "BREAK",
+  "CALL",
+  "CONST",
+  "CONTINUE",
+  "DATA",
+  "DELAY",
+  "DO",
+  "ELSE",
+  "END",
+  "EVENT",
+  "FOR",
+  "FORK",
+  "FUNC",
+  "GO",
+  "IF",
+#if HAS_LOCK > 0
+  "LOCK",
+#endif
+  "MOD",
+  "NOT", 
+  "ON",
+  "OR",
+  "PROC",
+  "REPEAT",
+  "RETURN",   
+  "STOP",
+  "THEN",
+#if HAS_LOCK > 0
+  "UNLOCK",
+#endif
+  "VAR",
+  "WHILE",
+  "YIELD",
+  "--",
+  "<=",     
+  ">=",
+  "<>",
+  NULL
+};
+
+/* the zero terminated token dictionary needed for scalability */
+const token_t tokens[] = {
+  TAND,
+  TAWAIT,
+  TBREAK,
+  TCALL,
+  TCONST,
+  TCONTINUE, 
+  TDATA,
+  TDELAY,
+  TDO, 
+  TELSE,
+  TEND,
+  TEVENT,
+  TFOR,
+  TFORK,
+  TFUNC,
+  TGO,
+  TIF,
+#if HAS_LOCK > 0
+  TLOCK,
+#endif
+  TMODULO,
+  TNOT,
+  TON,
+  TOR,
+  TPROC, 
+  TREPEAT,
+  TRETURN,
+  TSTOP,
+  TTHEN,
+#if HAS_LOCK > 0
+  TUNLOCK,
+#endif
+  TVARDEF,
+  TWHILE,
+  TYIELD,
+  TREM,
+  TLEQ,
+  TGEQ, 
+  TNEQ,
+  0
+};
+
+extern int verbose;
+
+/*
+  Computed keyword first character group jump table for fast keyword table search
+  A=0, B=1, ...
+  index=26 => all other non-alphanumeric keywords
+*/
+static index_t keywordgroup[27];
+
+/* upper case, don't trust the buildins on microcontrollers */
+#define btoupper(ch) (ch >= 'a' && ch <= 'z'?ch-32:ch)
+
+#define GetKeyword(i) (char*)keyword[i]
+
+/* tokens read here are token_t constructed from multi byte sequences */
+#define GetTokenValue(i) (i >= sizeof(tokens)?0:tokens[i])
+
+/* skip to next first character group if any, k+1 is returned to distinguish between empty group and keyword index 0 */
+#define GetKeywordGroup(ch) (ch>='A'&&ch<='Z'?keywordgroup[ch-'A']:keywordgroup[26])
+
+/* it is indeed k+1, see above */
+#define SetKeywordGroup(ch,k) keywordgroup[ch>='A'&&ch<='Z'?ch-'A':26]=k+1
+
+// Lookahead of next token
+void LookAheadToken(lexer_t *L) {
+  char *bi=L->input;
+  L->lasttoken=L->token;
+  NextToken(L);
+  L->next=L->input;
+  L->input=bi;
+  #if PROFILE>0
+    L->ntoken--;
+  #endif  
+}
+
+// following LookAheadToken
+void SkipToken (lexer_t *L) {
+  L->input=L->next;
+  #if PROFILE>0
+    L->ntoken++;
+  #endif  
+}
+
+/* consume whitespaces */
+static void whitespaces(lexer_t *L) {
+  while (*L->input == ' ' || *L->input == '\t') L->input++;
+}
+
+/*
+  One-time init
+*/
+void LexerInit(lexer_t *L) {
+  int i,j=0;
+  // initialize and compute keyword jump table
+  memset(keywordgroup,0,27*sizeof(index_t));
+  char c=keyword[0][0];
+  do {
+    if (c<'A'||c>'Z') break;
+    SetKeywordGroup(c,j);
+    while (keyword[j]!=NULL && keyword[j][0]==c) j++;
+    if (keyword[j]!=NULL) c=keyword[j][0];
+  } while(keyword[j]!=NULL);
+  SetKeywordGroup(c,j);
+}
+
+/* Initialize the lexer state and assign input */
+void LexerSetup(lexer_t *L, char * input) {
+  memset(L,0,sizeof(lexer_t));
+#if DEBUG > 0
+  log(LOGINFO,"[PLX] Lexer Setup.\n",NULL);
+#endif
+  L->last=L->next=L->input=input;
+  L->token=EOL;
+  L->line=1;
+
+#if PROFILE>0
+  L->ntoken=0;
+#endif  
+}
+
+
+/*
+  Main lexer. L->input is incremented, last position is saved, result is in L->token and L->x|L->name ..
+  This lexer version supports only linear input buffers. 
+  A version for ring buffers (e.g., from link rx buffers) is requied.
+*/
+void NextToken (lexer_t *L) {
+  address_t k, l, i;
+  char *ir;
+  char quotechar;
+
+  L->last=L->input;
+  /* end of line token */
+  if (*L->input == 0) {
+    L->token = EOL;
+    return;
+  }
+
+  /* 0. Consume white spaces */
+  
+  whitespaces(L);
+
+#if PROFILE>0
+  L->ntoken++;
+#endif  
+
+
+  /* 1.0 comment? */
+  if (*L->input=='-' && *(L->input+1)=='-') {
+    while (*L->input && *L->input!='\n') L->input++;
+    if (!*L->input) { L->token=EOL; return; }
+  }
+  
+  /* 1a. unsigned numbers, value returned in L->x */
+#if HAS_FLOAT==0
+  if (*L->input <= '9' && *L->input >= '0') {
+    L->input += ParseNumber(L->input, &L->x);
+#else
+  if ((*L->input <= '9' && *L->input >= '0') || *L->input == '.') {
+    L->input += ParseNumber2(L->input, &L->x);
+#endif
+    // number in expression 
+    L->token = TNUMBER;
+    L->next  = L->input;
+    return;
+  }
+  // 1b. character 'x', value returned in L->arg/arglen
+  if (*L->input=='\'') {
+    L->input++;
+    if (*L->input=='\\') {
+      //escape sequence
+      L->input++;
+      switch (*L->input) {
+        case 'n': L->arg=NL;
+      }
+    } else
+      L->arg=L->input;
+    L->input++;
+    if (*L->input!='\'') {
+      L->token=TERROR;
+      return;
+    } 
+    L->arglen=1;
+    L->token=TCHAR;
+    L->input++;
+    return;
+  }
+  // 1.c string "text", value returned in L->arg/arglen
+  if (*L->input=='"') {
+    L->input++;
+    L->arg=L->input;
+    // string can be empyt
+    if (*L->input!='"') do { *L->input++; } while (*L->input!=0 && *L->input!='"');
+    if (*L->input!='"') {
+      L->token=TERROR;
+      return;
+    }
+    L->arglen=L->input-L->arg;
+    L->token=TSTRING;
+    L->input++;
+    return;
+  }
+ /*
+   *	Keywords and variables
+   *
+   *	Isolate a word, input points to the beginning, l is the length of the word.
+   *	ir points to the end of the word after isolating.
+   */
+
+  l = 0;
+  ir = L->input;
+  while (-1) {
+    if (*ir >= 'a' && *ir <= 'z') {
+      *ir -= 32; /* toupper code, changing the input buffer directly */
+      ir++;
+      l++;
+    } else if ((*ir >= '@' && *ir <= 'Z') || *ir == '_') {
+      ir++;
+      l++;
+    } else {
+      break;
+    }
+  }
+
+  /*
+   *  2. Keywords
+   *	Ir is reused here to implement string compares
+   *	scanning the keyword array.
+   *	Once a keyword is detected the input buffer is advanced
+   *	by its length, and the token value is returned.
+   *
+   *	Keywords are an array of null terminated strings.
+   *  They are always matched uppercase.
+   */
+  if (l==0) {
+    // special characters, no starting letter
+    k = GetKeywordGroup(*L->input)-1;
+
+    while (GetTokenValue(k) != 0) {
+      char c;
+      ir = GetKeyword(k); i = 0;
+      c=/*btoupper*/(*(L->input + i)); /* name was alreday case converted above */
+
+      if (c!=*ir) { k++; continue; }
+      while (*(ir + i) != 0) {
+        if (*(ir + i) != c) {
+          k++; i = 0;
+          break;
+        } else
+          i++;
+        c=btoupper(*(L->input + i));
+      }
+      if (i == 0)
+        continue;
+      // we cannot check if this is only a partial match!
+      L->input += i;
+      L->token = GetTokenValue(k);
+      if (L->token == TREM) {
+        // consume remark line
+        while (*L->input != '\0' && *L->input!='\n') L->input++;
+      }
+      L->next=L->input;
+      return;
+    }
+  } else {
+    // onyl keyword starting with a letter
+    k = GetKeywordGroup(*L->input)-1; // get keyword group start based on first character 
+#if DEBUG>2
+    print_format("lexer keyword? starting with token k=%d\n",k);
+#endif
+    if (k>=0) while (GetTokenValue(k) != 0) {
+      char c;
+      ir = GetKeyword(k); i = 0;
+      c=/*btoupper*/(*(L->input + i));
+      if (c!=*ir) {
+        // skip to next character block
+        if (c<*ir) break; // nothing more to search for; no keyword
+        if (*ir < 'A' || *ir>'Z') break; // no more keywords starting with a letter
+        if (GetKeywordGroup(c)>0) { k=GetKeywordGroup(c)-1; continue; }
+        else break;
+      }
+      while (*(ir + i) != 0) {
+        if (*(ir + i) != c || i==l) {
+          k++; i = 0;
+          break;
+        } else
+          i++;
+        c=btoupper(*(L->input + i));
+      }
+      if (i == 0)
+        continue;
+      c = btoupper(*(L->input + i));
+      if ((c>='A' && c<='Z') || (c>='0' && c<='9')) {
+        // only partial match
+        k++; i=0;
+        continue;
+      }
+      L->input += i;
+      L->token = GetTokenValue(k);
+      if (L->token == TREM) {
+        // consume remark line
+        while (*L->input != '\0' && *L->input!='\n') L->input++;
+      }
+      L->next=L->input;
+      return;
+    }
+  }
+
+/*
+  Ccall extensions must be checked after built-in keyword search
+*/
+  k = 0;
+  if (ccalls) while (ccalls[k].name != 0) {
+    char c;
+    ir = ccalls[k].name;
+    // printf("ccalls[%d] %s\n",k,ir);
+    i = 0;
+    c = /*btoupper*/(*(L->input));
+    if (c<*ir) break; // not a ccall
+    if (c!=*ir) {
+      if (ccalls[k].next) {
+        k=ccalls[k].next;
+        // printf("ccjump %c %d\n",c,k);
+        continue;
+      } else break;
+    }
+    while (*(ir + i) != 0) {
+      if (*(ir + i) != btoupper(*(L->input + i))) {
+        k++;
+        i = 0;
+        break;
+      } else
+        i++;
+    }
+    if (i == 0)
+      continue;
+    c = /*btoupper*/(*(L->input + i));
+    if ((c>='A' && c<='Z') || (c>='0' && c<='9')) {
+      // only partial match
+      k++;
+      continue;
+    }
+    
+    L->input  += i;
+    L->next   = L->input;
+    L->token  = TCCALL;
+    L->ix     = k;
+    return;
+  }
+
+/*
+  Cconst extensions must be checked after built-in keyword search
+*/
+  k = 0;
+  if (cconst) while (cconst[k].name != 0) {
+    char c;
+    ir = cconst[k].name;
+    // printf("ccalls[%d] %s\n",k,ir);
+    i = 0;
+    c = /*btoupper*/(*(L->input));
+    if (c<*ir) break; // not a ccall
+    if (c!=*ir) {
+      if (cconst[k].next) {
+        k=cconst[k].next;
+        // printf("ccjump %c %d\n",c,k);
+        continue;
+      } else break;
+    }
+    while (*(ir + i) != 0) {
+      if (*(ir + i) != btoupper(*(L->input + i))) {
+        k++;
+        i = 0;
+        break;
+      } else
+        i++;
+    }
+    if (i == 0)
+      continue;
+    c = /*btoupper*/(*(L->input + i));
+    if ((c>='A' && c<='Z') || (c>='0' && c<='9')) {
+      // only partial match
+      k++;
+      continue;
+    }
+    
+    L->input  += i;
+    L->next   = L->input;
+    if (cconst[k].string) {
+      L->token  = TSTRING;
+      L->arg    = cconst[k].string;
+      L->arglen = (index_t)cconst[k].value;
+    } else {
+      L->token  = TNUMBER;
+      L->x      = cconst[k].value;
+    }
+    return;
+  }
+
+  // X. Identifier
+  if (l > 0 && l <= MAXNAMELEN) {
+    int i;
+    l=0;
+    L->token = TIDENTIFIER; // can be variable, string variable, array variable, function, resolved in code.CompileLine
+    while (((*L->input >= '0' && *L->input <= '9') || (*L->input >= '@' && *L->input <= 'Z') ||
+            (*L->input >= 'a' && *L->input <= 'z') || (*L->input == '_')) &&
+           *L->input != 0) {
+      L->name[l] = *L->input;
+      L->input++; l++;
+    }
+    L->name[l]=0;
+    L->next=L->input;
+    return;
+  }
+
+  /* other single characters are parsed and stored */
+  L->token = *L->input;
+  if (*L->input == '\n') { L->line++; L->token = EOL; }
+  L->input++;
+  
+  return;
+  
+}
+