Mon 16 Mar 11:09:06 CET 2026

This commit is contained in:
sbosse 2026-03-16 11:12:40 +01:00
parent 713711821b
commit 3c12254ac5

485
src/lexer.c Normal file
View File

@ -0,0 +1,485 @@
#include "config.h"
#include "types.h"
#include "tokens.h"
#include "lexer.h"
#include "parsef.h"
#include "printf.h"
#include "ccall.h"
#include "cconst.h"
#include "log.h"
static char NL[]="\n";
/*
All name lists must be sorted (wrt. first character)!
*/
const char * keyword[] = {
"AND",
"AWAIT",
"BREAK",
"CALL",
"CONST",
"CONTINUE",
"DATA",
"DELAY",
"DO",
"ELSE",
"END",
"EVENT",
"FOR",
"FORK",
"FUNC",
"GO",
"IF",
#if HAS_LOCK > 0
"LOCK",
#endif
"MOD",
"NOT",
"ON",
"OR",
"PROC",
"REPEAT",
"RETURN",
"STOP",
"THEN",
#if HAS_LOCK > 0
"UNLOCK",
#endif
"VAR",
"WHILE",
"YIELD",
"--",
"<=",
">=",
"<>",
NULL
};
/* the zero terminated token dictionary needed for scalability */
const token_t tokens[] = {
TAND,
TAWAIT,
TBREAK,
TCALL,
TCONST,
TCONTINUE,
TDATA,
TDELAY,
TDO,
TELSE,
TEND,
TEVENT,
TFOR,
TFORK,
TFUNC,
TGO,
TIF,
#if HAS_LOCK > 0
TLOCK,
#endif
TMODULO,
TNOT,
TON,
TOR,
TPROC,
TREPEAT,
TRETURN,
TSTOP,
TTHEN,
#if HAS_LOCK > 0
TUNLOCK,
#endif
TVARDEF,
TWHILE,
TYIELD,
TREM,
TLEQ,
TGEQ,
TNEQ,
0
};
extern int verbose;
/*
Computed keyword first character group jump table for fast keyword table search
A=0, B=1, ...
index=26 => all other non-alphanumeric keywords
*/
static index_t keywordgroup[27];
/* upper case, don't trust the buildins on microcontrollers */
#define btoupper(ch) (ch >= 'a' && ch <= 'z'?ch-32:ch)
#define GetKeyword(i) (char*)keyword[i]
/* tokens read here are token_t constructed from multi byte sequences */
#define GetTokenValue(i) (i >= sizeof(tokens)?0:tokens[i])
/* skip to next first character group if any, k+1 is returned to distinguish between empty group and keyword index 0 */
#define GetKeywordGroup(ch) (ch>='A'&&ch<='Z'?keywordgroup[ch-'A']:keywordgroup[26])
/* it is indeed k+1, see above */
#define SetKeywordGroup(ch,k) keywordgroup[ch>='A'&&ch<='Z'?ch-'A':26]=k+1
// Lookahead of next token
void LookAheadToken(lexer_t *L) {
char *bi=L->input;
L->lasttoken=L->token;
NextToken(L);
L->next=L->input;
L->input=bi;
#if PROFILE>0
L->ntoken--;
#endif
}
// following LookAheadToken
void SkipToken (lexer_t *L) {
L->input=L->next;
#if PROFILE>0
L->ntoken++;
#endif
}
/* consume whitespaces */
static void whitespaces(lexer_t *L) {
while (*L->input == ' ' || *L->input == '\t') L->input++;
}
/*
One-time init
*/
void LexerInit(lexer_t *L) {
int i,j=0;
// initialize and compute keyword jump table
memset(keywordgroup,0,27*sizeof(index_t));
char c=keyword[0][0];
do {
if (c<'A'||c>'Z') break;
SetKeywordGroup(c,j);
while (keyword[j]!=NULL && keyword[j][0]==c) j++;
if (keyword[j]!=NULL) c=keyword[j][0];
} while(keyword[j]!=NULL);
SetKeywordGroup(c,j);
}
/* Initialize the lexer state and assign input */
void LexerSetup(lexer_t *L, char * input) {
memset(L,0,sizeof(lexer_t));
#if DEBUG > 0
log(LOGINFO,"[PLX] Lexer Setup.\n",NULL);
#endif
L->last=L->next=L->input=input;
L->token=EOL;
L->line=1;
#if PROFILE>0
L->ntoken=0;
#endif
}
/*
Main lexer. L->input is incremented, last position is saved, result is in L->token and L->x|L->name ..
This lexer version supports only linear input buffers.
A version for ring buffers (e.g., from link rx buffers) is requied.
*/
void NextToken (lexer_t *L) {
address_t k, l, i;
char *ir;
char quotechar;
L->last=L->input;
/* end of line token */
if (*L->input == 0) {
L->token = EOL;
return;
}
/* 0. Consume white spaces */
whitespaces(L);
#if PROFILE>0
L->ntoken++;
#endif
/* 1.0 comment? */
if (*L->input=='-' && *(L->input+1)=='-') {
while (*L->input && *L->input!='\n') L->input++;
if (!*L->input) { L->token=EOL; return; }
}
/* 1a. unsigned numbers, value returned in L->x */
#if HAS_FLOAT==0
if (*L->input <= '9' && *L->input >= '0') {
L->input += ParseNumber(L->input, &L->x);
#else
if ((*L->input <= '9' && *L->input >= '0') || *L->input == '.') {
L->input += ParseNumber2(L->input, &L->x);
#endif
// number in expression
L->token = TNUMBER;
L->next = L->input;
return;
}
// 1b. character 'x', value returned in L->arg/arglen
if (*L->input=='\'') {
L->input++;
if (*L->input=='\\') {
//escape sequence
L->input++;
switch (*L->input) {
case 'n': L->arg=NL;
}
} else
L->arg=L->input;
L->input++;
if (*L->input!='\'') {
L->token=TERROR;
return;
}
L->arglen=1;
L->token=TCHAR;
L->input++;
return;
}
// 1.c string "text", value returned in L->arg/arglen
if (*L->input=='"') {
L->input++;
L->arg=L->input;
// string can be empyt
if (*L->input!='"') do { *L->input++; } while (*L->input!=0 && *L->input!='"');
if (*L->input!='"') {
L->token=TERROR;
return;
}
L->arglen=L->input-L->arg;
L->token=TSTRING;
L->input++;
return;
}
/*
* Keywords and variables
*
* Isolate a word, input points to the beginning, l is the length of the word.
* ir points to the end of the word after isolating.
*/
l = 0;
ir = L->input;
while (-1) {
if (*ir >= 'a' && *ir <= 'z') {
*ir -= 32; /* toupper code, changing the input buffer directly */
ir++;
l++;
} else if ((*ir >= '@' && *ir <= 'Z') || *ir == '_') {
ir++;
l++;
} else {
break;
}
}
/*
* 2. Keywords
* Ir is reused here to implement string compares
* scanning the keyword array.
* Once a keyword is detected the input buffer is advanced
* by its length, and the token value is returned.
*
* Keywords are an array of null terminated strings.
* They are always matched uppercase.
*/
if (l==0) {
// special characters, no starting letter
k = GetKeywordGroup(*L->input)-1;
while (GetTokenValue(k) != 0) {
char c;
ir = GetKeyword(k); i = 0;
c=/*btoupper*/(*(L->input + i)); /* name was alreday case converted above */
if (c!=*ir) { k++; continue; }
while (*(ir + i) != 0) {
if (*(ir + i) != c) {
k++; i = 0;
break;
} else
i++;
c=btoupper(*(L->input + i));
}
if (i == 0)
continue;
// we cannot check if this is only a partial match!
L->input += i;
L->token = GetTokenValue(k);
if (L->token == TREM) {
// consume remark line
while (*L->input != '\0' && *L->input!='\n') L->input++;
}
L->next=L->input;
return;
}
} else {
// onyl keyword starting with a letter
k = GetKeywordGroup(*L->input)-1; // get keyword group start based on first character
#if DEBUG>2
print_format("lexer keyword? starting with token k=%d\n",k);
#endif
if (k>=0) while (GetTokenValue(k) != 0) {
char c;
ir = GetKeyword(k); i = 0;
c=/*btoupper*/(*(L->input + i));
if (c!=*ir) {
// skip to next character block
if (c<*ir) break; // nothing more to search for; no keyword
if (*ir < 'A' || *ir>'Z') break; // no more keywords starting with a letter
if (GetKeywordGroup(c)>0) { k=GetKeywordGroup(c)-1; continue; }
else break;
}
while (*(ir + i) != 0) {
if (*(ir + i) != c || i==l) {
k++; i = 0;
break;
} else
i++;
c=btoupper(*(L->input + i));
}
if (i == 0)
continue;
c = btoupper(*(L->input + i));
if ((c>='A' && c<='Z') || (c>='0' && c<='9')) {
// only partial match
k++; i=0;
continue;
}
L->input += i;
L->token = GetTokenValue(k);
if (L->token == TREM) {
// consume remark line
while (*L->input != '\0' && *L->input!='\n') L->input++;
}
L->next=L->input;
return;
}
}
/*
Ccall extensions must be checked after built-in keyword search
*/
k = 0;
if (ccalls) while (ccalls[k].name != 0) {
char c;
ir = ccalls[k].name;
// printf("ccalls[%d] %s\n",k,ir);
i = 0;
c = /*btoupper*/(*(L->input));
if (c<*ir) break; // not a ccall
if (c!=*ir) {
if (ccalls[k].next) {
k=ccalls[k].next;
// printf("ccjump %c %d\n",c,k);
continue;
} else break;
}
while (*(ir + i) != 0) {
if (*(ir + i) != btoupper(*(L->input + i))) {
k++;
i = 0;
break;
} else
i++;
}
if (i == 0)
continue;
c = /*btoupper*/(*(L->input + i));
if ((c>='A' && c<='Z') || (c>='0' && c<='9')) {
// only partial match
k++;
continue;
}
L->input += i;
L->next = L->input;
L->token = TCCALL;
L->ix = k;
return;
}
/*
Cconst extensions must be checked after built-in keyword search
*/
k = 0;
if (cconst) while (cconst[k].name != 0) {
char c;
ir = cconst[k].name;
// printf("ccalls[%d] %s\n",k,ir);
i = 0;
c = /*btoupper*/(*(L->input));
if (c<*ir) break; // not a ccall
if (c!=*ir) {
if (cconst[k].next) {
k=cconst[k].next;
// printf("ccjump %c %d\n",c,k);
continue;
} else break;
}
while (*(ir + i) != 0) {
if (*(ir + i) != btoupper(*(L->input + i))) {
k++;
i = 0;
break;
} else
i++;
}
if (i == 0)
continue;
c = /*btoupper*/(*(L->input + i));
if ((c>='A' && c<='Z') || (c>='0' && c<='9')) {
// only partial match
k++;
continue;
}
L->input += i;
L->next = L->input;
if (cconst[k].string) {
L->token = TSTRING;
L->arg = cconst[k].string;
L->arglen = (index_t)cconst[k].value;
} else {
L->token = TNUMBER;
L->x = cconst[k].value;
}
return;
}
// X. Identifier
if (l > 0 && l <= MAXNAMELEN) {
int i;
l=0;
L->token = TIDENTIFIER; // can be variable, string variable, array variable, function, resolved in code.CompileLine
while (((*L->input >= '0' && *L->input <= '9') || (*L->input >= '@' && *L->input <= 'Z') ||
(*L->input >= 'a' && *L->input <= 'z') || (*L->input == '_')) &&
*L->input != 0) {
L->name[l] = *L->input;
L->input++; l++;
}
L->name[l]=0;
L->next=L->input;
return;
}
/* other single characters are parsed and stored */
L->token = *L->input;
if (*L->input == '\n') { L->line++; L->token = EOL; }
L->input++;
return;
}