Mon 16 Mar 11:09:06 CET 2026
This commit is contained in:
parent
713711821b
commit
3c12254ac5
485
src/lexer.c
Normal file
485
src/lexer.c
Normal file
|
|
@ -0,0 +1,485 @@
|
|||
#include "config.h"
|
||||
#include "types.h"
|
||||
#include "tokens.h"
|
||||
#include "lexer.h"
|
||||
#include "parsef.h"
|
||||
#include "printf.h"
|
||||
#include "ccall.h"
|
||||
#include "cconst.h"
|
||||
#include "log.h"
|
||||
|
||||
static char NL[]="\n";
|
||||
|
||||
/*
|
||||
All name lists must be sorted (wrt. first character)!
|
||||
*/
|
||||
const char * keyword[] = {
|
||||
"AND",
|
||||
"AWAIT",
|
||||
"BREAK",
|
||||
"CALL",
|
||||
"CONST",
|
||||
"CONTINUE",
|
||||
"DATA",
|
||||
"DELAY",
|
||||
"DO",
|
||||
"ELSE",
|
||||
"END",
|
||||
"EVENT",
|
||||
"FOR",
|
||||
"FORK",
|
||||
"FUNC",
|
||||
"GO",
|
||||
"IF",
|
||||
#if HAS_LOCK > 0
|
||||
"LOCK",
|
||||
#endif
|
||||
"MOD",
|
||||
"NOT",
|
||||
"ON",
|
||||
"OR",
|
||||
"PROC",
|
||||
"REPEAT",
|
||||
"RETURN",
|
||||
"STOP",
|
||||
"THEN",
|
||||
#if HAS_LOCK > 0
|
||||
"UNLOCK",
|
||||
#endif
|
||||
"VAR",
|
||||
"WHILE",
|
||||
"YIELD",
|
||||
"--",
|
||||
"<=",
|
||||
">=",
|
||||
"<>",
|
||||
NULL
|
||||
};
|
||||
|
||||
/* the zero terminated token dictionary needed for scalability */
|
||||
const token_t tokens[] = {
|
||||
TAND,
|
||||
TAWAIT,
|
||||
TBREAK,
|
||||
TCALL,
|
||||
TCONST,
|
||||
TCONTINUE,
|
||||
TDATA,
|
||||
TDELAY,
|
||||
TDO,
|
||||
TELSE,
|
||||
TEND,
|
||||
TEVENT,
|
||||
TFOR,
|
||||
TFORK,
|
||||
TFUNC,
|
||||
TGO,
|
||||
TIF,
|
||||
#if HAS_LOCK > 0
|
||||
TLOCK,
|
||||
#endif
|
||||
TMODULO,
|
||||
TNOT,
|
||||
TON,
|
||||
TOR,
|
||||
TPROC,
|
||||
TREPEAT,
|
||||
TRETURN,
|
||||
TSTOP,
|
||||
TTHEN,
|
||||
#if HAS_LOCK > 0
|
||||
TUNLOCK,
|
||||
#endif
|
||||
TVARDEF,
|
||||
TWHILE,
|
||||
TYIELD,
|
||||
TREM,
|
||||
TLEQ,
|
||||
TGEQ,
|
||||
TNEQ,
|
||||
0
|
||||
};
|
||||
|
||||
extern int verbose;
|
||||
|
||||
/*
|
||||
Computed keyword first character group jump table for fast keyword table search
|
||||
A=0, B=1, ...
|
||||
index=26 => all other non-alphanumeric keywords
|
||||
*/
|
||||
static index_t keywordgroup[27];
|
||||
|
||||
/* upper case, don't trust the buildins on microcontrollers */
|
||||
#define btoupper(ch) (ch >= 'a' && ch <= 'z'?ch-32:ch)
|
||||
|
||||
#define GetKeyword(i) (char*)keyword[i]
|
||||
|
||||
/* tokens read here are token_t constructed from multi byte sequences */
|
||||
#define GetTokenValue(i) (i >= sizeof(tokens)?0:tokens[i])
|
||||
|
||||
/* skip to next first character group if any, k+1 is returned to distinguish between empty group and keyword index 0 */
|
||||
#define GetKeywordGroup(ch) (ch>='A'&&ch<='Z'?keywordgroup[ch-'A']:keywordgroup[26])
|
||||
|
||||
/* it is indeed k+1, see above */
|
||||
#define SetKeywordGroup(ch,k) keywordgroup[ch>='A'&&ch<='Z'?ch-'A':26]=k+1
|
||||
|
||||
// Lookahead of next token
|
||||
void LookAheadToken(lexer_t *L) {
|
||||
char *bi=L->input;
|
||||
L->lasttoken=L->token;
|
||||
NextToken(L);
|
||||
L->next=L->input;
|
||||
L->input=bi;
|
||||
#if PROFILE>0
|
||||
L->ntoken--;
|
||||
#endif
|
||||
}
|
||||
|
||||
// following LookAheadToken
|
||||
void SkipToken (lexer_t *L) {
|
||||
L->input=L->next;
|
||||
#if PROFILE>0
|
||||
L->ntoken++;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* consume whitespaces */
|
||||
static void whitespaces(lexer_t *L) {
|
||||
while (*L->input == ' ' || *L->input == '\t') L->input++;
|
||||
}
|
||||
|
||||
/*
|
||||
One-time init
|
||||
*/
|
||||
void LexerInit(lexer_t *L) {
|
||||
int i,j=0;
|
||||
// initialize and compute keyword jump table
|
||||
memset(keywordgroup,0,27*sizeof(index_t));
|
||||
char c=keyword[0][0];
|
||||
do {
|
||||
if (c<'A'||c>'Z') break;
|
||||
SetKeywordGroup(c,j);
|
||||
while (keyword[j]!=NULL && keyword[j][0]==c) j++;
|
||||
if (keyword[j]!=NULL) c=keyword[j][0];
|
||||
} while(keyword[j]!=NULL);
|
||||
SetKeywordGroup(c,j);
|
||||
}
|
||||
|
||||
/* Initialize the lexer state and assign input */
|
||||
void LexerSetup(lexer_t *L, char * input) {
|
||||
memset(L,0,sizeof(lexer_t));
|
||||
#if DEBUG > 0
|
||||
log(LOGINFO,"[PLX] Lexer Setup.\n",NULL);
|
||||
#endif
|
||||
L->last=L->next=L->input=input;
|
||||
L->token=EOL;
|
||||
L->line=1;
|
||||
|
||||
#if PROFILE>0
|
||||
L->ntoken=0;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Main lexer. L->input is incremented, last position is saved, result is in L->token and L->x|L->name ..
|
||||
This lexer version supports only linear input buffers.
|
||||
A version for ring buffers (e.g., from link rx buffers) is requied.
|
||||
*/
|
||||
void NextToken (lexer_t *L) {
|
||||
address_t k, l, i;
|
||||
char *ir;
|
||||
char quotechar;
|
||||
|
||||
L->last=L->input;
|
||||
/* end of line token */
|
||||
if (*L->input == 0) {
|
||||
L->token = EOL;
|
||||
return;
|
||||
}
|
||||
|
||||
/* 0. Consume white spaces */
|
||||
|
||||
whitespaces(L);
|
||||
|
||||
#if PROFILE>0
|
||||
L->ntoken++;
|
||||
#endif
|
||||
|
||||
|
||||
/* 1.0 comment? */
|
||||
if (*L->input=='-' && *(L->input+1)=='-') {
|
||||
while (*L->input && *L->input!='\n') L->input++;
|
||||
if (!*L->input) { L->token=EOL; return; }
|
||||
}
|
||||
|
||||
/* 1a. unsigned numbers, value returned in L->x */
|
||||
#if HAS_FLOAT==0
|
||||
if (*L->input <= '9' && *L->input >= '0') {
|
||||
L->input += ParseNumber(L->input, &L->x);
|
||||
#else
|
||||
if ((*L->input <= '9' && *L->input >= '0') || *L->input == '.') {
|
||||
L->input += ParseNumber2(L->input, &L->x);
|
||||
#endif
|
||||
// number in expression
|
||||
L->token = TNUMBER;
|
||||
L->next = L->input;
|
||||
return;
|
||||
}
|
||||
// 1b. character 'x', value returned in L->arg/arglen
|
||||
if (*L->input=='\'') {
|
||||
L->input++;
|
||||
if (*L->input=='\\') {
|
||||
//escape sequence
|
||||
L->input++;
|
||||
switch (*L->input) {
|
||||
case 'n': L->arg=NL;
|
||||
}
|
||||
} else
|
||||
L->arg=L->input;
|
||||
L->input++;
|
||||
if (*L->input!='\'') {
|
||||
L->token=TERROR;
|
||||
return;
|
||||
}
|
||||
L->arglen=1;
|
||||
L->token=TCHAR;
|
||||
L->input++;
|
||||
return;
|
||||
}
|
||||
// 1.c string "text", value returned in L->arg/arglen
|
||||
if (*L->input=='"') {
|
||||
L->input++;
|
||||
L->arg=L->input;
|
||||
// string can be empyt
|
||||
if (*L->input!='"') do { *L->input++; } while (*L->input!=0 && *L->input!='"');
|
||||
if (*L->input!='"') {
|
||||
L->token=TERROR;
|
||||
return;
|
||||
}
|
||||
L->arglen=L->input-L->arg;
|
||||
L->token=TSTRING;
|
||||
L->input++;
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* Keywords and variables
|
||||
*
|
||||
* Isolate a word, input points to the beginning, l is the length of the word.
|
||||
* ir points to the end of the word after isolating.
|
||||
*/
|
||||
|
||||
l = 0;
|
||||
ir = L->input;
|
||||
while (-1) {
|
||||
if (*ir >= 'a' && *ir <= 'z') {
|
||||
*ir -= 32; /* toupper code, changing the input buffer directly */
|
||||
ir++;
|
||||
l++;
|
||||
} else if ((*ir >= '@' && *ir <= 'Z') || *ir == '_') {
|
||||
ir++;
|
||||
l++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* 2. Keywords
|
||||
* Ir is reused here to implement string compares
|
||||
* scanning the keyword array.
|
||||
* Once a keyword is detected the input buffer is advanced
|
||||
* by its length, and the token value is returned.
|
||||
*
|
||||
* Keywords are an array of null terminated strings.
|
||||
* They are always matched uppercase.
|
||||
*/
|
||||
if (l==0) {
|
||||
// special characters, no starting letter
|
||||
k = GetKeywordGroup(*L->input)-1;
|
||||
|
||||
while (GetTokenValue(k) != 0) {
|
||||
char c;
|
||||
ir = GetKeyword(k); i = 0;
|
||||
c=/*btoupper*/(*(L->input + i)); /* name was alreday case converted above */
|
||||
|
||||
if (c!=*ir) { k++; continue; }
|
||||
while (*(ir + i) != 0) {
|
||||
if (*(ir + i) != c) {
|
||||
k++; i = 0;
|
||||
break;
|
||||
} else
|
||||
i++;
|
||||
c=btoupper(*(L->input + i));
|
||||
}
|
||||
if (i == 0)
|
||||
continue;
|
||||
// we cannot check if this is only a partial match!
|
||||
L->input += i;
|
||||
L->token = GetTokenValue(k);
|
||||
if (L->token == TREM) {
|
||||
// consume remark line
|
||||
while (*L->input != '\0' && *L->input!='\n') L->input++;
|
||||
}
|
||||
L->next=L->input;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
// onyl keyword starting with a letter
|
||||
k = GetKeywordGroup(*L->input)-1; // get keyword group start based on first character
|
||||
#if DEBUG>2
|
||||
print_format("lexer keyword? starting with token k=%d\n",k);
|
||||
#endif
|
||||
if (k>=0) while (GetTokenValue(k) != 0) {
|
||||
char c;
|
||||
ir = GetKeyword(k); i = 0;
|
||||
c=/*btoupper*/(*(L->input + i));
|
||||
if (c!=*ir) {
|
||||
// skip to next character block
|
||||
if (c<*ir) break; // nothing more to search for; no keyword
|
||||
if (*ir < 'A' || *ir>'Z') break; // no more keywords starting with a letter
|
||||
if (GetKeywordGroup(c)>0) { k=GetKeywordGroup(c)-1; continue; }
|
||||
else break;
|
||||
}
|
||||
while (*(ir + i) != 0) {
|
||||
if (*(ir + i) != c || i==l) {
|
||||
k++; i = 0;
|
||||
break;
|
||||
} else
|
||||
i++;
|
||||
c=btoupper(*(L->input + i));
|
||||
}
|
||||
if (i == 0)
|
||||
continue;
|
||||
c = btoupper(*(L->input + i));
|
||||
if ((c>='A' && c<='Z') || (c>='0' && c<='9')) {
|
||||
// only partial match
|
||||
k++; i=0;
|
||||
continue;
|
||||
}
|
||||
L->input += i;
|
||||
L->token = GetTokenValue(k);
|
||||
if (L->token == TREM) {
|
||||
// consume remark line
|
||||
while (*L->input != '\0' && *L->input!='\n') L->input++;
|
||||
}
|
||||
L->next=L->input;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Ccall extensions must be checked after built-in keyword search
|
||||
*/
|
||||
k = 0;
|
||||
if (ccalls) while (ccalls[k].name != 0) {
|
||||
char c;
|
||||
ir = ccalls[k].name;
|
||||
// printf("ccalls[%d] %s\n",k,ir);
|
||||
i = 0;
|
||||
c = /*btoupper*/(*(L->input));
|
||||
if (c<*ir) break; // not a ccall
|
||||
if (c!=*ir) {
|
||||
if (ccalls[k].next) {
|
||||
k=ccalls[k].next;
|
||||
// printf("ccjump %c %d\n",c,k);
|
||||
continue;
|
||||
} else break;
|
||||
}
|
||||
while (*(ir + i) != 0) {
|
||||
if (*(ir + i) != btoupper(*(L->input + i))) {
|
||||
k++;
|
||||
i = 0;
|
||||
break;
|
||||
} else
|
||||
i++;
|
||||
}
|
||||
if (i == 0)
|
||||
continue;
|
||||
c = /*btoupper*/(*(L->input + i));
|
||||
if ((c>='A' && c<='Z') || (c>='0' && c<='9')) {
|
||||
// only partial match
|
||||
k++;
|
||||
continue;
|
||||
}
|
||||
|
||||
L->input += i;
|
||||
L->next = L->input;
|
||||
L->token = TCCALL;
|
||||
L->ix = k;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
Cconst extensions must be checked after built-in keyword search
|
||||
*/
|
||||
k = 0;
|
||||
if (cconst) while (cconst[k].name != 0) {
|
||||
char c;
|
||||
ir = cconst[k].name;
|
||||
// printf("ccalls[%d] %s\n",k,ir);
|
||||
i = 0;
|
||||
c = /*btoupper*/(*(L->input));
|
||||
if (c<*ir) break; // not a ccall
|
||||
if (c!=*ir) {
|
||||
if (cconst[k].next) {
|
||||
k=cconst[k].next;
|
||||
// printf("ccjump %c %d\n",c,k);
|
||||
continue;
|
||||
} else break;
|
||||
}
|
||||
while (*(ir + i) != 0) {
|
||||
if (*(ir + i) != btoupper(*(L->input + i))) {
|
||||
k++;
|
||||
i = 0;
|
||||
break;
|
||||
} else
|
||||
i++;
|
||||
}
|
||||
if (i == 0)
|
||||
continue;
|
||||
c = /*btoupper*/(*(L->input + i));
|
||||
if ((c>='A' && c<='Z') || (c>='0' && c<='9')) {
|
||||
// only partial match
|
||||
k++;
|
||||
continue;
|
||||
}
|
||||
|
||||
L->input += i;
|
||||
L->next = L->input;
|
||||
if (cconst[k].string) {
|
||||
L->token = TSTRING;
|
||||
L->arg = cconst[k].string;
|
||||
L->arglen = (index_t)cconst[k].value;
|
||||
} else {
|
||||
L->token = TNUMBER;
|
||||
L->x = cconst[k].value;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// X. Identifier
|
||||
if (l > 0 && l <= MAXNAMELEN) {
|
||||
int i;
|
||||
l=0;
|
||||
L->token = TIDENTIFIER; // can be variable, string variable, array variable, function, resolved in code.CompileLine
|
||||
while (((*L->input >= '0' && *L->input <= '9') || (*L->input >= '@' && *L->input <= 'Z') ||
|
||||
(*L->input >= 'a' && *L->input <= 'z') || (*L->input == '_')) &&
|
||||
*L->input != 0) {
|
||||
L->name[l] = *L->input;
|
||||
L->input++; l++;
|
||||
}
|
||||
L->name[l]=0;
|
||||
L->next=L->input;
|
||||
return;
|
||||
}
|
||||
|
||||
/* other single characters are parsed and stored */
|
||||
L->token = *L->input;
|
||||
if (*L->input == '\n') { L->line++; L->token = EOL; }
|
||||
L->input++;
|
||||
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user