diff --git a/src/lexer.c b/src/lexer.c new file mode 100644 index 0000000..dc0c162 --- /dev/null +++ b/src/lexer.c @@ -0,0 +1,485 @@ +#include "config.h" +#include "types.h" +#include "tokens.h" +#include "lexer.h" +#include "parsef.h" +#include "printf.h" +#include "ccall.h" +#include "cconst.h" +#include "log.h" + +static char NL[]="\n"; + +/* + All name lists must be sorted (wrt. first character)! +*/ +const char * keyword[] = { + "AND", + "AWAIT", + "BREAK", + "CALL", + "CONST", + "CONTINUE", + "DATA", + "DELAY", + "DO", + "ELSE", + "END", + "EVENT", + "FOR", + "FORK", + "FUNC", + "GO", + "IF", +#if HAS_LOCK > 0 + "LOCK", +#endif + "MOD", + "NOT", + "ON", + "OR", + "PROC", + "REPEAT", + "RETURN", + "STOP", + "THEN", +#if HAS_LOCK > 0 + "UNLOCK", +#endif + "VAR", + "WHILE", + "YIELD", + "--", + "<=", + ">=", + "<>", + NULL +}; + +/* the zero terminated token dictionary needed for scalability */ +const token_t tokens[] = { + TAND, + TAWAIT, + TBREAK, + TCALL, + TCONST, + TCONTINUE, + TDATA, + TDELAY, + TDO, + TELSE, + TEND, + TEVENT, + TFOR, + TFORK, + TFUNC, + TGO, + TIF, +#if HAS_LOCK > 0 + TLOCK, +#endif + TMODULO, + TNOT, + TON, + TOR, + TPROC, + TREPEAT, + TRETURN, + TSTOP, + TTHEN, +#if HAS_LOCK > 0 + TUNLOCK, +#endif + TVARDEF, + TWHILE, + TYIELD, + TREM, + TLEQ, + TGEQ, + TNEQ, + 0 +}; + +extern int verbose; + +/* + Computed keyword first character group jump table for fast keyword table search + A=0, B=1, ... + index=26 => all other non-alphanumeric keywords +*/ +static index_t keywordgroup[27]; + +/* upper case, don't trust the buildins on microcontrollers */ +#define btoupper(ch) (ch >= 'a' && ch <= 'z'?ch-32:ch) + +#define GetKeyword(i) (char*)keyword[i] + +/* tokens read here are token_t constructed from multi byte sequences */ +#define GetTokenValue(i) (i >= sizeof(tokens)?0:tokens[i]) + +/* skip to next first character group if any, k+1 is returned to distinguish between empty group and keyword index 0 */ +#define GetKeywordGroup(ch) (ch>='A'&&ch<='Z'?keywordgroup[ch-'A']:keywordgroup[26]) + +/* it is indeed k+1, see above */ +#define SetKeywordGroup(ch,k) keywordgroup[ch>='A'&&ch<='Z'?ch-'A':26]=k+1 + +// Lookahead of next token +void LookAheadToken(lexer_t *L) { + char *bi=L->input; + L->lasttoken=L->token; + NextToken(L); + L->next=L->input; + L->input=bi; + #if PROFILE>0 + L->ntoken--; + #endif +} + +// following LookAheadToken +void SkipToken (lexer_t *L) { + L->input=L->next; + #if PROFILE>0 + L->ntoken++; + #endif +} + +/* consume whitespaces */ +static void whitespaces(lexer_t *L) { + while (*L->input == ' ' || *L->input == '\t') L->input++; +} + +/* + One-time init +*/ +void LexerInit(lexer_t *L) { + int i,j=0; + // initialize and compute keyword jump table + memset(keywordgroup,0,27*sizeof(index_t)); + char c=keyword[0][0]; + do { + if (c<'A'||c>'Z') break; + SetKeywordGroup(c,j); + while (keyword[j]!=NULL && keyword[j][0]==c) j++; + if (keyword[j]!=NULL) c=keyword[j][0]; + } while(keyword[j]!=NULL); + SetKeywordGroup(c,j); +} + +/* Initialize the lexer state and assign input */ +void LexerSetup(lexer_t *L, char * input) { + memset(L,0,sizeof(lexer_t)); +#if DEBUG > 0 + log(LOGINFO,"[PLX] Lexer Setup.\n",NULL); +#endif + L->last=L->next=L->input=input; + L->token=EOL; + L->line=1; + +#if PROFILE>0 + L->ntoken=0; +#endif +} + + +/* + Main lexer. L->input is incremented, last position is saved, result is in L->token and L->x|L->name .. + This lexer version supports only linear input buffers. + A version for ring buffers (e.g., from link rx buffers) is requied. +*/ +void NextToken (lexer_t *L) { + address_t k, l, i; + char *ir; + char quotechar; + + L->last=L->input; + /* end of line token */ + if (*L->input == 0) { + L->token = EOL; + return; + } + + /* 0. Consume white spaces */ + + whitespaces(L); + +#if PROFILE>0 + L->ntoken++; +#endif + + + /* 1.0 comment? */ + if (*L->input=='-' && *(L->input+1)=='-') { + while (*L->input && *L->input!='\n') L->input++; + if (!*L->input) { L->token=EOL; return; } + } + + /* 1a. unsigned numbers, value returned in L->x */ +#if HAS_FLOAT==0 + if (*L->input <= '9' && *L->input >= '0') { + L->input += ParseNumber(L->input, &L->x); +#else + if ((*L->input <= '9' && *L->input >= '0') || *L->input == '.') { + L->input += ParseNumber2(L->input, &L->x); +#endif + // number in expression + L->token = TNUMBER; + L->next = L->input; + return; + } + // 1b. character 'x', value returned in L->arg/arglen + if (*L->input=='\'') { + L->input++; + if (*L->input=='\\') { + //escape sequence + L->input++; + switch (*L->input) { + case 'n': L->arg=NL; + } + } else + L->arg=L->input; + L->input++; + if (*L->input!='\'') { + L->token=TERROR; + return; + } + L->arglen=1; + L->token=TCHAR; + L->input++; + return; + } + // 1.c string "text", value returned in L->arg/arglen + if (*L->input=='"') { + L->input++; + L->arg=L->input; + // string can be empyt + if (*L->input!='"') do { *L->input++; } while (*L->input!=0 && *L->input!='"'); + if (*L->input!='"') { + L->token=TERROR; + return; + } + L->arglen=L->input-L->arg; + L->token=TSTRING; + L->input++; + return; + } + /* + * Keywords and variables + * + * Isolate a word, input points to the beginning, l is the length of the word. + * ir points to the end of the word after isolating. + */ + + l = 0; + ir = L->input; + while (-1) { + if (*ir >= 'a' && *ir <= 'z') { + *ir -= 32; /* toupper code, changing the input buffer directly */ + ir++; + l++; + } else if ((*ir >= '@' && *ir <= 'Z') || *ir == '_') { + ir++; + l++; + } else { + break; + } + } + + /* + * 2. Keywords + * Ir is reused here to implement string compares + * scanning the keyword array. + * Once a keyword is detected the input buffer is advanced + * by its length, and the token value is returned. + * + * Keywords are an array of null terminated strings. + * They are always matched uppercase. + */ + if (l==0) { + // special characters, no starting letter + k = GetKeywordGroup(*L->input)-1; + + while (GetTokenValue(k) != 0) { + char c; + ir = GetKeyword(k); i = 0; + c=/*btoupper*/(*(L->input + i)); /* name was alreday case converted above */ + + if (c!=*ir) { k++; continue; } + while (*(ir + i) != 0) { + if (*(ir + i) != c) { + k++; i = 0; + break; + } else + i++; + c=btoupper(*(L->input + i)); + } + if (i == 0) + continue; + // we cannot check if this is only a partial match! + L->input += i; + L->token = GetTokenValue(k); + if (L->token == TREM) { + // consume remark line + while (*L->input != '\0' && *L->input!='\n') L->input++; + } + L->next=L->input; + return; + } + } else { + // onyl keyword starting with a letter + k = GetKeywordGroup(*L->input)-1; // get keyword group start based on first character +#if DEBUG>2 + print_format("lexer keyword? starting with token k=%d\n",k); +#endif + if (k>=0) while (GetTokenValue(k) != 0) { + char c; + ir = GetKeyword(k); i = 0; + c=/*btoupper*/(*(L->input + i)); + if (c!=*ir) { + // skip to next character block + if (c<*ir) break; // nothing more to search for; no keyword + if (*ir < 'A' || *ir>'Z') break; // no more keywords starting with a letter + if (GetKeywordGroup(c)>0) { k=GetKeywordGroup(c)-1; continue; } + else break; + } + while (*(ir + i) != 0) { + if (*(ir + i) != c || i==l) { + k++; i = 0; + break; + } else + i++; + c=btoupper(*(L->input + i)); + } + if (i == 0) + continue; + c = btoupper(*(L->input + i)); + if ((c>='A' && c<='Z') || (c>='0' && c<='9')) { + // only partial match + k++; i=0; + continue; + } + L->input += i; + L->token = GetTokenValue(k); + if (L->token == TREM) { + // consume remark line + while (*L->input != '\0' && *L->input!='\n') L->input++; + } + L->next=L->input; + return; + } + } + +/* + Ccall extensions must be checked after built-in keyword search +*/ + k = 0; + if (ccalls) while (ccalls[k].name != 0) { + char c; + ir = ccalls[k].name; + // printf("ccalls[%d] %s\n",k,ir); + i = 0; + c = /*btoupper*/(*(L->input)); + if (c<*ir) break; // not a ccall + if (c!=*ir) { + if (ccalls[k].next) { + k=ccalls[k].next; + // printf("ccjump %c %d\n",c,k); + continue; + } else break; + } + while (*(ir + i) != 0) { + if (*(ir + i) != btoupper(*(L->input + i))) { + k++; + i = 0; + break; + } else + i++; + } + if (i == 0) + continue; + c = /*btoupper*/(*(L->input + i)); + if ((c>='A' && c<='Z') || (c>='0' && c<='9')) { + // only partial match + k++; + continue; + } + + L->input += i; + L->next = L->input; + L->token = TCCALL; + L->ix = k; + return; + } + +/* + Cconst extensions must be checked after built-in keyword search +*/ + k = 0; + if (cconst) while (cconst[k].name != 0) { + char c; + ir = cconst[k].name; + // printf("ccalls[%d] %s\n",k,ir); + i = 0; + c = /*btoupper*/(*(L->input)); + if (c<*ir) break; // not a ccall + if (c!=*ir) { + if (cconst[k].next) { + k=cconst[k].next; + // printf("ccjump %c %d\n",c,k); + continue; + } else break; + } + while (*(ir + i) != 0) { + if (*(ir + i) != btoupper(*(L->input + i))) { + k++; + i = 0; + break; + } else + i++; + } + if (i == 0) + continue; + c = /*btoupper*/(*(L->input + i)); + if ((c>='A' && c<='Z') || (c>='0' && c<='9')) { + // only partial match + k++; + continue; + } + + L->input += i; + L->next = L->input; + if (cconst[k].string) { + L->token = TSTRING; + L->arg = cconst[k].string; + L->arglen = (index_t)cconst[k].value; + } else { + L->token = TNUMBER; + L->x = cconst[k].value; + } + return; + } + + // X. Identifier + if (l > 0 && l <= MAXNAMELEN) { + int i; + l=0; + L->token = TIDENTIFIER; // can be variable, string variable, array variable, function, resolved in code.CompileLine + while (((*L->input >= '0' && *L->input <= '9') || (*L->input >= '@' && *L->input <= 'Z') || + (*L->input >= 'a' && *L->input <= 'z') || (*L->input == '_')) && + *L->input != 0) { + L->name[l] = *L->input; + L->input++; l++; + } + L->name[l]=0; + L->next=L->input; + return; + } + + /* other single characters are parsed and stored */ + L->token = *L->input; + if (*L->input == '\n') { L->line++; L->token = EOL; } + L->input++; + + return; + +} +