660 lines
15 KiB
C
660 lines
15 KiB
C
#include "lexer.h"
|
|
#include "aliases.h"
|
|
#include "dstring.h"
|
|
#include "lexer_data.h"
|
|
#include <assert.h>
|
|
#include <ctype.h>
|
|
#include <stdbool.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#define MAX_KEYWORD_LENGTH 5
|
|
#define UNICODE_LENGTH 4
|
|
#define MAX_STACK_CAPACITY 1024
|
|
#define STRING_BUF_START_CAPACITY 1024
|
|
|
|
typedef struct {
|
|
lexer_state_t stack[MAX_STACK_CAPACITY];
|
|
u64 size;
|
|
} state_stack_t;
|
|
|
|
typedef enum {
|
|
LEXER_STRING_KEYWORD,
|
|
LEXER_STRING_UNICODE,
|
|
} lex_str_type;
|
|
|
|
typedef struct {
|
|
char str[MAX_KEYWORD_LENGTH + 1];
|
|
} keyword_t;
|
|
|
|
typedef struct {
|
|
char codepoint[UNICODE_LENGTH];
|
|
} unicode_t;
|
|
|
|
typedef struct {
|
|
lex_str_type type;
|
|
u64 size;
|
|
union {
|
|
keyword_t keyword;
|
|
unicode_t unicode;
|
|
};
|
|
} lexer_string_t;
|
|
|
|
struct lexer_s {
|
|
u64 cursor;
|
|
u64 line;
|
|
u64 column;
|
|
u64 text_length;
|
|
const char *text;
|
|
lexer_state_t current;
|
|
lexer_state_t next;
|
|
state_stack_t stack;
|
|
lexer_string_t keyword;
|
|
lexer_string_t codepoint;
|
|
dstr_t *current_string;
|
|
bool token_ready;
|
|
token_t token;
|
|
bool has_extra_token;
|
|
token_t extra_token;
|
|
dstr_t *error_message;
|
|
char current_char;
|
|
lexer_input_t current_input;
|
|
};
|
|
|
|
INTERNAL lexer_input_t char_type(char input);
|
|
|
|
INTERNAL void stack_push(state_stack_t *stack, lexer_state_t value);
|
|
INTERNAL lexer_state_t stack_pop(state_stack_t *stack);
|
|
|
|
INTERNAL token_t dstr_to_numerical_token(const dstr_t *str);
|
|
INTERNAL void set_token(token_t *token, u64 line, u64 column, token_type type,
|
|
token_value_t value);
|
|
|
|
INTERNAL void finalise_state_transition(lexer_t *lexer);
|
|
INTERNAL void handle_object_end(lexer_t *lexer);
|
|
INTERNAL void handle_array_end(lexer_t *lexer);
|
|
INTERNAL void handle_string_end(lexer_t *lexer);
|
|
INTERNAL void post_keyword(lexer_t *lexer);
|
|
INTERNAL void set_numerical_token(lexer_t *lexer);
|
|
|
|
INTERNAL lexer_state_t state_table[COUNT_LEXER_STATES][COUNT_LEXER_INPUTS] = {
|
|
#include "lexer_state_transitions.table"
|
|
};
|
|
|
|
void lexer_init(lexer_t **lexer) {
|
|
if (*lexer) {
|
|
lexer_free(lexer);
|
|
}
|
|
|
|
*lexer = (lexer_t *)malloc(sizeof(lexer_t));
|
|
|
|
if (!(*lexer)) {
|
|
return;
|
|
}
|
|
|
|
(*lexer)->cursor = 0;
|
|
(*lexer)->line = 1;
|
|
(*lexer)->column = 0;
|
|
(*lexer)->text_length = 0;
|
|
(*lexer)->text = "";
|
|
(*lexer)->current = LEXER_STATE_START;
|
|
(*lexer)->next = LEXER_STATE_START;
|
|
(*lexer)->keyword.type = LEXER_STRING_KEYWORD;
|
|
(*lexer)->codepoint.type = LEXER_STRING_UNICODE;
|
|
(*lexer)->current_string = dstr_with_capacity(STRING_BUF_START_CAPACITY);
|
|
(*lexer)->error_message = dstr_with_capacity(STRING_BUF_START_CAPACITY);
|
|
(*lexer)->token_ready = false;
|
|
(*lexer)->token = (token_t){0};
|
|
(*lexer)->has_extra_token = false;
|
|
(*lexer)->extra_token = (token_t){0};
|
|
|
|
if (!((*lexer)->current_string)) {
|
|
lexer_free(lexer);
|
|
}
|
|
}
|
|
|
|
void lexer_free(lexer_t **lexer) {
|
|
if (!(*lexer)) {
|
|
return;
|
|
}
|
|
|
|
dstr_free(&((*lexer)->current_string));
|
|
dstr_free(&((*lexer)->error_message));
|
|
|
|
free(*lexer);
|
|
*lexer = NULL;
|
|
}
|
|
|
|
lex_result_t get_next_token(lexer_t *lexer, const char *text) {
|
|
if (text != NULL) {
|
|
lexer->cursor = 0;
|
|
lexer->text = text;
|
|
lexer->text_length = strlen(text);
|
|
}
|
|
|
|
dstr_clear(lexer->current_string);
|
|
|
|
while (lexer->cursor < lexer->text_length) {
|
|
if (lexer->has_extra_token) {
|
|
lexer->has_extra_token = false;
|
|
|
|
return (lex_result_t){
|
|
(lex_err_t){.errno = LEX_ERR_NONE, .msg = ""},
|
|
lexer->extra_token,
|
|
};
|
|
}
|
|
|
|
lexer->current_char = lexer->text[(lexer->cursor)++];
|
|
|
|
lexer->current_input = char_type(lexer->current_char);
|
|
|
|
lexer->next = state_table[lexer->current][lexer->current_input];
|
|
|
|
finalise_state_transition(lexer);
|
|
|
|
if (lexer->current_input == LEXER_INPUT_NEWLINE) {
|
|
++(lexer->line);
|
|
lexer->column = 0;
|
|
} else {
|
|
++(lexer->column);
|
|
}
|
|
|
|
if (lexer->current == LEXER_STATE_ERROR) {
|
|
char msg[STRING_BUF_START_CAPACITY + 1];
|
|
memset(msg, 0, STRING_BUF_START_CAPACITY + 1);
|
|
|
|
u64 slice_length = 20;
|
|
char slice[slice_length];
|
|
snprintf(slice, slice_length, "%s", &(lexer->text[lexer->cursor - 1]));
|
|
|
|
snprintf(
|
|
msg, STRING_BUF_START_CAPACITY,
|
|
"\n(%llu:%llu) Encountered an error while parsing the following:\n%s",
|
|
(unsigned long long)lexer->line, (unsigned long long)lexer->column,
|
|
slice);
|
|
|
|
dstr_update(&(lexer->error_message), msg);
|
|
|
|
return (lex_result_t){
|
|
(lex_err_t){.errno = LEX_ERR_INVALID,
|
|
.msg = dstr_to_cstr(lexer->error_message)},
|
|
(token_t){0},
|
|
};
|
|
} else if (lexer->token_ready) {
|
|
lexer->token_ready = false;
|
|
|
|
return (lex_result_t){
|
|
(lex_err_t){.errno = LEX_ERR_NONE, .msg = ""},
|
|
lexer->token,
|
|
};
|
|
}
|
|
}
|
|
|
|
return (lex_result_t){
|
|
(lex_err_t){.errno = LEX_ERR_NONE, .msg = ""},
|
|
(token_t){0},
|
|
};
|
|
}
|
|
|
|
void print_token(token_t token) {
|
|
i32 num_padding = 4;
|
|
|
|
printf("{LINE: %*llu, COLUMN: %*llu, TYPE: ", num_padding,
|
|
(unsigned long long)token.line, num_padding,
|
|
(unsigned long long)token.column);
|
|
|
|
i32 token_type_padding = 15;
|
|
|
|
switch (token.type) {
|
|
case TK_NO_TOKEN:
|
|
break;
|
|
case TK_L_BRACE:
|
|
printf("%*s, VALUE: N/A", token_type_padding, "TK_L_BRACE");
|
|
break;
|
|
case TK_R_BRACE:
|
|
printf("%*s, VALUE: N/A", token_type_padding, "TK_R_BRACE");
|
|
break;
|
|
case TK_L_BRACKET:
|
|
printf("%*s, VALUE: N/A", token_type_padding, "TK_L_BRACKET");
|
|
break;
|
|
case TK_R_BRACKET:
|
|
printf("%*s, VALUE: N/A", token_type_padding, "TK_R_BRACKET");
|
|
break;
|
|
case TK_NULL:
|
|
printf("%*s, VALUE: N/A", token_type_padding, "TK_NULL");
|
|
break;
|
|
case TK_BOOL:
|
|
printf("%*s, VALUE: %s", token_type_padding, "TK_BOOL",
|
|
token.value.boolean ? "true" : "false");
|
|
break;
|
|
case TK_STR_KEY:
|
|
printf("%*s, VALUE: %s", token_type_padding, "TK_STR_KEY",
|
|
token.value.string);
|
|
break;
|
|
case TK_STR_VAL:
|
|
printf("%*s, VALUE: %s", token_type_padding, "TK_STR_VAL",
|
|
token.value.string);
|
|
break;
|
|
case TK_INTEGER:
|
|
printf("%*s, VALUE: %lld", token_type_padding, "TK_INTEGER",
|
|
(long long)token.value.num_int);
|
|
break;
|
|
case TK_DOUBLE:
|
|
printf("%*s, VALUE: %f", token_type_padding, "TK_DOUBLE",
|
|
token.value.num_frac);
|
|
break;
|
|
}
|
|
|
|
printf("}\n");
|
|
}
|
|
|
|
INTERNAL lexer_input_t char_type(char input) {
|
|
if (input == '\n') {
|
|
return LEXER_INPUT_NEWLINE;
|
|
} else if (isspace(input)) {
|
|
return LEXER_INPUT_WHITE_SPACE;
|
|
} else if (input >= '1' && input <= '9') {
|
|
return LEXER_INPUT_NON_ZERO;
|
|
}
|
|
|
|
switch (input) {
|
|
case '{':
|
|
return LEXER_INPUT_OPEN_BRACE;
|
|
case '}':
|
|
return LEXER_INPUT_CLOSE_BRACE;
|
|
case '[':
|
|
return LEXER_INPUT_OPEN_BRACKET;
|
|
case ']':
|
|
return LEXER_INPUT_CLOSE_BRACKET;
|
|
case ',':
|
|
return LEXER_INPUT_COMMA;
|
|
case ':':
|
|
return LEXER_INPUT_COLON;
|
|
case '"':
|
|
return LEXER_INPUT_DOUBLE_QUOTE;
|
|
case '\\':
|
|
return LEXER_INPUT_BACK_SLASH;
|
|
case '/':
|
|
return LEXER_INPUT_FORWARD_SLASH;
|
|
case 'a':
|
|
return LEXER_INPUT_LOWER_A;
|
|
case 'b':
|
|
return LEXER_INPUT_LOWER_B;
|
|
case 'c':
|
|
return LEXER_INPUT_LOWER_C;
|
|
case 'd':
|
|
return LEXER_INPUT_LOWER_D;
|
|
case 'e':
|
|
return LEXER_INPUT_LOWER_E;
|
|
case 'f':
|
|
return LEXER_INPUT_LOWER_F;
|
|
case 'l':
|
|
return LEXER_INPUT_LOWER_L;
|
|
case 'n':
|
|
return LEXER_INPUT_LOWER_N;
|
|
case 'r':
|
|
return LEXER_INPUT_LOWER_R;
|
|
case 's':
|
|
return LEXER_INPUT_LOWER_S;
|
|
case 't':
|
|
return LEXER_INPUT_LOWER_T;
|
|
case 'u':
|
|
return LEXER_INPUT_LOWER_U;
|
|
case 'A':
|
|
return LEXER_INPUT_UPPER_A;
|
|
case 'B':
|
|
return LEXER_INPUT_UPPER_B;
|
|
case 'C':
|
|
return LEXER_INPUT_UPPER_C;
|
|
case 'D':
|
|
return LEXER_INPUT_UPPER_D;
|
|
case 'E':
|
|
return LEXER_INPUT_UPPER_E;
|
|
case 'F':
|
|
return LEXER_INPUT_UPPER_F;
|
|
case '-':
|
|
return LEXER_INPUT_MINUS;
|
|
case '+':
|
|
return LEXER_INPUT_PLUS;
|
|
case '.':
|
|
return LEXER_INPUT_DECIMAL;
|
|
case '0':
|
|
return LEXER_INPUT_ZERO;
|
|
default:
|
|
return LEXER_INPUT_OTHER;
|
|
}
|
|
}
|
|
|
|
void stack_push(state_stack_t *stack, lexer_state_t state) {
|
|
if (stack->size + 1 >= MAX_STACK_CAPACITY) {
|
|
return;
|
|
}
|
|
|
|
stack->stack[(stack->size)++] = state;
|
|
}
|
|
|
|
lexer_state_t stack_pop(state_stack_t *stack) {
|
|
if (stack->size == 0) {
|
|
return LEXER_STATE_ERROR;
|
|
}
|
|
|
|
lexer_state_t state = stack->stack[--(stack->size)];
|
|
|
|
return state;
|
|
}
|
|
|
|
token_t dstr_to_numerical_token(const dstr_t *str) {
|
|
token_t token = {0};
|
|
|
|
bool is_double = dstr_find(str, ".") != -1;
|
|
|
|
token.type = is_double ? TK_DOUBLE : TK_INTEGER;
|
|
|
|
if (is_double) {
|
|
token.value.num_frac = strtod(dstr_to_cstr(str), NULL);
|
|
} else {
|
|
token.value.num_int = atol(dstr_to_cstr(str));
|
|
}
|
|
|
|
return token;
|
|
}
|
|
|
|
void set_token(token_t *token, u64 line, u64 column, token_type type,
|
|
token_value_t value) {
|
|
*token = (token_t){
|
|
.line = line,
|
|
.column = column,
|
|
.type = type,
|
|
.value = value,
|
|
};
|
|
}
|
|
|
|
void finalise_state_transition(lexer_t *lexer) {
|
|
switch (lexer->next) {
|
|
case LEXER_STATE_OBJECT_START:
|
|
lexer->token_ready = true;
|
|
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACE,
|
|
(token_value_t){0});
|
|
|
|
stack_push(&(lexer->stack), LEXER_STATE_OBJECT);
|
|
|
|
lexer->next = LEXER_STATE_OBJECT;
|
|
|
|
break;
|
|
case LEXER_STATE_OBJECT_END:
|
|
switch (lexer->current) {
|
|
case LEXER_STATE_NUMBER:
|
|
case LEXER_STATE_FRACTION:
|
|
case LEXER_STATE_POWER:
|
|
case LEXER_STATE_NUMBER_END:
|
|
if (dstr_length(lexer->current_string) > 0) {
|
|
set_numerical_token(lexer);
|
|
}
|
|
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (lexer->current_input == LEXER_INPUT_CLOSE_BRACE) {
|
|
handle_object_end(lexer);
|
|
}
|
|
|
|
break;
|
|
case LEXER_STATE_ARRAY_START:
|
|
lexer->token_ready = true;
|
|
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACKET,
|
|
(token_value_t){0});
|
|
|
|
stack_push(&(lexer->stack), LEXER_STATE_ARRAY);
|
|
|
|
lexer->next = LEXER_STATE_ARRAY;
|
|
|
|
break;
|
|
case LEXER_STATE_ARRAY_END:
|
|
switch (lexer->current) {
|
|
case LEXER_STATE_NUMBER:
|
|
case LEXER_STATE_FRACTION:
|
|
case LEXER_STATE_POWER:
|
|
case LEXER_STATE_NUMBER_END:
|
|
if (dstr_length(lexer->current_string) > 0) {
|
|
set_numerical_token(lexer);
|
|
}
|
|
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (lexer->current_input == LEXER_INPUT_CLOSE_BRACKET) {
|
|
handle_array_end(lexer);
|
|
}
|
|
|
|
break;
|
|
case LEXER_STATE_KEY:
|
|
if (lexer->current == LEXER_STATE_OBJECT) {
|
|
stack_push(&(lexer->stack), LEXER_STATE_KEY);
|
|
|
|
lexer->next = LEXER_STATE_STRING;
|
|
}
|
|
|
|
break;
|
|
case LEXER_STATE_KEY_END:
|
|
lexer->next = LEXER_STATE_VALUE;
|
|
|
|
break;
|
|
case LEXER_STATE_STRING:
|
|
if (lexer->current == LEXER_STATE_VALUE ||
|
|
lexer->current == LEXER_STATE_ARRAY) {
|
|
stack_push(&(lexer->stack), LEXER_STATE_VALUE);
|
|
break;
|
|
}
|
|
|
|
// break left out intentionally
|
|
case LEXER_STATE_ESCAPE_SEQUENCE:
|
|
case LEXER_STATE_UNICODE_HEX1:
|
|
case LEXER_STATE_UNICODE_HEX2:
|
|
case LEXER_STATE_UNICODE_HEX3:
|
|
case LEXER_STATE_UNICODE_HEX4:
|
|
case LEXER_STATE_DECIMAL:
|
|
case LEXER_STATE_NUMBER:
|
|
case LEXER_STATE_FRACTION:
|
|
case LEXER_STATE_EXPONENT:
|
|
case LEXER_STATE_EXP_SIGN:
|
|
case LEXER_STATE_POWER:
|
|
dstr_append(&(lexer->current_string), lexer->current_char);
|
|
|
|
break;
|
|
case LEXER_STATE_STRING_END:
|
|
if (lexer->current_input == LEXER_INPUT_DOUBLE_QUOTE) {
|
|
handle_string_end(lexer);
|
|
}
|
|
|
|
break;
|
|
case LEXER_STATE_TRUE:
|
|
case LEXER_STATE_FALSE:
|
|
case LEXER_STATE_NULL:
|
|
post_keyword(lexer);
|
|
|
|
break;
|
|
case LEXER_STATE_VALUE_END:
|
|
switch (lexer->current) {
|
|
case LEXER_STATE_NUMBER:
|
|
case LEXER_STATE_FRACTION:
|
|
case LEXER_STATE_POWER:
|
|
case LEXER_STATE_NUMBER_END:
|
|
if (dstr_length(lexer->current_string) > 0) {
|
|
set_numerical_token(lexer);
|
|
}
|
|
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
lexer->next = lexer->stack.stack[lexer->stack.size - 1];
|
|
|
|
break;
|
|
case LEXER_STATE_NUMBER_END:
|
|
switch (lexer->current) {
|
|
case LEXER_STATE_NUMBER:
|
|
case LEXER_STATE_FRACTION:
|
|
case LEXER_STATE_POWER:
|
|
set_numerical_token(lexer);
|
|
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
break;
|
|
case LEXER_STATE_ERROR:
|
|
case LEXER_STATE_START:
|
|
case LEXER_STATE_VALUE:
|
|
case LEXER_STATE_OBJECT:
|
|
case LEXER_STATE_ARRAY:
|
|
case LEXER_STATE_LAST_COLLECTION:
|
|
case LEXER_STATE_T:
|
|
case LEXER_STATE_TR:
|
|
case LEXER_STATE_TRU:
|
|
case LEXER_STATE_F:
|
|
case LEXER_STATE_FA:
|
|
case LEXER_STATE_FAL:
|
|
case LEXER_STATE_FALS:
|
|
case LEXER_STATE_N:
|
|
case LEXER_STATE_NU:
|
|
case LEXER_STATE_NUL:
|
|
case LEXER_STATE_KEYWORD_END:
|
|
case COUNT_LEXER_STATES:
|
|
break;
|
|
}
|
|
|
|
lexer->current = lexer->next;
|
|
}
|
|
|
|
void handle_object_end(lexer_t *lexer) {
|
|
lexer_state_t last = lexer->stack.stack[lexer->stack.size - 1];
|
|
|
|
if (last != LEXER_STATE_OBJECT) {
|
|
lexer->next = LEXER_STATE_ERROR;
|
|
|
|
return;
|
|
}
|
|
|
|
if (lexer->stack.size > 0) {
|
|
stack_pop(&(lexer->stack));
|
|
} else {
|
|
lexer->next = LEXER_STATE_LAST_COLLECTION;
|
|
}
|
|
|
|
token_t *token;
|
|
|
|
if (lexer->token_ready) {
|
|
lexer->has_extra_token = true;
|
|
token = &(lexer->extra_token);
|
|
} else {
|
|
lexer->token_ready = true;
|
|
token = &(lexer->token);
|
|
}
|
|
|
|
set_token(token, lexer->line, lexer->column, TK_R_BRACE, (token_value_t){0});
|
|
}
|
|
|
|
void handle_array_end(lexer_t *lexer) {
|
|
lexer_state_t last = lexer->stack.stack[lexer->stack.size - 1];
|
|
|
|
if (last != LEXER_STATE_ARRAY) {
|
|
lexer->next = LEXER_STATE_ERROR;
|
|
|
|
return;
|
|
}
|
|
|
|
if (lexer->stack.size > 0) {
|
|
stack_pop(&(lexer->stack));
|
|
} else {
|
|
lexer->next = LEXER_STATE_LAST_COLLECTION;
|
|
}
|
|
|
|
token_t *token;
|
|
|
|
if (lexer->token_ready) {
|
|
lexer->has_extra_token = true;
|
|
token = &(lexer->extra_token);
|
|
} else {
|
|
lexer->token_ready = true;
|
|
token = &(lexer->token);
|
|
}
|
|
|
|
set_token(token, lexer->line, lexer->column, TK_R_BRACKET,
|
|
(token_value_t){0});
|
|
}
|
|
|
|
void handle_string_end(lexer_t *lexer) {
|
|
lexer_state_t string_type = stack_pop(&(lexer->stack));
|
|
|
|
lexer->token_ready = true;
|
|
token_t *token = &(lexer->token);
|
|
u64 column = lexer->column - dstr_length(lexer->current_string);
|
|
token_value_t value = {.string = dstr_to_cstr(lexer->current_string)};
|
|
|
|
if (string_type == LEXER_STATE_KEY) {
|
|
set_token(token, lexer->line, column, TK_STR_KEY, value);
|
|
} else if (string_type == LEXER_STATE_VALUE) {
|
|
set_token(token, lexer->line, column, TK_STR_VAL, value);
|
|
}
|
|
}
|
|
|
|
void post_keyword(lexer_t *lexer) {
|
|
u64 keyword_char_count;
|
|
u64 column;
|
|
|
|
token_t *token = &(lexer->token);
|
|
|
|
switch (lexer->next) {
|
|
case LEXER_STATE_NULL:
|
|
keyword_char_count = 4;
|
|
|
|
column = lexer->column - keyword_char_count;
|
|
|
|
set_token(token, lexer->line, column, TK_NULL, (token_value_t){0});
|
|
|
|
break;
|
|
case LEXER_STATE_TRUE:
|
|
keyword_char_count = 4;
|
|
|
|
column = lexer->column - keyword_char_count;
|
|
|
|
set_token(token, lexer->line, column, TK_BOOL,
|
|
(token_value_t){.boolean = true});
|
|
|
|
break;
|
|
case LEXER_STATE_FALSE:
|
|
keyword_char_count = 5;
|
|
|
|
column = lexer->column - keyword_char_count;
|
|
|
|
set_token(token, lexer->line, column, TK_BOOL,
|
|
(token_value_t){.boolean = false});
|
|
|
|
break;
|
|
default:
|
|
lexer->next = LEXER_STATE_ERROR;
|
|
|
|
return;
|
|
}
|
|
|
|
lexer->token_ready = true;
|
|
|
|
lexer->next = LEXER_STATE_KEYWORD_END;
|
|
}
|
|
|
|
void set_numerical_token(lexer_t *lexer) {
|
|
lexer->token_ready = true;
|
|
u64 column = lexer->column - dstr_length(lexer->current_string);
|
|
|
|
token_t token = dstr_to_numerical_token(lexer->current_string);
|
|
|
|
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
|
|
}
|