diff --git a/include/lexer/lexer.h b/include/lexer/lexer.h index 129dfb8..0c02fe5 100644 --- a/include/lexer/lexer.h +++ b/include/lexer/lexer.h @@ -4,9 +4,6 @@ #include "aliases.h" #include -#define VALID_JSON true -#define INVALID_JSON false - typedef const char *str_view_t; typedef enum { diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index 4454a45..cca1c17 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -1,6 +1,7 @@ #include "lexer.h" #include "aliases.h" #include "dstring.h" +#include "lexer_data.h" #include #include #include @@ -13,43 +14,6 @@ #define MAX_STACK_CAPACITY 1024 #define STRING_BUF_START_CAPACITY 1024 -typedef enum { - // GENERAL STATES - LEXER_STATE_START, - LEXER_STATE_ERROR, - LEXER_STATE_VALUE, - // COLLECTION STATES - LEXER_STATE_OBJECT_START, - LEXER_STATE_OBJECT, - LEXER_STATE_OBJECT_END, - LEXER_STATE_ARRAY_START, - LEXER_STATE_ARRAY, - LEXER_STATE_ARRAY_END, - LEXER_STATE_LAST_COLLECTION, - // OBJECT STATES - LEXER_STATE_KEY, - // NUMBER STATES - LEXER_STATE_DECIMAL, - LEXER_STATE_NUMBER, - LEXER_STATE_FRACTION, - LEXER_STATE_EXPONENT, - LEXER_STATE_EXP_SIGN, - LEXER_STATE_POWER, - LEXER_STATE_NUMBER_END, - // STRING STATES - LEXER_STATE_STRING, - LEXER_STATE_STRING_END, - LEXER_STATE_ESCAPE_SEQUENCE, - LEXER_STATE_UNICODE_HEX, - // KEYWORD STATES - LEXER_STATE_TRUE, - LEXER_STATE_FALSE, - LEXER_STATE_NULL, - LEXER_STATE_KEYWORD_END, - - COUNT_LEXER_STATES, -} lexer_state_t; - typedef struct { lexer_state_t stack[MAX_STACK_CAPACITY]; u64 size; @@ -84,6 +48,7 @@ struct lexer_s { u64 text_length; const char *text; lexer_state_t current; + lexer_state_t next; state_stack_t stack; lexer_string_t keyword; lexer_string_t codepoint; @@ -95,9 +60,8 @@ struct lexer_s { dstr_t *error_message; }; -#if 0 INTERNAL lexer_input_t char_type(char input); -#endif + INTERNAL void stack_push(state_stack_t *stack, lexer_state_t value); INTERNAL lexer_state_t stack_pop(state_stack_t *stack); @@ -111,31 +75,13 @@ INTERNAL token_t dstr_to_numerical_token(const dstr_t *str); INTERNAL void set_token(token_t *token, u64 line, u64 column, token_type type, token_value_t value); -INTERNAL void lexer_state_machine(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_lexer_start(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_last_collection(char input); -INTERNAL lexer_state_t handle_collection_end(lexer_t *lexer, char input); -INTERNAL void handle_input_after_collection_end(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_object(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_array(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_key(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_value(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_string(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_string_end(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_escape_sequence(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_decimal(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_number(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_fraction(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_exponent(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_exp_sign(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_power(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_number_end(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_keyword(char input); -INTERNAL lexer_state_t handle_true(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_false(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_null(lexer_t *lexer, char input); -INTERNAL lexer_state_t handle_keyword_end(lexer_t *lexer, char input); +INTERNAL void finalise_state_transition(lexer_t *lexer); +INTERNAL void post_keyword(lexer_t *lexer); +INTERNAL void set_numerical_token(lexer_t *lexer); + +INTERNAL lexer_state_t state_table[COUNT_LEXER_STATES][COUNT_LEXER_INPUTS] = { +#include "lexer_state_transitions.table" +}; void lexer_init(lexer_t **lexer) { if (*lexer) { @@ -154,6 +100,7 @@ void lexer_init(lexer_t **lexer) { (*lexer)->text_length = 0; (*lexer)->text = ""; (*lexer)->current = LEXER_STATE_START; + (*lexer)->next = LEXER_STATE_START; (*lexer)->keyword.type = LEXER_STRING_KEYWORD; (*lexer)->codepoint.type = LEXER_STRING_UNICODE; (*lexer)->current_string = dstr_with_capacity(STRING_BUF_START_CAPACITY); @@ -203,9 +150,13 @@ lex_result_t get_next_token(lexer_t *lexer, const char *text) { c = lexer->text[(lexer->cursor)++]; - lexer_state_machine(lexer, c); + lexer_input_t current_input = char_type(c); - if (c == '\n') { + lexer->next = state_table[lexer->current][current_input]; + + finalise_state_transition(lexer); + + if (current_input == LEXER_INPUT_NEWLINE) { ++(lexer->line); lexer->column = 0; } else { @@ -301,9 +252,10 @@ void print_token(token_t token) { printf("}\n"); } -#if 0 INTERNAL lexer_input_t char_type(char input) { - if (isspace(input)) { + if (input == '\n') { + return LEXER_INPUT_NEWLINE; + } else if (isspace(input)) { return LEXER_INPUT_WHITE_SPACE; } else if (input >= '1' && input <= '9') { return LEXER_INPUT_NON_ZERO; @@ -376,7 +328,6 @@ INTERNAL lexer_input_t char_type(char input) { return LEXER_INPUT_OTHER; } } -#endif void stack_push(state_stack_t *stack, lexer_state_t state) { if (stack->size + 1 >= MAX_STACK_CAPACITY) { @@ -483,6 +434,104 @@ void set_token(token_t *token, u64 line, u64 column, token_type type, }; } +void finalise_state_transition(lexer_t *lexer) { + switch (lexer->next) { + case LEXER_STATE_OBJECT_START: + stack_push(&(lexer->stack), LEXER_STATE_OBJECT); + + lexer->next = LEXER_STATE_OBJECT; + + break; + case LEXER_STATE_ARRAY_START: + stack_push(&(lexer->stack), LEXER_STATE_ARRAY); + + lexer->next = LEXER_STATE_ARRAY; + + break; + case LEXER_STATE_TRUE: + case LEXER_STATE_FALSE: + case LEXER_STATE_NULL: + post_keyword(lexer); + + break; + case LEXER_STATE_VALUE_END: + switch (lexer->current) { + case LEXER_STATE_NUMBER: + case LEXER_STATE_FRACTION: + case LEXER_STATE_POWER: + case LEXER_STATE_NUMBER_END: + set_numerical_token(lexer); + + break; + default: + break; + } + + lexer->next = lexer->stack.stack[lexer->stack.size - 1]; + + break; + case LEXER_STATE_NUMBER_END: + set_numerical_token(lexer); + + break; + } + + lexer->current = lexer->next; +} + +void post_keyword(lexer_t *lexer) { + u64 keyword_char_count; + u64 column; + + token_t *token = &(lexer->token); + + switch (lexer->current) { + case LEXER_STATE_NULL: + keyword_char_count = 4; + + column = lexer->column - keyword_char_count; + + set_token(token, lexer->line, column, TK_NULL, (token_value_t){0}); + + break; + case LEXER_STATE_TRUE: + keyword_char_count = 4; + + column = lexer->column - keyword_char_count; + + set_token(token, lexer->line, column, TK_BOOL, + (token_value_t){.boolean = true}); + + break; + case LEXER_STATE_FALSE: + keyword_char_count = 5; + + column = lexer->column - keyword_char_count; + + set_token(token, lexer->line, column, TK_BOOL, + (token_value_t){.boolean = false}); + + break; + default: + lexer->current = LEXER_STATE_ERROR; + + return; + } + + lexer->token_ready = true; + + lexer->current = LEXER_STATE_KEYWORD_END; +} + +void set_numerical_token(lexer_t *lexer) { + lexer->token_ready = true; + u64 column = lexer->column - dstr_length(lexer->current_string); + + token_t token = dstr_to_numerical_token(lexer->current_string); + + set_token(&(lexer->token), lexer->line, column, token.type, token.value); +} + void lexer_state_machine(lexer_t *lexer, char input) { switch (lexer->current) { case LEXER_STATE_START: