Start implementing using the state transition table in the lexer
This commit is contained in:
		| @@ -4,9 +4,6 @@ | ||||
| #include "aliases.h" | ||||
| #include <stdbool.h> | ||||
|  | ||||
| #define VALID_JSON true | ||||
| #define INVALID_JSON false | ||||
|  | ||||
| typedef const char *str_view_t; | ||||
|  | ||||
| typedef enum { | ||||
|   | ||||
| @@ -1,6 +1,7 @@ | ||||
| #include "lexer.h" | ||||
| #include "aliases.h" | ||||
| #include "dstring.h" | ||||
| #include "lexer_data.h" | ||||
| #include <assert.h> | ||||
| #include <ctype.h> | ||||
| #include <stdbool.h> | ||||
| @@ -13,43 +14,6 @@ | ||||
| #define MAX_STACK_CAPACITY 1024 | ||||
| #define STRING_BUF_START_CAPACITY 1024 | ||||
|  | ||||
| typedef enum { | ||||
|   // GENERAL STATES | ||||
|   LEXER_STATE_START, | ||||
|   LEXER_STATE_ERROR, | ||||
|   LEXER_STATE_VALUE, | ||||
|   // COLLECTION STATES | ||||
|   LEXER_STATE_OBJECT_START, | ||||
|   LEXER_STATE_OBJECT, | ||||
|   LEXER_STATE_OBJECT_END, | ||||
|   LEXER_STATE_ARRAY_START, | ||||
|   LEXER_STATE_ARRAY, | ||||
|   LEXER_STATE_ARRAY_END, | ||||
|   LEXER_STATE_LAST_COLLECTION, | ||||
|   // OBJECT STATES | ||||
|   LEXER_STATE_KEY, | ||||
|   // NUMBER STATES | ||||
|   LEXER_STATE_DECIMAL, | ||||
|   LEXER_STATE_NUMBER, | ||||
|   LEXER_STATE_FRACTION, | ||||
|   LEXER_STATE_EXPONENT, | ||||
|   LEXER_STATE_EXP_SIGN, | ||||
|   LEXER_STATE_POWER, | ||||
|   LEXER_STATE_NUMBER_END, | ||||
|   // STRING STATES | ||||
|   LEXER_STATE_STRING, | ||||
|   LEXER_STATE_STRING_END, | ||||
|   LEXER_STATE_ESCAPE_SEQUENCE, | ||||
|   LEXER_STATE_UNICODE_HEX, | ||||
|   // KEYWORD STATES | ||||
|   LEXER_STATE_TRUE, | ||||
|   LEXER_STATE_FALSE, | ||||
|   LEXER_STATE_NULL, | ||||
|   LEXER_STATE_KEYWORD_END, | ||||
|  | ||||
|   COUNT_LEXER_STATES, | ||||
| } lexer_state_t; | ||||
|  | ||||
| typedef struct { | ||||
|   lexer_state_t stack[MAX_STACK_CAPACITY]; | ||||
|   u64 size; | ||||
| @@ -84,6 +48,7 @@ struct lexer_s { | ||||
|   u64 text_length; | ||||
|   const char *text; | ||||
|   lexer_state_t current; | ||||
|   lexer_state_t next; | ||||
|   state_stack_t stack; | ||||
|   lexer_string_t keyword; | ||||
|   lexer_string_t codepoint; | ||||
| @@ -95,9 +60,8 @@ struct lexer_s { | ||||
|   dstr_t *error_message; | ||||
| }; | ||||
|  | ||||
| #if 0 | ||||
| INTERNAL lexer_input_t char_type(char input); | ||||
| #endif | ||||
|  | ||||
| INTERNAL void stack_push(state_stack_t *stack, lexer_state_t value); | ||||
| INTERNAL lexer_state_t stack_pop(state_stack_t *stack); | ||||
|  | ||||
| @@ -111,31 +75,13 @@ INTERNAL token_t dstr_to_numerical_token(const dstr_t *str); | ||||
| INTERNAL void set_token(token_t *token, u64 line, u64 column, token_type type, | ||||
|                         token_value_t value); | ||||
|  | ||||
| INTERNAL void lexer_state_machine(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_lexer_start(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_last_collection(char input); | ||||
| INTERNAL lexer_state_t handle_collection_end(lexer_t *lexer, char input); | ||||
| INTERNAL void handle_input_after_collection_end(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_object(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_array(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_key(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_value(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_string(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_string_end(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_escape_sequence(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_decimal(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_number(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_fraction(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_exponent(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_exp_sign(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_power(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_number_end(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_keyword(char input); | ||||
| INTERNAL lexer_state_t handle_true(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_false(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_null(lexer_t *lexer, char input); | ||||
| INTERNAL lexer_state_t handle_keyword_end(lexer_t *lexer, char input); | ||||
| INTERNAL void finalise_state_transition(lexer_t *lexer); | ||||
| INTERNAL void post_keyword(lexer_t *lexer); | ||||
| INTERNAL void set_numerical_token(lexer_t *lexer); | ||||
|  | ||||
| INTERNAL lexer_state_t state_table[COUNT_LEXER_STATES][COUNT_LEXER_INPUTS] = { | ||||
| #include "lexer_state_transitions.table" | ||||
| }; | ||||
|  | ||||
| void lexer_init(lexer_t **lexer) { | ||||
|   if (*lexer) { | ||||
| @@ -154,6 +100,7 @@ void lexer_init(lexer_t **lexer) { | ||||
|   (*lexer)->text_length = 0; | ||||
|   (*lexer)->text = ""; | ||||
|   (*lexer)->current = LEXER_STATE_START; | ||||
|   (*lexer)->next = LEXER_STATE_START; | ||||
|   (*lexer)->keyword.type = LEXER_STRING_KEYWORD; | ||||
|   (*lexer)->codepoint.type = LEXER_STRING_UNICODE; | ||||
|   (*lexer)->current_string = dstr_with_capacity(STRING_BUF_START_CAPACITY); | ||||
| @@ -203,9 +150,13 @@ lex_result_t get_next_token(lexer_t *lexer, const char *text) { | ||||
|  | ||||
|     c = lexer->text[(lexer->cursor)++]; | ||||
|  | ||||
|     lexer_state_machine(lexer, c); | ||||
|     lexer_input_t current_input = char_type(c); | ||||
|  | ||||
|     if (c == '\n') { | ||||
|     lexer->next = state_table[lexer->current][current_input]; | ||||
|  | ||||
|     finalise_state_transition(lexer); | ||||
|  | ||||
|     if (current_input == LEXER_INPUT_NEWLINE) { | ||||
|       ++(lexer->line); | ||||
|       lexer->column = 0; | ||||
|     } else { | ||||
| @@ -301,9 +252,10 @@ void print_token(token_t token) { | ||||
|   printf("}\n"); | ||||
| } | ||||
|  | ||||
| #if 0 | ||||
| INTERNAL lexer_input_t char_type(char input) { | ||||
|   if (isspace(input)) { | ||||
|   if (input == '\n') { | ||||
|     return LEXER_INPUT_NEWLINE; | ||||
|   } else if (isspace(input)) { | ||||
|     return LEXER_INPUT_WHITE_SPACE; | ||||
|   } else if (input >= '1' && input <= '9') { | ||||
|     return LEXER_INPUT_NON_ZERO; | ||||
| @@ -376,7 +328,6 @@ INTERNAL lexer_input_t char_type(char input) { | ||||
|     return LEXER_INPUT_OTHER; | ||||
|   } | ||||
| } | ||||
| #endif | ||||
|  | ||||
| void stack_push(state_stack_t *stack, lexer_state_t state) { | ||||
|   if (stack->size + 1 >= MAX_STACK_CAPACITY) { | ||||
| @@ -483,6 +434,104 @@ void set_token(token_t *token, u64 line, u64 column, token_type type, | ||||
|   }; | ||||
| } | ||||
|  | ||||
| void finalise_state_transition(lexer_t *lexer) { | ||||
|   switch (lexer->next) { | ||||
|   case LEXER_STATE_OBJECT_START: | ||||
|     stack_push(&(lexer->stack), LEXER_STATE_OBJECT); | ||||
|  | ||||
|     lexer->next = LEXER_STATE_OBJECT; | ||||
|  | ||||
|     break; | ||||
|   case LEXER_STATE_ARRAY_START: | ||||
|     stack_push(&(lexer->stack), LEXER_STATE_ARRAY); | ||||
|  | ||||
|     lexer->next = LEXER_STATE_ARRAY; | ||||
|  | ||||
|     break; | ||||
|   case LEXER_STATE_TRUE: | ||||
|   case LEXER_STATE_FALSE: | ||||
|   case LEXER_STATE_NULL: | ||||
|     post_keyword(lexer); | ||||
|  | ||||
|     break; | ||||
|   case LEXER_STATE_VALUE_END: | ||||
|     switch (lexer->current) { | ||||
|     case LEXER_STATE_NUMBER: | ||||
|     case LEXER_STATE_FRACTION: | ||||
|     case LEXER_STATE_POWER: | ||||
|     case LEXER_STATE_NUMBER_END: | ||||
|       set_numerical_token(lexer); | ||||
|  | ||||
|       break; | ||||
|     default: | ||||
|       break; | ||||
|     } | ||||
|  | ||||
|     lexer->next = lexer->stack.stack[lexer->stack.size - 1]; | ||||
|  | ||||
|     break; | ||||
|   case LEXER_STATE_NUMBER_END: | ||||
|     set_numerical_token(lexer); | ||||
|  | ||||
|     break; | ||||
|   } | ||||
|  | ||||
|   lexer->current = lexer->next; | ||||
| } | ||||
|  | ||||
| void post_keyword(lexer_t *lexer) { | ||||
|   u64 keyword_char_count; | ||||
|   u64 column; | ||||
|  | ||||
|   token_t *token = &(lexer->token); | ||||
|  | ||||
|   switch (lexer->current) { | ||||
|   case LEXER_STATE_NULL: | ||||
|     keyword_char_count = 4; | ||||
|  | ||||
|     column = lexer->column - keyword_char_count; | ||||
|  | ||||
|     set_token(token, lexer->line, column, TK_NULL, (token_value_t){0}); | ||||
|  | ||||
|     break; | ||||
|   case LEXER_STATE_TRUE: | ||||
|     keyword_char_count = 4; | ||||
|  | ||||
|     column = lexer->column - keyword_char_count; | ||||
|  | ||||
|     set_token(token, lexer->line, column, TK_BOOL, | ||||
|               (token_value_t){.boolean = true}); | ||||
|  | ||||
|     break; | ||||
|   case LEXER_STATE_FALSE: | ||||
|     keyword_char_count = 5; | ||||
|  | ||||
|     column = lexer->column - keyword_char_count; | ||||
|  | ||||
|     set_token(token, lexer->line, column, TK_BOOL, | ||||
|               (token_value_t){.boolean = false}); | ||||
|  | ||||
|     break; | ||||
|   default: | ||||
|     lexer->current = LEXER_STATE_ERROR; | ||||
|  | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   lexer->token_ready = true; | ||||
|  | ||||
|   lexer->current = LEXER_STATE_KEYWORD_END; | ||||
| } | ||||
|  | ||||
| void set_numerical_token(lexer_t *lexer) { | ||||
|   lexer->token_ready = true; | ||||
|   u64 column = lexer->column - dstr_length(lexer->current_string); | ||||
|  | ||||
|   token_t token = dstr_to_numerical_token(lexer->current_string); | ||||
|  | ||||
|   set_token(&(lexer->token), lexer->line, column, token.type, token.value); | ||||
| } | ||||
|  | ||||
| void lexer_state_machine(lexer_t *lexer, char input) { | ||||
|   switch (lexer->current) { | ||||
|   case LEXER_STATE_START: | ||||
|   | ||||
		Reference in New Issue
	
	Block a user