#include "lexer.h" #include "aliases.h" #include "dstring.h" #include "lexer_data.h" #include #include #include #include #include #include #define MAX_KEYWORD_LENGTH 5 #define UNICODE_LENGTH 4 #define MAX_STACK_CAPACITY 1024 #define STRING_BUF_START_CAPACITY 1024 typedef struct { lexer_state_t stack[MAX_STACK_CAPACITY]; u64 size; } state_stack_t; typedef enum { LEXER_STRING_KEYWORD, LEXER_STRING_UNICODE, } lex_str_type; typedef struct { char str[MAX_KEYWORD_LENGTH + 1]; } keyword_t; typedef struct { char codepoint[UNICODE_LENGTH]; } unicode_t; typedef struct { lex_str_type type; u64 size; union { keyword_t keyword; unicode_t unicode; }; } lexer_string_t; struct lexer_s { u64 cursor; u64 line; u64 column; u64 text_length; const char *text; lexer_state_t current; lexer_state_t next; state_stack_t stack; lexer_string_t keyword; lexer_string_t codepoint; dstr_t *current_string; bool token_ready; token_t token; bool has_extra_token; token_t extra_token; dstr_t *error_message; char current_char; lexer_input_t current_input; }; INTERNAL lexer_input_t char_type(char input); INTERNAL void stack_push(state_stack_t *stack, lexer_state_t value); INTERNAL lexer_state_t stack_pop(state_stack_t *stack); INTERNAL token_t dstr_to_numerical_token(const dstr_t *str); INTERNAL void set_token(token_t *token, u64 line, u64 column, token_type type, token_value_t value); INTERNAL void finalise_state_transition(lexer_t *lexer); INTERNAL void handle_object_end(lexer_t *lexer); INTERNAL void handle_array_end(lexer_t *lexer); INTERNAL void handle_string_end(lexer_t *lexer); INTERNAL void post_keyword(lexer_t *lexer); INTERNAL void set_numerical_token(lexer_t *lexer); INTERNAL lexer_state_t state_table[COUNT_LEXER_STATES][COUNT_LEXER_INPUTS] = { #include "lexer_state_transitions.table" }; void lexer_init(lexer_t **lexer) { if (*lexer) { lexer_free(lexer); } *lexer = (lexer_t *)malloc(sizeof(lexer_t)); if (!(*lexer)) { return; } (*lexer)->cursor = 0; (*lexer)->line = 1; (*lexer)->column = 0; (*lexer)->text_length = 0; (*lexer)->text = ""; (*lexer)->current = LEXER_STATE_START; (*lexer)->next = LEXER_STATE_START; (*lexer)->keyword.type = LEXER_STRING_KEYWORD; (*lexer)->codepoint.type = LEXER_STRING_UNICODE; (*lexer)->current_string = dstr_with_capacity(STRING_BUF_START_CAPACITY); (*lexer)->error_message = dstr_with_capacity(STRING_BUF_START_CAPACITY); (*lexer)->token_ready = false; (*lexer)->token = (token_t){0}; (*lexer)->has_extra_token = false; (*lexer)->extra_token = (token_t){0}; if (!((*lexer)->current_string)) { lexer_free(lexer); } } void lexer_free(lexer_t **lexer) { if (!(*lexer)) { return; } dstr_free(&((*lexer)->current_string)); dstr_free(&((*lexer)->error_message)); free(*lexer); *lexer = NULL; } lex_result_t get_next_token(lexer_t *lexer, const char *text) { if (text != NULL) { lexer->cursor = 0; lexer->text = text; lexer->text_length = strlen(text); } dstr_clear(lexer->current_string); while (lexer->cursor < lexer->text_length) { if (lexer->has_extra_token) { lexer->has_extra_token = false; return (lex_result_t){ (lex_err_t){.errno = LEX_ERR_NONE, .msg = ""}, lexer->extra_token, }; } lexer->current_char = lexer->text[(lexer->cursor)++]; lexer->current_input = char_type(lexer->current_char); lexer->next = state_table[lexer->current][lexer->current_input]; finalise_state_transition(lexer); if (lexer->current_input == LEXER_INPUT_NEWLINE) { ++(lexer->line); lexer->column = 0; } else { ++(lexer->column); } if (lexer->current == LEXER_STATE_ERROR) { char msg[STRING_BUF_START_CAPACITY + 1]; memset(msg, 0, STRING_BUF_START_CAPACITY + 1); u64 slice_length = 20; char slice[slice_length]; snprintf(slice, slice_length, "%s", &(lexer->text[lexer->cursor - 1])); snprintf( msg, STRING_BUF_START_CAPACITY, "\n(%llu:%llu) Encountered an error while parsing the following:\n%s", (unsigned long long)lexer->line, (unsigned long long)lexer->column, slice); dstr_update(&(lexer->error_message), msg); return (lex_result_t){ (lex_err_t){.errno = LEX_ERR_INVALID, .msg = dstr_to_cstr(lexer->error_message)}, (token_t){0}, }; } else if (lexer->token_ready) { lexer->token_ready = false; return (lex_result_t){ (lex_err_t){.errno = LEX_ERR_NONE, .msg = ""}, lexer->token, }; } } return (lex_result_t){ (lex_err_t){.errno = LEX_ERR_NONE, .msg = ""}, (token_t){0}, }; } void print_token(token_t token) { i32 num_padding = 4; printf("{LINE: %*llu, COLUMN: %*llu, TYPE: ", num_padding, (unsigned long long)token.line, num_padding, (unsigned long long)token.column); i32 token_type_padding = 15; switch (token.type) { case TK_NO_TOKEN: break; case TK_L_BRACE: printf("%*s, VALUE: N/A", token_type_padding, "TK_L_BRACE"); break; case TK_R_BRACE: printf("%*s, VALUE: N/A", token_type_padding, "TK_R_BRACE"); break; case TK_L_BRACKET: printf("%*s, VALUE: N/A", token_type_padding, "TK_L_BRACKET"); break; case TK_R_BRACKET: printf("%*s, VALUE: N/A", token_type_padding, "TK_R_BRACKET"); break; case TK_NULL: printf("%*s, VALUE: N/A", token_type_padding, "TK_NULL"); break; case TK_BOOL: printf("%*s, VALUE: %s", token_type_padding, "TK_BOOL", token.value.boolean ? "true" : "false"); break; case TK_STR_KEY: printf("%*s, VALUE: %s", token_type_padding, "TK_STR_KEY", token.value.string); break; case TK_STR_VAL: printf("%*s, VALUE: %s", token_type_padding, "TK_STR_VAL", token.value.string); break; case TK_INTEGER: printf("%*s, VALUE: %lld", token_type_padding, "TK_INTEGER", (long long)token.value.num_int); break; case TK_DOUBLE: printf("%*s, VALUE: %f", token_type_padding, "TK_DOUBLE", token.value.num_frac); break; } printf("}\n"); } INTERNAL lexer_input_t char_type(char input) { if (input == '\n') { return LEXER_INPUT_NEWLINE; } else if (isspace(input)) { return LEXER_INPUT_WHITE_SPACE; } else if (input >= '1' && input <= '9') { return LEXER_INPUT_NON_ZERO; } switch (input) { case '{': return LEXER_INPUT_OPEN_BRACE; case '}': return LEXER_INPUT_CLOSE_BRACE; case '[': return LEXER_INPUT_OPEN_BRACKET; case ']': return LEXER_INPUT_CLOSE_BRACKET; case ',': return LEXER_INPUT_COMMA; case ':': return LEXER_INPUT_COLON; case '"': return LEXER_INPUT_DOUBLE_QUOTE; case '\\': return LEXER_INPUT_BACK_SLASH; case '/': return LEXER_INPUT_FORWARD_SLASH; case 'a': return LEXER_INPUT_LOWER_A; case 'b': return LEXER_INPUT_LOWER_B; case 'c': return LEXER_INPUT_LOWER_C; case 'd': return LEXER_INPUT_LOWER_D; case 'e': return LEXER_INPUT_LOWER_E; case 'f': return LEXER_INPUT_LOWER_F; case 'l': return LEXER_INPUT_LOWER_L; case 'n': return LEXER_INPUT_LOWER_N; case 'r': return LEXER_INPUT_LOWER_R; case 's': return LEXER_INPUT_LOWER_S; case 't': return LEXER_INPUT_LOWER_T; case 'u': return LEXER_INPUT_LOWER_U; case 'A': return LEXER_INPUT_UPPER_A; case 'B': return LEXER_INPUT_UPPER_B; case 'C': return LEXER_INPUT_UPPER_C; case 'D': return LEXER_INPUT_UPPER_D; case 'E': return LEXER_INPUT_UPPER_E; case 'F': return LEXER_INPUT_UPPER_F; case '-': return LEXER_INPUT_MINUS; case '+': return LEXER_INPUT_PLUS; case '.': return LEXER_INPUT_DECIMAL; case '0': return LEXER_INPUT_ZERO; default: return LEXER_INPUT_OTHER; } } void stack_push(state_stack_t *stack, lexer_state_t state) { if (stack->size + 1 >= MAX_STACK_CAPACITY) { return; } stack->stack[(stack->size)++] = state; } lexer_state_t stack_pop(state_stack_t *stack) { if (stack->size == 0) { return LEXER_STATE_ERROR; } lexer_state_t state = stack->stack[--(stack->size)]; return state; } token_t dstr_to_numerical_token(const dstr_t *str) { token_t token = {0}; bool is_double = dstr_find(str, ".") != -1; token.type = is_double ? TK_DOUBLE : TK_INTEGER; if (is_double) { token.value.num_frac = strtod(dstr_to_cstr(str), NULL); } else { token.value.num_int = atol(dstr_to_cstr(str)); } return token; } void set_token(token_t *token, u64 line, u64 column, token_type type, token_value_t value) { *token = (token_t){ .line = line, .column = column, .type = type, .value = value, }; } void finalise_state_transition(lexer_t *lexer) { switch (lexer->next) { case LEXER_STATE_OBJECT_START: lexer->token_ready = true; set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACE, (token_value_t){0}); stack_push(&(lexer->stack), LEXER_STATE_OBJECT); lexer->next = LEXER_STATE_OBJECT; break; case LEXER_STATE_OBJECT_END: switch (lexer->current) { case LEXER_STATE_NUMBER: case LEXER_STATE_FRACTION: case LEXER_STATE_POWER: case LEXER_STATE_NUMBER_END: if (dstr_length(lexer->current_string) > 0) { set_numerical_token(lexer); } break; default: break; } if (lexer->current_input == LEXER_INPUT_CLOSE_BRACE) { handle_object_end(lexer); } break; case LEXER_STATE_ARRAY_START: lexer->token_ready = true; set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACKET, (token_value_t){0}); stack_push(&(lexer->stack), LEXER_STATE_ARRAY); lexer->next = LEXER_STATE_ARRAY; break; case LEXER_STATE_ARRAY_END: switch (lexer->current) { case LEXER_STATE_NUMBER: case LEXER_STATE_FRACTION: case LEXER_STATE_POWER: case LEXER_STATE_NUMBER_END: if (dstr_length(lexer->current_string) > 0) { set_numerical_token(lexer); } break; default: break; } if (lexer->current_input == LEXER_INPUT_CLOSE_BRACKET) { handle_array_end(lexer); } break; case LEXER_STATE_KEY: if (lexer->current == LEXER_STATE_OBJECT) { stack_push(&(lexer->stack), LEXER_STATE_KEY); lexer->next = LEXER_STATE_STRING; } break; case LEXER_STATE_KEY_END: lexer->next = LEXER_STATE_VALUE; break; case LEXER_STATE_STRING: if (lexer->current == LEXER_STATE_VALUE || lexer->current == LEXER_STATE_ARRAY) { stack_push(&(lexer->stack), LEXER_STATE_VALUE); break; } // break left out intentionally case LEXER_STATE_ESCAPE_SEQUENCE: case LEXER_STATE_UNICODE_HEX1: case LEXER_STATE_UNICODE_HEX2: case LEXER_STATE_UNICODE_HEX3: case LEXER_STATE_UNICODE_HEX4: case LEXER_STATE_DECIMAL: case LEXER_STATE_NUMBER: case LEXER_STATE_FRACTION: case LEXER_STATE_EXPONENT: case LEXER_STATE_EXP_SIGN: case LEXER_STATE_POWER: dstr_append(&(lexer->current_string), lexer->current_char); break; case LEXER_STATE_STRING_END: if (lexer->current_input == LEXER_INPUT_DOUBLE_QUOTE) { handle_string_end(lexer); } break; case LEXER_STATE_TRUE: case LEXER_STATE_FALSE: case LEXER_STATE_NULL: post_keyword(lexer); break; case LEXER_STATE_VALUE_END: switch (lexer->current) { case LEXER_STATE_NUMBER: case LEXER_STATE_FRACTION: case LEXER_STATE_POWER: case LEXER_STATE_NUMBER_END: if (dstr_length(lexer->current_string) > 0) { set_numerical_token(lexer); } break; default: break; } lexer->next = lexer->stack.stack[lexer->stack.size - 1]; break; case LEXER_STATE_NUMBER_END: switch (lexer->current) { case LEXER_STATE_NUMBER: case LEXER_STATE_FRACTION: case LEXER_STATE_POWER: set_numerical_token(lexer); break; default: break; } break; case LEXER_STATE_ERROR: case LEXER_STATE_START: case LEXER_STATE_VALUE: case LEXER_STATE_OBJECT: case LEXER_STATE_ARRAY: case LEXER_STATE_LAST_COLLECTION: case LEXER_STATE_T: case LEXER_STATE_TR: case LEXER_STATE_TRU: case LEXER_STATE_F: case LEXER_STATE_FA: case LEXER_STATE_FAL: case LEXER_STATE_FALS: case LEXER_STATE_N: case LEXER_STATE_NU: case LEXER_STATE_NUL: case LEXER_STATE_KEYWORD_END: case COUNT_LEXER_STATES: break; } lexer->current = lexer->next; } void handle_object_end(lexer_t *lexer) { lexer_state_t last = lexer->stack.stack[lexer->stack.size - 1]; if (last != LEXER_STATE_OBJECT) { lexer->next = LEXER_STATE_ERROR; return; } if (lexer->stack.size > 0) { stack_pop(&(lexer->stack)); } else { lexer->next = LEXER_STATE_LAST_COLLECTION; } token_t *token; if (lexer->token_ready) { lexer->has_extra_token = true; token = &(lexer->extra_token); } else { lexer->token_ready = true; token = &(lexer->token); } set_token(token, lexer->line, lexer->column, TK_R_BRACE, (token_value_t){0}); } void handle_array_end(lexer_t *lexer) { lexer_state_t last = lexer->stack.stack[lexer->stack.size - 1]; if (last != LEXER_STATE_ARRAY) { lexer->next = LEXER_STATE_ERROR; return; } if (lexer->stack.size > 0) { stack_pop(&(lexer->stack)); } else { lexer->next = LEXER_STATE_LAST_COLLECTION; } token_t *token; if (lexer->token_ready) { lexer->has_extra_token = true; token = &(lexer->extra_token); } else { lexer->token_ready = true; token = &(lexer->token); } set_token(token, lexer->line, lexer->column, TK_R_BRACKET, (token_value_t){0}); } void handle_string_end(lexer_t *lexer) { lexer_state_t string_type = stack_pop(&(lexer->stack)); lexer->token_ready = true; token_t *token = &(lexer->token); u64 column = lexer->column - dstr_length(lexer->current_string); token_value_t value = {.string = dstr_to_cstr(lexer->current_string)}; if (string_type == LEXER_STATE_KEY) { set_token(token, lexer->line, column, TK_STR_KEY, value); } else if (string_type == LEXER_STATE_VALUE) { set_token(token, lexer->line, column, TK_STR_VAL, value); } } void post_keyword(lexer_t *lexer) { u64 keyword_char_count; u64 column; token_t *token = &(lexer->token); switch (lexer->next) { case LEXER_STATE_NULL: keyword_char_count = 4; column = lexer->column - keyword_char_count; set_token(token, lexer->line, column, TK_NULL, (token_value_t){0}); break; case LEXER_STATE_TRUE: keyword_char_count = 4; column = lexer->column - keyword_char_count; set_token(token, lexer->line, column, TK_BOOL, (token_value_t){.boolean = true}); break; case LEXER_STATE_FALSE: keyword_char_count = 5; column = lexer->column - keyword_char_count; set_token(token, lexer->line, column, TK_BOOL, (token_value_t){.boolean = false}); break; default: lexer->next = LEXER_STATE_ERROR; return; } lexer->token_ready = true; lexer->next = LEXER_STATE_KEYWORD_END; } void set_numerical_token(lexer_t *lexer) { lexer->token_ready = true; u64 column = lexer->column - dstr_length(lexer->current_string); token_t token = dstr_to_numerical_token(lexer->current_string); set_token(&(lexer->token), lexer->line, column, token.type, token.value); }