Refactor the lexer to use the state transitions table
This commit is contained in:
parent
574f771444
commit
b59c59e7e1
@ -59,6 +59,7 @@ struct lexer_s {
|
||||
token_t extra_token;
|
||||
dstr_t *error_message;
|
||||
char current_char;
|
||||
lexer_input_t current_input;
|
||||
};
|
||||
|
||||
INTERNAL lexer_input_t char_type(char input);
|
||||
@ -66,20 +67,16 @@ INTERNAL lexer_input_t char_type(char input);
|
||||
INTERNAL void stack_push(state_stack_t *stack, lexer_state_t value);
|
||||
INTERNAL lexer_state_t stack_pop(state_stack_t *stack);
|
||||
|
||||
INTERNAL void append_to_lex_str(lexer_string_t *str, char input);
|
||||
INTERNAL void clear_lex_str(lexer_string_t *str);
|
||||
INTERNAL bool strequal(const char *const first, const char *const second);
|
||||
INTERNAL bool is_valid_hex_char(const char input);
|
||||
INTERNAL bool ishex(const char input);
|
||||
|
||||
INTERNAL token_t dstr_to_numerical_token(const dstr_t *str);
|
||||
INTERNAL void set_token(token_t *token, u64 line, u64 column, token_type type,
|
||||
token_value_t value);
|
||||
|
||||
INTERNAL void finalise_state_transition(lexer_t *lexer);
|
||||
INTERNAL void handle_object_end(lexer_t *lexer);
|
||||
INTERNAL void handle_array_end(lexer_t *lexer);
|
||||
INTERNAL void handle_string_end(lexer_t *lexer);
|
||||
INTERNAL void post_keyword(lexer_t *lexer);
|
||||
INTERNAL void set_numerical_token(lexer_t *lexer);
|
||||
INTERNAL void handle_string_end(lexer_t *lexer);
|
||||
|
||||
INTERNAL lexer_state_t state_table[COUNT_LEXER_STATES][COUNT_LEXER_INPUTS] = {
|
||||
#include "lexer_state_transitions.table"
|
||||
@ -150,13 +147,13 @@ lex_result_t get_next_token(lexer_t *lexer, const char *text) {
|
||||
|
||||
lexer->current_char = lexer->text[(lexer->cursor)++];
|
||||
|
||||
lexer_input_t input = char_type(lexer->current_char);
|
||||
lexer->current_input = char_type(lexer->current_char);
|
||||
|
||||
lexer->next = state_table[lexer->current][input];
|
||||
lexer->next = state_table[lexer->current][lexer->current_input];
|
||||
|
||||
finalise_state_transition(lexer);
|
||||
|
||||
if (input == LEXER_INPUT_NEWLINE) {
|
||||
if (lexer->current_input == LEXER_INPUT_NEWLINE) {
|
||||
++(lexer->line);
|
||||
lexer->column = 0;
|
||||
} else {
|
||||
@ -347,67 +344,6 @@ lexer_state_t stack_pop(state_stack_t *stack) {
|
||||
return state;
|
||||
}
|
||||
|
||||
void append_to_lex_str(lexer_string_t *lex_str, char input) {
|
||||
u64 capacity = 0;
|
||||
char *str = NULL;
|
||||
|
||||
switch (lex_str->type) {
|
||||
case LEXER_STRING_KEYWORD:
|
||||
capacity = MAX_KEYWORD_LENGTH;
|
||||
str = lex_str->keyword.str;
|
||||
|
||||
break;
|
||||
case LEXER_STRING_UNICODE:
|
||||
capacity = UNICODE_LENGTH;
|
||||
str = lex_str->unicode.codepoint;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if (lex_str->size + 1 > capacity) {
|
||||
return;
|
||||
}
|
||||
|
||||
assert(str != NULL);
|
||||
|
||||
str[(lex_str->size)++] = input;
|
||||
}
|
||||
|
||||
void clear_lex_str(lexer_string_t *lex_str) {
|
||||
u64 capacity = 1;
|
||||
char *str = NULL;
|
||||
|
||||
switch (lex_str->type) {
|
||||
case LEXER_STRING_KEYWORD:
|
||||
capacity += MAX_KEYWORD_LENGTH;
|
||||
str = lex_str->keyword.str;
|
||||
|
||||
break;
|
||||
case LEXER_STRING_UNICODE:
|
||||
capacity += UNICODE_LENGTH;
|
||||
str = lex_str->unicode.codepoint;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
assert(str != NULL);
|
||||
|
||||
memset(str, 0, capacity);
|
||||
lex_str->size = 0;
|
||||
}
|
||||
|
||||
bool strequal(const char *const first, const char *const second) {
|
||||
return strcmp(first, second) == 0;
|
||||
}
|
||||
|
||||
bool is_valid_hex_char(const char input) {
|
||||
return (input >= 'A' && input <= 'F') || (input >= 'a' && input <= 'e');
|
||||
}
|
||||
|
||||
bool ishex(const char input) {
|
||||
return isdigit(input) || is_valid_hex_char(input);
|
||||
}
|
||||
|
||||
token_t dstr_to_numerical_token(const dstr_t *str) {
|
||||
token_t token = {0};
|
||||
|
||||
@ -447,6 +383,24 @@ void finalise_state_transition(lexer_t *lexer) {
|
||||
|
||||
break;
|
||||
case LEXER_STATE_OBJECT_END:
|
||||
switch (lexer->current) {
|
||||
case LEXER_STATE_NUMBER:
|
||||
case LEXER_STATE_FRACTION:
|
||||
case LEXER_STATE_POWER:
|
||||
case LEXER_STATE_NUMBER_END:
|
||||
if (dstr_length(lexer->current_string) > 0) {
|
||||
set_numerical_token(lexer);
|
||||
}
|
||||
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (lexer->current_input == LEXER_INPUT_CLOSE_BRACE) {
|
||||
handle_object_end(lexer);
|
||||
}
|
||||
|
||||
break;
|
||||
case LEXER_STATE_ARRAY_START:
|
||||
lexer->token_ready = true;
|
||||
@ -459,6 +413,24 @@ void finalise_state_transition(lexer_t *lexer) {
|
||||
|
||||
break;
|
||||
case LEXER_STATE_ARRAY_END:
|
||||
switch (lexer->current) {
|
||||
case LEXER_STATE_NUMBER:
|
||||
case LEXER_STATE_FRACTION:
|
||||
case LEXER_STATE_POWER:
|
||||
case LEXER_STATE_NUMBER_END:
|
||||
if (dstr_length(lexer->current_string) > 0) {
|
||||
set_numerical_token(lexer);
|
||||
}
|
||||
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (lexer->current_input == LEXER_INPUT_CLOSE_BRACKET) {
|
||||
handle_array_end(lexer);
|
||||
}
|
||||
|
||||
break;
|
||||
case LEXER_STATE_KEY:
|
||||
if (lexer->current == LEXER_STATE_OBJECT) {
|
||||
@ -473,16 +445,31 @@ void finalise_state_transition(lexer_t *lexer) {
|
||||
|
||||
break;
|
||||
case LEXER_STATE_STRING:
|
||||
if (lexer->current == LEXER_STATE_VALUE ||
|
||||
lexer->current == LEXER_STATE_ARRAY) {
|
||||
stack_push(&(lexer->stack), LEXER_STATE_VALUE);
|
||||
break;
|
||||
}
|
||||
|
||||
// break left out intentionally
|
||||
case LEXER_STATE_ESCAPE_SEQUENCE:
|
||||
case LEXER_STATE_UNICODE_HEX1:
|
||||
case LEXER_STATE_UNICODE_HEX2:
|
||||
case LEXER_STATE_UNICODE_HEX3:
|
||||
case LEXER_STATE_UNICODE_HEX4:
|
||||
case LEXER_STATE_DECIMAL:
|
||||
case LEXER_STATE_NUMBER:
|
||||
case LEXER_STATE_FRACTION:
|
||||
case LEXER_STATE_EXPONENT:
|
||||
case LEXER_STATE_EXP_SIGN:
|
||||
case LEXER_STATE_POWER:
|
||||
dstr_append(&(lexer->current_string), lexer->current_char);
|
||||
|
||||
break;
|
||||
case LEXER_STATE_STRING_END:
|
||||
handle_string_end(lexer);
|
||||
if (lexer->current_input == LEXER_INPUT_DOUBLE_QUOTE) {
|
||||
handle_string_end(lexer);
|
||||
}
|
||||
|
||||
break;
|
||||
case LEXER_STATE_TRUE:
|
||||
@ -497,7 +484,9 @@ void finalise_state_transition(lexer_t *lexer) {
|
||||
case LEXER_STATE_FRACTION:
|
||||
case LEXER_STATE_POWER:
|
||||
case LEXER_STATE_NUMBER_END:
|
||||
set_numerical_token(lexer);
|
||||
if (dstr_length(lexer->current_string) > 0) {
|
||||
set_numerical_token(lexer);
|
||||
}
|
||||
|
||||
break;
|
||||
default:
|
||||
@ -519,19 +508,110 @@ void finalise_state_transition(lexer_t *lexer) {
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
case LEXER_STATE_ERROR:
|
||||
case LEXER_STATE_START:
|
||||
case LEXER_STATE_VALUE:
|
||||
case LEXER_STATE_OBJECT:
|
||||
case LEXER_STATE_ARRAY:
|
||||
case LEXER_STATE_LAST_COLLECTION:
|
||||
case LEXER_STATE_T:
|
||||
case LEXER_STATE_TR:
|
||||
case LEXER_STATE_TRU:
|
||||
case LEXER_STATE_F:
|
||||
case LEXER_STATE_FA:
|
||||
case LEXER_STATE_FAL:
|
||||
case LEXER_STATE_FALS:
|
||||
case LEXER_STATE_N:
|
||||
case LEXER_STATE_NU:
|
||||
case LEXER_STATE_NUL:
|
||||
case LEXER_STATE_KEYWORD_END:
|
||||
case COUNT_LEXER_STATES:
|
||||
break;
|
||||
}
|
||||
|
||||
lexer->current = lexer->next;
|
||||
}
|
||||
|
||||
void handle_object_end(lexer_t *lexer) {
|
||||
lexer_state_t last = lexer->stack.stack[lexer->stack.size - 1];
|
||||
|
||||
if (last != LEXER_STATE_OBJECT) {
|
||||
lexer->next = LEXER_STATE_ERROR;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (lexer->stack.size > 0) {
|
||||
stack_pop(&(lexer->stack));
|
||||
} else {
|
||||
lexer->next = LEXER_STATE_LAST_COLLECTION;
|
||||
}
|
||||
|
||||
token_t *token;
|
||||
|
||||
if (lexer->token_ready) {
|
||||
lexer->has_extra_token = true;
|
||||
token = &(lexer->extra_token);
|
||||
} else {
|
||||
lexer->token_ready = true;
|
||||
token = &(lexer->token);
|
||||
}
|
||||
|
||||
set_token(token, lexer->line, lexer->column, TK_R_BRACE, (token_value_t){0});
|
||||
}
|
||||
|
||||
void handle_array_end(lexer_t *lexer) {
|
||||
lexer_state_t last = lexer->stack.stack[lexer->stack.size - 1];
|
||||
|
||||
if (last != LEXER_STATE_ARRAY) {
|
||||
lexer->next = LEXER_STATE_ERROR;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (lexer->stack.size > 0) {
|
||||
stack_pop(&(lexer->stack));
|
||||
} else {
|
||||
lexer->next = LEXER_STATE_LAST_COLLECTION;
|
||||
}
|
||||
|
||||
token_t *token;
|
||||
|
||||
if (lexer->token_ready) {
|
||||
lexer->has_extra_token = true;
|
||||
token = &(lexer->extra_token);
|
||||
} else {
|
||||
lexer->token_ready = true;
|
||||
token = &(lexer->token);
|
||||
}
|
||||
|
||||
set_token(token, lexer->line, lexer->column, TK_R_BRACKET,
|
||||
(token_value_t){0});
|
||||
}
|
||||
|
||||
void handle_string_end(lexer_t *lexer) {
|
||||
lexer_state_t string_type = stack_pop(&(lexer->stack));
|
||||
|
||||
lexer->token_ready = true;
|
||||
token_t *token = &(lexer->token);
|
||||
u64 column = lexer->column - dstr_length(lexer->current_string);
|
||||
token_value_t value = {.string = dstr_to_cstr(lexer->current_string)};
|
||||
|
||||
if (string_type == LEXER_STATE_KEY) {
|
||||
set_token(token, lexer->line, column, TK_STR_KEY, value);
|
||||
} else if (string_type == LEXER_STATE_VALUE) {
|
||||
set_token(token, lexer->line, column, TK_STR_VAL, value);
|
||||
}
|
||||
}
|
||||
|
||||
void post_keyword(lexer_t *lexer) {
|
||||
u64 keyword_char_count;
|
||||
u64 column;
|
||||
|
||||
token_t *token = &(lexer->token);
|
||||
|
||||
switch (lexer->current) {
|
||||
switch (lexer->next) {
|
||||
case LEXER_STATE_NULL:
|
||||
keyword_char_count = 4;
|
||||
|
||||
@ -577,645 +657,3 @@ void set_numerical_token(lexer_t *lexer) {
|
||||
|
||||
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
|
||||
}
|
||||
|
||||
void handle_string_end(lexer_t *lexer) {
|
||||
lexer_state_t string_type = lexer->stack.stack[lexer->stack.size - 1];
|
||||
|
||||
lexer->token_ready = true;
|
||||
token_t *token = &(lexer->token);
|
||||
u64 column = lexer->column - dstr_length(lexer->current_string);
|
||||
token_value_t value = {.string = dstr_to_cstr(lexer->current_string)};
|
||||
|
||||
if (string_type == LEXER_STATE_KEY) {
|
||||
set_token(token, lexer->line, column, TK_STR_KEY, value);
|
||||
} else if (string_type == LEXER_STATE_VALUE) {
|
||||
set_token(token, lexer->line, column, TK_STR_VAL, value);
|
||||
}
|
||||
}
|
||||
|
||||
void lexer_state_machine(lexer_t *lexer, char input) {
|
||||
switch (lexer->current) {
|
||||
case LEXER_STATE_START:
|
||||
lexer->current = handle_lexer_start(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_VALUE:
|
||||
lexer->current = handle_value(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_OBJECT_START:
|
||||
stack_push(&(lexer->stack), LEXER_STATE_OBJECT);
|
||||
// break is left out intentionally here to utilise the fallthrough behaviour
|
||||
// of the switch statement
|
||||
case LEXER_STATE_OBJECT:
|
||||
lexer->current = handle_object(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_ARRAY_START:
|
||||
stack_push(&(lexer->stack), LEXER_STATE_ARRAY);
|
||||
// break is left out intentionally here to utilise the fallthrough behaviour
|
||||
// of the switch statement
|
||||
case LEXER_STATE_ARRAY:
|
||||
lexer->current = handle_array(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_OBJECT_END:
|
||||
case LEXER_STATE_ARRAY_END:
|
||||
if (lexer->stack.size > 1) {
|
||||
stack_pop(&(lexer->stack));
|
||||
|
||||
lexer->current = lexer->stack.stack[lexer->stack.size - 1];
|
||||
} else {
|
||||
lexer->current = LEXER_STATE_LAST_COLLECTION;
|
||||
}
|
||||
|
||||
handle_input_after_collection_end(lexer, input);
|
||||
|
||||
break;
|
||||
case LEXER_STATE_KEY:
|
||||
lexer->current = handle_key(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_DECIMAL:
|
||||
lexer->current = handle_decimal(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_NUMBER:
|
||||
lexer->current = handle_number(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_FRACTION:
|
||||
lexer->current = handle_fraction(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_EXPONENT:
|
||||
lexer->current = handle_exponent(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_EXP_SIGN:
|
||||
lexer->current = handle_exp_sign(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_POWER:
|
||||
lexer->current = handle_power(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_NUMBER_END:
|
||||
lexer->current = handle_number_end(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_STRING:
|
||||
lexer->current = handle_string(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_STRING_END:
|
||||
lexer->current = handle_string_end(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_ESCAPE_SEQUENCE:
|
||||
lexer->current = handle_escape_sequence(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_UNICODE_HEX:
|
||||
lexer->current = handle_unicode_sequence(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_TRUE:
|
||||
lexer->current = handle_true(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_FALSE:
|
||||
lexer->current = handle_false(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_NULL:
|
||||
lexer->current = handle_null(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_KEYWORD_END:
|
||||
lexer->current = handle_keyword_end(lexer, input);
|
||||
break;
|
||||
case LEXER_STATE_LAST_COLLECTION:
|
||||
lexer->current = handle_last_collection(input);
|
||||
break;
|
||||
case LEXER_STATE_ERROR:
|
||||
case COUNT_LEXER_STATES:
|
||||
lexer->current = LEXER_STATE_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
lexer_state_t handle_lexer_start(lexer_t *lexer, char input) {
|
||||
if (isspace(input)) {
|
||||
return LEXER_STATE_START;
|
||||
}
|
||||
|
||||
switch (input) {
|
||||
case '{':
|
||||
lexer->token_ready = true;
|
||||
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACE,
|
||||
(token_value_t){0});
|
||||
|
||||
return LEXER_STATE_OBJECT_START;
|
||||
case '[':
|
||||
lexer->token_ready = true;
|
||||
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACKET,
|
||||
(token_value_t){0});
|
||||
|
||||
return LEXER_STATE_ARRAY_START;
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_last_collection(char input) {
|
||||
if (isspace(input)) {
|
||||
return LEXER_STATE_LAST_COLLECTION;
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_collection_end(lexer_t *lexer, char input) {
|
||||
// No need to ignore space as this is only called when input is } or ]
|
||||
|
||||
lexer->current = lexer->stack.stack[lexer->stack.size - 1];
|
||||
|
||||
bool object_end = lexer->current == LEXER_STATE_OBJECT && input == '}';
|
||||
|
||||
if (object_end) {
|
||||
token_t *token;
|
||||
|
||||
if (lexer->token_ready) {
|
||||
lexer->has_extra_token = true;
|
||||
token = &(lexer->extra_token);
|
||||
} else {
|
||||
lexer->token_ready = true;
|
||||
token = &(lexer->token);
|
||||
}
|
||||
|
||||
set_token(token, lexer->line, lexer->column, TK_R_BRACE,
|
||||
(token_value_t){0});
|
||||
|
||||
return LEXER_STATE_OBJECT_END;
|
||||
}
|
||||
|
||||
bool array_end = lexer->current == LEXER_STATE_ARRAY && input == ']';
|
||||
|
||||
if (array_end) {
|
||||
token_t *token;
|
||||
|
||||
if (lexer->token_ready) {
|
||||
lexer->has_extra_token = true;
|
||||
token = &(lexer->extra_token);
|
||||
} else {
|
||||
lexer->token_ready = true;
|
||||
token = &(lexer->token);
|
||||
}
|
||||
|
||||
set_token(token, lexer->line, lexer->column, TK_R_BRACKET,
|
||||
(token_value_t){0});
|
||||
|
||||
return LEXER_STATE_ARRAY_END;
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
void handle_input_after_collection_end(lexer_t *lexer, char input) {
|
||||
switch (input) {
|
||||
case '}':
|
||||
lexer->token_ready = true;
|
||||
set_token(&(lexer->token), lexer->line, lexer->column, TK_R_BRACE,
|
||||
(token_value_t){0});
|
||||
|
||||
break;
|
||||
case ']':
|
||||
lexer->token_ready = true;
|
||||
set_token(&(lexer->token), lexer->line, lexer->column, TK_R_BRACKET,
|
||||
(token_value_t){0});
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
lexer_state_t handle_object(lexer_t *lexer, char input) {
|
||||
if (isspace(input)) {
|
||||
return LEXER_STATE_OBJECT;
|
||||
} else if (input == '"') {
|
||||
stack_push(&(lexer->stack), LEXER_STATE_KEY);
|
||||
|
||||
return LEXER_STATE_KEY;
|
||||
} else if (input == '}') {
|
||||
return handle_collection_end(lexer, input);
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_array(lexer_t *lexer, char input) {
|
||||
if (isspace(input)) {
|
||||
return LEXER_STATE_ARRAY;
|
||||
} else if (input == ']') {
|
||||
return handle_collection_end(lexer, input);
|
||||
}
|
||||
|
||||
return handle_value(lexer, input);
|
||||
}
|
||||
|
||||
lexer_state_t handle_key(lexer_t *lexer, char input) {
|
||||
dstr_append(&(lexer->current_string), input);
|
||||
|
||||
return LEXER_STATE_STRING;
|
||||
}
|
||||
|
||||
lexer_state_t handle_value(lexer_t *lexer, char input) {
|
||||
if (isspace(input)) {
|
||||
return LEXER_STATE_VALUE;
|
||||
} else if ((isdigit(input) && input != '0') || input == '-') {
|
||||
dstr_append(&(lexer->current_string), input);
|
||||
|
||||
return LEXER_STATE_NUMBER;
|
||||
}
|
||||
|
||||
switch (input) {
|
||||
case '"':
|
||||
stack_push(&(lexer->stack), LEXER_STATE_VALUE);
|
||||
|
||||
return LEXER_STATE_STRING;
|
||||
case '0':
|
||||
dstr_append(&(lexer->current_string), input);
|
||||
|
||||
return LEXER_STATE_DECIMAL;
|
||||
case '{':
|
||||
lexer->token_ready = true;
|
||||
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACE,
|
||||
(token_value_t){0});
|
||||
|
||||
return LEXER_STATE_OBJECT_START;
|
||||
case '[':
|
||||
lexer->token_ready = true;
|
||||
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACKET,
|
||||
(token_value_t){0});
|
||||
|
||||
return LEXER_STATE_ARRAY_START;
|
||||
case 't':
|
||||
case 'f':
|
||||
case 'n':
|
||||
append_to_lex_str(&(lexer->keyword), input);
|
||||
|
||||
return handle_keyword(input);
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_string(lexer_t *lexer, char input) {
|
||||
switch (input) {
|
||||
case '\\':
|
||||
dstr_append(&(lexer->current_string), input);
|
||||
|
||||
return LEXER_STATE_ESCAPE_SEQUENCE;
|
||||
case '"': {
|
||||
lexer_state_t string_type = lexer->stack.stack[lexer->stack.size - 1];
|
||||
|
||||
lexer->token_ready = true;
|
||||
token_t *token = &(lexer->token);
|
||||
u64 column = lexer->column - dstr_length(lexer->current_string);
|
||||
token_value_t value = {.string = dstr_to_cstr(lexer->current_string)};
|
||||
|
||||
if (string_type == LEXER_STATE_KEY) {
|
||||
set_token(token, lexer->line, column, TK_STR_KEY, value);
|
||||
} else if (string_type == LEXER_STATE_VALUE) {
|
||||
set_token(token, lexer->line, column, TK_STR_VAL, value);
|
||||
}
|
||||
|
||||
return LEXER_STATE_STRING_END;
|
||||
}
|
||||
}
|
||||
|
||||
dstr_append(&(lexer->current_string), input);
|
||||
|
||||
return LEXER_STATE_STRING;
|
||||
}
|
||||
|
||||
lexer_state_t handle_string_end(lexer_t *lexer, char input) {
|
||||
if (isspace(input)) {
|
||||
return LEXER_STATE_STRING_END;
|
||||
}
|
||||
|
||||
lexer->current = stack_pop(&(lexer->stack));
|
||||
|
||||
bool key_end = lexer->current == LEXER_STATE_KEY && input == ':';
|
||||
|
||||
if (key_end) {
|
||||
return LEXER_STATE_VALUE;
|
||||
}
|
||||
|
||||
bool value_end = lexer->current == LEXER_STATE_VALUE && input == ',';
|
||||
|
||||
if (value_end) {
|
||||
return lexer->stack.stack[lexer->stack.size - 1];
|
||||
}
|
||||
|
||||
bool collection_end = input == '}' || input == ']';
|
||||
|
||||
return collection_end ? handle_collection_end(lexer, input)
|
||||
: LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_escape_sequence(lexer_t *lexer, char input) {
|
||||
dstr_append(&(lexer->current_string), input);
|
||||
|
||||
switch (input) {
|
||||
case '"':
|
||||
case '/':
|
||||
case '\\':
|
||||
case 'b':
|
||||
case 'f':
|
||||
case 'n':
|
||||
case 'r':
|
||||
case 't':
|
||||
return LEXER_STATE_STRING;
|
||||
case 'u':
|
||||
return LEXER_STATE_UNICODE_HEX;
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input) {
|
||||
append_to_lex_str(&(lexer->codepoint), input);
|
||||
dstr_append(&(lexer->current_string), input);
|
||||
|
||||
if (!ishex(input)) {
|
||||
clear_lex_str(&(lexer->codepoint));
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
} else if (lexer->codepoint.size == UNICODE_LENGTH) {
|
||||
clear_lex_str(&(lexer->codepoint));
|
||||
|
||||
return LEXER_STATE_STRING;
|
||||
}
|
||||
|
||||
return LEXER_STATE_UNICODE_HEX;
|
||||
}
|
||||
|
||||
lexer_state_t handle_decimal(lexer_t *lexer, char input) {
|
||||
dstr_append(&(lexer->current_string), input);
|
||||
|
||||
if (input == '.') {
|
||||
return LEXER_STATE_FRACTION;
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_number(lexer_t *lexer, char input) {
|
||||
if (isdigit(input)) {
|
||||
dstr_append(&(lexer->current_string), input);
|
||||
|
||||
return LEXER_STATE_NUMBER;
|
||||
} else if (input == '.') {
|
||||
dstr_append(&(lexer->current_string), input);
|
||||
|
||||
return LEXER_STATE_FRACTION;
|
||||
} else if (input == '}' || input == ']') {
|
||||
lexer->token_ready = true;
|
||||
u64 column = lexer->column - dstr_length(lexer->current_string);
|
||||
|
||||
token_t token = dstr_to_numerical_token(lexer->current_string);
|
||||
|
||||
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
|
||||
|
||||
return handle_collection_end(lexer, input);
|
||||
} else if (input == ',') {
|
||||
lexer->token_ready = true;
|
||||
u64 column = lexer->column - dstr_length(lexer->current_string);
|
||||
|
||||
token_t token = dstr_to_numerical_token(lexer->current_string);
|
||||
|
||||
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
|
||||
|
||||
return lexer->stack.stack[lexer->stack.size - 1];
|
||||
} else if (isspace(input)) {
|
||||
lexer->token_ready = true;
|
||||
u64 column = lexer->column - dstr_length(lexer->current_string);
|
||||
|
||||
token_t token = dstr_to_numerical_token(lexer->current_string);
|
||||
|
||||
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
|
||||
|
||||
return LEXER_STATE_NUMBER_END;
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_fraction(lexer_t *lexer, char input) {
|
||||
if (isdigit(input)) {
|
||||
dstr_append(&(lexer->current_string), input);
|
||||
|
||||
return LEXER_STATE_FRACTION;
|
||||
} else if (input == '}' || input == ']') {
|
||||
lexer->token_ready = true;
|
||||
u64 column = lexer->column - dstr_length(lexer->current_string);
|
||||
|
||||
token_t token = dstr_to_numerical_token(lexer->current_string);
|
||||
|
||||
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
|
||||
|
||||
return handle_collection_end(lexer, input);
|
||||
} else if (input == 'e' || input == 'E') {
|
||||
dstr_append(&(lexer->current_string), input);
|
||||
|
||||
return LEXER_STATE_EXPONENT;
|
||||
} else if (input == ',') {
|
||||
lexer->token_ready = true;
|
||||
u64 column = lexer->column - dstr_length(lexer->current_string);
|
||||
|
||||
token_t token = dstr_to_numerical_token(lexer->current_string);
|
||||
|
||||
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
|
||||
|
||||
return lexer->stack.stack[lexer->stack.size - 1];
|
||||
} else if (isspace(input)) {
|
||||
lexer->token_ready = true;
|
||||
u64 column = lexer->column - dstr_length(lexer->current_string);
|
||||
|
||||
token_t token = dstr_to_numerical_token(lexer->current_string);
|
||||
|
||||
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
|
||||
|
||||
return LEXER_STATE_NUMBER_END;
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_exponent(lexer_t *lexer, char input) {
|
||||
dstr_append(&(lexer->current_string), input);
|
||||
|
||||
if (isdigit(input)) {
|
||||
return LEXER_STATE_POWER;
|
||||
} else if (input == '+' || input == '-') {
|
||||
return LEXER_STATE_EXP_SIGN;
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_exp_sign(lexer_t *lexer, char input) {
|
||||
dstr_append(&(lexer->current_string), input);
|
||||
|
||||
if (isdigit(input)) {
|
||||
return LEXER_STATE_POWER;
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_power(lexer_t *lexer, char input) {
|
||||
if (isdigit(input)) {
|
||||
dstr_append(&(lexer->current_string), input);
|
||||
|
||||
return LEXER_STATE_POWER;
|
||||
} else if (input == '}' || input == ']') {
|
||||
lexer->token_ready = true;
|
||||
u64 column = lexer->column - dstr_length(lexer->current_string);
|
||||
|
||||
token_t token = dstr_to_numerical_token(lexer->current_string);
|
||||
|
||||
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
|
||||
|
||||
return handle_collection_end(lexer, input);
|
||||
} else if (input == ',') {
|
||||
lexer->token_ready = true;
|
||||
u64 column = lexer->column - dstr_length(lexer->current_string);
|
||||
|
||||
token_t token = dstr_to_numerical_token(lexer->current_string);
|
||||
|
||||
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
|
||||
|
||||
return lexer->stack.stack[lexer->stack.size - 1];
|
||||
} else if (isspace(input)) {
|
||||
lexer->token_ready = true;
|
||||
u64 column = lexer->column - dstr_length(lexer->current_string);
|
||||
|
||||
token_t token = dstr_to_numerical_token(lexer->current_string);
|
||||
|
||||
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
|
||||
|
||||
return LEXER_STATE_NUMBER_END;
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_number_end(lexer_t *lexer, char input) {
|
||||
if (isspace(input)) {
|
||||
return LEXER_STATE_NUMBER_END;
|
||||
} else if (input == ',') {
|
||||
lexer->token_ready = true;
|
||||
u64 column = lexer->column - dstr_length(lexer->current_string);
|
||||
|
||||
token_t token = dstr_to_numerical_token(lexer->current_string);
|
||||
|
||||
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
|
||||
|
||||
return lexer->stack.stack[lexer->stack.size - 1];
|
||||
}
|
||||
|
||||
bool collection_end = input == '}' || input == ']';
|
||||
|
||||
return collection_end ? handle_collection_end(lexer, input)
|
||||
: LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_keyword(char input) {
|
||||
switch (input) {
|
||||
case 't':
|
||||
return LEXER_STATE_TRUE;
|
||||
case 'f':
|
||||
return LEXER_STATE_FALSE;
|
||||
case 'n':
|
||||
return LEXER_STATE_NULL;
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_true(lexer_t *lexer, char input) {
|
||||
char current[MAX_KEYWORD_LENGTH + 1];
|
||||
strcpy(current, lexer->keyword.keyword.str);
|
||||
|
||||
append_to_lex_str(&(lexer->keyword), input);
|
||||
|
||||
bool return_state_true = (strequal(current, "t") && input == 'r') ||
|
||||
(strequal(current, "tr") && input == 'u');
|
||||
|
||||
bool return_state_end = strequal(current, "tru") && input == 'e';
|
||||
|
||||
if (return_state_true) {
|
||||
return LEXER_STATE_TRUE;
|
||||
} else if (return_state_end) {
|
||||
return LEXER_STATE_KEYWORD_END;
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_false(lexer_t *lexer, char input) {
|
||||
char current[MAX_KEYWORD_LENGTH + 1];
|
||||
strcpy(current, lexer->keyword.keyword.str);
|
||||
|
||||
append_to_lex_str(&(lexer->keyword), input);
|
||||
|
||||
bool return_state_false = (strequal(current, "f") && input == 'a') ||
|
||||
(strequal(current, "fa") && input == 'l') ||
|
||||
(strequal(current, "fal") && input == 's');
|
||||
|
||||
bool return_state_end = strequal(current, "fals") && input == 'e';
|
||||
|
||||
if (return_state_false) {
|
||||
return LEXER_STATE_FALSE;
|
||||
} else if (return_state_end) {
|
||||
return LEXER_STATE_KEYWORD_END;
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_null(lexer_t *lexer, char input) {
|
||||
char current[MAX_KEYWORD_LENGTH + 1];
|
||||
strcpy(current, lexer->keyword.keyword.str);
|
||||
|
||||
append_to_lex_str(&(lexer->keyword), input);
|
||||
|
||||
bool return_state_null = (strequal(current, "n") && input == 'u') ||
|
||||
(strequal(current, "nu") && input == 'l');
|
||||
|
||||
bool return_state_end = strequal(current, "nul") && input == 'l';
|
||||
|
||||
if (return_state_null) {
|
||||
return LEXER_STATE_NULL;
|
||||
} else if (return_state_end) {
|
||||
return LEXER_STATE_KEYWORD_END;
|
||||
}
|
||||
|
||||
return LEXER_STATE_ERROR;
|
||||
}
|
||||
|
||||
lexer_state_t handle_keyword_end(lexer_t *lexer, char input) {
|
||||
const char *keyword = lexer->keyword.keyword.str;
|
||||
|
||||
if (lexer->keyword.size > 0) {
|
||||
lexer->token_ready = true;
|
||||
token_t *token = &(lexer->token);
|
||||
u64 column = lexer->column - lexer->keyword.size;
|
||||
|
||||
if (strequal(keyword, "null")) {
|
||||
set_token(token, lexer->line, column, TK_NULL, (token_value_t){0});
|
||||
} else if (strequal(keyword, "true")) {
|
||||
set_token(token, lexer->line, column, TK_BOOL,
|
||||
(token_value_t){.boolean = true});
|
||||
} else if (strequal(keyword, "false")) {
|
||||
set_token(token, lexer->line, column, TK_BOOL,
|
||||
(token_value_t){.boolean = false});
|
||||
}
|
||||
|
||||
clear_lex_str(&(lexer->keyword));
|
||||
}
|
||||
|
||||
if (isspace(input)) {
|
||||
return LEXER_STATE_KEYWORD_END;
|
||||
} else if (input == ',') {
|
||||
return lexer->stack.stack[lexer->stack.size - 1];
|
||||
}
|
||||
|
||||
bool collection_end = input == '}' || input == ']';
|
||||
|
||||
return collection_end ? handle_collection_end(lexer, input)
|
||||
: LEXER_STATE_ERROR;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user