helloJSON/src/lexer/lexer.c

1222 lines
30 KiB
C

#include "lexer.h"
#include "aliases.h"
#include "dstring.h"
#include "lexer_data.h"
#include <assert.h>
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_KEYWORD_LENGTH 5
#define UNICODE_LENGTH 4
#define MAX_STACK_CAPACITY 1024
#define STRING_BUF_START_CAPACITY 1024
typedef struct {
lexer_state_t stack[MAX_STACK_CAPACITY];
u64 size;
} state_stack_t;
typedef enum {
LEXER_STRING_KEYWORD,
LEXER_STRING_UNICODE,
} lex_str_type;
typedef struct {
char str[MAX_KEYWORD_LENGTH + 1];
} keyword_t;
typedef struct {
char codepoint[UNICODE_LENGTH];
} unicode_t;
typedef struct {
lex_str_type type;
u64 size;
union {
keyword_t keyword;
unicode_t unicode;
};
} lexer_string_t;
struct lexer_s {
u64 cursor;
u64 line;
u64 column;
u64 text_length;
const char *text;
lexer_state_t current;
lexer_state_t next;
state_stack_t stack;
lexer_string_t keyword;
lexer_string_t codepoint;
dstr_t *current_string;
bool token_ready;
token_t token;
bool has_extra_token;
token_t extra_token;
dstr_t *error_message;
char current_char;
};
INTERNAL lexer_input_t char_type(char input);
INTERNAL void stack_push(state_stack_t *stack, lexer_state_t value);
INTERNAL lexer_state_t stack_pop(state_stack_t *stack);
INTERNAL void append_to_lex_str(lexer_string_t *str, char input);
INTERNAL void clear_lex_str(lexer_string_t *str);
INTERNAL bool strequal(const char *const first, const char *const second);
INTERNAL bool is_valid_hex_char(const char input);
INTERNAL bool ishex(const char input);
INTERNAL token_t dstr_to_numerical_token(const dstr_t *str);
INTERNAL void set_token(token_t *token, u64 line, u64 column, token_type type,
token_value_t value);
INTERNAL void finalise_state_transition(lexer_t *lexer);
INTERNAL void post_keyword(lexer_t *lexer);
INTERNAL void set_numerical_token(lexer_t *lexer);
INTERNAL void handle_string_end(lexer_t *lexer);
INTERNAL lexer_state_t state_table[COUNT_LEXER_STATES][COUNT_LEXER_INPUTS] = {
#include "lexer_state_transitions.table"
};
void lexer_init(lexer_t **lexer) {
if (*lexer) {
lexer_free(lexer);
}
*lexer = (lexer_t *)malloc(sizeof(lexer_t));
if (!(*lexer)) {
return;
}
(*lexer)->cursor = 0;
(*lexer)->line = 1;
(*lexer)->column = 0;
(*lexer)->text_length = 0;
(*lexer)->text = "";
(*lexer)->current = LEXER_STATE_START;
(*lexer)->next = LEXER_STATE_START;
(*lexer)->keyword.type = LEXER_STRING_KEYWORD;
(*lexer)->codepoint.type = LEXER_STRING_UNICODE;
(*lexer)->current_string = dstr_with_capacity(STRING_BUF_START_CAPACITY);
(*lexer)->error_message = dstr_with_capacity(STRING_BUF_START_CAPACITY);
(*lexer)->token_ready = false;
(*lexer)->token = (token_t){0};
(*lexer)->has_extra_token = false;
(*lexer)->extra_token = (token_t){0};
if (!((*lexer)->current_string)) {
lexer_free(lexer);
}
}
void lexer_free(lexer_t **lexer) {
if (!(*lexer)) {
return;
}
dstr_free(&((*lexer)->current_string));
dstr_free(&((*lexer)->error_message));
free(*lexer);
*lexer = NULL;
}
lex_result_t get_next_token(lexer_t *lexer, const char *text) {
if (text != NULL) {
lexer->cursor = 0;
lexer->text = text;
lexer->text_length = strlen(text);
}
dstr_clear(lexer->current_string);
while (lexer->cursor < lexer->text_length) {
if (lexer->has_extra_token) {
lexer->has_extra_token = false;
return (lex_result_t){
(lex_err_t){.errno = LEX_ERR_NONE, .msg = ""},
lexer->extra_token,
};
}
lexer->current_char = lexer->text[(lexer->cursor)++];
lexer_input_t input = char_type(lexer->current_char);
lexer->next = state_table[lexer->current][input];
finalise_state_transition(lexer);
if (input == LEXER_INPUT_NEWLINE) {
++(lexer->line);
lexer->column = 0;
} else {
++(lexer->column);
}
if (lexer->current == LEXER_STATE_ERROR) {
char msg[STRING_BUF_START_CAPACITY + 1];
memset(msg, 0, STRING_BUF_START_CAPACITY + 1);
u64 slice_length = 20;
char slice[slice_length];
snprintf(slice, slice_length, "%s", &(lexer->text[lexer->cursor - 1]));
snprintf(
msg, STRING_BUF_START_CAPACITY,
"\n(%llu:%llu) Encountered an error while parsing the following:\n%s",
(unsigned long long)lexer->line, (unsigned long long)lexer->column,
slice);
dstr_update(&(lexer->error_message), msg);
return (lex_result_t){
(lex_err_t){.errno = LEX_ERR_INVALID,
.msg = dstr_to_cstr(lexer->error_message)},
(token_t){0},
};
} else if (lexer->token_ready) {
lexer->token_ready = false;
return (lex_result_t){
(lex_err_t){.errno = LEX_ERR_NONE, .msg = ""},
lexer->token,
};
}
}
return (lex_result_t){
(lex_err_t){.errno = LEX_ERR_NONE, .msg = ""},
(token_t){0},
};
}
void print_token(token_t token) {
i32 num_padding = 4;
printf("{LINE: %*llu, COLUMN: %*llu, TYPE: ", num_padding,
(unsigned long long)token.line, num_padding,
(unsigned long long)token.column);
i32 token_type_padding = 15;
switch (token.type) {
case TK_NO_TOKEN:
break;
case TK_L_BRACE:
printf("%*s, VALUE: N/A", token_type_padding, "TK_L_BRACE");
break;
case TK_R_BRACE:
printf("%*s, VALUE: N/A", token_type_padding, "TK_R_BRACE");
break;
case TK_L_BRACKET:
printf("%*s, VALUE: N/A", token_type_padding, "TK_L_BRACKET");
break;
case TK_R_BRACKET:
printf("%*s, VALUE: N/A", token_type_padding, "TK_R_BRACKET");
break;
case TK_NULL:
printf("%*s, VALUE: N/A", token_type_padding, "TK_NULL");
break;
case TK_BOOL:
printf("%*s, VALUE: %s", token_type_padding, "TK_BOOL",
token.value.boolean ? "true" : "false");
break;
case TK_STR_KEY:
printf("%*s, VALUE: %s", token_type_padding, "TK_STR_KEY",
token.value.string);
break;
case TK_STR_VAL:
printf("%*s, VALUE: %s", token_type_padding, "TK_STR_VAL",
token.value.string);
break;
case TK_INTEGER:
printf("%*s, VALUE: %lld", token_type_padding, "TK_INTEGER",
(long long)token.value.num_int);
break;
case TK_DOUBLE:
printf("%*s, VALUE: %f", token_type_padding, "TK_DOUBLE",
token.value.num_frac);
break;
}
printf("}\n");
}
INTERNAL lexer_input_t char_type(char input) {
if (input == '\n') {
return LEXER_INPUT_NEWLINE;
} else if (isspace(input)) {
return LEXER_INPUT_WHITE_SPACE;
} else if (input >= '1' && input <= '9') {
return LEXER_INPUT_NON_ZERO;
}
switch (input) {
case '{':
return LEXER_INPUT_OPEN_BRACE;
case '}':
return LEXER_INPUT_CLOSE_BRACE;
case '[':
return LEXER_INPUT_OPEN_BRACKET;
case ']':
return LEXER_INPUT_CLOSE_BRACKET;
case ',':
return LEXER_INPUT_COMMA;
case ':':
return LEXER_INPUT_COLON;
case '"':
return LEXER_INPUT_DOUBLE_QUOTE;
case '\\':
return LEXER_INPUT_BACK_SLASH;
case '/':
return LEXER_INPUT_FORWARD_SLASH;
case 'a':
return LEXER_INPUT_LOWER_A;
case 'b':
return LEXER_INPUT_LOWER_B;
case 'c':
return LEXER_INPUT_LOWER_C;
case 'd':
return LEXER_INPUT_LOWER_D;
case 'e':
return LEXER_INPUT_LOWER_E;
case 'f':
return LEXER_INPUT_LOWER_F;
case 'l':
return LEXER_INPUT_LOWER_L;
case 'n':
return LEXER_INPUT_LOWER_N;
case 'r':
return LEXER_INPUT_LOWER_R;
case 's':
return LEXER_INPUT_LOWER_S;
case 't':
return LEXER_INPUT_LOWER_T;
case 'u':
return LEXER_INPUT_LOWER_U;
case 'A':
return LEXER_INPUT_UPPER_A;
case 'B':
return LEXER_INPUT_UPPER_B;
case 'C':
return LEXER_INPUT_UPPER_C;
case 'D':
return LEXER_INPUT_UPPER_D;
case 'E':
return LEXER_INPUT_UPPER_E;
case 'F':
return LEXER_INPUT_UPPER_F;
case '-':
return LEXER_INPUT_MINUS;
case '+':
return LEXER_INPUT_PLUS;
case '.':
return LEXER_INPUT_DECIMAL;
case '0':
return LEXER_INPUT_ZERO;
default:
return LEXER_INPUT_OTHER;
}
}
void stack_push(state_stack_t *stack, lexer_state_t state) {
if (stack->size + 1 >= MAX_STACK_CAPACITY) {
return;
}
stack->stack[(stack->size)++] = state;
}
lexer_state_t stack_pop(state_stack_t *stack) {
if (stack->size == 0) {
return LEXER_STATE_ERROR;
}
lexer_state_t state = stack->stack[--(stack->size)];
return state;
}
void append_to_lex_str(lexer_string_t *lex_str, char input) {
u64 capacity = 0;
char *str = NULL;
switch (lex_str->type) {
case LEXER_STRING_KEYWORD:
capacity = MAX_KEYWORD_LENGTH;
str = lex_str->keyword.str;
break;
case LEXER_STRING_UNICODE:
capacity = UNICODE_LENGTH;
str = lex_str->unicode.codepoint;
break;
}
if (lex_str->size + 1 > capacity) {
return;
}
assert(str != NULL);
str[(lex_str->size)++] = input;
}
void clear_lex_str(lexer_string_t *lex_str) {
u64 capacity = 1;
char *str = NULL;
switch (lex_str->type) {
case LEXER_STRING_KEYWORD:
capacity += MAX_KEYWORD_LENGTH;
str = lex_str->keyword.str;
break;
case LEXER_STRING_UNICODE:
capacity += UNICODE_LENGTH;
str = lex_str->unicode.codepoint;
break;
}
assert(str != NULL);
memset(str, 0, capacity);
lex_str->size = 0;
}
bool strequal(const char *const first, const char *const second) {
return strcmp(first, second) == 0;
}
bool is_valid_hex_char(const char input) {
return (input >= 'A' && input <= 'F') || (input >= 'a' && input <= 'e');
}
bool ishex(const char input) {
return isdigit(input) || is_valid_hex_char(input);
}
token_t dstr_to_numerical_token(const dstr_t *str) {
token_t token = {0};
bool is_double = dstr_find(str, ".") != -1;
token.type = is_double ? TK_DOUBLE : TK_INTEGER;
if (is_double) {
token.value.num_frac = strtod(dstr_to_cstr(str), NULL);
} else {
token.value.num_int = atol(dstr_to_cstr(str));
}
return token;
}
void set_token(token_t *token, u64 line, u64 column, token_type type,
token_value_t value) {
*token = (token_t){
.line = line,
.column = column,
.type = type,
.value = value,
};
}
void finalise_state_transition(lexer_t *lexer) {
switch (lexer->next) {
case LEXER_STATE_OBJECT_START:
lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACE,
(token_value_t){0});
stack_push(&(lexer->stack), LEXER_STATE_OBJECT);
lexer->next = LEXER_STATE_OBJECT;
break;
case LEXER_STATE_OBJECT_END:
break;
case LEXER_STATE_ARRAY_START:
lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACKET,
(token_value_t){0});
stack_push(&(lexer->stack), LEXER_STATE_ARRAY);
lexer->next = LEXER_STATE_ARRAY;
break;
case LEXER_STATE_ARRAY_END:
break;
case LEXER_STATE_KEY:
if (lexer->current == LEXER_STATE_OBJECT) {
stack_push(&(lexer->stack), LEXER_STATE_KEY);
lexer->next = LEXER_STATE_STRING;
}
break;
case LEXER_STATE_KEY_END:
lexer->next = LEXER_STATE_VALUE;
break;
case LEXER_STATE_STRING:
case LEXER_STATE_ESCAPE_SEQUENCE:
case LEXER_STATE_UNICODE_HEX1:
case LEXER_STATE_UNICODE_HEX2:
case LEXER_STATE_UNICODE_HEX3:
case LEXER_STATE_UNICODE_HEX4:
dstr_append(&(lexer->current_string), lexer->current_char);
break;
case LEXER_STATE_STRING_END:
handle_string_end(lexer);
break;
case LEXER_STATE_TRUE:
case LEXER_STATE_FALSE:
case LEXER_STATE_NULL:
post_keyword(lexer);
break;
case LEXER_STATE_VALUE_END:
switch (lexer->current) {
case LEXER_STATE_NUMBER:
case LEXER_STATE_FRACTION:
case LEXER_STATE_POWER:
case LEXER_STATE_NUMBER_END:
set_numerical_token(lexer);
break;
default:
break;
}
lexer->next = lexer->stack.stack[lexer->stack.size - 1];
break;
case LEXER_STATE_NUMBER_END:
switch (lexer->current) {
case LEXER_STATE_NUMBER:
case LEXER_STATE_FRACTION:
case LEXER_STATE_POWER:
set_numerical_token(lexer);
break;
default:
break;
}
break;
}
lexer->current = lexer->next;
}
void post_keyword(lexer_t *lexer) {
u64 keyword_char_count;
u64 column;
token_t *token = &(lexer->token);
switch (lexer->current) {
case LEXER_STATE_NULL:
keyword_char_count = 4;
column = lexer->column - keyword_char_count;
set_token(token, lexer->line, column, TK_NULL, (token_value_t){0});
break;
case LEXER_STATE_TRUE:
keyword_char_count = 4;
column = lexer->column - keyword_char_count;
set_token(token, lexer->line, column, TK_BOOL,
(token_value_t){.boolean = true});
break;
case LEXER_STATE_FALSE:
keyword_char_count = 5;
column = lexer->column - keyword_char_count;
set_token(token, lexer->line, column, TK_BOOL,
(token_value_t){.boolean = false});
break;
default:
lexer->next = LEXER_STATE_ERROR;
return;
}
lexer->token_ready = true;
lexer->next = LEXER_STATE_KEYWORD_END;
}
void set_numerical_token(lexer_t *lexer) {
lexer->token_ready = true;
u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
}
void handle_string_end(lexer_t *lexer) {
lexer_state_t string_type = lexer->stack.stack[lexer->stack.size - 1];
lexer->token_ready = true;
token_t *token = &(lexer->token);
u64 column = lexer->column - dstr_length(lexer->current_string);
token_value_t value = {.string = dstr_to_cstr(lexer->current_string)};
if (string_type == LEXER_STATE_KEY) {
set_token(token, lexer->line, column, TK_STR_KEY, value);
} else if (string_type == LEXER_STATE_VALUE) {
set_token(token, lexer->line, column, TK_STR_VAL, value);
}
}
void lexer_state_machine(lexer_t *lexer, char input) {
switch (lexer->current) {
case LEXER_STATE_START:
lexer->current = handle_lexer_start(lexer, input);
break;
case LEXER_STATE_VALUE:
lexer->current = handle_value(lexer, input);
break;
case LEXER_STATE_OBJECT_START:
stack_push(&(lexer->stack), LEXER_STATE_OBJECT);
// break is left out intentionally here to utilise the fallthrough behaviour
// of the switch statement
case LEXER_STATE_OBJECT:
lexer->current = handle_object(lexer, input);
break;
case LEXER_STATE_ARRAY_START:
stack_push(&(lexer->stack), LEXER_STATE_ARRAY);
// break is left out intentionally here to utilise the fallthrough behaviour
// of the switch statement
case LEXER_STATE_ARRAY:
lexer->current = handle_array(lexer, input);
break;
case LEXER_STATE_OBJECT_END:
case LEXER_STATE_ARRAY_END:
if (lexer->stack.size > 1) {
stack_pop(&(lexer->stack));
lexer->current = lexer->stack.stack[lexer->stack.size - 1];
} else {
lexer->current = LEXER_STATE_LAST_COLLECTION;
}
handle_input_after_collection_end(lexer, input);
break;
case LEXER_STATE_KEY:
lexer->current = handle_key(lexer, input);
break;
case LEXER_STATE_DECIMAL:
lexer->current = handle_decimal(lexer, input);
break;
case LEXER_STATE_NUMBER:
lexer->current = handle_number(lexer, input);
break;
case LEXER_STATE_FRACTION:
lexer->current = handle_fraction(lexer, input);
break;
case LEXER_STATE_EXPONENT:
lexer->current = handle_exponent(lexer, input);
break;
case LEXER_STATE_EXP_SIGN:
lexer->current = handle_exp_sign(lexer, input);
break;
case LEXER_STATE_POWER:
lexer->current = handle_power(lexer, input);
break;
case LEXER_STATE_NUMBER_END:
lexer->current = handle_number_end(lexer, input);
break;
case LEXER_STATE_STRING:
lexer->current = handle_string(lexer, input);
break;
case LEXER_STATE_STRING_END:
lexer->current = handle_string_end(lexer, input);
break;
case LEXER_STATE_ESCAPE_SEQUENCE:
lexer->current = handle_escape_sequence(lexer, input);
break;
case LEXER_STATE_UNICODE_HEX:
lexer->current = handle_unicode_sequence(lexer, input);
break;
case LEXER_STATE_TRUE:
lexer->current = handle_true(lexer, input);
break;
case LEXER_STATE_FALSE:
lexer->current = handle_false(lexer, input);
break;
case LEXER_STATE_NULL:
lexer->current = handle_null(lexer, input);
break;
case LEXER_STATE_KEYWORD_END:
lexer->current = handle_keyword_end(lexer, input);
break;
case LEXER_STATE_LAST_COLLECTION:
lexer->current = handle_last_collection(input);
break;
case LEXER_STATE_ERROR:
case COUNT_LEXER_STATES:
lexer->current = LEXER_STATE_ERROR;
break;
}
}
lexer_state_t handle_lexer_start(lexer_t *lexer, char input) {
if (isspace(input)) {
return LEXER_STATE_START;
}
switch (input) {
case '{':
lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACE,
(token_value_t){0});
return LEXER_STATE_OBJECT_START;
case '[':
lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACKET,
(token_value_t){0});
return LEXER_STATE_ARRAY_START;
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_last_collection(char input) {
if (isspace(input)) {
return LEXER_STATE_LAST_COLLECTION;
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_collection_end(lexer_t *lexer, char input) {
// No need to ignore space as this is only called when input is } or ]
lexer->current = lexer->stack.stack[lexer->stack.size - 1];
bool object_end = lexer->current == LEXER_STATE_OBJECT && input == '}';
if (object_end) {
token_t *token;
if (lexer->token_ready) {
lexer->has_extra_token = true;
token = &(lexer->extra_token);
} else {
lexer->token_ready = true;
token = &(lexer->token);
}
set_token(token, lexer->line, lexer->column, TK_R_BRACE,
(token_value_t){0});
return LEXER_STATE_OBJECT_END;
}
bool array_end = lexer->current == LEXER_STATE_ARRAY && input == ']';
if (array_end) {
token_t *token;
if (lexer->token_ready) {
lexer->has_extra_token = true;
token = &(lexer->extra_token);
} else {
lexer->token_ready = true;
token = &(lexer->token);
}
set_token(token, lexer->line, lexer->column, TK_R_BRACKET,
(token_value_t){0});
return LEXER_STATE_ARRAY_END;
}
return LEXER_STATE_ERROR;
}
void handle_input_after_collection_end(lexer_t *lexer, char input) {
switch (input) {
case '}':
lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_R_BRACE,
(token_value_t){0});
break;
case ']':
lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_R_BRACKET,
(token_value_t){0});
break;
}
}
lexer_state_t handle_object(lexer_t *lexer, char input) {
if (isspace(input)) {
return LEXER_STATE_OBJECT;
} else if (input == '"') {
stack_push(&(lexer->stack), LEXER_STATE_KEY);
return LEXER_STATE_KEY;
} else if (input == '}') {
return handle_collection_end(lexer, input);
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_array(lexer_t *lexer, char input) {
if (isspace(input)) {
return LEXER_STATE_ARRAY;
} else if (input == ']') {
return handle_collection_end(lexer, input);
}
return handle_value(lexer, input);
}
lexer_state_t handle_key(lexer_t *lexer, char input) {
dstr_append(&(lexer->current_string), input);
return LEXER_STATE_STRING;
}
lexer_state_t handle_value(lexer_t *lexer, char input) {
if (isspace(input)) {
return LEXER_STATE_VALUE;
} else if ((isdigit(input) && input != '0') || input == '-') {
dstr_append(&(lexer->current_string), input);
return LEXER_STATE_NUMBER;
}
switch (input) {
case '"':
stack_push(&(lexer->stack), LEXER_STATE_VALUE);
return LEXER_STATE_STRING;
case '0':
dstr_append(&(lexer->current_string), input);
return LEXER_STATE_DECIMAL;
case '{':
lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACE,
(token_value_t){0});
return LEXER_STATE_OBJECT_START;
case '[':
lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACKET,
(token_value_t){0});
return LEXER_STATE_ARRAY_START;
case 't':
case 'f':
case 'n':
append_to_lex_str(&(lexer->keyword), input);
return handle_keyword(input);
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_string(lexer_t *lexer, char input) {
switch (input) {
case '\\':
dstr_append(&(lexer->current_string), input);
return LEXER_STATE_ESCAPE_SEQUENCE;
case '"': {
lexer_state_t string_type = lexer->stack.stack[lexer->stack.size - 1];
lexer->token_ready = true;
token_t *token = &(lexer->token);
u64 column = lexer->column - dstr_length(lexer->current_string);
token_value_t value = {.string = dstr_to_cstr(lexer->current_string)};
if (string_type == LEXER_STATE_KEY) {
set_token(token, lexer->line, column, TK_STR_KEY, value);
} else if (string_type == LEXER_STATE_VALUE) {
set_token(token, lexer->line, column, TK_STR_VAL, value);
}
return LEXER_STATE_STRING_END;
}
}
dstr_append(&(lexer->current_string), input);
return LEXER_STATE_STRING;
}
lexer_state_t handle_string_end(lexer_t *lexer, char input) {
if (isspace(input)) {
return LEXER_STATE_STRING_END;
}
lexer->current = stack_pop(&(lexer->stack));
bool key_end = lexer->current == LEXER_STATE_KEY && input == ':';
if (key_end) {
return LEXER_STATE_VALUE;
}
bool value_end = lexer->current == LEXER_STATE_VALUE && input == ',';
if (value_end) {
return lexer->stack.stack[lexer->stack.size - 1];
}
bool collection_end = input == '}' || input == ']';
return collection_end ? handle_collection_end(lexer, input)
: LEXER_STATE_ERROR;
}
lexer_state_t handle_escape_sequence(lexer_t *lexer, char input) {
dstr_append(&(lexer->current_string), input);
switch (input) {
case '"':
case '/':
case '\\':
case 'b':
case 'f':
case 'n':
case 'r':
case 't':
return LEXER_STATE_STRING;
case 'u':
return LEXER_STATE_UNICODE_HEX;
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input) {
append_to_lex_str(&(lexer->codepoint), input);
dstr_append(&(lexer->current_string), input);
if (!ishex(input)) {
clear_lex_str(&(lexer->codepoint));
return LEXER_STATE_ERROR;
} else if (lexer->codepoint.size == UNICODE_LENGTH) {
clear_lex_str(&(lexer->codepoint));
return LEXER_STATE_STRING;
}
return LEXER_STATE_UNICODE_HEX;
}
lexer_state_t handle_decimal(lexer_t *lexer, char input) {
dstr_append(&(lexer->current_string), input);
if (input == '.') {
return LEXER_STATE_FRACTION;
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_number(lexer_t *lexer, char input) {
if (isdigit(input)) {
dstr_append(&(lexer->current_string), input);
return LEXER_STATE_NUMBER;
} else if (input == '.') {
dstr_append(&(lexer->current_string), input);
return LEXER_STATE_FRACTION;
} else if (input == '}' || input == ']') {
lexer->token_ready = true;
u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return handle_collection_end(lexer, input);
} else if (input == ',') {
lexer->token_ready = true;
u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return lexer->stack.stack[lexer->stack.size - 1];
} else if (isspace(input)) {
lexer->token_ready = true;
u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return LEXER_STATE_NUMBER_END;
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_fraction(lexer_t *lexer, char input) {
if (isdigit(input)) {
dstr_append(&(lexer->current_string), input);
return LEXER_STATE_FRACTION;
} else if (input == '}' || input == ']') {
lexer->token_ready = true;
u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return handle_collection_end(lexer, input);
} else if (input == 'e' || input == 'E') {
dstr_append(&(lexer->current_string), input);
return LEXER_STATE_EXPONENT;
} else if (input == ',') {
lexer->token_ready = true;
u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return lexer->stack.stack[lexer->stack.size - 1];
} else if (isspace(input)) {
lexer->token_ready = true;
u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return LEXER_STATE_NUMBER_END;
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_exponent(lexer_t *lexer, char input) {
dstr_append(&(lexer->current_string), input);
if (isdigit(input)) {
return LEXER_STATE_POWER;
} else if (input == '+' || input == '-') {
return LEXER_STATE_EXP_SIGN;
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_exp_sign(lexer_t *lexer, char input) {
dstr_append(&(lexer->current_string), input);
if (isdigit(input)) {
return LEXER_STATE_POWER;
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_power(lexer_t *lexer, char input) {
if (isdigit(input)) {
dstr_append(&(lexer->current_string), input);
return LEXER_STATE_POWER;
} else if (input == '}' || input == ']') {
lexer->token_ready = true;
u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return handle_collection_end(lexer, input);
} else if (input == ',') {
lexer->token_ready = true;
u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return lexer->stack.stack[lexer->stack.size - 1];
} else if (isspace(input)) {
lexer->token_ready = true;
u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return LEXER_STATE_NUMBER_END;
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_number_end(lexer_t *lexer, char input) {
if (isspace(input)) {
return LEXER_STATE_NUMBER_END;
} else if (input == ',') {
lexer->token_ready = true;
u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return lexer->stack.stack[lexer->stack.size - 1];
}
bool collection_end = input == '}' || input == ']';
return collection_end ? handle_collection_end(lexer, input)
: LEXER_STATE_ERROR;
}
lexer_state_t handle_keyword(char input) {
switch (input) {
case 't':
return LEXER_STATE_TRUE;
case 'f':
return LEXER_STATE_FALSE;
case 'n':
return LEXER_STATE_NULL;
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_true(lexer_t *lexer, char input) {
char current[MAX_KEYWORD_LENGTH + 1];
strcpy(current, lexer->keyword.keyword.str);
append_to_lex_str(&(lexer->keyword), input);
bool return_state_true = (strequal(current, "t") && input == 'r') ||
(strequal(current, "tr") && input == 'u');
bool return_state_end = strequal(current, "tru") && input == 'e';
if (return_state_true) {
return LEXER_STATE_TRUE;
} else if (return_state_end) {
return LEXER_STATE_KEYWORD_END;
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_false(lexer_t *lexer, char input) {
char current[MAX_KEYWORD_LENGTH + 1];
strcpy(current, lexer->keyword.keyword.str);
append_to_lex_str(&(lexer->keyword), input);
bool return_state_false = (strequal(current, "f") && input == 'a') ||
(strequal(current, "fa") && input == 'l') ||
(strequal(current, "fal") && input == 's');
bool return_state_end = strequal(current, "fals") && input == 'e';
if (return_state_false) {
return LEXER_STATE_FALSE;
} else if (return_state_end) {
return LEXER_STATE_KEYWORD_END;
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_null(lexer_t *lexer, char input) {
char current[MAX_KEYWORD_LENGTH + 1];
strcpy(current, lexer->keyword.keyword.str);
append_to_lex_str(&(lexer->keyword), input);
bool return_state_null = (strequal(current, "n") && input == 'u') ||
(strequal(current, "nu") && input == 'l');
bool return_state_end = strequal(current, "nul") && input == 'l';
if (return_state_null) {
return LEXER_STATE_NULL;
} else if (return_state_end) {
return LEXER_STATE_KEYWORD_END;
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_keyword_end(lexer_t *lexer, char input) {
const char *keyword = lexer->keyword.keyword.str;
if (lexer->keyword.size > 0) {
lexer->token_ready = true;
token_t *token = &(lexer->token);
u64 column = lexer->column - lexer->keyword.size;
if (strequal(keyword, "null")) {
set_token(token, lexer->line, column, TK_NULL, (token_value_t){0});
} else if (strequal(keyword, "true")) {
set_token(token, lexer->line, column, TK_BOOL,
(token_value_t){.boolean = true});
} else if (strequal(keyword, "false")) {
set_token(token, lexer->line, column, TK_BOOL,
(token_value_t){.boolean = false});
}
clear_lex_str(&(lexer->keyword));
}
if (isspace(input)) {
return LEXER_STATE_KEYWORD_END;
} else if (input == ',') {
return lexer->stack.stack[lexer->stack.size - 1];
}
bool collection_end = input == '}' || input == ']';
return collection_end ? handle_collection_end(lexer, input)
: LEXER_STATE_ERROR;
}