#include "lexer.h" #include "aliases.h" #include "dstring.h" #include #include #include #include #include #include #define MAX_KEYWORD_LENGTH 5 #define UNICODE_LENGTH 4 #define MAX_STACK_CAPACITY 1024 #define STRING_BUF_START_CAPACITY 1024 typedef enum { // GENERAL STATES LEXER_STATE_START, LEXER_STATE_ERROR, LEXER_STATE_VALUE, // COLLECTION STATES LEXER_STATE_OBJECT_START, LEXER_STATE_OBJECT, LEXER_STATE_OBJECT_END, LEXER_STATE_ARRAY_START, LEXER_STATE_ARRAY, LEXER_STATE_ARRAY_END, LEXER_STATE_LAST_COLLECTION, // OBJECT STATES LEXER_STATE_KEY, // NUMBER STATES LEXER_STATE_DECIMAL, LEXER_STATE_NUMBER, LEXER_STATE_FRACTION, LEXER_STATE_EXPONENT, LEXER_STATE_EXP_SIGN, LEXER_STATE_POWER, LEXER_STATE_NUMBER_END, // STRING STATES LEXER_STATE_STRING, LEXER_STATE_STRING_END, LEXER_STATE_ESCAPE_SEQUENCE, LEXER_STATE_UNICODE_HEX, // KEYWORD STATES LEXER_STATE_TRUE, LEXER_STATE_FALSE, LEXER_STATE_NULL, LEXER_STATE_KEYWORD_END, COUNT_LEXER_STATES, } lexer_state_t; typedef struct { lexer_state_t stack[MAX_STACK_CAPACITY]; u64 size; } state_stack_t; typedef enum { LEXER_STRING_KEYWORD, LEXER_STRING_UNICODE, } lex_str_type; typedef struct { char str[MAX_KEYWORD_LENGTH + 1]; } keyword_t; typedef struct { char codepoint[UNICODE_LENGTH]; } unicode_t; typedef struct { lex_str_type type; u64 size; union { keyword_t keyword; unicode_t unicode; }; } lexer_string_t; struct lexer { u64 line; u64 column; lexer_state_t current; state_stack_t stack; lexer_string_t keyword; lexer_string_t codepoint; dstr_t *current_string; }; void stack_push(state_stack_t *stack, lexer_state_t value); lexer_state_t stack_pop(state_stack_t *stack); void append_to_lex_str(lexer_string_t *str, char input); void clear_lex_str(lexer_string_t *str); bool strequal(const char *first, const char *second); bool is_valid_hex_char(const char input); bool ishex(const char input); void lexer_state_machine(lexer_t *lexer, char input); lexer_state_t handle_lexer_start(char input); lexer_state_t handle_last_collection(char input); lexer_state_t handle_collection_end(lexer_t *lexer, char input); lexer_state_t handle_object(lexer_t *lexer, char input); lexer_state_t handle_array(lexer_t *lexer, char input); lexer_state_t handle_key(lexer_t *lexer, char input); lexer_state_t handle_value(lexer_t *lexer, char input); lexer_state_t handle_string(lexer_t *lexer, char input); lexer_state_t handle_string_end(lexer_t *lexer, char input); lexer_state_t handle_escape_sequence(lexer_t *lexer, char input); lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input); lexer_state_t handle_decimal(lexer_t *lexer, char input); lexer_state_t handle_number(lexer_t *lexer, char input); lexer_state_t handle_fraction(lexer_t *lexer, char input); lexer_state_t handle_exponent(lexer_t *lexer, char input); lexer_state_t handle_exp_sign(lexer_t *lexer, char input); lexer_state_t handle_power(lexer_t *lexer, char input); lexer_state_t handle_number_end(lexer_t *lexer, char input); lexer_state_t handle_keyword(char input); lexer_state_t handle_true(lexer_t *lexer, char input); lexer_state_t handle_false(lexer_t *lexer, char input); lexer_state_t handle_null(lexer_t *lexer, char input); lexer_state_t handle_keyword_end(lexer_t *lexer, char input); // TODO (Abdelrahman): The printf functions in the state handlers are the exit // points for the tokenisation function. Replace them once ready. bool validate_json(char *json) { lexer_t lexer = {0}; lexer.line = 1; lexer.column = 0; lexer.current = LEXER_STATE_START; lexer.keyword.type = LEXER_STRING_KEYWORD; lexer.codepoint.type = LEXER_STRING_UNICODE; lexer.current_string = dstr_with_capacity(STRING_BUF_START_CAPACITY); if (!lexer.current_string) { // TODO (Abdelrahman): This is fine for now, but it doesn't make sense to // return INVALID_JSON if string allocation fails return INVALID_JSON; } for (char *c = json; *c != '\0'; ++c) { lexer_state_machine(&lexer, *c); // Track the position in the text ++(lexer.column); if (*c == '\n') { ++(lexer.line); lexer.column = 0; } if (lexer.current == LEXER_STATE_ERROR) { return INVALID_JSON; } } return lexer.current == LEXER_STATE_LAST_COLLECTION || lexer.stack.size == 0; } void stack_push(state_stack_t *stack, lexer_state_t state) { if (stack->size + 1 >= MAX_STACK_CAPACITY) { return; } stack->stack[(stack->size)++] = state; } lexer_state_t stack_pop(state_stack_t *stack) { if (stack->size == 0) { return LEXER_STATE_ERROR; } lexer_state_t state = stack->stack[--(stack->size)]; return state; } void append_to_lex_str(lexer_string_t *lex_str, char input) { u64 capacity = 0; char *str = NULL; switch (lex_str->type) { case LEXER_STRING_KEYWORD: capacity = MAX_KEYWORD_LENGTH; str = lex_str->keyword.str; break; case LEXER_STRING_UNICODE: capacity = UNICODE_LENGTH; str = lex_str->unicode.codepoint; break; } if (lex_str->size + 1 > capacity) { return; } assert(str != NULL); str[(lex_str->size)++] = input; } void clear_lex_str(lexer_string_t *lex_str) { u64 capacity = 1; char *str = NULL; switch (lex_str->type) { case LEXER_STRING_KEYWORD: capacity += MAX_KEYWORD_LENGTH; str = lex_str->keyword.str; break; case LEXER_STRING_UNICODE: capacity += UNICODE_LENGTH; str = lex_str->unicode.codepoint; break; } assert(str != NULL); memset(str, 0, capacity); lex_str->size = 0; } bool strequal(const char *first, const char *second) { return strcmp(first, second) == 0; } bool is_valid_hex_char(const char input) { switch (input) { case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': return true; } return false; } bool ishex(const char input) { return isdigit(input) || is_valid_hex_char(input); } void lexer_state_machine(lexer_t *lexer, char input) { switch (lexer->current) { case LEXER_STATE_START: lexer->current = handle_lexer_start(input); break; case LEXER_STATE_VALUE: lexer->current = handle_value(lexer, input); break; case LEXER_STATE_OBJECT_START: stack_push(&(lexer->stack), LEXER_STATE_OBJECT); // break is left out intentionally here to utilise the fallthrough behaviour // of the switch statement case LEXER_STATE_OBJECT: lexer->current = handle_object(lexer, input); break; case LEXER_STATE_ARRAY_START: stack_push(&(lexer->stack), LEXER_STATE_ARRAY); // break is left out intentionally here to utilise the fallthrough behaviour // of the switch statement case LEXER_STATE_ARRAY: lexer->current = handle_array(lexer, input); break; case LEXER_STATE_OBJECT_END: case LEXER_STATE_ARRAY_END: if (lexer->stack.size > 1) { stack_pop(&(lexer->stack)); lexer->current = lexer->stack.stack[lexer->stack.size - 1]; } else { lexer->current = LEXER_STATE_LAST_COLLECTION; } break; case LEXER_STATE_KEY: lexer->current = handle_key(lexer, input); break; case LEXER_STATE_DECIMAL: lexer->current = handle_decimal(lexer, input); break; case LEXER_STATE_NUMBER: lexer->current = handle_number(lexer, input); break; case LEXER_STATE_FRACTION: lexer->current = handle_fraction(lexer, input); break; case LEXER_STATE_EXPONENT: lexer->current = handle_exponent(lexer, input); break; case LEXER_STATE_EXP_SIGN: lexer->current = handle_exp_sign(lexer, input); break; case LEXER_STATE_POWER: lexer->current = handle_power(lexer, input); break; case LEXER_STATE_NUMBER_END: lexer->current = handle_number_end(lexer, input); break; case LEXER_STATE_STRING: lexer->current = handle_string(lexer, input); break; case LEXER_STATE_STRING_END: lexer->current = handle_string_end(lexer, input); break; case LEXER_STATE_ESCAPE_SEQUENCE: lexer->current = handle_escape_sequence(lexer, input); break; case LEXER_STATE_UNICODE_HEX: lexer->current = handle_unicode_sequence(lexer, input); break; case LEXER_STATE_TRUE: lexer->current = handle_true(lexer, input); break; case LEXER_STATE_FALSE: lexer->current = handle_false(lexer, input); break; case LEXER_STATE_NULL: lexer->current = handle_null(lexer, input); break; case LEXER_STATE_KEYWORD_END: lexer->current = handle_keyword_end(lexer, input); break; case LEXER_STATE_LAST_COLLECTION: lexer->current = handle_last_collection(input); break; case LEXER_STATE_ERROR: case COUNT_LEXER_STATES: lexer->current = LEXER_STATE_ERROR; break; } } lexer_state_t handle_lexer_start(char input) { if (isspace(input)) { return LEXER_STATE_START; } switch (input) { case '{': printf("TK_L_BRACE\n"); return LEXER_STATE_OBJECT_START; case '[': printf("TK_L_BRACKET\n"); return LEXER_STATE_ARRAY_START; } return LEXER_STATE_ERROR; } lexer_state_t handle_last_collection(char input) { if (isspace(input)) { return LEXER_STATE_LAST_COLLECTION; } return LEXER_STATE_ERROR; } lexer_state_t handle_collection_end(lexer_t *lexer, char input) { // No need to ignore space as this is only called when input is } or ] lexer->current = lexer->stack.stack[lexer->stack.size - 1]; bool object_end = lexer->current == LEXER_STATE_OBJECT && input == '}'; if (object_end) { printf("TK_R_BRACE\n"); return LEXER_STATE_OBJECT_END; } bool array_end = lexer->current == LEXER_STATE_ARRAY && input == ']'; if (array_end) { printf("TK_R_BRACKET\n"); return LEXER_STATE_ARRAY_END; } return LEXER_STATE_ERROR; } lexer_state_t handle_object(lexer_t *lexer, char input) { if (isspace(input)) { return LEXER_STATE_OBJECT; } else if (input == '"') { stack_push(&(lexer->stack), LEXER_STATE_KEY); return LEXER_STATE_KEY; } else if (input == '}') { printf("TK_R_BRACE\n"); return handle_collection_end(lexer, input); } return LEXER_STATE_ERROR; } lexer_state_t handle_array(lexer_t *lexer, char input) { if (isspace(input)) { return LEXER_STATE_ARRAY; } else if (input == ']') { printf("TK_R_BRACKET\n"); return handle_collection_end(lexer, input); } return handle_value(lexer, input); } lexer_state_t handle_key(lexer_t *lexer, char input) { dstr_append(&(lexer->current_string), input); return LEXER_STATE_STRING; } lexer_state_t handle_value(lexer_t *lexer, char input) { if (isspace(input)) { return LEXER_STATE_VALUE; } else if (isdigit(input) && input != '0') { dstr_append(&(lexer->current_string), input); return LEXER_STATE_NUMBER; } switch (input) { case '"': stack_push(&(lexer->stack), LEXER_STATE_VALUE); return LEXER_STATE_STRING; case '0': dstr_append(&(lexer->current_string), input); return LEXER_STATE_DECIMAL; case '{': printf("TK_L_BRACE\n"); return LEXER_STATE_OBJECT_START; case '[': printf("TK_L_BRACKET\n"); return LEXER_STATE_ARRAY_START; case 't': case 'f': case 'n': append_to_lex_str(&(lexer->keyword), input); return handle_keyword(input); } return LEXER_STATE_ERROR; } lexer_state_t handle_string(lexer_t *lexer, char input) { switch (input) { case '\\': dstr_append(&(lexer->current_string), input); return LEXER_STATE_ESCAPE_SEQUENCE; case '"': printf("TK_STRING: %s\n", dstr_to_cstr(lexer->current_string)); return LEXER_STATE_STRING_END; } dstr_append(&(lexer->current_string), input); return LEXER_STATE_STRING; } lexer_state_t handle_string_end(lexer_t *lexer, char input) { if (isspace(input)) { return LEXER_STATE_STRING_END; } dstr_clear(lexer->current_string); lexer->current = stack_pop(&(lexer->stack)); bool key_end = lexer->current == LEXER_STATE_KEY && input == ':'; if (key_end) { printf("TK_COLON\n"); return LEXER_STATE_VALUE; } bool value_end = lexer->current == LEXER_STATE_VALUE && input == ','; if (value_end) { printf("TK_COMMA\n"); return lexer->stack.stack[lexer->stack.size - 1]; } bool collection_end = input == '}' || input == ']'; return collection_end ? handle_collection_end(lexer, input) : LEXER_STATE_ERROR; } lexer_state_t handle_escape_sequence(lexer_t *lexer, char input) { dstr_append(&(lexer->current_string), input); switch (input) { case '"': case '/': case '\\': case 'b': case 'f': case 'n': case 'r': case 't': return LEXER_STATE_STRING; case 'u': return LEXER_STATE_UNICODE_HEX; } return LEXER_STATE_ERROR; } lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input) { append_to_lex_str(&(lexer->codepoint), input); dstr_append(&(lexer->current_string), input); if (!ishex(input)) { clear_lex_str(&(lexer->codepoint)); return LEXER_STATE_ERROR; } else if (lexer->codepoint.size == UNICODE_LENGTH) { clear_lex_str(&(lexer->codepoint)); return LEXER_STATE_STRING; } return LEXER_STATE_UNICODE_HEX; } lexer_state_t handle_decimal(lexer_t *lexer, char input) { dstr_append(&(lexer->current_string), input); if (input == '.') { return LEXER_STATE_FRACTION; } return LEXER_STATE_ERROR; } lexer_state_t handle_number(lexer_t *lexer, char input) { if (isdigit(input)) { dstr_append(&(lexer->current_string), input); return LEXER_STATE_NUMBER; } else if (input == '.') { dstr_append(&(lexer->current_string), input); return LEXER_STATE_FRACTION; } else if (input == '}' || input == ']') { printf("TK_NUMBER: %s\n", dstr_to_cstr(lexer->current_string)); dstr_clear(lexer->current_string); return handle_collection_end(lexer, input); } else if (input == ',') { printf("TK_NUMBER: %s\n", dstr_to_cstr(lexer->current_string)); dstr_clear(lexer->current_string); return lexer->stack.stack[lexer->stack.size - 1]; } else if (isspace(input)) { return LEXER_STATE_NUMBER_END; } return LEXER_STATE_ERROR; } lexer_state_t handle_fraction(lexer_t *lexer, char input) { if (isdigit(input)) { dstr_append(&(lexer->current_string), input); return LEXER_STATE_FRACTION; } else if (input == '}' || input == ']') { printf("TK_NUMBER: %s\n", dstr_to_cstr(lexer->current_string)); dstr_clear(lexer->current_string); return handle_collection_end(lexer, input); } else if (input == 'e' || input == 'E') { dstr_append(&(lexer->current_string), input); return LEXER_STATE_EXPONENT; } else if (input == ',') { printf("TK_NUMBER: %s\n", dstr_to_cstr(lexer->current_string)); dstr_clear(lexer->current_string); return lexer->stack.stack[lexer->stack.size - 1]; } else if (isspace(input)) { return LEXER_STATE_NUMBER_END; } return LEXER_STATE_ERROR; } lexer_state_t handle_exponent(lexer_t *lexer, char input) { dstr_append(&(lexer->current_string), input); if (isdigit(input)) { return LEXER_STATE_POWER; } else if (input == '+' || input == '-') { return LEXER_STATE_EXP_SIGN; } return LEXER_STATE_ERROR; } lexer_state_t handle_exp_sign(lexer_t *lexer, char input) { dstr_append(&(lexer->current_string), input); if (isdigit(input)) { return LEXER_STATE_POWER; } return LEXER_STATE_ERROR; } lexer_state_t handle_power(lexer_t *lexer, char input) { if (isdigit(input)) { dstr_append(&(lexer->current_string), input); return LEXER_STATE_POWER; } else if (input == '}' || input == ']') { printf("TK_NUMBER: %s\n", dstr_to_cstr(lexer->current_string)); dstr_clear(lexer->current_string); return handle_collection_end(lexer, input); } else if (input == ',') { printf("TK_NUMBER: %s\n", dstr_to_cstr(lexer->current_string)); dstr_clear(lexer->current_string); return lexer->stack.stack[lexer->stack.size - 1]; } else if (isspace(input)) { return LEXER_STATE_NUMBER_END; } return LEXER_STATE_ERROR; } lexer_state_t handle_number_end(lexer_t *lexer, char input) { printf("TK_NUMBER: %s\n", dstr_to_cstr(lexer->current_string)); dstr_clear(lexer->current_string); if (isspace(input)) { return LEXER_STATE_NUMBER_END; } else if (input == ',') { return lexer->stack.stack[lexer->stack.size - 1]; } bool collection_end = input == '}' || input == ']'; return collection_end ? handle_collection_end(lexer, input) : LEXER_STATE_ERROR; } lexer_state_t handle_keyword(char input) { switch (input) { case 't': return LEXER_STATE_TRUE; case 'f': return LEXER_STATE_FALSE; case 'n': return LEXER_STATE_NULL; } return LEXER_STATE_ERROR; } lexer_state_t handle_true(lexer_t *lexer, char input) { char current[MAX_KEYWORD_LENGTH + 1]; strcpy(current, lexer->keyword.keyword.str); append_to_lex_str(&(lexer->keyword), input); bool return_state_true = (strequal(current, "t") && input == 'r') || (strequal(current, "tr") && input == 'u'); bool return_state_end = strequal(current, "tru") && input == 'e'; if (return_state_true) { return LEXER_STATE_TRUE; } else if (return_state_end) { return LEXER_STATE_KEYWORD_END; } return LEXER_STATE_ERROR; } lexer_state_t handle_false(lexer_t *lexer, char input) { char current[MAX_KEYWORD_LENGTH + 1]; strcpy(current, lexer->keyword.keyword.str); append_to_lex_str(&(lexer->keyword), input); bool return_state_false = (strequal(current, "f") && input == 'a') || (strequal(current, "fa") && input == 'l') || (strequal(current, "fal") && input == 's'); bool return_state_end = strequal(current, "fals") && input == 'e'; if (return_state_false) { return LEXER_STATE_FALSE; } else if (return_state_end) { return LEXER_STATE_KEYWORD_END; } return LEXER_STATE_ERROR; } lexer_state_t handle_null(lexer_t *lexer, char input) { char current[MAX_KEYWORD_LENGTH + 1]; strcpy(current, lexer->keyword.keyword.str); append_to_lex_str(&(lexer->keyword), input); bool return_state_null = (strequal(current, "n") && input == 'u') || (strequal(current, "nu") && input == 'l'); bool return_state_end = strequal(current, "nul") && input == 'l'; if (return_state_null) { return LEXER_STATE_NULL; } else if (return_state_end) { return LEXER_STATE_KEYWORD_END; } return LEXER_STATE_ERROR; } lexer_state_t handle_keyword_end(lexer_t *lexer, char input) { printf("TK_KEYWORD: %s\n", lexer->keyword.keyword.str); clear_lex_str(&(lexer->keyword)); if (isspace(input)) { return LEXER_STATE_KEYWORD_END; } else if (input == ',') { return lexer->stack.stack[lexer->stack.size - 1]; } bool collection_end = input == '}' || input == ']'; return collection_end ? handle_collection_end(lexer, input) : LEXER_STATE_ERROR; }