From ed9b5fb638a49a4749ee3b82f4d3b15b198fbf04 Mon Sep 17 00:00:00 2001 From: Abdelrahman Said Date: Sat, 17 Jun 2023 22:10:48 +0100 Subject: [PATCH] Add keyword support --- compile_commands.json | 8 +- src/lexer/lexer_states.c | 192 +++++++++++++++++++++++++++++++++------ 2 files changed, 168 insertions(+), 32 deletions(-) diff --git a/compile_commands.json b/compile_commands.json index 9883399..674238b 100644 --- a/compile_commands.json +++ b/compile_commands.json @@ -95,12 +95,12 @@ "-x", "c", "-o", - "/tmp/main-d954cc.o", + "/tmp/main-69d465.o", "src/main.c" ], "directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json", "file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/main.c", - "output": "/tmp/main-d954cc.o" + "output": "/tmp/main-69d465.o" }, { "arguments": [ @@ -162,11 +162,11 @@ "-x", "c", "-o", - "/tmp/lexer_states-f7dff6.o", + "/tmp/lexer_states-ad0df4.o", "src/lexer/lexer_states.c" ], "directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json", "file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/lexer/lexer_states.c", - "output": "/tmp/lexer_states-f7dff6.o" + "output": "/tmp/lexer_states-ad0df4.o" } ] diff --git a/src/lexer/lexer_states.c b/src/lexer/lexer_states.c index e62b358..604aeda 100644 --- a/src/lexer/lexer_states.c +++ b/src/lexer/lexer_states.c @@ -3,28 +3,28 @@ #include #include #include +#include +#include #define MAX_KEYWORD_LENGTH 5 #define MAX_STACK_CAPACITY 1024 -// clang-format off typedef enum { - // GENERAL STATES + // GENERAL STATES LEXER_STATE_START, LEXER_STATE_ERROR, LEXER_STATE_VALUE, - LEXER_STATE_KEYWORD, - // COLLECTION STATES - LEXER_STATE_OBJECT_START, - LEXER_STATE_OBJECT, - LEXER_STATE_OBJECT_END, - LEXER_STATE_ARRAY_START, - LEXER_STATE_ARRAY, - LEXER_STATE_ARRAY_END, - LEXER_STATE_LAST_COLLECTION, - // OBJECT STATES - LEXER_STATE_KEY, - // NUMBER STATES + // COLLECTION STATES + LEXER_STATE_OBJECT_START, + LEXER_STATE_OBJECT, + LEXER_STATE_OBJECT_END, + LEXER_STATE_ARRAY_START, + LEXER_STATE_ARRAY, + LEXER_STATE_ARRAY_END, + LEXER_STATE_LAST_COLLECTION, + // OBJECT STATES + LEXER_STATE_KEY, + // NUMBER STATES LEXER_STATE_DECIMAL, LEXER_STATE_NUMBER, LEXER_STATE_FRACTION, @@ -32,32 +32,45 @@ typedef enum { LEXER_STATE_EXP_SIGN, LEXER_STATE_POWER, LEXER_STATE_NUMBER_END, - // STRING STATES - LEXER_STATE_STRING, - LEXER_STATE_STRING_END, - LEXER_STATE_ESCAPE_SEQUENCE, - LEXER_STATE_UNICODE_HEX, + // STRING STATES + LEXER_STATE_STRING, + LEXER_STATE_STRING_END, + LEXER_STATE_ESCAPE_SEQUENCE, + LEXER_STATE_UNICODE_HEX, + // KEYWORD STATES + LEXER_STATE_TRUE, + LEXER_STATE_FALSE, + LEXER_STATE_NULL, + LEXER_STATE_KEYWORD_END, COUNT_LEXER_STATES, } lexer_state_t; -// clang-format on typedef struct { lexer_state_t stack[MAX_STACK_CAPACITY]; u64 size; } state_stack_t; +typedef struct { + u64 size; + char str[MAX_KEYWORD_LENGTH + 1]; +} keyword_t; + struct lexer { lexer_state_t current; state_stack_t stack; u64 line; u64 column; - char current_keyword[MAX_KEYWORD_LENGTH + 1]; + keyword_t keyword; }; void stack_push(state_stack_t *stack, lexer_state_t value); lexer_state_t stack_pop(state_stack_t *stack); +void append_to_keyword(keyword_t *kw, char input); +void clear_keyword(keyword_t *kw); +bool strequal(const char *first, const char *second); + void lexer_state_machine(lexer_t *lexer, char input); lexer_state_t handle_lexer_start(char input); lexer_state_t handle_last_collection(char input); @@ -76,6 +89,11 @@ lexer_state_t handle_exponent(char input); lexer_state_t handle_exp_sign(char input); lexer_state_t handle_power(lexer_t *lexer, char input); lexer_state_t handle_number_end(lexer_t *lexer, char input); +lexer_state_t handle_keyword(char input); +lexer_state_t handle_true(lexer_t *lexer, char input); +lexer_state_t handle_false(lexer_t *lexer, char input); +lexer_state_t handle_null(lexer_t *lexer, char input); +lexer_state_t handle_keyword_end(lexer_t *lexer, char input); bool validate_json(char *json) { lexer_t lexer = {0}; @@ -112,6 +130,23 @@ lexer_state_t stack_pop(state_stack_t *stack) { return state; } +void append_to_keyword(keyword_t *kw, char input) { + if (kw->size + 1 > MAX_KEYWORD_LENGTH) { + return; + } + + kw->str[(kw->size)++] = input; +} + +void clear_keyword(keyword_t *kw) { + memset(kw->str, 0, MAX_KEYWORD_LENGTH + 1); + kw->size = 0; +} + +bool strequal(const char *first, const char *second) { + return strcmp(first, second) == 0; +} + void lexer_state_machine(lexer_t *lexer, char input) { switch (lexer->current) { case LEXER_STATE_START: @@ -120,19 +155,17 @@ void lexer_state_machine(lexer_t *lexer, char input) { case LEXER_STATE_VALUE: lexer->current = handle_value(lexer, input); break; - case LEXER_STATE_KEYWORD: - break; case LEXER_STATE_OBJECT_START: stack_push(&(lexer->stack), LEXER_STATE_OBJECT); - // break is left intentionally here to utilise the fallthrough behaviour of - // the switch statement + // break is left out intentionally here to utilise the fallthrough behaviour + // of the switch statement case LEXER_STATE_OBJECT: lexer->current = handle_object(lexer, input); break; case LEXER_STATE_ARRAY_START: stack_push(&(lexer->stack), LEXER_STATE_ARRAY); - // break is left intentionally here to utilise the fallthrough behaviour of - // the switch statement + // break is left out intentionally here to utilise the fallthrough behaviour + // of the switch statement case LEXER_STATE_ARRAY: lexer->current = handle_array(lexer, input); break; @@ -182,6 +215,18 @@ void lexer_state_machine(lexer_t *lexer, char input) { break; case LEXER_STATE_UNICODE_HEX: break; + case LEXER_STATE_TRUE: + lexer->current = handle_true(lexer, input); + break; + case LEXER_STATE_FALSE: + lexer->current = handle_false(lexer, input); + break; + case LEXER_STATE_NULL: + lexer->current = handle_null(lexer, input); + break; + case LEXER_STATE_KEYWORD_END: + lexer->current = handle_keyword_end(lexer, input); + break; case LEXER_STATE_LAST_COLLECTION: lexer->current = handle_last_collection(input); break; @@ -282,7 +327,9 @@ lexer_state_t handle_value(lexer_t *lexer, char input) { case 't': case 'f': case 'n': - return LEXER_STATE_KEYWORD; + append_to_keyword(&(lexer->keyword), input); + + return handle_keyword(input); } return LEXER_STATE_ERROR; @@ -426,3 +473,92 @@ lexer_state_t handle_number_end(lexer_t *lexer, char input) { return collection_end ? handle_collection_end(lexer, input) : LEXER_STATE_ERROR; } + +lexer_state_t handle_keyword(char input) { + switch (input) { + case 't': + return LEXER_STATE_TRUE; + case 'f': + return LEXER_STATE_FALSE; + case 'n': + return LEXER_STATE_NULL; + } + + return LEXER_STATE_ERROR; +} + +lexer_state_t handle_true(lexer_t *lexer, char input) { + char current[MAX_KEYWORD_LENGTH + 1]; + strcpy(current, lexer->keyword.str); + + append_to_keyword(&(lexer->keyword), input); + + bool return_state_true = (strequal(current, "t") && input == 'r') || + (strequal(current, "tr") && input == 'u'); + + bool return_state_end = strequal(current, "tru") && input == 'e'; + + if (return_state_true) { + return LEXER_STATE_TRUE; + } else if (return_state_end) { + return LEXER_STATE_KEYWORD_END; + } + + return LEXER_STATE_ERROR; +} + +lexer_state_t handle_false(lexer_t *lexer, char input) { + char current[MAX_KEYWORD_LENGTH + 1]; + strcpy(current, lexer->keyword.str); + + append_to_keyword(&(lexer->keyword), input); + + bool return_state_false = (strequal(current, "f") && input == 'a') || + (strequal(current, "fa") && input == 'l') || + (strequal(current, "fal") && input == 's'); + + bool return_state_end = strequal(current, "fals") && input == 'e'; + + if (return_state_false) { + return LEXER_STATE_FALSE; + } else if (return_state_end) { + return LEXER_STATE_KEYWORD_END; + } + + return LEXER_STATE_ERROR; +} + +lexer_state_t handle_null(lexer_t *lexer, char input) { + char current[MAX_KEYWORD_LENGTH + 1]; + strcpy(current, lexer->keyword.str); + + append_to_keyword(&(lexer->keyword), input); + + bool return_state_null = (strequal(current, "n") && input == 'u') || + (strequal(current, "nu") && input == 'l'); + + bool return_state_end = strequal(current, "nul") && input == 'l'; + + if (return_state_null) { + return LEXER_STATE_NULL; + } else if (return_state_end) { + return LEXER_STATE_KEYWORD_END; + } + + return LEXER_STATE_ERROR; +} + +lexer_state_t handle_keyword_end(lexer_t *lexer, char input) { + clear_keyword(&(lexer->keyword)); + + if (isspace(input)) { + return LEXER_STATE_KEYWORD_END; + } else if (input == ',') { + return lexer->stack.stack[lexer->stack.size - 1]; + } + + bool collection_end = input == '}' || input == ']'; + + return collection_end ? handle_collection_end(lexer, input) + : LEXER_STATE_ERROR; +}