From 386dfe72dbe117e900da36ebd2b02ced34ab6a5c Mon Sep 17 00:00:00 2001 From: Abdelrahman Said Date: Sun, 18 Jun 2023 00:03:31 +0100 Subject: [PATCH] Added string type that works for both keywords and unicode sequences --- compile_commands.json | 8 ++-- src/lexer/lexer_states.c | 92 ++++++++++++++++++++++++++++++++-------- 2 files changed, 78 insertions(+), 22 deletions(-) diff --git a/compile_commands.json b/compile_commands.json index 674238b..ea60dd1 100644 --- a/compile_commands.json +++ b/compile_commands.json @@ -95,12 +95,12 @@ "-x", "c", "-o", - "/tmp/main-69d465.o", + "/tmp/main-977e60.o", "src/main.c" ], "directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json", "file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/main.c", - "output": "/tmp/main-69d465.o" + "output": "/tmp/main-977e60.o" }, { "arguments": [ @@ -162,11 +162,11 @@ "-x", "c", "-o", - "/tmp/lexer_states-ad0df4.o", + "/tmp/lexer_states-04f606.o", "src/lexer/lexer_states.c" ], "directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json", "file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/lexer/lexer_states.c", - "output": "/tmp/lexer_states-ad0df4.o" + "output": "/tmp/lexer_states-04f606.o" } ] diff --git a/src/lexer/lexer_states.c b/src/lexer/lexer_states.c index 604aeda..e697097 100644 --- a/src/lexer/lexer_states.c +++ b/src/lexer/lexer_states.c @@ -1,5 +1,6 @@ #include "lexer_states.h" #include "aliases.h" +#include #include #include #include @@ -7,6 +8,7 @@ #include #define MAX_KEYWORD_LENGTH 5 +#define UNICODE_LENGTH 4 #define MAX_STACK_CAPACITY 1024 typedef enum { @@ -51,24 +53,42 @@ typedef struct { u64 size; } state_stack_t; +typedef enum { + LEXER_STRING_KEYWORD, + LEXER_STRING_UNICODE, +} lex_str_type; + typedef struct { - u64 size; char str[MAX_KEYWORD_LENGTH + 1]; } keyword_t; +typedef struct { + char codepoint[UNICODE_LENGTH]; +} unicode_t; + +typedef struct { + lex_str_type type; + u64 size; + union { + keyword_t keyword; + unicode_t unicode; + }; +} lexer_string_t; + struct lexer { lexer_state_t current; state_stack_t stack; u64 line; u64 column; - keyword_t keyword; + lexer_string_t keyword; + lexer_string_t codepoint; }; void stack_push(state_stack_t *stack, lexer_state_t value); lexer_state_t stack_pop(state_stack_t *stack); -void append_to_keyword(keyword_t *kw, char input); -void clear_keyword(keyword_t *kw); +void append_to_string(lexer_string_t *str, char input); +void clear_string(lexer_string_t *str); bool strequal(const char *first, const char *second); void lexer_state_machine(lexer_t *lexer, char input); @@ -98,6 +118,8 @@ lexer_state_t handle_keyword_end(lexer_t *lexer, char input); bool validate_json(char *json) { lexer_t lexer = {0}; lexer.current = LEXER_STATE_START; + lexer.keyword.type = LEXER_STRING_KEYWORD; + lexer.codepoint.type = LEXER_STRING_UNICODE; for (char *c = json; *c != '\0'; ++c) { // printf("\nINPUT=>%s\n", c); @@ -130,17 +152,51 @@ lexer_state_t stack_pop(state_stack_t *stack) { return state; } -void append_to_keyword(keyword_t *kw, char input) { - if (kw->size + 1 > MAX_KEYWORD_LENGTH) { +void append_to_string(lexer_string_t *lex_str, char input) { + u64 capacity = 0; + char *str = NULL; + + switch (lex_str->type) { + case LEXER_STRING_KEYWORD: + capacity = MAX_KEYWORD_LENGTH; + str = lex_str->keyword.str; + + break; + case LEXER_STRING_UNICODE: + capacity = UNICODE_LENGTH; + str = lex_str->unicode.codepoint; + + break; + } + + if (lex_str->size + 1 > capacity) { return; } - kw->str[(kw->size)++] = input; + str[(lex_str->size)++] = input; } -void clear_keyword(keyword_t *kw) { - memset(kw->str, 0, MAX_KEYWORD_LENGTH + 1); - kw->size = 0; +void clear_string(lexer_string_t *lex_str) { + u64 capacity = 1; + char *str = NULL; + + switch (lex_str->type) { + case LEXER_STRING_KEYWORD: + capacity += MAX_KEYWORD_LENGTH; + str = lex_str->keyword.str; + + break; + case LEXER_STRING_UNICODE: + capacity += UNICODE_LENGTH; + str = lex_str->unicode.codepoint; + + break; + } + + assert(str != NULL); + + memset(str, 0, capacity); + lex_str->size = 0; } bool strequal(const char *first, const char *second) { @@ -327,7 +383,7 @@ lexer_state_t handle_value(lexer_t *lexer, char input) { case 't': case 'f': case 'n': - append_to_keyword(&(lexer->keyword), input); + append_to_string(&(lexer->keyword), input); return handle_keyword(input); } @@ -489,9 +545,9 @@ lexer_state_t handle_keyword(char input) { lexer_state_t handle_true(lexer_t *lexer, char input) { char current[MAX_KEYWORD_LENGTH + 1]; - strcpy(current, lexer->keyword.str); + strcpy(current, lexer->keyword.keyword.str); - append_to_keyword(&(lexer->keyword), input); + append_to_string(&(lexer->keyword), input); bool return_state_true = (strequal(current, "t") && input == 'r') || (strequal(current, "tr") && input == 'u'); @@ -509,9 +565,9 @@ lexer_state_t handle_true(lexer_t *lexer, char input) { lexer_state_t handle_false(lexer_t *lexer, char input) { char current[MAX_KEYWORD_LENGTH + 1]; - strcpy(current, lexer->keyword.str); + strcpy(current, lexer->keyword.keyword.str); - append_to_keyword(&(lexer->keyword), input); + append_to_string(&(lexer->keyword), input); bool return_state_false = (strequal(current, "f") && input == 'a') || (strequal(current, "fa") && input == 'l') || @@ -530,9 +586,9 @@ lexer_state_t handle_false(lexer_t *lexer, char input) { lexer_state_t handle_null(lexer_t *lexer, char input) { char current[MAX_KEYWORD_LENGTH + 1]; - strcpy(current, lexer->keyword.str); + strcpy(current, lexer->keyword.keyword.str); - append_to_keyword(&(lexer->keyword), input); + append_to_string(&(lexer->keyword), input); bool return_state_null = (strequal(current, "n") && input == 'u') || (strequal(current, "nu") && input == 'l'); @@ -549,7 +605,7 @@ lexer_state_t handle_null(lexer_t *lexer, char input) { } lexer_state_t handle_keyword_end(lexer_t *lexer, char input) { - clear_keyword(&(lexer->keyword)); + clear_string(&(lexer->keyword)); if (isspace(input)) { return LEXER_STATE_KEYWORD_END;