From 31e19a50fc25e5d3ad2d7adca4dc425d603b0c14 Mon Sep 17 00:00:00 2001 From: Abdelrahman Said Date: Sun, 18 Jun 2023 22:57:08 +0100 Subject: [PATCH] Save string and number values for tokenisation --- .vscode/launch.json | 2 +- compile_commands.json | 12 +++--- src/lexer/lexer.c | 94 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 81 insertions(+), 27 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 4cee826..894994f 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -10,7 +10,7 @@ "request": "launch", "program": "${workspaceFolder}/main", "args": [ - "${workspaceFolder}/test_files/webapp.json" + "${workspaceFolder}/test_files/menu.json" ], "stopAtEntry": false, "cwd": "${workspaceFolder}", diff --git a/compile_commands.json b/compile_commands.json index 2b87a57..cc00ec3 100644 --- a/compile_commands.json +++ b/compile_commands.json @@ -118,12 +118,12 @@ "-x", "c", "-o", - "/tmp/main-c4d09c.o", + "/tmp/main-e1ef59.o", "src/main.c" ], "directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json", "file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/main.c", - "output": "/tmp/main-c4d09c.o" + "output": "/tmp/main-e1ef59.o" }, { "arguments": [ @@ -187,12 +187,12 @@ "-x", "c", "-o", - "/tmp/dstring-9f956a.o", + "/tmp/dstring-b2eb78.o", "src/dstring/dstring.c" ], "directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json", "file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/dstring/dstring.c", - "output": "/tmp/dstring-9f956a.o" + "output": "/tmp/dstring-b2eb78.o" }, { "arguments": [ @@ -256,11 +256,11 @@ "-x", "c", "-o", - "/tmp/lexer-7622c3.o", + "/tmp/lexer-b0ee1f.o", "src/lexer/lexer.c" ], "directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json", "file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/lexer/lexer.c", - "output": "/tmp/lexer-7622c3.o" + "output": "/tmp/lexer-b0ee1f.o" } ] diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index 6c9674a..fbd3aea 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -102,17 +102,17 @@ lexer_state_t handle_last_collection(char input); lexer_state_t handle_collection_end(lexer_t *lexer, char input); lexer_state_t handle_object(lexer_t *lexer, char input); lexer_state_t handle_array(lexer_t *lexer, char input); -lexer_state_t handle_key(lexer_t *lexer); +lexer_state_t handle_key(lexer_t *lexer, char input); lexer_state_t handle_value(lexer_t *lexer, char input); -lexer_state_t handle_string(char input); +lexer_state_t handle_string(lexer_t *lexer, char input); lexer_state_t handle_string_end(lexer_t *lexer, char input); -lexer_state_t handle_escape_sequence(char input); +lexer_state_t handle_escape_sequence(lexer_t *lexer, char input); lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input); -lexer_state_t handle_decimal(char input); +lexer_state_t handle_decimal(lexer_t *lexer, char input); lexer_state_t handle_number(lexer_t *lexer, char input); lexer_state_t handle_fraction(lexer_t *lexer, char input); -lexer_state_t handle_exponent(char input); -lexer_state_t handle_exp_sign(char input); +lexer_state_t handle_exponent(lexer_t *lexer, char input); +lexer_state_t handle_exp_sign(lexer_t *lexer, char input); lexer_state_t handle_power(lexer_t *lexer, char input); lexer_state_t handle_number_end(lexer_t *lexer, char input); lexer_state_t handle_keyword(char input); @@ -123,6 +123,8 @@ lexer_state_t handle_keyword_end(lexer_t *lexer, char input); bool validate_json(char *json) { lexer_t lexer = {0}; + lexer.line = 1; + lexer.column = 0; lexer.current = LEXER_STATE_START; lexer.keyword.type = LEXER_STRING_KEYWORD; lexer.codepoint.type = LEXER_STRING_UNICODE; @@ -135,10 +137,15 @@ bool validate_json(char *json) { } for (char *c = json; *c != '\0'; ++c) { - // printf("\nINPUT=>%s\n", c); - // printf("STACK SIZE: %zu\n", lexer.stack.size); lexer_state_machine(&lexer, *c); + // Track the position in the text + ++(lexer.column); + if (*c == '\n') { + ++(lexer.line); + lexer.column = 0; + } + if (lexer.current == LEXER_STATE_ERROR) { return INVALID_JSON; } @@ -276,10 +283,10 @@ void lexer_state_machine(lexer_t *lexer, char input) { break; case LEXER_STATE_KEY: - lexer->current = handle_key(lexer); + lexer->current = handle_key(lexer, input); break; case LEXER_STATE_DECIMAL: - lexer->current = handle_decimal(input); + lexer->current = handle_decimal(lexer, input); break; case LEXER_STATE_NUMBER: lexer->current = handle_number(lexer, input); @@ -288,10 +295,10 @@ void lexer_state_machine(lexer_t *lexer, char input) { lexer->current = handle_fraction(lexer, input); break; case LEXER_STATE_EXPONENT: - lexer->current = handle_exponent(input); + lexer->current = handle_exponent(lexer, input); break; case LEXER_STATE_EXP_SIGN: - lexer->current = handle_exp_sign(input); + lexer->current = handle_exp_sign(lexer, input); break; case LEXER_STATE_POWER: lexer->current = handle_power(lexer, input); @@ -300,13 +307,13 @@ void lexer_state_machine(lexer_t *lexer, char input) { lexer->current = handle_number_end(lexer, input); break; case LEXER_STATE_STRING: - lexer->current = handle_string(input); + lexer->current = handle_string(lexer, input); break; case LEXER_STATE_STRING_END: lexer->current = handle_string_end(lexer, input); break; case LEXER_STATE_ESCAPE_SEQUENCE: - lexer->current = handle_escape_sequence(input); + lexer->current = handle_escape_sequence(lexer, input); break; case LEXER_STATE_UNICODE_HEX: lexer->current = handle_unicode_sequence(lexer, input); @@ -400,12 +407,18 @@ lexer_state_t handle_array(lexer_t *lexer, char input) { return handle_value(lexer, input); } -lexer_state_t handle_key(lexer_t *lexer) { return LEXER_STATE_STRING; } +lexer_state_t handle_key(lexer_t *lexer, char input) { + append_to_dstr(&(lexer->current_string), input); + + return LEXER_STATE_STRING; +} lexer_state_t handle_value(lexer_t *lexer, char input) { if (isspace(input)) { return LEXER_STATE_VALUE; } else if (isdigit(input) && input != '0') { + append_to_dstr(&(lexer->current_string), input); + return LEXER_STATE_NUMBER; } @@ -415,6 +428,8 @@ lexer_state_t handle_value(lexer_t *lexer, char input) { return LEXER_STATE_STRING; case '0': + append_to_dstr(&(lexer->current_string), input); + return LEXER_STATE_DECIMAL; case '{': return LEXER_STATE_OBJECT_START; @@ -431,14 +446,18 @@ lexer_state_t handle_value(lexer_t *lexer, char input) { return LEXER_STATE_ERROR; } -lexer_state_t handle_string(char input) { +lexer_state_t handle_string(lexer_t *lexer, char input) { switch (input) { case '\\': + append_to_dstr(&(lexer->current_string), input); + return LEXER_STATE_ESCAPE_SEQUENCE; case '"': return LEXER_STATE_STRING_END; } + append_to_dstr(&(lexer->current_string), input); + return LEXER_STATE_STRING; } @@ -447,6 +466,8 @@ lexer_state_t handle_string_end(lexer_t *lexer, char input) { return LEXER_STATE_STRING_END; } + empty_dstr(lexer->current_string); + lexer->current = stack_pop(&(lexer->stack)); bool key_end = lexer->current == LEXER_STATE_KEY && input == ':'; @@ -467,7 +488,9 @@ lexer_state_t handle_string_end(lexer_t *lexer, char input) { : LEXER_STATE_ERROR; } -lexer_state_t handle_escape_sequence(char input) { +lexer_state_t handle_escape_sequence(lexer_t *lexer, char input) { + append_to_dstr(&(lexer->current_string), input); + switch (input) { case '"': case '/': @@ -487,6 +510,7 @@ lexer_state_t handle_escape_sequence(char input) { lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input) { append_to_lex_str(&(lexer->codepoint), input); + append_to_dstr(&(lexer->current_string), input); if (!ishex(input)) { clear_lex_str(&(lexer->codepoint)); @@ -501,7 +525,9 @@ lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input) { return LEXER_STATE_UNICODE_HEX; } -lexer_state_t handle_decimal(char input) { +lexer_state_t handle_decimal(lexer_t *lexer, char input) { + append_to_dstr(&(lexer->current_string), input); + if (input == '.') { return LEXER_STATE_FRACTION; } @@ -511,12 +537,20 @@ lexer_state_t handle_decimal(char input) { lexer_state_t handle_number(lexer_t *lexer, char input) { if (isdigit(input)) { + append_to_dstr(&(lexer->current_string), input); + return LEXER_STATE_NUMBER; } else if (input == '.') { + append_to_dstr(&(lexer->current_string), input); + return LEXER_STATE_FRACTION; } else if (input == '}' || input == ']') { + empty_dstr(lexer->current_string); + return handle_collection_end(lexer, input); } else if (input == ',') { + empty_dstr(lexer->current_string); + return lexer->stack.stack[lexer->stack.size - 1]; } else if (isspace(input)) { return LEXER_STATE_NUMBER_END; @@ -527,12 +561,20 @@ lexer_state_t handle_number(lexer_t *lexer, char input) { lexer_state_t handle_fraction(lexer_t *lexer, char input) { if (isdigit(input)) { + append_to_dstr(&(lexer->current_string), input); + return LEXER_STATE_FRACTION; } else if (input == '}' || input == ']') { + empty_dstr(lexer->current_string); + return handle_collection_end(lexer, input); } else if (input == 'e' || input == 'E') { + append_to_dstr(&(lexer->current_string), input); + return LEXER_STATE_EXPONENT; } else if (input == ',') { + empty_dstr(lexer->current_string); + return lexer->stack.stack[lexer->stack.size - 1]; } else if (isspace(input)) { return LEXER_STATE_NUMBER_END; @@ -541,7 +583,9 @@ lexer_state_t handle_fraction(lexer_t *lexer, char input) { return LEXER_STATE_ERROR; } -lexer_state_t handle_exponent(char input) { +lexer_state_t handle_exponent(lexer_t *lexer, char input) { + append_to_dstr(&(lexer->current_string), input); + if (isdigit(input)) { return LEXER_STATE_POWER; } else if (input == '+' || input == '-') { @@ -551,7 +595,9 @@ lexer_state_t handle_exponent(char input) { return LEXER_STATE_ERROR; } -lexer_state_t handle_exp_sign(char input) { +lexer_state_t handle_exp_sign(lexer_t *lexer, char input) { + append_to_dstr(&(lexer->current_string), input); + if (isdigit(input)) { return LEXER_STATE_POWER; } @@ -561,10 +607,16 @@ lexer_state_t handle_exp_sign(char input) { lexer_state_t handle_power(lexer_t *lexer, char input) { if (isdigit(input)) { + append_to_dstr(&(lexer->current_string), input); + return LEXER_STATE_POWER; } else if (input == '}' || input == ']') { + empty_dstr(lexer->current_string); + return handle_collection_end(lexer, input); } else if (input == ',') { + empty_dstr(lexer->current_string); + return lexer->stack.stack[lexer->stack.size - 1]; } else if (isspace(input)) { return LEXER_STATE_NUMBER_END; @@ -574,6 +626,8 @@ lexer_state_t handle_power(lexer_t *lexer, char input) { } lexer_state_t handle_number_end(lexer_t *lexer, char input) { + empty_dstr(lexer->current_string); + if (isspace(input)) { return LEXER_STATE_NUMBER_END; } else if (input == ',') {