Save string and number values for tokenisation

This commit is contained in:
Abdelrahman Said 2023-06-18 22:57:08 +01:00
parent f0043a691c
commit 31e19a50fc
3 changed files with 81 additions and 27 deletions

2
.vscode/launch.json vendored
View File

@ -10,7 +10,7 @@
"request": "launch", "request": "launch",
"program": "${workspaceFolder}/main", "program": "${workspaceFolder}/main",
"args": [ "args": [
"${workspaceFolder}/test_files/webapp.json" "${workspaceFolder}/test_files/menu.json"
], ],
"stopAtEntry": false, "stopAtEntry": false,
"cwd": "${workspaceFolder}", "cwd": "${workspaceFolder}",

View File

@ -118,12 +118,12 @@
"-x", "-x",
"c", "c",
"-o", "-o",
"/tmp/main-c4d09c.o", "/tmp/main-e1ef59.o",
"src/main.c" "src/main.c"
], ],
"directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json", "directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json",
"file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/main.c", "file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/main.c",
"output": "/tmp/main-c4d09c.o" "output": "/tmp/main-e1ef59.o"
}, },
{ {
"arguments": [ "arguments": [
@ -187,12 +187,12 @@
"-x", "-x",
"c", "c",
"-o", "-o",
"/tmp/dstring-9f956a.o", "/tmp/dstring-b2eb78.o",
"src/dstring/dstring.c" "src/dstring/dstring.c"
], ],
"directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json", "directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json",
"file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/dstring/dstring.c", "file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/dstring/dstring.c",
"output": "/tmp/dstring-9f956a.o" "output": "/tmp/dstring-b2eb78.o"
}, },
{ {
"arguments": [ "arguments": [
@ -256,11 +256,11 @@
"-x", "-x",
"c", "c",
"-o", "-o",
"/tmp/lexer-7622c3.o", "/tmp/lexer-b0ee1f.o",
"src/lexer/lexer.c" "src/lexer/lexer.c"
], ],
"directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json", "directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json",
"file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/lexer/lexer.c", "file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/lexer/lexer.c",
"output": "/tmp/lexer-7622c3.o" "output": "/tmp/lexer-b0ee1f.o"
} }
] ]

View File

@ -102,17 +102,17 @@ lexer_state_t handle_last_collection(char input);
lexer_state_t handle_collection_end(lexer_t *lexer, char input); lexer_state_t handle_collection_end(lexer_t *lexer, char input);
lexer_state_t handle_object(lexer_t *lexer, char input); lexer_state_t handle_object(lexer_t *lexer, char input);
lexer_state_t handle_array(lexer_t *lexer, char input); lexer_state_t handle_array(lexer_t *lexer, char input);
lexer_state_t handle_key(lexer_t *lexer); lexer_state_t handle_key(lexer_t *lexer, char input);
lexer_state_t handle_value(lexer_t *lexer, char input); lexer_state_t handle_value(lexer_t *lexer, char input);
lexer_state_t handle_string(char input); lexer_state_t handle_string(lexer_t *lexer, char input);
lexer_state_t handle_string_end(lexer_t *lexer, char input); lexer_state_t handle_string_end(lexer_t *lexer, char input);
lexer_state_t handle_escape_sequence(char input); lexer_state_t handle_escape_sequence(lexer_t *lexer, char input);
lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input); lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input);
lexer_state_t handle_decimal(char input); lexer_state_t handle_decimal(lexer_t *lexer, char input);
lexer_state_t handle_number(lexer_t *lexer, char input); lexer_state_t handle_number(lexer_t *lexer, char input);
lexer_state_t handle_fraction(lexer_t *lexer, char input); lexer_state_t handle_fraction(lexer_t *lexer, char input);
lexer_state_t handle_exponent(char input); lexer_state_t handle_exponent(lexer_t *lexer, char input);
lexer_state_t handle_exp_sign(char input); lexer_state_t handle_exp_sign(lexer_t *lexer, char input);
lexer_state_t handle_power(lexer_t *lexer, char input); lexer_state_t handle_power(lexer_t *lexer, char input);
lexer_state_t handle_number_end(lexer_t *lexer, char input); lexer_state_t handle_number_end(lexer_t *lexer, char input);
lexer_state_t handle_keyword(char input); lexer_state_t handle_keyword(char input);
@ -123,6 +123,8 @@ lexer_state_t handle_keyword_end(lexer_t *lexer, char input);
bool validate_json(char *json) { bool validate_json(char *json) {
lexer_t lexer = {0}; lexer_t lexer = {0};
lexer.line = 1;
lexer.column = 0;
lexer.current = LEXER_STATE_START; lexer.current = LEXER_STATE_START;
lexer.keyword.type = LEXER_STRING_KEYWORD; lexer.keyword.type = LEXER_STRING_KEYWORD;
lexer.codepoint.type = LEXER_STRING_UNICODE; lexer.codepoint.type = LEXER_STRING_UNICODE;
@ -135,10 +137,15 @@ bool validate_json(char *json) {
} }
for (char *c = json; *c != '\0'; ++c) { for (char *c = json; *c != '\0'; ++c) {
// printf("\nINPUT=>%s\n", c);
// printf("STACK SIZE: %zu\n", lexer.stack.size);
lexer_state_machine(&lexer, *c); lexer_state_machine(&lexer, *c);
// Track the position in the text
++(lexer.column);
if (*c == '\n') {
++(lexer.line);
lexer.column = 0;
}
if (lexer.current == LEXER_STATE_ERROR) { if (lexer.current == LEXER_STATE_ERROR) {
return INVALID_JSON; return INVALID_JSON;
} }
@ -276,10 +283,10 @@ void lexer_state_machine(lexer_t *lexer, char input) {
break; break;
case LEXER_STATE_KEY: case LEXER_STATE_KEY:
lexer->current = handle_key(lexer); lexer->current = handle_key(lexer, input);
break; break;
case LEXER_STATE_DECIMAL: case LEXER_STATE_DECIMAL:
lexer->current = handle_decimal(input); lexer->current = handle_decimal(lexer, input);
break; break;
case LEXER_STATE_NUMBER: case LEXER_STATE_NUMBER:
lexer->current = handle_number(lexer, input); lexer->current = handle_number(lexer, input);
@ -288,10 +295,10 @@ void lexer_state_machine(lexer_t *lexer, char input) {
lexer->current = handle_fraction(lexer, input); lexer->current = handle_fraction(lexer, input);
break; break;
case LEXER_STATE_EXPONENT: case LEXER_STATE_EXPONENT:
lexer->current = handle_exponent(input); lexer->current = handle_exponent(lexer, input);
break; break;
case LEXER_STATE_EXP_SIGN: case LEXER_STATE_EXP_SIGN:
lexer->current = handle_exp_sign(input); lexer->current = handle_exp_sign(lexer, input);
break; break;
case LEXER_STATE_POWER: case LEXER_STATE_POWER:
lexer->current = handle_power(lexer, input); lexer->current = handle_power(lexer, input);
@ -300,13 +307,13 @@ void lexer_state_machine(lexer_t *lexer, char input) {
lexer->current = handle_number_end(lexer, input); lexer->current = handle_number_end(lexer, input);
break; break;
case LEXER_STATE_STRING: case LEXER_STATE_STRING:
lexer->current = handle_string(input); lexer->current = handle_string(lexer, input);
break; break;
case LEXER_STATE_STRING_END: case LEXER_STATE_STRING_END:
lexer->current = handle_string_end(lexer, input); lexer->current = handle_string_end(lexer, input);
break; break;
case LEXER_STATE_ESCAPE_SEQUENCE: case LEXER_STATE_ESCAPE_SEQUENCE:
lexer->current = handle_escape_sequence(input); lexer->current = handle_escape_sequence(lexer, input);
break; break;
case LEXER_STATE_UNICODE_HEX: case LEXER_STATE_UNICODE_HEX:
lexer->current = handle_unicode_sequence(lexer, input); lexer->current = handle_unicode_sequence(lexer, input);
@ -400,12 +407,18 @@ lexer_state_t handle_array(lexer_t *lexer, char input) {
return handle_value(lexer, input); return handle_value(lexer, input);
} }
lexer_state_t handle_key(lexer_t *lexer) { return LEXER_STATE_STRING; } lexer_state_t handle_key(lexer_t *lexer, char input) {
append_to_dstr(&(lexer->current_string), input);
return LEXER_STATE_STRING;
}
lexer_state_t handle_value(lexer_t *lexer, char input) { lexer_state_t handle_value(lexer_t *lexer, char input) {
if (isspace(input)) { if (isspace(input)) {
return LEXER_STATE_VALUE; return LEXER_STATE_VALUE;
} else if (isdigit(input) && input != '0') { } else if (isdigit(input) && input != '0') {
append_to_dstr(&(lexer->current_string), input);
return LEXER_STATE_NUMBER; return LEXER_STATE_NUMBER;
} }
@ -415,6 +428,8 @@ lexer_state_t handle_value(lexer_t *lexer, char input) {
return LEXER_STATE_STRING; return LEXER_STATE_STRING;
case '0': case '0':
append_to_dstr(&(lexer->current_string), input);
return LEXER_STATE_DECIMAL; return LEXER_STATE_DECIMAL;
case '{': case '{':
return LEXER_STATE_OBJECT_START; return LEXER_STATE_OBJECT_START;
@ -431,14 +446,18 @@ lexer_state_t handle_value(lexer_t *lexer, char input) {
return LEXER_STATE_ERROR; return LEXER_STATE_ERROR;
} }
lexer_state_t handle_string(char input) { lexer_state_t handle_string(lexer_t *lexer, char input) {
switch (input) { switch (input) {
case '\\': case '\\':
append_to_dstr(&(lexer->current_string), input);
return LEXER_STATE_ESCAPE_SEQUENCE; return LEXER_STATE_ESCAPE_SEQUENCE;
case '"': case '"':
return LEXER_STATE_STRING_END; return LEXER_STATE_STRING_END;
} }
append_to_dstr(&(lexer->current_string), input);
return LEXER_STATE_STRING; return LEXER_STATE_STRING;
} }
@ -447,6 +466,8 @@ lexer_state_t handle_string_end(lexer_t *lexer, char input) {
return LEXER_STATE_STRING_END; return LEXER_STATE_STRING_END;
} }
empty_dstr(lexer->current_string);
lexer->current = stack_pop(&(lexer->stack)); lexer->current = stack_pop(&(lexer->stack));
bool key_end = lexer->current == LEXER_STATE_KEY && input == ':'; bool key_end = lexer->current == LEXER_STATE_KEY && input == ':';
@ -467,7 +488,9 @@ lexer_state_t handle_string_end(lexer_t *lexer, char input) {
: LEXER_STATE_ERROR; : LEXER_STATE_ERROR;
} }
lexer_state_t handle_escape_sequence(char input) { lexer_state_t handle_escape_sequence(lexer_t *lexer, char input) {
append_to_dstr(&(lexer->current_string), input);
switch (input) { switch (input) {
case '"': case '"':
case '/': case '/':
@ -487,6 +510,7 @@ lexer_state_t handle_escape_sequence(char input) {
lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input) { lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input) {
append_to_lex_str(&(lexer->codepoint), input); append_to_lex_str(&(lexer->codepoint), input);
append_to_dstr(&(lexer->current_string), input);
if (!ishex(input)) { if (!ishex(input)) {
clear_lex_str(&(lexer->codepoint)); clear_lex_str(&(lexer->codepoint));
@ -501,7 +525,9 @@ lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input) {
return LEXER_STATE_UNICODE_HEX; return LEXER_STATE_UNICODE_HEX;
} }
lexer_state_t handle_decimal(char input) { lexer_state_t handle_decimal(lexer_t *lexer, char input) {
append_to_dstr(&(lexer->current_string), input);
if (input == '.') { if (input == '.') {
return LEXER_STATE_FRACTION; return LEXER_STATE_FRACTION;
} }
@ -511,12 +537,20 @@ lexer_state_t handle_decimal(char input) {
lexer_state_t handle_number(lexer_t *lexer, char input) { lexer_state_t handle_number(lexer_t *lexer, char input) {
if (isdigit(input)) { if (isdigit(input)) {
append_to_dstr(&(lexer->current_string), input);
return LEXER_STATE_NUMBER; return LEXER_STATE_NUMBER;
} else if (input == '.') { } else if (input == '.') {
append_to_dstr(&(lexer->current_string), input);
return LEXER_STATE_FRACTION; return LEXER_STATE_FRACTION;
} else if (input == '}' || input == ']') { } else if (input == '}' || input == ']') {
empty_dstr(lexer->current_string);
return handle_collection_end(lexer, input); return handle_collection_end(lexer, input);
} else if (input == ',') { } else if (input == ',') {
empty_dstr(lexer->current_string);
return lexer->stack.stack[lexer->stack.size - 1]; return lexer->stack.stack[lexer->stack.size - 1];
} else if (isspace(input)) { } else if (isspace(input)) {
return LEXER_STATE_NUMBER_END; return LEXER_STATE_NUMBER_END;
@ -527,12 +561,20 @@ lexer_state_t handle_number(lexer_t *lexer, char input) {
lexer_state_t handle_fraction(lexer_t *lexer, char input) { lexer_state_t handle_fraction(lexer_t *lexer, char input) {
if (isdigit(input)) { if (isdigit(input)) {
append_to_dstr(&(lexer->current_string), input);
return LEXER_STATE_FRACTION; return LEXER_STATE_FRACTION;
} else if (input == '}' || input == ']') { } else if (input == '}' || input == ']') {
empty_dstr(lexer->current_string);
return handle_collection_end(lexer, input); return handle_collection_end(lexer, input);
} else if (input == 'e' || input == 'E') { } else if (input == 'e' || input == 'E') {
append_to_dstr(&(lexer->current_string), input);
return LEXER_STATE_EXPONENT; return LEXER_STATE_EXPONENT;
} else if (input == ',') { } else if (input == ',') {
empty_dstr(lexer->current_string);
return lexer->stack.stack[lexer->stack.size - 1]; return lexer->stack.stack[lexer->stack.size - 1];
} else if (isspace(input)) { } else if (isspace(input)) {
return LEXER_STATE_NUMBER_END; return LEXER_STATE_NUMBER_END;
@ -541,7 +583,9 @@ lexer_state_t handle_fraction(lexer_t *lexer, char input) {
return LEXER_STATE_ERROR; return LEXER_STATE_ERROR;
} }
lexer_state_t handle_exponent(char input) { lexer_state_t handle_exponent(lexer_t *lexer, char input) {
append_to_dstr(&(lexer->current_string), input);
if (isdigit(input)) { if (isdigit(input)) {
return LEXER_STATE_POWER; return LEXER_STATE_POWER;
} else if (input == '+' || input == '-') { } else if (input == '+' || input == '-') {
@ -551,7 +595,9 @@ lexer_state_t handle_exponent(char input) {
return LEXER_STATE_ERROR; return LEXER_STATE_ERROR;
} }
lexer_state_t handle_exp_sign(char input) { lexer_state_t handle_exp_sign(lexer_t *lexer, char input) {
append_to_dstr(&(lexer->current_string), input);
if (isdigit(input)) { if (isdigit(input)) {
return LEXER_STATE_POWER; return LEXER_STATE_POWER;
} }
@ -561,10 +607,16 @@ lexer_state_t handle_exp_sign(char input) {
lexer_state_t handle_power(lexer_t *lexer, char input) { lexer_state_t handle_power(lexer_t *lexer, char input) {
if (isdigit(input)) { if (isdigit(input)) {
append_to_dstr(&(lexer->current_string), input);
return LEXER_STATE_POWER; return LEXER_STATE_POWER;
} else if (input == '}' || input == ']') { } else if (input == '}' || input == ']') {
empty_dstr(lexer->current_string);
return handle_collection_end(lexer, input); return handle_collection_end(lexer, input);
} else if (input == ',') { } else if (input == ',') {
empty_dstr(lexer->current_string);
return lexer->stack.stack[lexer->stack.size - 1]; return lexer->stack.stack[lexer->stack.size - 1];
} else if (isspace(input)) { } else if (isspace(input)) {
return LEXER_STATE_NUMBER_END; return LEXER_STATE_NUMBER_END;
@ -574,6 +626,8 @@ lexer_state_t handle_power(lexer_t *lexer, char input) {
} }
lexer_state_t handle_number_end(lexer_t *lexer, char input) { lexer_state_t handle_number_end(lexer_t *lexer, char input) {
empty_dstr(lexer->current_string);
if (isspace(input)) { if (isspace(input)) {
return LEXER_STATE_NUMBER_END; return LEXER_STATE_NUMBER_END;
} else if (input == ',') { } else if (input == ',') {