Ensure lexer returns correct token for numerical values

This commit is contained in:
Abdelrahman Said 2023-06-24 03:41:22 +01:00
parent 7a326421b8
commit 84533ad643
2 changed files with 170 additions and 61 deletions

View File

@ -118,12 +118,12 @@
"-x", "-x",
"c", "c",
"-o", "-o",
"/tmp/main-92ad35.o", "/tmp/main-4f432a.o",
"src/main.c" "src/main.c"
], ],
"directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json", "directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json",
"file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/main.c", "file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/main.c",
"output": "/tmp/main-92ad35.o" "output": "/tmp/main-4f432a.o"
}, },
{ {
"arguments": [ "arguments": [
@ -187,12 +187,12 @@
"-x", "-x",
"c", "c",
"-o", "-o",
"/tmp/dstring-0fd2b9.o", "/tmp/dstring-841d2e.o",
"src/dstring/dstring.c" "src/dstring/dstring.c"
], ],
"directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json", "directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json",
"file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/dstring/dstring.c", "file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/dstring/dstring.c",
"output": "/tmp/dstring-0fd2b9.o" "output": "/tmp/dstring-841d2e.o"
}, },
{ {
"arguments": [ "arguments": [
@ -256,11 +256,11 @@
"-x", "-x",
"c", "c",
"-o", "-o",
"/tmp/lexer-77effd.o", "/tmp/lexer-5cd579.o",
"src/lexer/lexer.c" "src/lexer/lexer.c"
], ],
"directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json", "directory": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json",
"file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/lexer/lexer.c", "file": "/mnt/3A5CDF785CDF2CFF/Users/abdoo/dev/say_it_in_json/src/lexer/lexer.c",
"output": "/tmp/lexer-77effd.o" "output": "/tmp/lexer-5cd579.o"
} }
] ]

View File

@ -90,6 +90,8 @@ struct lexer {
dstr_t *current_string; dstr_t *current_string;
bool token_ready; bool token_ready;
token_t token; token_t token;
bool has_extra_token;
token_t extra_token;
}; };
void stack_push(state_stack_t *stack, lexer_state_t value); void stack_push(state_stack_t *stack, lexer_state_t value);
@ -97,11 +99,13 @@ lexer_state_t stack_pop(state_stack_t *stack);
void append_to_lex_str(lexer_string_t *str, char input); void append_to_lex_str(lexer_string_t *str, char input);
void clear_lex_str(lexer_string_t *str); void clear_lex_str(lexer_string_t *str);
bool strequal(const char *first, const char *second); bool strequal(const char *const first, const char *const second);
bool is_valid_hex_char(const char input); bool is_valid_hex_char(const char input);
bool ishex(const char input); bool ishex(const char input);
void set_token(lexer_t *lexer, token_type_t type, token_value_t value); token_t dstr_to_numerical_token(const dstr_t *str);
void set_token(token_t *token, u64 line, u64 column, token_type_t type,
token_value_t value);
void lexer_state_machine(lexer_t *lexer, char input); void lexer_state_machine(lexer_t *lexer, char input);
lexer_state_t handle_lexer_start(lexer_t *lexer, char input); lexer_state_t handle_lexer_start(lexer_t *lexer, char input);
@ -151,6 +155,8 @@ void lexer_init(lexer_t **lexer) {
(*lexer)->current_string = dstr_with_capacity(STRING_BUF_START_CAPACITY); (*lexer)->current_string = dstr_with_capacity(STRING_BUF_START_CAPACITY);
(*lexer)->token_ready = false; (*lexer)->token_ready = false;
(*lexer)->token = (token_t){0}; (*lexer)->token = (token_t){0};
(*lexer)->has_extra_token = false;
(*lexer)->extra_token = (token_t){0};
if (!((*lexer)->current_string)) { if (!((*lexer)->current_string)) {
lexer_free(lexer); lexer_free(lexer);
@ -178,18 +184,22 @@ token_t get_next_token(lexer_t *lexer, const char *text) {
char c; char c;
while (lexer->cursor < lexer->text_length) { while (lexer->cursor < lexer->text_length) {
if (lexer->has_extra_token) {
lexer->has_extra_token = false;
return lexer->extra_token;
}
c = lexer->text[(lexer->cursor)++]; c = lexer->text[(lexer->cursor)++];
lexer_state_machine(lexer, c);
if (c == '\n') { if (c == '\n') {
++(lexer->line); ++(lexer->line);
lexer->column = 0; lexer->column = 0;
continue; } else {
}
lexer_state_machine(lexer, c);
// Track the position in the text
++(lexer->column); ++(lexer->column);
}
if (lexer->current == LEXER_STATE_ERROR) { if (lexer->current == LEXER_STATE_ERROR) {
} else if (lexer->token_ready) { } else if (lexer->token_ready) {
@ -269,9 +279,10 @@ void print_token(token_t token) {
printf("%15s, VALUE: %s", "TK_STR_VAL", token.value.string); printf("%15s, VALUE: %s", "TK_STR_VAL", token.value.string);
break; break;
case TK_INTEGER: case TK_INTEGER:
printf("%15s, VALUE: %ld", "TK_INTEGER", token.value.num_int);
break; break;
case TK_DOUBLE: case TK_DOUBLE:
printf("%15s, VALUE: %s", "TK_DOUBLE", token.value.string); printf("%15s, VALUE: %f", "TK_DOUBLE", token.value.num_frac);
break; break;
} }
@ -345,7 +356,7 @@ void clear_lex_str(lexer_string_t *lex_str) {
lex_str->size = 0; lex_str->size = 0;
} }
bool strequal(const char *first, const char *second) { bool strequal(const char *const first, const char *const second) {
return strcmp(first, second) == 0; return strcmp(first, second) == 0;
} }
@ -373,12 +384,27 @@ bool ishex(const char input) {
return isdigit(input) || is_valid_hex_char(input); return isdigit(input) || is_valid_hex_char(input);
} }
void set_token(lexer_t *lexer, token_type_t type, token_value_t value) { token_t dstr_to_numerical_token(const dstr_t *str) {
lexer->token_ready = true; token_t token = {0};
lexer->token = (token_t){ bool is_double = dstr_find(str, ".") != -1;
.line = lexer->line,
.column = lexer->column, token.type = is_double ? TK_DOUBLE : TK_INTEGER;
if (is_double) {
token.value.num_frac = strtod(dstr_to_cstr(str), NULL);
} else {
token.value.num_int = atol(dstr_to_cstr(str));
}
return token;
}
void set_token(token_t *token, u64 line, u64 column, token_type_t type,
token_value_t value) {
*token = (token_t){
.line = line,
.column = column,
.type = type, .type = type,
.value = value, .value = value,
}; };
@ -484,11 +510,15 @@ lexer_state_t handle_lexer_start(lexer_t *lexer, char input) {
switch (input) { switch (input) {
case '{': case '{':
set_token(lexer, TK_L_BRACE, (token_value_t){0}); lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACE,
(token_value_t){0});
return LEXER_STATE_OBJECT_START; return LEXER_STATE_OBJECT_START;
case '[': case '[':
set_token(lexer, TK_L_BRACKET, (token_value_t){0}); lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACKET,
(token_value_t){0});
return LEXER_STATE_ARRAY_START; return LEXER_STATE_ARRAY_START;
} }
@ -512,7 +542,18 @@ lexer_state_t handle_collection_end(lexer_t *lexer, char input) {
bool object_end = lexer->current == LEXER_STATE_OBJECT && input == '}'; bool object_end = lexer->current == LEXER_STATE_OBJECT && input == '}';
if (object_end) { if (object_end) {
set_token(lexer, TK_R_BRACE, (token_value_t){0}); token_t *token;
if (lexer->token_ready) {
lexer->has_extra_token = true;
token = &(lexer->extra_token);
} else {
lexer->token_ready = true;
token = &(lexer->token);
}
set_token(token, lexer->line, lexer->column, TK_R_BRACE,
(token_value_t){0});
return LEXER_STATE_OBJECT_END; return LEXER_STATE_OBJECT_END;
} }
@ -520,7 +561,18 @@ lexer_state_t handle_collection_end(lexer_t *lexer, char input) {
bool array_end = lexer->current == LEXER_STATE_ARRAY && input == ']'; bool array_end = lexer->current == LEXER_STATE_ARRAY && input == ']';
if (array_end) { if (array_end) {
set_token(lexer, TK_R_BRACKET, (token_value_t){0}); token_t *token;
if (lexer->token_ready) {
lexer->has_extra_token = true;
token = &(lexer->extra_token);
} else {
lexer->token_ready = true;
token = &(lexer->token);
}
set_token(token, lexer->line, lexer->column, TK_R_BRACKET,
(token_value_t){0});
return LEXER_STATE_ARRAY_END; return LEXER_STATE_ARRAY_END;
} }
@ -531,11 +583,15 @@ lexer_state_t handle_collection_end(lexer_t *lexer, char input) {
void handle_input_after_collection_end(lexer_t *lexer, char input) { void handle_input_after_collection_end(lexer_t *lexer, char input) {
switch (input) { switch (input) {
case '}': case '}':
set_token(lexer, TK_R_BRACE, (token_value_t){0}); lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_R_BRACE,
(token_value_t){0});
break; break;
case ']': case ']':
set_token(lexer, TK_R_BRACKET, (token_value_t){0}); lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_R_BRACKET,
(token_value_t){0});
break; break;
} }
@ -590,11 +646,15 @@ lexer_state_t handle_value(lexer_t *lexer, char input) {
return LEXER_STATE_DECIMAL; return LEXER_STATE_DECIMAL;
case '{': case '{':
set_token(lexer, TK_L_BRACE, (token_value_t){0}); lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACE,
(token_value_t){0});
return LEXER_STATE_OBJECT_START; return LEXER_STATE_OBJECT_START;
case '[': case '[':
set_token(lexer, TK_L_BRACKET, (token_value_t){0}); lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_L_BRACKET,
(token_value_t){0});
return LEXER_STATE_ARRAY_START; return LEXER_STATE_ARRAY_START;
case 't': case 't':
@ -617,12 +677,15 @@ lexer_state_t handle_string(lexer_t *lexer, char input) {
case '"': { case '"': {
lexer_state_t string_type = lexer->stack.stack[lexer->stack.size - 1]; lexer_state_t string_type = lexer->stack.stack[lexer->stack.size - 1];
lexer->token_ready = true;
token_t *token = &(lexer->token);
u64 column = lexer->column - dstr_length(lexer->current_string);
token_value_t value = {.string = dstr_to_cstr(lexer->current_string)};
if (string_type == LEXER_STATE_KEY) { if (string_type == LEXER_STATE_KEY) {
set_token(lexer, TK_STR_KEY, set_token(token, lexer->line, column, TK_STR_KEY, value);
(token_value_t){.string = dstr_to_cstr(lexer->current_string)});
} else if (string_type == LEXER_STATE_VALUE) { } else if (string_type == LEXER_STATE_VALUE) {
set_token(lexer, TK_STR_VAL, set_token(token, lexer->line, column, TK_STR_VAL, value);
(token_value_t){.string = dstr_to_cstr(lexer->current_string)});
} }
return LEXER_STATE_STRING_END; return LEXER_STATE_STRING_END;
@ -718,22 +781,34 @@ lexer_state_t handle_number(lexer_t *lexer, char input) {
} else if (input == '}' || input == ']') { } else if (input == '}' || input == ']') {
// TODO (Abdelrahman): Set the token type correctly based on whether the // TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double // number is an integer or a double
set_token(lexer, TK_DOUBLE, lexer->token_ready = true;
(token_value_t){.string = dstr_to_cstr(lexer->current_string)}); u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return handle_collection_end(lexer, input); return handle_collection_end(lexer, input);
} else if (input == ',') { } else if (input == ',') {
// TODO (Abdelrahman): Set the token type correctly based on whether the // TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double // number is an integer or a double
set_token(lexer, TK_DOUBLE, lexer->token_ready = true;
(token_value_t){.string = dstr_to_cstr(lexer->current_string)}); u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return lexer->stack.stack[lexer->stack.size - 1]; return lexer->stack.stack[lexer->stack.size - 1];
} else if (isspace(input)) { } else if (isspace(input)) {
// TODO (Abdelrahman): Set the token type correctly based on whether the // TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double // number is an integer or a double
set_token(lexer, TK_DOUBLE, lexer->token_ready = true;
(token_value_t){.string = dstr_to_cstr(lexer->current_string)}); u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return LEXER_STATE_NUMBER_END; return LEXER_STATE_NUMBER_END;
} }
@ -749,8 +824,12 @@ lexer_state_t handle_fraction(lexer_t *lexer, char input) {
} else if (input == '}' || input == ']') { } else if (input == '}' || input == ']') {
// TODO (Abdelrahman): Set the token type correctly based on whether the // TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double // number is an integer or a double
set_token(lexer, TK_DOUBLE, lexer->token_ready = true;
(token_value_t){.string = dstr_to_cstr(lexer->current_string)}); u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return handle_collection_end(lexer, input); return handle_collection_end(lexer, input);
} else if (input == 'e' || input == 'E') { } else if (input == 'e' || input == 'E') {
@ -760,15 +839,23 @@ lexer_state_t handle_fraction(lexer_t *lexer, char input) {
} else if (input == ',') { } else if (input == ',') {
// TODO (Abdelrahman): Set the token type correctly based on whether the // TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double // number is an integer or a double
set_token(lexer, TK_DOUBLE, lexer->token_ready = true;
(token_value_t){.string = dstr_to_cstr(lexer->current_string)}); u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return lexer->stack.stack[lexer->stack.size - 1]; return lexer->stack.stack[lexer->stack.size - 1];
} else if (isspace(input)) { } else if (isspace(input)) {
// TODO (Abdelrahman): Set the token type correctly based on whether the // TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double // number is an integer or a double
set_token(lexer, TK_DOUBLE, lexer->token_ready = true;
(token_value_t){.string = dstr_to_cstr(lexer->current_string)}); u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return LEXER_STATE_NUMBER_END; return LEXER_STATE_NUMBER_END;
} }
@ -806,22 +893,34 @@ lexer_state_t handle_power(lexer_t *lexer, char input) {
} else if (input == '}' || input == ']') { } else if (input == '}' || input == ']') {
// TODO (Abdelrahman): Set the token type correctly based on whether the // TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double // number is an integer or a double
set_token(lexer, TK_DOUBLE, lexer->token_ready = true;
(token_value_t){.string = dstr_to_cstr(lexer->current_string)}); u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return handle_collection_end(lexer, input); return handle_collection_end(lexer, input);
} else if (input == ',') { } else if (input == ',') {
// TODO (Abdelrahman): Set the token type correctly based on whether the // TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double // number is an integer or a double
set_token(lexer, TK_DOUBLE, lexer->token_ready = true;
(token_value_t){.string = dstr_to_cstr(lexer->current_string)}); u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return lexer->stack.stack[lexer->stack.size - 1]; return lexer->stack.stack[lexer->stack.size - 1];
} else if (isspace(input)) { } else if (isspace(input)) {
// TODO (Abdelrahman): Set the token type correctly based on whether the // TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double // number is an integer or a double
set_token(lexer, TK_DOUBLE, lexer->token_ready = true;
(token_value_t){.string = dstr_to_cstr(lexer->current_string)}); u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return LEXER_STATE_NUMBER_END; return LEXER_STATE_NUMBER_END;
} }
@ -835,8 +934,12 @@ lexer_state_t handle_number_end(lexer_t *lexer, char input) {
} else if (input == ',') { } else if (input == ',') {
// TODO (Abdelrahman): Set the token type correctly based on whether the // TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double // number is an integer or a double
set_token(lexer, TK_DOUBLE, lexer->token_ready = true;
(token_value_t){.string = dstr_to_cstr(lexer->current_string)}); u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
return lexer->stack.stack[lexer->stack.size - 1]; return lexer->stack.stack[lexer->stack.size - 1];
} }
@ -924,15 +1027,21 @@ lexer_state_t handle_null(lexer_t *lexer, char input) {
lexer_state_t handle_keyword_end(lexer_t *lexer, char input) { lexer_state_t handle_keyword_end(lexer_t *lexer, char input) {
const char *keyword = lexer->keyword.keyword.str; const char *keyword = lexer->keyword.keyword.str;
if (lexer->keyword.size > 0) {
lexer->token_ready = true;
token_t *token = &(lexer->token);
u64 column = lexer->column - lexer->keyword.size;
if (strequal(keyword, "null")) { if (strequal(keyword, "null")) {
set_token(lexer, TK_NULL, (token_value_t){0}); set_token(token, lexer->line, column, TK_NULL, (token_value_t){0});
} else if (strequal(keyword, "true")) { } else if (strequal(keyword, "true")) {
set_token(lexer, TK_TRUE, (token_value_t){0}); set_token(token, lexer->line, column, TK_TRUE, (token_value_t){0});
} else if (strequal(keyword, "false")) { } else if (strequal(keyword, "false")) {
set_token(lexer, TK_FALSE, (token_value_t){0}); set_token(token, lexer->line, column, TK_FALSE, (token_value_t){0});
} }
clear_lex_str(&(lexer->keyword)); clear_lex_str(&(lexer->keyword));
}
if (isspace(input)) { if (isspace(input)) {
return LEXER_STATE_KEYWORD_END; return LEXER_STATE_KEYWORD_END;