Add the get_next_token function

This commit is contained in:
Abdelrahman Said
2023-06-22 00:13:05 +01:00
parent 7845ad4b06
commit efe07a3c60
4 changed files with 184 additions and 258 deletions

View File

@@ -81,13 +81,15 @@ struct lexer {
u64 cursor;
u64 line;
u64 column;
const char *text;
u64 text_length;
const char *text;
lexer_state_t current;
state_stack_t stack;
lexer_string_t keyword;
lexer_string_t codepoint;
dstr_t *current_string;
bool token_ready;
token_t token;
};
void stack_push(state_stack_t *stack, lexer_state_t value);
@@ -99,10 +101,13 @@ bool strequal(const char *first, const char *second);
bool is_valid_hex_char(const char input);
bool ishex(const char input);
void set_token(lexer_t *lexer, token_type_t type, token_value_t value);
void lexer_state_machine(lexer_t *lexer, char input);
lexer_state_t handle_lexer_start(char input);
lexer_state_t handle_lexer_start(lexer_t *lexer, char input);
lexer_state_t handle_last_collection(char input);
lexer_state_t handle_collection_end(lexer_t *lexer, char input);
void handle_input_after_collection_end(lexer_t *lexer, char input);
lexer_state_t handle_object(lexer_t *lexer, char input);
lexer_state_t handle_array(lexer_t *lexer, char input);
lexer_state_t handle_key(lexer_t *lexer, char input);
@@ -124,9 +129,6 @@ lexer_state_t handle_false(lexer_t *lexer, char input);
lexer_state_t handle_null(lexer_t *lexer, char input);
lexer_state_t handle_keyword_end(lexer_t *lexer, char input);
// TODO (Abdelrahman): The printf functions in the state handlers are the exit
// points for the tokenisation function. Replace them once ready.
void lexer_init(lexer_t **lexer) {
if (*lexer) {
lexer_free(lexer);
@@ -141,10 +143,14 @@ void lexer_init(lexer_t **lexer) {
(*lexer)->cursor = 0;
(*lexer)->line = 1;
(*lexer)->column = 0;
(*lexer)->text_length = 0;
(*lexer)->text = "";
(*lexer)->current = LEXER_STATE_START;
(*lexer)->keyword.type = LEXER_STRING_KEYWORD;
(*lexer)->codepoint.type = LEXER_STRING_UNICODE;
(*lexer)->current_string = dstr_with_capacity(STRING_BUF_START_CAPACITY);
(*lexer)->token_ready = false;
(*lexer)->token = (token_t){0};
if (!((*lexer)->current_string)) {
lexer_free(lexer);
@@ -167,10 +173,12 @@ token_t get_next_token(lexer_t *lexer, const char *text) {
lexer->text_length = strlen(text);
}
dstr_clear(lexer->current_string);
char c;
for (; lexer->cursor < lexer->text_length; ++(lexer->cursor)) {
c = lexer->text[lexer->cursor];
while (lexer->cursor < lexer->text_length) {
c = lexer->text[(lexer->cursor)++];
if (c == '\n') {
++(lexer->line);
@@ -184,6 +192,10 @@ token_t get_next_token(lexer_t *lexer, const char *text) {
++(lexer->column);
if (lexer->current == LEXER_STATE_ERROR) {
} else if (lexer->token_ready) {
lexer->token_ready = false;
return lexer->token;
}
}
@@ -223,6 +235,49 @@ bool validate_json(char *json) {
return lexer.current == LEXER_STATE_LAST_COLLECTION || lexer.stack.size == 0;
}
void print_token(token_t token) {
printf("{LINE: %4llu, COLUMN: %4llu, TYPE: ", token.line, token.column);
switch (token.type) {
case TK_NO_TOKEN:
break;
case TK_L_BRACE:
printf("%15s, VALUE: N/A", "TK_L_BRACE");
break;
case TK_R_BRACE:
printf("%15s, VALUE: N/A", "TK_R_BRACE");
break;
case TK_L_BRACKET:
printf("%15s, VALUE: N/A", "TK_L_BRACKET");
break;
case TK_R_BRACKET:
printf("%15s, VALUE: N/A", "TK_R_BRACKET");
break;
case TK_NULL:
printf("%15s, VALUE: N/A", "TK_NULL");
break;
case TK_TRUE:
printf("%15s, VALUE: N/A", "TK_TRUE");
break;
case TK_FALSE:
printf("%15s, VALUE: N/A", "TK_FALSE");
break;
case TK_STR_KEY:
printf("%15s, VALUE: %s", "TK_STR_KEY", token.value.string);
break;
case TK_STR_VAL:
printf("%15s, VALUE: %s", "TK_STR_VAL", token.value.string);
break;
case TK_INTEGER:
break;
case TK_DOUBLE:
printf("%15s, VALUE: %s", "TK_DOUBLE", token.value.string);
break;
}
printf("}\n");
}
void stack_push(state_stack_t *stack, lexer_state_t state) {
if (stack->size + 1 >= MAX_STACK_CAPACITY) {
return;
@@ -318,10 +373,21 @@ bool ishex(const char input) {
return isdigit(input) || is_valid_hex_char(input);
}
void set_token(lexer_t *lexer, token_type_t type, token_value_t value) {
lexer->token_ready = true;
lexer->token = (token_t){
.line = lexer->line,
.column = lexer->column,
.type = type,
.value = value,
};
}
void lexer_state_machine(lexer_t *lexer, char input) {
switch (lexer->current) {
case LEXER_STATE_START:
lexer->current = handle_lexer_start(input);
lexer->current = handle_lexer_start(lexer, input);
break;
case LEXER_STATE_VALUE:
lexer->current = handle_value(lexer, input);
@@ -350,6 +416,8 @@ void lexer_state_machine(lexer_t *lexer, char input) {
lexer->current = LEXER_STATE_LAST_COLLECTION;
}
handle_input_after_collection_end(lexer, input);
break;
case LEXER_STATE_KEY:
lexer->current = handle_key(lexer, input);
@@ -409,17 +477,19 @@ void lexer_state_machine(lexer_t *lexer, char input) {
}
}
lexer_state_t handle_lexer_start(char input) {
lexer_state_t handle_lexer_start(lexer_t *lexer, char input) {
if (isspace(input)) {
return LEXER_STATE_START;
}
switch (input) {
case '{':
printf("TK_L_BRACE\n");
set_token(lexer, TK_L_BRACE, (token_value_t){0});
return LEXER_STATE_OBJECT_START;
case '[':
printf("TK_L_BRACKET\n");
set_token(lexer, TK_L_BRACKET, (token_value_t){0});
return LEXER_STATE_ARRAY_START;
}
@@ -442,20 +512,35 @@ lexer_state_t handle_collection_end(lexer_t *lexer, char input) {
bool object_end = lexer->current == LEXER_STATE_OBJECT && input == '}';
if (object_end) {
printf("TK_R_BRACE\n");
set_token(lexer, TK_R_BRACE, (token_value_t){0});
return LEXER_STATE_OBJECT_END;
}
bool array_end = lexer->current == LEXER_STATE_ARRAY && input == ']';
if (array_end) {
printf("TK_R_BRACKET\n");
set_token(lexer, TK_R_BRACKET, (token_value_t){0});
return LEXER_STATE_ARRAY_END;
}
return LEXER_STATE_ERROR;
}
void handle_input_after_collection_end(lexer_t *lexer, char input) {
switch (input) {
case '}':
set_token(lexer, TK_R_BRACE, (token_value_t){0});
break;
case ']':
set_token(lexer, TK_R_BRACKET, (token_value_t){0});
break;
}
}
lexer_state_t handle_object(lexer_t *lexer, char input) {
if (isspace(input)) {
return LEXER_STATE_OBJECT;
@@ -464,7 +549,6 @@ lexer_state_t handle_object(lexer_t *lexer, char input) {
return LEXER_STATE_KEY;
} else if (input == '}') {
printf("TK_R_BRACE\n");
return handle_collection_end(lexer, input);
}
@@ -475,7 +559,6 @@ lexer_state_t handle_array(lexer_t *lexer, char input) {
if (isspace(input)) {
return LEXER_STATE_ARRAY;
} else if (input == ']') {
printf("TK_R_BRACKET\n");
return handle_collection_end(lexer, input);
}
@@ -507,10 +590,12 @@ lexer_state_t handle_value(lexer_t *lexer, char input) {
return LEXER_STATE_DECIMAL;
case '{':
printf("TK_L_BRACE\n");
set_token(lexer, TK_L_BRACE, (token_value_t){0});
return LEXER_STATE_OBJECT_START;
case '[':
printf("TK_L_BRACKET\n");
set_token(lexer, TK_L_BRACKET, (token_value_t){0});
return LEXER_STATE_ARRAY_START;
case 't':
case 'f':
@@ -529,10 +614,20 @@ lexer_state_t handle_string(lexer_t *lexer, char input) {
dstr_append(&(lexer->current_string), input);
return LEXER_STATE_ESCAPE_SEQUENCE;
case '"':
printf("TK_STRING: %s\n", dstr_to_cstr(lexer->current_string));
case '"': {
lexer_state_t string_type = lexer->stack.stack[lexer->stack.size - 1];
if (string_type == LEXER_STATE_KEY) {
set_token(lexer, TK_STR_KEY,
(token_value_t){.string = dstr_to_cstr(lexer->current_string)});
} else if (string_type == LEXER_STATE_VALUE) {
set_token(lexer, TK_STR_VAL,
(token_value_t){.string = dstr_to_cstr(lexer->current_string)});
}
return LEXER_STATE_STRING_END;
}
}
dstr_append(&(lexer->current_string), input);
@@ -544,21 +639,17 @@ lexer_state_t handle_string_end(lexer_t *lexer, char input) {
return LEXER_STATE_STRING_END;
}
dstr_clear(lexer->current_string);
lexer->current = stack_pop(&(lexer->stack));
bool key_end = lexer->current == LEXER_STATE_KEY && input == ':';
if (key_end) {
printf("TK_COLON\n");
return LEXER_STATE_VALUE;
}
bool value_end = lexer->current == LEXER_STATE_VALUE && input == ',';
if (value_end) {
printf("TK_COMMA\n");
return lexer->stack.stack[lexer->stack.size - 1];
}
@@ -625,16 +716,25 @@ lexer_state_t handle_number(lexer_t *lexer, char input) {
return LEXER_STATE_FRACTION;
} else if (input == '}' || input == ']') {
printf("TK_NUMBER: %s\n", dstr_to_cstr(lexer->current_string));
dstr_clear(lexer->current_string);
// TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double
set_token(lexer, TK_DOUBLE,
(token_value_t){.string = dstr_to_cstr(lexer->current_string)});
return handle_collection_end(lexer, input);
} else if (input == ',') {
printf("TK_NUMBER: %s\n", dstr_to_cstr(lexer->current_string));
dstr_clear(lexer->current_string);
// TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double
set_token(lexer, TK_DOUBLE,
(token_value_t){.string = dstr_to_cstr(lexer->current_string)});
return lexer->stack.stack[lexer->stack.size - 1];
} else if (isspace(input)) {
// TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double
set_token(lexer, TK_DOUBLE,
(token_value_t){.string = dstr_to_cstr(lexer->current_string)});
return LEXER_STATE_NUMBER_END;
}
@@ -647,8 +747,10 @@ lexer_state_t handle_fraction(lexer_t *lexer, char input) {
return LEXER_STATE_FRACTION;
} else if (input == '}' || input == ']') {
printf("TK_NUMBER: %s\n", dstr_to_cstr(lexer->current_string));
dstr_clear(lexer->current_string);
// TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double
set_token(lexer, TK_DOUBLE,
(token_value_t){.string = dstr_to_cstr(lexer->current_string)});
return handle_collection_end(lexer, input);
} else if (input == 'e' || input == 'E') {
@@ -656,11 +758,18 @@ lexer_state_t handle_fraction(lexer_t *lexer, char input) {
return LEXER_STATE_EXPONENT;
} else if (input == ',') {
printf("TK_NUMBER: %s\n", dstr_to_cstr(lexer->current_string));
dstr_clear(lexer->current_string);
// TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double
set_token(lexer, TK_DOUBLE,
(token_value_t){.string = dstr_to_cstr(lexer->current_string)});
return lexer->stack.stack[lexer->stack.size - 1];
} else if (isspace(input)) {
// TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double
set_token(lexer, TK_DOUBLE,
(token_value_t){.string = dstr_to_cstr(lexer->current_string)});
return LEXER_STATE_NUMBER_END;
}
@@ -695,16 +804,25 @@ lexer_state_t handle_power(lexer_t *lexer, char input) {
return LEXER_STATE_POWER;
} else if (input == '}' || input == ']') {
printf("TK_NUMBER: %s\n", dstr_to_cstr(lexer->current_string));
dstr_clear(lexer->current_string);
// TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double
set_token(lexer, TK_DOUBLE,
(token_value_t){.string = dstr_to_cstr(lexer->current_string)});
return handle_collection_end(lexer, input);
} else if (input == ',') {
printf("TK_NUMBER: %s\n", dstr_to_cstr(lexer->current_string));
dstr_clear(lexer->current_string);
// TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double
set_token(lexer, TK_DOUBLE,
(token_value_t){.string = dstr_to_cstr(lexer->current_string)});
return lexer->stack.stack[lexer->stack.size - 1];
} else if (isspace(input)) {
// TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double
set_token(lexer, TK_DOUBLE,
(token_value_t){.string = dstr_to_cstr(lexer->current_string)});
return LEXER_STATE_NUMBER_END;
}
@@ -712,12 +830,14 @@ lexer_state_t handle_power(lexer_t *lexer, char input) {
}
lexer_state_t handle_number_end(lexer_t *lexer, char input) {
printf("TK_NUMBER: %s\n", dstr_to_cstr(lexer->current_string));
dstr_clear(lexer->current_string);
if (isspace(input)) {
return LEXER_STATE_NUMBER_END;
} else if (input == ',') {
// TODO (Abdelrahman): Set the token type correctly based on whether the
// number is an integer or a double
set_token(lexer, TK_DOUBLE,
(token_value_t){.string = dstr_to_cstr(lexer->current_string)});
return lexer->stack.stack[lexer->stack.size - 1];
}
@@ -802,7 +922,16 @@ lexer_state_t handle_null(lexer_t *lexer, char input) {
}
lexer_state_t handle_keyword_end(lexer_t *lexer, char input) {
printf("TK_KEYWORD: %s\n", lexer->keyword.keyword.str);
const char *keyword = lexer->keyword.keyword.str;
if (strequal(keyword, "null")) {
set_token(lexer, TK_NULL, (token_value_t){0});
} else if (strequal(keyword, "true")) {
set_token(lexer, TK_TRUE, (token_value_t){0});
} else if (strequal(keyword, "false")) {
set_token(lexer, TK_FALSE, (token_value_t){0});
}
clear_lex_str(&(lexer->keyword));
if (isspace(input)) {

View File

@@ -35,7 +35,12 @@ int main(int argc, char *argv[]) {
return EXIT_FAILURE;
}
get_next_token(lexer, json);
token_t token = get_next_token(lexer, json);
while (token.type != TK_NO_TOKEN) {
print_token(token);
token = get_next_token(lexer, NULL);
}
lexer_free(&lexer);