diff --git a/generate_state_table.c b/generate_state_table.c new file mode 100644 index 0000000..cf67e07 --- /dev/null +++ b/generate_state_table.c @@ -0,0 +1,675 @@ +#include "aliases.h" +#include "lexer_data.h" +#include +#include +#include +#include + +#define STRING_BUF_LENGTH 100 + +INTERNAL void write_table(void); +INTERNAL void clear_file(void); +INTERNAL lexer_input_t char_type(char input); +INTERNAL void lexer_state_machine(lexer_input_t input); +INTERNAL lexer_state_t handle_lexer_start(lexer_input_t input); +INTERNAL lexer_state_t handle_last_collection(char input); +INTERNAL lexer_state_t handle_collection_end(lexer_input_t input); +INTERNAL void handle_input_after_collection_end(lexer_input_t input); +INTERNAL lexer_state_t handle_object(lexer_input_t input); +INTERNAL lexer_state_t handle_array(lexer_input_t input); +INTERNAL lexer_state_t handle_key(lexer_input_t input); +INTERNAL lexer_state_t handle_value(lexer_input_t input); +INTERNAL lexer_state_t handle_string(lexer_input_t input); +INTERNAL lexer_state_t handle_string_end(lexer_input_t input); +INTERNAL lexer_state_t handle_escape_sequence(lexer_input_t input); +INTERNAL lexer_state_t handle_unicode_hex(lexer_input_t input, + lexer_state_t return_state); +INTERNAL lexer_state_t handle_decimal(lexer_input_t input); +INTERNAL lexer_state_t handle_number(lexer_input_t input); +INTERNAL lexer_state_t handle_fraction(lexer_input_t input); +INTERNAL lexer_state_t handle_exponent(lexer_input_t input); +INTERNAL lexer_state_t handle_exp_sign(lexer_input_t input); +INTERNAL lexer_state_t handle_power(lexer_input_t input); +INTERNAL lexer_state_t handle_number_end(lexer_input_t input); +INTERNAL lexer_state_t handle_true(lexer_input_t input, + lexer_state_t start_state); +INTERNAL lexer_state_t handle_false(lexer_input_t input, + lexer_state_t start_state); +INTERNAL lexer_state_t handle_null(lexer_input_t input, + lexer_state_t start_state); +INTERNAL lexer_state_t handle_keyword_end(lexer_input_t input); + +INTERNAL lexer_state_t current_state = LEXER_STATE_START; +INTERNAL lexer_state_t state_table[COUNT_LEXER_STATES][COUNT_LEXER_INPUTS] = { + 0}; + +INTERNAL const char *filename = "./include/lexer/lexer_state_transitions.table"; + +int main(void) { + clear_file(); + + write_table(); + + return EXIT_SUCCESS; +} + +void clear_file(void) { + FILE *fp = fopen(filename, "w"); + if (!fp) { + printf("Failed to open file\n"); + + return; + } + + fclose(fp); +} + +void write_table(void) { + FILE *fp = fopen(filename, "a"); + + if (!fp) { + printf("Failed to open file\n"); + + return; + } + + char output[STRING_BUF_LENGTH] = {0}; + u64 length = 0; + + for (u64 i = 0; i < COUNT_LEXER_STATES; ++i) { + sprintf(output, "{ "); + length = strlen(output); + fwrite(output, 1, length, fp); + + for (u64 j = 0; j < COUNT_LEXER_INPUTS; ++j) { + sprintf(output, "%lld, ", (unsigned long long)state_table[i][j]); + length = strlen(output); + fwrite(output, 1, length, fp); + } + + sprintf(output, "},\n"); + length = strlen(output); + fwrite(output, 1, length, fp); + + memset(output, 0, STRING_BUF_LENGTH); + } + + fclose(fp); +} + +INTERNAL lexer_input_t char_type(char input) { + if (isspace(input)) { + return LEXER_INPUT_WHITE_SPACE; + } else if (input >= '1' && input <= '9') { + return LEXER_INPUT_NON_ZERO; + } + + switch (input) { + case '{': + return LEXER_INPUT_OPEN_BRACE; + case '}': + return LEXER_INPUT_CLOSE_BRACE; + case '[': + return LEXER_INPUT_OPEN_BRACKET; + case ']': + return LEXER_INPUT_CLOSE_BRACKET; + case ',': + return LEXER_INPUT_COMMA; + case ':': + return LEXER_INPUT_COLON; + case '"': + return LEXER_INPUT_DOUBLE_QUOTE; + case '\\': + return LEXER_INPUT_BACK_SLASH; + case '/': + return LEXER_INPUT_FORWARD_SLASH; + case 'a': + return LEXER_INPUT_LOWER_A; + case 'b': + return LEXER_INPUT_LOWER_B; + case 'c': + return LEXER_INPUT_LOWER_C; + case 'd': + return LEXER_INPUT_LOWER_D; + case 'e': + return LEXER_INPUT_LOWER_E; + case 'f': + return LEXER_INPUT_LOWER_F; + case 'l': + return LEXER_INPUT_LOWER_L; + case 'n': + return LEXER_INPUT_LOWER_N; + case 'r': + return LEXER_INPUT_LOWER_R; + case 's': + return LEXER_INPUT_LOWER_S; + case 't': + return LEXER_INPUT_LOWER_T; + case 'u': + return LEXER_INPUT_LOWER_U; + case 'A': + return LEXER_INPUT_UPPER_A; + case 'B': + return LEXER_INPUT_UPPER_B; + case 'C': + return LEXER_INPUT_UPPER_C; + case 'D': + return LEXER_INPUT_UPPER_D; + case 'E': + return LEXER_INPUT_UPPER_E; + case 'F': + return LEXER_INPUT_UPPER_F; + case '-': + return LEXER_INPUT_MINUS; + case '+': + return LEXER_INPUT_PLUS; + case '.': + return LEXER_INPUT_DECIMAL; + case '0': + return LEXER_INPUT_ZERO; + default: + return LEXER_INPUT_OTHER; + } +} + +void lexer_state_machine(lexer_input_t input) { + switch (current_state) { + case LEXER_STATE_START: + current_state = handle_lexer_start(input); + break; + case LEXER_STATE_VALUE: + current_state = handle_value(input); + break; + case LEXER_STATE_OBJECT_START: + case LEXER_STATE_OBJECT: + current_state = handle_object(input); + break; + case LEXER_STATE_ARRAY_START: + case LEXER_STATE_ARRAY: + current_state = handle_array(input); + break; + case LEXER_STATE_OBJECT_END: + case LEXER_STATE_ARRAY_END: + handle_input_after_collection_end(input); + + break; + case LEXER_STATE_KEY: + current_state = handle_key(input); + break; + case LEXER_STATE_DECIMAL: + current_state = handle_decimal(input); + break; + case LEXER_STATE_NUMBER: + current_state = handle_number(input); + break; + case LEXER_STATE_FRACTION: + current_state = handle_fraction(input); + break; + case LEXER_STATE_EXPONENT: + current_state = handle_exponent(input); + break; + case LEXER_STATE_EXP_SIGN: + current_state = handle_exp_sign(input); + break; + case LEXER_STATE_POWER: + current_state = handle_power(input); + break; + case LEXER_STATE_NUMBER_END: + current_state = handle_number_end(input); + break; + case LEXER_STATE_STRING: + current_state = handle_string(input); + break; + case LEXER_STATE_STRING_END: + current_state = handle_string_end(input); + break; + case LEXER_STATE_ESCAPE_SEQUENCE: + current_state = handle_escape_sequence(input); + break; + case LEXER_STATE_UNICODE_HEX1: + current_state = handle_unicode_hex(input, LEXER_STATE_UNICODE_HEX2); + break; + case LEXER_STATE_UNICODE_HEX2: + current_state = handle_unicode_hex(input, LEXER_STATE_UNICODE_HEX3); + break; + case LEXER_STATE_UNICODE_HEX3: + current_state = handle_unicode_hex(input, LEXER_STATE_UNICODE_HEX4); + break; + case LEXER_STATE_UNICODE_HEX4: + current_state = handle_unicode_hex(input, LEXER_STATE_STRING); + break; + case LEXER_STATE_T: + current_state = handle_true(input, LEXER_STATE_T); + break; + case LEXER_STATE_TR: + current_state = handle_true(input, LEXER_STATE_TR); + break; + case LEXER_STATE_TRU: + current_state = handle_true(input, LEXER_STATE_TRU); + break; + case LEXER_STATE_TRUE: + current_state = handle_true(input, LEXER_STATE_TRUE); + break; + case LEXER_STATE_F: + current_state = handle_false(input, LEXER_STATE_F); + break; + case LEXER_STATE_FA: + current_state = handle_false(input, LEXER_STATE_FA); + break; + case LEXER_STATE_FAL: + current_state = handle_false(input, LEXER_STATE_FAL); + break; + case LEXER_STATE_FALS: + current_state = handle_false(input, LEXER_STATE_FALS); + break; + case LEXER_STATE_FALSE: + current_state = handle_false(input, LEXER_STATE_FALSE); + break; + case LEXER_STATE_N: + current_state = handle_null(input, LEXER_STATE_N); + break; + case LEXER_STATE_NU: + current_state = handle_null(input, LEXER_STATE_NU); + break; + case LEXER_STATE_NUL: + current_state = handle_null(input, LEXER_STATE_NUL); + break; + case LEXER_STATE_NULL: + current_state = handle_null(input, LEXER_STATE_NULL); + break; + case LEXER_STATE_KEYWORD_END: + current_state = handle_keyword_end(input); + break; + case LEXER_STATE_LAST_COLLECTION: + current_state = handle_last_collection(input); + break; + case LEXER_STATE_ERROR: + case COUNT_LEXER_STATES: + current_state = LEXER_STATE_ERROR; + break; + } +} + +lexer_state_t handle_lexer_start(lexer_input_t input) { + switch (input) { + case LEXER_INPUT_WHITE_SPACE: + return LEXER_STATE_START; + case LEXER_INPUT_OPEN_BRACE: + return LEXER_STATE_OBJECT_START; + case LEXER_INPUT_OPEN_BRACKET: + return LEXER_STATE_ARRAY_START; + default: + return LEXER_STATE_ERROR; + } +} + +lexer_state_t handle_last_collection(char input) { + if (input == LEXER_INPUT_WHITE_SPACE) { + return LEXER_STATE_LAST_COLLECTION; + } + + return LEXER_STATE_ERROR; +} + +// TODO (Abdelrahman): Figure out how to handle this +lexer_state_t handle_collection_end(lexer_input_t input) { + // No need to ignore space as this is only called when input is } or ] + + lexer->current = lexer->stack.stack[lexer->stack.size - 1]; + + bool object_end = lexer->current == LEXER_STATE_OBJECT && input == '}'; + + if (object_end) { + token_t *token; + + if (lexer->token_ready) { + lexer->has_extra_token = true; + token = &(lexer->extra_token); + } else { + lexer->token_ready = true; + token = &(lexer->token); + } + + set_token(token, lexer->line, lexer->column, TK_R_BRACE, + (token_value_t){0}); + + return LEXER_STATE_OBJECT_END; + } + + bool array_end = lexer->current == LEXER_STATE_ARRAY && input == ']'; + + if (array_end) { + token_t *token; + + if (lexer->token_ready) { + lexer->has_extra_token = true; + token = &(lexer->extra_token); + } else { + lexer->token_ready = true; + token = &(lexer->token); + } + + set_token(token, lexer->line, lexer->column, TK_R_BRACKET, + (token_value_t){0}); + + return LEXER_STATE_ARRAY_END; + } + + return LEXER_STATE_ERROR; +} + +// TODO (Abdelrahman): Figure out how to handle this +void handle_input_after_collection_end(lexer_input_t input) { + switch (input) { + case '}': + lexer->token_ready = true; + set_token(&(lexer->token), lexer->line, lexer->column, TK_R_BRACE, + (token_value_t){0}); + + break; + case ']': + lexer->token_ready = true; + set_token(&(lexer->token), lexer->line, lexer->column, TK_R_BRACKET, + (token_value_t){0}); + + break; + } +} + +lexer_state_t handle_object(lexer_input_t input) { + switch (input) { + case LEXER_INPUT_WHITE_SPACE: + return LEXER_STATE_OBJECT; + case LEXER_INPUT_DOUBLE_QUOTE: + return LEXER_STATE_KEY; + case LEXER_INPUT_CLOSE_BRACE: + return handle_collection_end(input); + default: + return LEXER_STATE_ERROR; + } +} + +lexer_state_t handle_array(lexer_input_t input) { + switch (input) { + case LEXER_INPUT_WHITE_SPACE: + return LEXER_STATE_ARRAY; + case LEXER_INPUT_CLOSE_BRACKET: + return handle_collection_end(input); + default: + return handle_value(input); + } +} + +lexer_state_t handle_key(lexer_input_t input) { return LEXER_STATE_STRING; } + +lexer_state_t handle_value(lexer_input_t input) { + switch (input) { + case LEXER_INPUT_WHITE_SPACE: + return LEXER_STATE_VALUE; + case LEXER_INPUT_NON_ZERO: + case LEXER_INPUT_MINUS: + return LEXER_STATE_NUMBER; + case LEXER_INPUT_ZERO: + return LEXER_STATE_DECIMAL; + case LEXER_INPUT_DOUBLE_QUOTE: + return LEXER_STATE_STRING; + case LEXER_INPUT_OPEN_BRACE: + return LEXER_STATE_OBJECT_START; + case LEXER_INPUT_OPEN_BRACKET: + return LEXER_STATE_ARRAY_START; + case LEXER_INPUT_LOWER_T: + return LEXER_STATE_T; + case LEXER_INPUT_LOWER_F: + return LEXER_STATE_F; + case LEXER_INPUT_LOWER_N: + return LEXER_STATE_N; + default: + return LEXER_STATE_ERROR; + } +} + +lexer_state_t handle_string(lexer_input_t input) { + switch (input) { + case LEXER_INPUT_BACK_SLASH: + return LEXER_STATE_ESCAPE_SEQUENCE; + case LEXER_INPUT_DOUBLE_QUOTE: + return LEXER_STATE_STRING_END; + default: + return LEXER_STATE_STRING; + } +} + +// TODO (Abdelrahman): Figure out how to handle this +lexer_state_t handle_string_end(lexer_input_t input) { + switch (input) { + case LEXER_INPUT_WHITE_SPACE: + return LEXER_STATE_STRING_END; + } + + lexer->current = stack_pop(&(lexer->stack)); + + bool key_end = lexer->current == LEXER_STATE_KEY && input == ':'; + + if (key_end) { + return LEXER_STATE_VALUE; + } + + bool value_end = lexer->current == LEXER_STATE_VALUE && input == ','; + + if (value_end) { + return lexer->stack.stack[lexer->stack.size - 1]; + } + + bool collection_end = input == '}' || input == ']'; + + return collection_end ? handle_collection_end(lexer, input) + : LEXER_STATE_ERROR; +} + +lexer_state_t handle_escape_sequence(lexer_input_t input) { + switch (input) { + case LEXER_INPUT_DOUBLE_QUOTE: + case LEXER_INPUT_FORWARD_SLASH: + case LEXER_INPUT_BACK_SLASH: + case LEXER_INPUT_LOWER_B: + case LEXER_INPUT_LOWER_F: + case LEXER_INPUT_LOWER_N: + case LEXER_INPUT_LOWER_R: + case LEXER_INPUT_LOWER_T: + return LEXER_STATE_STRING; + case LEXER_INPUT_LOWER_U: + return LEXER_STATE_UNICODE_HEX1; + default: + return LEXER_STATE_ERROR; + } +} + +INTERNAL lexer_state_t handle_unicode_hex(lexer_input_t input, + lexer_state_t return_state) { + switch (input) { + case LEXER_INPUT_LOWER_A: + case LEXER_INPUT_LOWER_B: + case LEXER_INPUT_LOWER_C: + case LEXER_INPUT_LOWER_D: + case LEXER_INPUT_LOWER_E: + case LEXER_INPUT_LOWER_F: + case LEXER_INPUT_UPPER_A: + case LEXER_INPUT_UPPER_B: + case LEXER_INPUT_UPPER_C: + case LEXER_INPUT_UPPER_D: + case LEXER_INPUT_UPPER_E: + case LEXER_INPUT_UPPER_F: + case LEXER_INPUT_ZERO: + case LEXER_INPUT_NON_ZERO: + return return_state; + default: + return LEXER_STATE_ERROR; + } +} + +lexer_state_t handle_decimal(lexer_input_t input) { + if (input == LEXER_INPUT_DECIMAL) { + return LEXER_STATE_FRACTION; + } + + return LEXER_STATE_ERROR; +} + +lexer_state_t handle_number(lexer_input_t input) { + switch (input) { + case LEXER_INPUT_ZERO: + case LEXER_INPUT_NON_ZERO: + return LEXER_STATE_NUMBER; + case LEXER_INPUT_DECIMAL: + return LEXER_STATE_FRACTION; + case LEXER_INPUT_CLOSE_BRACE: + case LEXER_INPUT_CLOSE_BRACKET: + return handle_collection_end(input); + case LEXER_INPUT_COMMA: + // TODO (Abdelrahman): Figure out how to handle this + return lexer->stack.stack[lexer->stack.size - 1]; + case LEXER_INPUT_WHITE_SPACE: + return LEXER_STATE_NUMBER_END; + default: + return LEXER_STATE_ERROR; + } +} + +lexer_state_t handle_fraction(lexer_input_t input) { + switch (input) { + case LEXER_INPUT_ZERO: + case LEXER_INPUT_NON_ZERO: + return LEXER_STATE_FRACTION; + case LEXER_INPUT_CLOSE_BRACE: + case LEXER_INPUT_CLOSE_BRACKET: + return handle_collection_end(input); + case LEXER_INPUT_LOWER_E: + case LEXER_INPUT_UPPER_E: + return LEXER_STATE_EXPONENT; + case LEXER_INPUT_COMMA: + // TODO (Abdelrahman): Figure out how to handle this + return lexer->stack.stack[lexer->stack.size - 1]; + case LEXER_INPUT_WHITE_SPACE: + return LEXER_STATE_NUMBER_END; + default: + return LEXER_STATE_ERROR; + } +} + +lexer_state_t handle_exponent(lexer_input_t input) { + switch (input) { + case LEXER_INPUT_ZERO: + case LEXER_INPUT_NON_ZERO: + return LEXER_STATE_POWER; + case LEXER_INPUT_PLUS: + case LEXER_INPUT_MINUS: + return LEXER_STATE_EXP_SIGN; + default: + return LEXER_STATE_ERROR; + } +} + +lexer_state_t handle_exp_sign(lexer_input_t input) { + switch (input) { + case LEXER_INPUT_ZERO: + case LEXER_INPUT_NON_ZERO: + return LEXER_STATE_POWER; + default: + return LEXER_STATE_ERROR; + } +} + +lexer_state_t handle_power(lexer_input_t input) { + switch (input) { + case LEXER_INPUT_ZERO: + case LEXER_INPUT_NON_ZERO: + return LEXER_STATE_POWER; + case LEXER_INPUT_CLOSE_BRACE: + case LEXER_INPUT_CLOSE_BRACKET: + return handle_collection_end(input); + case LEXER_INPUT_COMMA: + // TODO (Abdelrahman): Figure out how to handle this + return lexer->stack.stack[lexer->stack.size - 1]; + case LEXER_INPUT_WHITE_SPACE: + return LEXER_STATE_NUMBER_END; + default: + return LEXER_STATE_ERROR; + } +} + +lexer_state_t handle_number_end(lexer_input_t input) { + switch (input) { + case LEXER_INPUT_WHITE_SPACE: + return LEXER_STATE_NUMBER_END; + case LEXER_INPUT_CLOSE_BRACE: + case LEXER_INPUT_CLOSE_BRACKET: + return handle_collection_end(input); + case LEXER_INPUT_COMMA: + // TODO (Abdelrahman): Figure out how to handle this + return lexer->stack.stack[lexer->stack.size - 1]; + default: + return LEXER_STATE_ERROR; + } +} + +lexer_state_t handle_true(lexer_input_t input, lexer_state_t start_state) { + switch (start_state) { + case LEXER_STATE_T: + return input == LEXER_INPUT_LOWER_R ? LEXER_STATE_TR : LEXER_STATE_ERROR; + case LEXER_STATE_TR: + return input == LEXER_INPUT_LOWER_U ? LEXER_STATE_TRU : LEXER_STATE_ERROR; + case LEXER_STATE_TRU: + return input == LEXER_INPUT_LOWER_E ? LEXER_STATE_TRUE : LEXER_STATE_ERROR; + case LEXER_STATE_TRUE: + return LEXER_STATE_KEYWORD_END; + default: + return LEXER_STATE_ERROR; + } +} + +lexer_state_t handle_false(lexer_input_t input, lexer_state_t start_state) { + switch (start_state) { + case LEXER_STATE_F: + return input == LEXER_INPUT_LOWER_A ? LEXER_STATE_FA : LEXER_STATE_ERROR; + case LEXER_STATE_FA: + return input == LEXER_INPUT_LOWER_L ? LEXER_STATE_FAL : LEXER_STATE_ERROR; + case LEXER_STATE_FAL: + return input == LEXER_INPUT_LOWER_S ? LEXER_STATE_FALS : LEXER_STATE_ERROR; + case LEXER_STATE_FALS: + return input == LEXER_INPUT_LOWER_E ? LEXER_STATE_FALSE : LEXER_STATE_ERROR; + case LEXER_STATE_FALSE: + return LEXER_STATE_KEYWORD_END; + default: + return LEXER_STATE_ERROR; + } +} + +lexer_state_t handle_null(lexer_input_t input, lexer_state_t start_state) { + switch (start_state) { + case LEXER_STATE_N: + return input == LEXER_INPUT_LOWER_U ? LEXER_STATE_NU : LEXER_STATE_ERROR; + case LEXER_STATE_NU: + return input == LEXER_INPUT_LOWER_L ? LEXER_STATE_NUL : LEXER_STATE_ERROR; + case LEXER_STATE_NUL: + return input == LEXER_INPUT_LOWER_L ? LEXER_STATE_NULL : LEXER_STATE_ERROR; + case LEXER_STATE_NULL: + return LEXER_STATE_KEYWORD_END; + default: + return LEXER_STATE_ERROR; + } +} + +lexer_state_t handle_keyword_end(lexer_input_t input) { + switch (input) { + case LEXER_INPUT_WHITE_SPACE: + return LEXER_STATE_KEYWORD_END; + case LEXER_INPUT_CLOSE_BRACE: + case LEXER_INPUT_CLOSE_BRACKET: + return handle_collection_end(input); + case LEXER_INPUT_COMMA: + // TODO (Abdelrahman): Figure out how to handle this + return lexer->stack.stack[lexer->stack.size - 1]; + default: + return LEXER_STATE_ERROR; + } +} diff --git a/mk_table_generator b/mk_table_generator new file mode 100755 index 0000000..344d345 --- /dev/null +++ b/mk_table_generator @@ -0,0 +1,3 @@ +#!/bin/bash + +clang -Iinclude -Iinclude/lexer generate_state_table.c -o gentable