Initial work on converting the state machine to a state table

This commit is contained in:
Abdelrahman Said 2023-07-23 22:48:28 +01:00
parent 6e93d3ecd1
commit 91162654b3
2 changed files with 678 additions and 0 deletions

675
generate_state_table.c Normal file
View File

@ -0,0 +1,675 @@
#include "aliases.h"
#include "lexer_data.h"
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define STRING_BUF_LENGTH 100
INTERNAL void write_table(void);
INTERNAL void clear_file(void);
INTERNAL lexer_input_t char_type(char input);
INTERNAL void lexer_state_machine(lexer_input_t input);
INTERNAL lexer_state_t handle_lexer_start(lexer_input_t input);
INTERNAL lexer_state_t handle_last_collection(char input);
INTERNAL lexer_state_t handle_collection_end(lexer_input_t input);
INTERNAL void handle_input_after_collection_end(lexer_input_t input);
INTERNAL lexer_state_t handle_object(lexer_input_t input);
INTERNAL lexer_state_t handle_array(lexer_input_t input);
INTERNAL lexer_state_t handle_key(lexer_input_t input);
INTERNAL lexer_state_t handle_value(lexer_input_t input);
INTERNAL lexer_state_t handle_string(lexer_input_t input);
INTERNAL lexer_state_t handle_string_end(lexer_input_t input);
INTERNAL lexer_state_t handle_escape_sequence(lexer_input_t input);
INTERNAL lexer_state_t handle_unicode_hex(lexer_input_t input,
lexer_state_t return_state);
INTERNAL lexer_state_t handle_decimal(lexer_input_t input);
INTERNAL lexer_state_t handle_number(lexer_input_t input);
INTERNAL lexer_state_t handle_fraction(lexer_input_t input);
INTERNAL lexer_state_t handle_exponent(lexer_input_t input);
INTERNAL lexer_state_t handle_exp_sign(lexer_input_t input);
INTERNAL lexer_state_t handle_power(lexer_input_t input);
INTERNAL lexer_state_t handle_number_end(lexer_input_t input);
INTERNAL lexer_state_t handle_true(lexer_input_t input,
lexer_state_t start_state);
INTERNAL lexer_state_t handle_false(lexer_input_t input,
lexer_state_t start_state);
INTERNAL lexer_state_t handle_null(lexer_input_t input,
lexer_state_t start_state);
INTERNAL lexer_state_t handle_keyword_end(lexer_input_t input);
INTERNAL lexer_state_t current_state = LEXER_STATE_START;
INTERNAL lexer_state_t state_table[COUNT_LEXER_STATES][COUNT_LEXER_INPUTS] = {
0};
INTERNAL const char *filename = "./include/lexer/lexer_state_transitions.table";
int main(void) {
clear_file();
write_table();
return EXIT_SUCCESS;
}
void clear_file(void) {
FILE *fp = fopen(filename, "w");
if (!fp) {
printf("Failed to open file\n");
return;
}
fclose(fp);
}
void write_table(void) {
FILE *fp = fopen(filename, "a");
if (!fp) {
printf("Failed to open file\n");
return;
}
char output[STRING_BUF_LENGTH] = {0};
u64 length = 0;
for (u64 i = 0; i < COUNT_LEXER_STATES; ++i) {
sprintf(output, "{ ");
length = strlen(output);
fwrite(output, 1, length, fp);
for (u64 j = 0; j < COUNT_LEXER_INPUTS; ++j) {
sprintf(output, "%lld, ", (unsigned long long)state_table[i][j]);
length = strlen(output);
fwrite(output, 1, length, fp);
}
sprintf(output, "},\n");
length = strlen(output);
fwrite(output, 1, length, fp);
memset(output, 0, STRING_BUF_LENGTH);
}
fclose(fp);
}
INTERNAL lexer_input_t char_type(char input) {
if (isspace(input)) {
return LEXER_INPUT_WHITE_SPACE;
} else if (input >= '1' && input <= '9') {
return LEXER_INPUT_NON_ZERO;
}
switch (input) {
case '{':
return LEXER_INPUT_OPEN_BRACE;
case '}':
return LEXER_INPUT_CLOSE_BRACE;
case '[':
return LEXER_INPUT_OPEN_BRACKET;
case ']':
return LEXER_INPUT_CLOSE_BRACKET;
case ',':
return LEXER_INPUT_COMMA;
case ':':
return LEXER_INPUT_COLON;
case '"':
return LEXER_INPUT_DOUBLE_QUOTE;
case '\\':
return LEXER_INPUT_BACK_SLASH;
case '/':
return LEXER_INPUT_FORWARD_SLASH;
case 'a':
return LEXER_INPUT_LOWER_A;
case 'b':
return LEXER_INPUT_LOWER_B;
case 'c':
return LEXER_INPUT_LOWER_C;
case 'd':
return LEXER_INPUT_LOWER_D;
case 'e':
return LEXER_INPUT_LOWER_E;
case 'f':
return LEXER_INPUT_LOWER_F;
case 'l':
return LEXER_INPUT_LOWER_L;
case 'n':
return LEXER_INPUT_LOWER_N;
case 'r':
return LEXER_INPUT_LOWER_R;
case 's':
return LEXER_INPUT_LOWER_S;
case 't':
return LEXER_INPUT_LOWER_T;
case 'u':
return LEXER_INPUT_LOWER_U;
case 'A':
return LEXER_INPUT_UPPER_A;
case 'B':
return LEXER_INPUT_UPPER_B;
case 'C':
return LEXER_INPUT_UPPER_C;
case 'D':
return LEXER_INPUT_UPPER_D;
case 'E':
return LEXER_INPUT_UPPER_E;
case 'F':
return LEXER_INPUT_UPPER_F;
case '-':
return LEXER_INPUT_MINUS;
case '+':
return LEXER_INPUT_PLUS;
case '.':
return LEXER_INPUT_DECIMAL;
case '0':
return LEXER_INPUT_ZERO;
default:
return LEXER_INPUT_OTHER;
}
}
void lexer_state_machine(lexer_input_t input) {
switch (current_state) {
case LEXER_STATE_START:
current_state = handle_lexer_start(input);
break;
case LEXER_STATE_VALUE:
current_state = handle_value(input);
break;
case LEXER_STATE_OBJECT_START:
case LEXER_STATE_OBJECT:
current_state = handle_object(input);
break;
case LEXER_STATE_ARRAY_START:
case LEXER_STATE_ARRAY:
current_state = handle_array(input);
break;
case LEXER_STATE_OBJECT_END:
case LEXER_STATE_ARRAY_END:
handle_input_after_collection_end(input);
break;
case LEXER_STATE_KEY:
current_state = handle_key(input);
break;
case LEXER_STATE_DECIMAL:
current_state = handle_decimal(input);
break;
case LEXER_STATE_NUMBER:
current_state = handle_number(input);
break;
case LEXER_STATE_FRACTION:
current_state = handle_fraction(input);
break;
case LEXER_STATE_EXPONENT:
current_state = handle_exponent(input);
break;
case LEXER_STATE_EXP_SIGN:
current_state = handle_exp_sign(input);
break;
case LEXER_STATE_POWER:
current_state = handle_power(input);
break;
case LEXER_STATE_NUMBER_END:
current_state = handle_number_end(input);
break;
case LEXER_STATE_STRING:
current_state = handle_string(input);
break;
case LEXER_STATE_STRING_END:
current_state = handle_string_end(input);
break;
case LEXER_STATE_ESCAPE_SEQUENCE:
current_state = handle_escape_sequence(input);
break;
case LEXER_STATE_UNICODE_HEX1:
current_state = handle_unicode_hex(input, LEXER_STATE_UNICODE_HEX2);
break;
case LEXER_STATE_UNICODE_HEX2:
current_state = handle_unicode_hex(input, LEXER_STATE_UNICODE_HEX3);
break;
case LEXER_STATE_UNICODE_HEX3:
current_state = handle_unicode_hex(input, LEXER_STATE_UNICODE_HEX4);
break;
case LEXER_STATE_UNICODE_HEX4:
current_state = handle_unicode_hex(input, LEXER_STATE_STRING);
break;
case LEXER_STATE_T:
current_state = handle_true(input, LEXER_STATE_T);
break;
case LEXER_STATE_TR:
current_state = handle_true(input, LEXER_STATE_TR);
break;
case LEXER_STATE_TRU:
current_state = handle_true(input, LEXER_STATE_TRU);
break;
case LEXER_STATE_TRUE:
current_state = handle_true(input, LEXER_STATE_TRUE);
break;
case LEXER_STATE_F:
current_state = handle_false(input, LEXER_STATE_F);
break;
case LEXER_STATE_FA:
current_state = handle_false(input, LEXER_STATE_FA);
break;
case LEXER_STATE_FAL:
current_state = handle_false(input, LEXER_STATE_FAL);
break;
case LEXER_STATE_FALS:
current_state = handle_false(input, LEXER_STATE_FALS);
break;
case LEXER_STATE_FALSE:
current_state = handle_false(input, LEXER_STATE_FALSE);
break;
case LEXER_STATE_N:
current_state = handle_null(input, LEXER_STATE_N);
break;
case LEXER_STATE_NU:
current_state = handle_null(input, LEXER_STATE_NU);
break;
case LEXER_STATE_NUL:
current_state = handle_null(input, LEXER_STATE_NUL);
break;
case LEXER_STATE_NULL:
current_state = handle_null(input, LEXER_STATE_NULL);
break;
case LEXER_STATE_KEYWORD_END:
current_state = handle_keyword_end(input);
break;
case LEXER_STATE_LAST_COLLECTION:
current_state = handle_last_collection(input);
break;
case LEXER_STATE_ERROR:
case COUNT_LEXER_STATES:
current_state = LEXER_STATE_ERROR;
break;
}
}
lexer_state_t handle_lexer_start(lexer_input_t input) {
switch (input) {
case LEXER_INPUT_WHITE_SPACE:
return LEXER_STATE_START;
case LEXER_INPUT_OPEN_BRACE:
return LEXER_STATE_OBJECT_START;
case LEXER_INPUT_OPEN_BRACKET:
return LEXER_STATE_ARRAY_START;
default:
return LEXER_STATE_ERROR;
}
}
lexer_state_t handle_last_collection(char input) {
if (input == LEXER_INPUT_WHITE_SPACE) {
return LEXER_STATE_LAST_COLLECTION;
}
return LEXER_STATE_ERROR;
}
// TODO (Abdelrahman): Figure out how to handle this
lexer_state_t handle_collection_end(lexer_input_t input) {
// No need to ignore space as this is only called when input is } or ]
lexer->current = lexer->stack.stack[lexer->stack.size - 1];
bool object_end = lexer->current == LEXER_STATE_OBJECT && input == '}';
if (object_end) {
token_t *token;
if (lexer->token_ready) {
lexer->has_extra_token = true;
token = &(lexer->extra_token);
} else {
lexer->token_ready = true;
token = &(lexer->token);
}
set_token(token, lexer->line, lexer->column, TK_R_BRACE,
(token_value_t){0});
return LEXER_STATE_OBJECT_END;
}
bool array_end = lexer->current == LEXER_STATE_ARRAY && input == ']';
if (array_end) {
token_t *token;
if (lexer->token_ready) {
lexer->has_extra_token = true;
token = &(lexer->extra_token);
} else {
lexer->token_ready = true;
token = &(lexer->token);
}
set_token(token, lexer->line, lexer->column, TK_R_BRACKET,
(token_value_t){0});
return LEXER_STATE_ARRAY_END;
}
return LEXER_STATE_ERROR;
}
// TODO (Abdelrahman): Figure out how to handle this
void handle_input_after_collection_end(lexer_input_t input) {
switch (input) {
case '}':
lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_R_BRACE,
(token_value_t){0});
break;
case ']':
lexer->token_ready = true;
set_token(&(lexer->token), lexer->line, lexer->column, TK_R_BRACKET,
(token_value_t){0});
break;
}
}
lexer_state_t handle_object(lexer_input_t input) {
switch (input) {
case LEXER_INPUT_WHITE_SPACE:
return LEXER_STATE_OBJECT;
case LEXER_INPUT_DOUBLE_QUOTE:
return LEXER_STATE_KEY;
case LEXER_INPUT_CLOSE_BRACE:
return handle_collection_end(input);
default:
return LEXER_STATE_ERROR;
}
}
lexer_state_t handle_array(lexer_input_t input) {
switch (input) {
case LEXER_INPUT_WHITE_SPACE:
return LEXER_STATE_ARRAY;
case LEXER_INPUT_CLOSE_BRACKET:
return handle_collection_end(input);
default:
return handle_value(input);
}
}
lexer_state_t handle_key(lexer_input_t input) { return LEXER_STATE_STRING; }
lexer_state_t handle_value(lexer_input_t input) {
switch (input) {
case LEXER_INPUT_WHITE_SPACE:
return LEXER_STATE_VALUE;
case LEXER_INPUT_NON_ZERO:
case LEXER_INPUT_MINUS:
return LEXER_STATE_NUMBER;
case LEXER_INPUT_ZERO:
return LEXER_STATE_DECIMAL;
case LEXER_INPUT_DOUBLE_QUOTE:
return LEXER_STATE_STRING;
case LEXER_INPUT_OPEN_BRACE:
return LEXER_STATE_OBJECT_START;
case LEXER_INPUT_OPEN_BRACKET:
return LEXER_STATE_ARRAY_START;
case LEXER_INPUT_LOWER_T:
return LEXER_STATE_T;
case LEXER_INPUT_LOWER_F:
return LEXER_STATE_F;
case LEXER_INPUT_LOWER_N:
return LEXER_STATE_N;
default:
return LEXER_STATE_ERROR;
}
}
lexer_state_t handle_string(lexer_input_t input) {
switch (input) {
case LEXER_INPUT_BACK_SLASH:
return LEXER_STATE_ESCAPE_SEQUENCE;
case LEXER_INPUT_DOUBLE_QUOTE:
return LEXER_STATE_STRING_END;
default:
return LEXER_STATE_STRING;
}
}
// TODO (Abdelrahman): Figure out how to handle this
lexer_state_t handle_string_end(lexer_input_t input) {
switch (input) {
case LEXER_INPUT_WHITE_SPACE:
return LEXER_STATE_STRING_END;
}
lexer->current = stack_pop(&(lexer->stack));
bool key_end = lexer->current == LEXER_STATE_KEY && input == ':';
if (key_end) {
return LEXER_STATE_VALUE;
}
bool value_end = lexer->current == LEXER_STATE_VALUE && input == ',';
if (value_end) {
return lexer->stack.stack[lexer->stack.size - 1];
}
bool collection_end = input == '}' || input == ']';
return collection_end ? handle_collection_end(lexer, input)
: LEXER_STATE_ERROR;
}
lexer_state_t handle_escape_sequence(lexer_input_t input) {
switch (input) {
case LEXER_INPUT_DOUBLE_QUOTE:
case LEXER_INPUT_FORWARD_SLASH:
case LEXER_INPUT_BACK_SLASH:
case LEXER_INPUT_LOWER_B:
case LEXER_INPUT_LOWER_F:
case LEXER_INPUT_LOWER_N:
case LEXER_INPUT_LOWER_R:
case LEXER_INPUT_LOWER_T:
return LEXER_STATE_STRING;
case LEXER_INPUT_LOWER_U:
return LEXER_STATE_UNICODE_HEX1;
default:
return LEXER_STATE_ERROR;
}
}
INTERNAL lexer_state_t handle_unicode_hex(lexer_input_t input,
lexer_state_t return_state) {
switch (input) {
case LEXER_INPUT_LOWER_A:
case LEXER_INPUT_LOWER_B:
case LEXER_INPUT_LOWER_C:
case LEXER_INPUT_LOWER_D:
case LEXER_INPUT_LOWER_E:
case LEXER_INPUT_LOWER_F:
case LEXER_INPUT_UPPER_A:
case LEXER_INPUT_UPPER_B:
case LEXER_INPUT_UPPER_C:
case LEXER_INPUT_UPPER_D:
case LEXER_INPUT_UPPER_E:
case LEXER_INPUT_UPPER_F:
case LEXER_INPUT_ZERO:
case LEXER_INPUT_NON_ZERO:
return return_state;
default:
return LEXER_STATE_ERROR;
}
}
lexer_state_t handle_decimal(lexer_input_t input) {
if (input == LEXER_INPUT_DECIMAL) {
return LEXER_STATE_FRACTION;
}
return LEXER_STATE_ERROR;
}
lexer_state_t handle_number(lexer_input_t input) {
switch (input) {
case LEXER_INPUT_ZERO:
case LEXER_INPUT_NON_ZERO:
return LEXER_STATE_NUMBER;
case LEXER_INPUT_DECIMAL:
return LEXER_STATE_FRACTION;
case LEXER_INPUT_CLOSE_BRACE:
case LEXER_INPUT_CLOSE_BRACKET:
return handle_collection_end(input);
case LEXER_INPUT_COMMA:
// TODO (Abdelrahman): Figure out how to handle this
return lexer->stack.stack[lexer->stack.size - 1];
case LEXER_INPUT_WHITE_SPACE:
return LEXER_STATE_NUMBER_END;
default:
return LEXER_STATE_ERROR;
}
}
lexer_state_t handle_fraction(lexer_input_t input) {
switch (input) {
case LEXER_INPUT_ZERO:
case LEXER_INPUT_NON_ZERO:
return LEXER_STATE_FRACTION;
case LEXER_INPUT_CLOSE_BRACE:
case LEXER_INPUT_CLOSE_BRACKET:
return handle_collection_end(input);
case LEXER_INPUT_LOWER_E:
case LEXER_INPUT_UPPER_E:
return LEXER_STATE_EXPONENT;
case LEXER_INPUT_COMMA:
// TODO (Abdelrahman): Figure out how to handle this
return lexer->stack.stack[lexer->stack.size - 1];
case LEXER_INPUT_WHITE_SPACE:
return LEXER_STATE_NUMBER_END;
default:
return LEXER_STATE_ERROR;
}
}
lexer_state_t handle_exponent(lexer_input_t input) {
switch (input) {
case LEXER_INPUT_ZERO:
case LEXER_INPUT_NON_ZERO:
return LEXER_STATE_POWER;
case LEXER_INPUT_PLUS:
case LEXER_INPUT_MINUS:
return LEXER_STATE_EXP_SIGN;
default:
return LEXER_STATE_ERROR;
}
}
lexer_state_t handle_exp_sign(lexer_input_t input) {
switch (input) {
case LEXER_INPUT_ZERO:
case LEXER_INPUT_NON_ZERO:
return LEXER_STATE_POWER;
default:
return LEXER_STATE_ERROR;
}
}
lexer_state_t handle_power(lexer_input_t input) {
switch (input) {
case LEXER_INPUT_ZERO:
case LEXER_INPUT_NON_ZERO:
return LEXER_STATE_POWER;
case LEXER_INPUT_CLOSE_BRACE:
case LEXER_INPUT_CLOSE_BRACKET:
return handle_collection_end(input);
case LEXER_INPUT_COMMA:
// TODO (Abdelrahman): Figure out how to handle this
return lexer->stack.stack[lexer->stack.size - 1];
case LEXER_INPUT_WHITE_SPACE:
return LEXER_STATE_NUMBER_END;
default:
return LEXER_STATE_ERROR;
}
}
lexer_state_t handle_number_end(lexer_input_t input) {
switch (input) {
case LEXER_INPUT_WHITE_SPACE:
return LEXER_STATE_NUMBER_END;
case LEXER_INPUT_CLOSE_BRACE:
case LEXER_INPUT_CLOSE_BRACKET:
return handle_collection_end(input);
case LEXER_INPUT_COMMA:
// TODO (Abdelrahman): Figure out how to handle this
return lexer->stack.stack[lexer->stack.size - 1];
default:
return LEXER_STATE_ERROR;
}
}
lexer_state_t handle_true(lexer_input_t input, lexer_state_t start_state) {
switch (start_state) {
case LEXER_STATE_T:
return input == LEXER_INPUT_LOWER_R ? LEXER_STATE_TR : LEXER_STATE_ERROR;
case LEXER_STATE_TR:
return input == LEXER_INPUT_LOWER_U ? LEXER_STATE_TRU : LEXER_STATE_ERROR;
case LEXER_STATE_TRU:
return input == LEXER_INPUT_LOWER_E ? LEXER_STATE_TRUE : LEXER_STATE_ERROR;
case LEXER_STATE_TRUE:
return LEXER_STATE_KEYWORD_END;
default:
return LEXER_STATE_ERROR;
}
}
lexer_state_t handle_false(lexer_input_t input, lexer_state_t start_state) {
switch (start_state) {
case LEXER_STATE_F:
return input == LEXER_INPUT_LOWER_A ? LEXER_STATE_FA : LEXER_STATE_ERROR;
case LEXER_STATE_FA:
return input == LEXER_INPUT_LOWER_L ? LEXER_STATE_FAL : LEXER_STATE_ERROR;
case LEXER_STATE_FAL:
return input == LEXER_INPUT_LOWER_S ? LEXER_STATE_FALS : LEXER_STATE_ERROR;
case LEXER_STATE_FALS:
return input == LEXER_INPUT_LOWER_E ? LEXER_STATE_FALSE : LEXER_STATE_ERROR;
case LEXER_STATE_FALSE:
return LEXER_STATE_KEYWORD_END;
default:
return LEXER_STATE_ERROR;
}
}
lexer_state_t handle_null(lexer_input_t input, lexer_state_t start_state) {
switch (start_state) {
case LEXER_STATE_N:
return input == LEXER_INPUT_LOWER_U ? LEXER_STATE_NU : LEXER_STATE_ERROR;
case LEXER_STATE_NU:
return input == LEXER_INPUT_LOWER_L ? LEXER_STATE_NUL : LEXER_STATE_ERROR;
case LEXER_STATE_NUL:
return input == LEXER_INPUT_LOWER_L ? LEXER_STATE_NULL : LEXER_STATE_ERROR;
case LEXER_STATE_NULL:
return LEXER_STATE_KEYWORD_END;
default:
return LEXER_STATE_ERROR;
}
}
lexer_state_t handle_keyword_end(lexer_input_t input) {
switch (input) {
case LEXER_INPUT_WHITE_SPACE:
return LEXER_STATE_KEYWORD_END;
case LEXER_INPUT_CLOSE_BRACE:
case LEXER_INPUT_CLOSE_BRACKET:
return handle_collection_end(input);
case LEXER_INPUT_COMMA:
// TODO (Abdelrahman): Figure out how to handle this
return lexer->stack.stack[lexer->stack.size - 1];
default:
return LEXER_STATE_ERROR;
}
}

3
mk_table_generator Executable file
View File

@ -0,0 +1,3 @@
#!/bin/bash
clang -Iinclude -Iinclude/lexer generate_state_table.c -o gentable