Start implementing using the state transition table in the lexer

This commit is contained in:
Abdelrahman Said 2023-07-30 11:47:26 +01:00
parent de57d9f14b
commit ec6df32839
2 changed files with 118 additions and 72 deletions

View File

@ -4,9 +4,6 @@
#include "aliases.h"
#include <stdbool.h>
#define VALID_JSON true
#define INVALID_JSON false
typedef const char *str_view_t;
typedef enum {

View File

@ -1,6 +1,7 @@
#include "lexer.h"
#include "aliases.h"
#include "dstring.h"
#include "lexer_data.h"
#include <assert.h>
#include <ctype.h>
#include <stdbool.h>
@ -13,43 +14,6 @@
#define MAX_STACK_CAPACITY 1024
#define STRING_BUF_START_CAPACITY 1024
typedef enum {
// GENERAL STATES
LEXER_STATE_START,
LEXER_STATE_ERROR,
LEXER_STATE_VALUE,
// COLLECTION STATES
LEXER_STATE_OBJECT_START,
LEXER_STATE_OBJECT,
LEXER_STATE_OBJECT_END,
LEXER_STATE_ARRAY_START,
LEXER_STATE_ARRAY,
LEXER_STATE_ARRAY_END,
LEXER_STATE_LAST_COLLECTION,
// OBJECT STATES
LEXER_STATE_KEY,
// NUMBER STATES
LEXER_STATE_DECIMAL,
LEXER_STATE_NUMBER,
LEXER_STATE_FRACTION,
LEXER_STATE_EXPONENT,
LEXER_STATE_EXP_SIGN,
LEXER_STATE_POWER,
LEXER_STATE_NUMBER_END,
// STRING STATES
LEXER_STATE_STRING,
LEXER_STATE_STRING_END,
LEXER_STATE_ESCAPE_SEQUENCE,
LEXER_STATE_UNICODE_HEX,
// KEYWORD STATES
LEXER_STATE_TRUE,
LEXER_STATE_FALSE,
LEXER_STATE_NULL,
LEXER_STATE_KEYWORD_END,
COUNT_LEXER_STATES,
} lexer_state_t;
typedef struct {
lexer_state_t stack[MAX_STACK_CAPACITY];
u64 size;
@ -84,6 +48,7 @@ struct lexer_s {
u64 text_length;
const char *text;
lexer_state_t current;
lexer_state_t next;
state_stack_t stack;
lexer_string_t keyword;
lexer_string_t codepoint;
@ -95,9 +60,8 @@ struct lexer_s {
dstr_t *error_message;
};
#if 0
INTERNAL lexer_input_t char_type(char input);
#endif
INTERNAL void stack_push(state_stack_t *stack, lexer_state_t value);
INTERNAL lexer_state_t stack_pop(state_stack_t *stack);
@ -111,31 +75,13 @@ INTERNAL token_t dstr_to_numerical_token(const dstr_t *str);
INTERNAL void set_token(token_t *token, u64 line, u64 column, token_type type,
token_value_t value);
INTERNAL void lexer_state_machine(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_lexer_start(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_last_collection(char input);
INTERNAL lexer_state_t handle_collection_end(lexer_t *lexer, char input);
INTERNAL void handle_input_after_collection_end(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_object(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_array(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_key(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_value(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_string(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_string_end(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_escape_sequence(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_decimal(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_number(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_fraction(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_exponent(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_exp_sign(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_power(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_number_end(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_keyword(char input);
INTERNAL lexer_state_t handle_true(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_false(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_null(lexer_t *lexer, char input);
INTERNAL lexer_state_t handle_keyword_end(lexer_t *lexer, char input);
INTERNAL void finalise_state_transition(lexer_t *lexer);
INTERNAL void post_keyword(lexer_t *lexer);
INTERNAL void set_numerical_token(lexer_t *lexer);
INTERNAL lexer_state_t state_table[COUNT_LEXER_STATES][COUNT_LEXER_INPUTS] = {
#include "lexer_state_transitions.table"
};
void lexer_init(lexer_t **lexer) {
if (*lexer) {
@ -154,6 +100,7 @@ void lexer_init(lexer_t **lexer) {
(*lexer)->text_length = 0;
(*lexer)->text = "";
(*lexer)->current = LEXER_STATE_START;
(*lexer)->next = LEXER_STATE_START;
(*lexer)->keyword.type = LEXER_STRING_KEYWORD;
(*lexer)->codepoint.type = LEXER_STRING_UNICODE;
(*lexer)->current_string = dstr_with_capacity(STRING_BUF_START_CAPACITY);
@ -203,9 +150,13 @@ lex_result_t get_next_token(lexer_t *lexer, const char *text) {
c = lexer->text[(lexer->cursor)++];
lexer_state_machine(lexer, c);
lexer_input_t current_input = char_type(c);
if (c == '\n') {
lexer->next = state_table[lexer->current][current_input];
finalise_state_transition(lexer);
if (current_input == LEXER_INPUT_NEWLINE) {
++(lexer->line);
lexer->column = 0;
} else {
@ -301,9 +252,10 @@ void print_token(token_t token) {
printf("}\n");
}
#if 0
INTERNAL lexer_input_t char_type(char input) {
if (isspace(input)) {
if (input == '\n') {
return LEXER_INPUT_NEWLINE;
} else if (isspace(input)) {
return LEXER_INPUT_WHITE_SPACE;
} else if (input >= '1' && input <= '9') {
return LEXER_INPUT_NON_ZERO;
@ -376,7 +328,6 @@ INTERNAL lexer_input_t char_type(char input) {
return LEXER_INPUT_OTHER;
}
}
#endif
void stack_push(state_stack_t *stack, lexer_state_t state) {
if (stack->size + 1 >= MAX_STACK_CAPACITY) {
@ -483,6 +434,104 @@ void set_token(token_t *token, u64 line, u64 column, token_type type,
};
}
void finalise_state_transition(lexer_t *lexer) {
switch (lexer->next) {
case LEXER_STATE_OBJECT_START:
stack_push(&(lexer->stack), LEXER_STATE_OBJECT);
lexer->next = LEXER_STATE_OBJECT;
break;
case LEXER_STATE_ARRAY_START:
stack_push(&(lexer->stack), LEXER_STATE_ARRAY);
lexer->next = LEXER_STATE_ARRAY;
break;
case LEXER_STATE_TRUE:
case LEXER_STATE_FALSE:
case LEXER_STATE_NULL:
post_keyword(lexer);
break;
case LEXER_STATE_VALUE_END:
switch (lexer->current) {
case LEXER_STATE_NUMBER:
case LEXER_STATE_FRACTION:
case LEXER_STATE_POWER:
case LEXER_STATE_NUMBER_END:
set_numerical_token(lexer);
break;
default:
break;
}
lexer->next = lexer->stack.stack[lexer->stack.size - 1];
break;
case LEXER_STATE_NUMBER_END:
set_numerical_token(lexer);
break;
}
lexer->current = lexer->next;
}
void post_keyword(lexer_t *lexer) {
u64 keyword_char_count;
u64 column;
token_t *token = &(lexer->token);
switch (lexer->current) {
case LEXER_STATE_NULL:
keyword_char_count = 4;
column = lexer->column - keyword_char_count;
set_token(token, lexer->line, column, TK_NULL, (token_value_t){0});
break;
case LEXER_STATE_TRUE:
keyword_char_count = 4;
column = lexer->column - keyword_char_count;
set_token(token, lexer->line, column, TK_BOOL,
(token_value_t){.boolean = true});
break;
case LEXER_STATE_FALSE:
keyword_char_count = 5;
column = lexer->column - keyword_char_count;
set_token(token, lexer->line, column, TK_BOOL,
(token_value_t){.boolean = false});
break;
default:
lexer->current = LEXER_STATE_ERROR;
return;
}
lexer->token_ready = true;
lexer->current = LEXER_STATE_KEYWORD_END;
}
void set_numerical_token(lexer_t *lexer) {
lexer->token_ready = true;
u64 column = lexer->column - dstr_length(lexer->current_string);
token_t token = dstr_to_numerical_token(lexer->current_string);
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
}
void lexer_state_machine(lexer_t *lexer, char input) {
switch (lexer->current) {
case LEXER_STATE_START: