Start implementing using the state transition table in the lexer
This commit is contained in:
parent
de57d9f14b
commit
ec6df32839
@ -4,9 +4,6 @@
|
||||
#include "aliases.h"
|
||||
#include <stdbool.h>
|
||||
|
||||
#define VALID_JSON true
|
||||
#define INVALID_JSON false
|
||||
|
||||
typedef const char *str_view_t;
|
||||
|
||||
typedef enum {
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include "lexer.h"
|
||||
#include "aliases.h"
|
||||
#include "dstring.h"
|
||||
#include "lexer_data.h"
|
||||
#include <assert.h>
|
||||
#include <ctype.h>
|
||||
#include <stdbool.h>
|
||||
@ -13,43 +14,6 @@
|
||||
#define MAX_STACK_CAPACITY 1024
|
||||
#define STRING_BUF_START_CAPACITY 1024
|
||||
|
||||
typedef enum {
|
||||
// GENERAL STATES
|
||||
LEXER_STATE_START,
|
||||
LEXER_STATE_ERROR,
|
||||
LEXER_STATE_VALUE,
|
||||
// COLLECTION STATES
|
||||
LEXER_STATE_OBJECT_START,
|
||||
LEXER_STATE_OBJECT,
|
||||
LEXER_STATE_OBJECT_END,
|
||||
LEXER_STATE_ARRAY_START,
|
||||
LEXER_STATE_ARRAY,
|
||||
LEXER_STATE_ARRAY_END,
|
||||
LEXER_STATE_LAST_COLLECTION,
|
||||
// OBJECT STATES
|
||||
LEXER_STATE_KEY,
|
||||
// NUMBER STATES
|
||||
LEXER_STATE_DECIMAL,
|
||||
LEXER_STATE_NUMBER,
|
||||
LEXER_STATE_FRACTION,
|
||||
LEXER_STATE_EXPONENT,
|
||||
LEXER_STATE_EXP_SIGN,
|
||||
LEXER_STATE_POWER,
|
||||
LEXER_STATE_NUMBER_END,
|
||||
// STRING STATES
|
||||
LEXER_STATE_STRING,
|
||||
LEXER_STATE_STRING_END,
|
||||
LEXER_STATE_ESCAPE_SEQUENCE,
|
||||
LEXER_STATE_UNICODE_HEX,
|
||||
// KEYWORD STATES
|
||||
LEXER_STATE_TRUE,
|
||||
LEXER_STATE_FALSE,
|
||||
LEXER_STATE_NULL,
|
||||
LEXER_STATE_KEYWORD_END,
|
||||
|
||||
COUNT_LEXER_STATES,
|
||||
} lexer_state_t;
|
||||
|
||||
typedef struct {
|
||||
lexer_state_t stack[MAX_STACK_CAPACITY];
|
||||
u64 size;
|
||||
@ -84,6 +48,7 @@ struct lexer_s {
|
||||
u64 text_length;
|
||||
const char *text;
|
||||
lexer_state_t current;
|
||||
lexer_state_t next;
|
||||
state_stack_t stack;
|
||||
lexer_string_t keyword;
|
||||
lexer_string_t codepoint;
|
||||
@ -95,9 +60,8 @@ struct lexer_s {
|
||||
dstr_t *error_message;
|
||||
};
|
||||
|
||||
#if 0
|
||||
INTERNAL lexer_input_t char_type(char input);
|
||||
#endif
|
||||
|
||||
INTERNAL void stack_push(state_stack_t *stack, lexer_state_t value);
|
||||
INTERNAL lexer_state_t stack_pop(state_stack_t *stack);
|
||||
|
||||
@ -111,31 +75,13 @@ INTERNAL token_t dstr_to_numerical_token(const dstr_t *str);
|
||||
INTERNAL void set_token(token_t *token, u64 line, u64 column, token_type type,
|
||||
token_value_t value);
|
||||
|
||||
INTERNAL void lexer_state_machine(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_lexer_start(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_last_collection(char input);
|
||||
INTERNAL lexer_state_t handle_collection_end(lexer_t *lexer, char input);
|
||||
INTERNAL void handle_input_after_collection_end(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_object(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_array(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_key(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_value(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_string(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_string_end(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_escape_sequence(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_unicode_sequence(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_decimal(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_number(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_fraction(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_exponent(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_exp_sign(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_power(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_number_end(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_keyword(char input);
|
||||
INTERNAL lexer_state_t handle_true(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_false(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_null(lexer_t *lexer, char input);
|
||||
INTERNAL lexer_state_t handle_keyword_end(lexer_t *lexer, char input);
|
||||
INTERNAL void finalise_state_transition(lexer_t *lexer);
|
||||
INTERNAL void post_keyword(lexer_t *lexer);
|
||||
INTERNAL void set_numerical_token(lexer_t *lexer);
|
||||
|
||||
INTERNAL lexer_state_t state_table[COUNT_LEXER_STATES][COUNT_LEXER_INPUTS] = {
|
||||
#include "lexer_state_transitions.table"
|
||||
};
|
||||
|
||||
void lexer_init(lexer_t **lexer) {
|
||||
if (*lexer) {
|
||||
@ -154,6 +100,7 @@ void lexer_init(lexer_t **lexer) {
|
||||
(*lexer)->text_length = 0;
|
||||
(*lexer)->text = "";
|
||||
(*lexer)->current = LEXER_STATE_START;
|
||||
(*lexer)->next = LEXER_STATE_START;
|
||||
(*lexer)->keyword.type = LEXER_STRING_KEYWORD;
|
||||
(*lexer)->codepoint.type = LEXER_STRING_UNICODE;
|
||||
(*lexer)->current_string = dstr_with_capacity(STRING_BUF_START_CAPACITY);
|
||||
@ -203,9 +150,13 @@ lex_result_t get_next_token(lexer_t *lexer, const char *text) {
|
||||
|
||||
c = lexer->text[(lexer->cursor)++];
|
||||
|
||||
lexer_state_machine(lexer, c);
|
||||
lexer_input_t current_input = char_type(c);
|
||||
|
||||
if (c == '\n') {
|
||||
lexer->next = state_table[lexer->current][current_input];
|
||||
|
||||
finalise_state_transition(lexer);
|
||||
|
||||
if (current_input == LEXER_INPUT_NEWLINE) {
|
||||
++(lexer->line);
|
||||
lexer->column = 0;
|
||||
} else {
|
||||
@ -301,9 +252,10 @@ void print_token(token_t token) {
|
||||
printf("}\n");
|
||||
}
|
||||
|
||||
#if 0
|
||||
INTERNAL lexer_input_t char_type(char input) {
|
||||
if (isspace(input)) {
|
||||
if (input == '\n') {
|
||||
return LEXER_INPUT_NEWLINE;
|
||||
} else if (isspace(input)) {
|
||||
return LEXER_INPUT_WHITE_SPACE;
|
||||
} else if (input >= '1' && input <= '9') {
|
||||
return LEXER_INPUT_NON_ZERO;
|
||||
@ -376,7 +328,6 @@ INTERNAL lexer_input_t char_type(char input) {
|
||||
return LEXER_INPUT_OTHER;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void stack_push(state_stack_t *stack, lexer_state_t state) {
|
||||
if (stack->size + 1 >= MAX_STACK_CAPACITY) {
|
||||
@ -483,6 +434,104 @@ void set_token(token_t *token, u64 line, u64 column, token_type type,
|
||||
};
|
||||
}
|
||||
|
||||
void finalise_state_transition(lexer_t *lexer) {
|
||||
switch (lexer->next) {
|
||||
case LEXER_STATE_OBJECT_START:
|
||||
stack_push(&(lexer->stack), LEXER_STATE_OBJECT);
|
||||
|
||||
lexer->next = LEXER_STATE_OBJECT;
|
||||
|
||||
break;
|
||||
case LEXER_STATE_ARRAY_START:
|
||||
stack_push(&(lexer->stack), LEXER_STATE_ARRAY);
|
||||
|
||||
lexer->next = LEXER_STATE_ARRAY;
|
||||
|
||||
break;
|
||||
case LEXER_STATE_TRUE:
|
||||
case LEXER_STATE_FALSE:
|
||||
case LEXER_STATE_NULL:
|
||||
post_keyword(lexer);
|
||||
|
||||
break;
|
||||
case LEXER_STATE_VALUE_END:
|
||||
switch (lexer->current) {
|
||||
case LEXER_STATE_NUMBER:
|
||||
case LEXER_STATE_FRACTION:
|
||||
case LEXER_STATE_POWER:
|
||||
case LEXER_STATE_NUMBER_END:
|
||||
set_numerical_token(lexer);
|
||||
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
lexer->next = lexer->stack.stack[lexer->stack.size - 1];
|
||||
|
||||
break;
|
||||
case LEXER_STATE_NUMBER_END:
|
||||
set_numerical_token(lexer);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
lexer->current = lexer->next;
|
||||
}
|
||||
|
||||
void post_keyword(lexer_t *lexer) {
|
||||
u64 keyword_char_count;
|
||||
u64 column;
|
||||
|
||||
token_t *token = &(lexer->token);
|
||||
|
||||
switch (lexer->current) {
|
||||
case LEXER_STATE_NULL:
|
||||
keyword_char_count = 4;
|
||||
|
||||
column = lexer->column - keyword_char_count;
|
||||
|
||||
set_token(token, lexer->line, column, TK_NULL, (token_value_t){0});
|
||||
|
||||
break;
|
||||
case LEXER_STATE_TRUE:
|
||||
keyword_char_count = 4;
|
||||
|
||||
column = lexer->column - keyword_char_count;
|
||||
|
||||
set_token(token, lexer->line, column, TK_BOOL,
|
||||
(token_value_t){.boolean = true});
|
||||
|
||||
break;
|
||||
case LEXER_STATE_FALSE:
|
||||
keyword_char_count = 5;
|
||||
|
||||
column = lexer->column - keyword_char_count;
|
||||
|
||||
set_token(token, lexer->line, column, TK_BOOL,
|
||||
(token_value_t){.boolean = false});
|
||||
|
||||
break;
|
||||
default:
|
||||
lexer->current = LEXER_STATE_ERROR;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
lexer->token_ready = true;
|
||||
|
||||
lexer->current = LEXER_STATE_KEYWORD_END;
|
||||
}
|
||||
|
||||
void set_numerical_token(lexer_t *lexer) {
|
||||
lexer->token_ready = true;
|
||||
u64 column = lexer->column - dstr_length(lexer->current_string);
|
||||
|
||||
token_t token = dstr_to_numerical_token(lexer->current_string);
|
||||
|
||||
set_token(&(lexer->token), lexer->line, column, token.type, token.value);
|
||||
}
|
||||
|
||||
void lexer_state_machine(lexer_t *lexer, char input) {
|
||||
switch (lexer->current) {
|
||||
case LEXER_STATE_START:
|
||||
|
Loading…
Reference in New Issue
Block a user