Separated lexing logic further. Added file for lexical analysis functions

This commit is contained in:
rexy712 2020-03-17 15:59:51 -07:00
parent 21e4583827
commit 8e454b3d5a
4 changed files with 343 additions and 254 deletions

View File

@ -17,7 +17,7 @@ option(BUILD_TESTS "Build test programs" OFF)
option(ENABLE_PROFILING "Enable asan" OFF) option(ENABLE_PROFILING "Enable asan" OFF)
mark_as_advanced(ENABLE_PROFILING) mark_as_advanced(ENABLE_PROFILING)
set(SOURCE_LIST "src/rjp_ordered_object.c" "src/rjp_unordered_object.c" "src/input.c" "src/output.c" "src/rjp_array.c" "src/rjp.c" "src/rjp_object.c" "src/rjp_string.c" "src/tree.c") set(SOURCE_LIST "src/rjp_lex.c" "src/rjp_ordered_object.c" "src/rjp_unordered_object.c" "src/rjp_parse.c" "src/output.c" "src/rjp_array.c" "src/rjp.c" "src/rjp_object.c" "src/rjp_string.c" "src/tree.c")
if(ENABLE_SHARED) if(ENABLE_SHARED)
add_library(rjp SHARED ${SOURCE_LIST}) add_library(rjp SHARED ${SOURCE_LIST})
set_target_properties(rjp PROPERTIES SOVERSION "${rjp_VERSION_MAJOR}.${rjp_VERSION_MINOR}.${rjp_VERSION_REVISION}") set_target_properties(rjp PROPERTIES SOVERSION "${rjp_VERSION_MAJOR}.${rjp_VERSION_MINOR}.${rjp_VERSION_REVISION}")

59
include/rjp_lex.h Normal file
View File

@ -0,0 +1,59 @@
#ifndef RJP_LEX_H
#define RJP_LEX_H
#include "rjp.h"
#define rjp_lex_accept 1
typedef enum RJP_lex_category{
rjp_lex_start = 0,
rjp_lex_obracket = 3,
rjp_lex_obrace = 5,
rjp_lex_cbracket = 7,
rjp_lex_cbrace = 9,
rjp_lex_spaces = 11,
rjp_lex_quote = 12,
rjp_lex_t = 14,
rjp_lex_tr = 16,
rjp_lex_tru = 18,
rjp_lex_true = 19,
rjp_lex_f = 20,
rjp_lex_fa = 22,
rjp_lex_fal = 24,
rjp_lex_fals = 26,
rjp_lex_false = 27,
rjp_lex_n = 28,
rjp_lex_nu = 30,
rjp_lex_nul = 32,
rjp_lex_null = 33,
rjp_lex_escaped = 34,
rjp_lex_string = 35,
rjp_lex_comma = 37,
rjp_lex_colon = 39,
rjp_lex_number = 41,
rjp_lex_decimal = 42,
rjp_lex_fnumber = 43,
rjp_lex_fnum_e = 44,
rjp_lex_sci_num = 45,
rjp_lex_slash = 46,
rjp_lex_line_comment = 47,
rjp_lex_signed_number = 49,
rjp_lex_sci_num_signed = 51,
rjp_lex_newlines = 53,
rjp_lex_block_comment_start = 54,
rjp_lex_block_comment_end1 = 56,
rjp_lex_block_comment = 57,
rjp_lex_invalid = 1000,
rjp_lex_unrecognized_word = 1002,
rjp_lex_end = 1004,
}RJP_lex_category;
typedef struct RJP_lex_state{
const char* str;
RJP_lex_category node;
RJP_index length;
RJP_index offset;
}RJP_lex_state;
RJP_lex_category irjp_lex(RJP_lex_state* state);
#endif

281
src/rjp_lex.c Normal file
View File

@ -0,0 +1,281 @@
#include "rjp_lex.h"
#include "rjp.h"
#include <ctype.h> //isalpha, etc
static RJP_lex_category irjp_lex_accept(RJP_lex_category val, RJP_lex_state* state){
state->node = rjp_lex_start;
return val;
}
static int irjp_is_space(char ch){
switch(ch){
case ' ':
case '\t':
case '\f':
case '\v':
return 1;
};
return 0;
}
static inline RJP_lex_category irjp_lex_do_start(char ch){
switch(ch){
case 0:
return rjp_lex_end;
case '{':
return rjp_lex_obrace;
case '}':
return rjp_lex_cbrace;
case '[':
return rjp_lex_obracket;
case ']':
return rjp_lex_cbracket;
case '"':
return rjp_lex_quote;
case ',':
return rjp_lex_comma;
case ':':
return rjp_lex_colon;
case 't':
return rjp_lex_t;
case 'f':
return rjp_lex_f;
case 'n':
return rjp_lex_n;
case '/':
return rjp_lex_slash;
case '+':
case '-':
return rjp_lex_signed_number;
case '\n':
case '\r':
return rjp_lex_newlines;
case ' ':
case '\t':
case '\v':
case '\f':
return rjp_lex_spaces;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return rjp_lex_number;
default:
break;
};
return rjp_lex_invalid;
}
static inline RJP_lex_category irjp_lex_do_spaces(char ch){
if(irjp_is_space(ch))
return rjp_lex_spaces;
return rjp_lex_invalid;
}
static inline RJP_lex_category irjp_lex_do_number(char ch){
if(isdigit(ch))
return rjp_lex_number;
if(ch == '.')
return rjp_lex_decimal;
if(isalpha(ch))
return rjp_lex_unrecognized_word;
return rjp_lex_invalid;
}
static inline RJP_lex_category irjp_lex_do_signed_num(char ch){
switch(ch){
case '-':
case '+':
return rjp_lex_number;
};
return irjp_lex_do_number(ch);
}
static inline RJP_lex_category irjp_lex_do_decimal(char ch){
if(isdigit(ch))
return rjp_lex_fnumber;
return rjp_lex_invalid;
}
static inline RJP_lex_category irjp_lex_do_fnumber(char ch){
if(isdigit(ch))
return rjp_lex_fnumber;
if(ch == 'e' || ch == 'E')
return rjp_lex_fnum_e;
if(isalpha(ch))
return rjp_lex_unrecognized_word;
return rjp_lex_invalid;
}
static inline RJP_lex_category irjp_lex_do_fnum_e(char ch){
if(ch == '-' || ch == '+')
return rjp_lex_sci_num_signed;
if(isdigit(ch))
return rjp_lex_sci_num;
return rjp_lex_invalid;
}
static inline RJP_lex_category irjp_lex_do_sci_num_signed(char ch){
if(isdigit(ch))
return rjp_lex_sci_num;
return rjp_lex_invalid;
}
static inline RJP_lex_category irjp_lex_do_sci_num(char ch){
if(isdigit(ch))
return rjp_lex_sci_num;
if(isalpha(ch))
return rjp_lex_unrecognized_word;
return rjp_lex_invalid;
}
static inline RJP_lex_category irjp_lex_do_quote(char ch){
switch(ch){
case '\\':
return rjp_lex_escaped;
case '\n':
case '\r':
return rjp_lex_invalid;
case '"':
return rjp_lex_string;
};
return rjp_lex_quote;
}
static inline RJP_lex_category irjp_lex_do_slash(char ch){
switch(ch){
case '/':
return rjp_lex_line_comment;
case '*':
return rjp_lex_block_comment_start;
};
return rjp_lex_invalid;
}
static inline RJP_lex_category irjp_lex_do_line_comment(char ch){
switch(ch){
case '\n':
case '\r':
case 0:
return rjp_lex_invalid;
};
return rjp_lex_line_comment;
}
static inline RJP_lex_category irjp_lex_do_block_comment_start(char ch){
if(ch == '*')
return rjp_lex_block_comment_end1;
return rjp_lex_block_comment_start;
}
static inline RJP_lex_category irjp_lex_do_block_comment_end1(char ch){
if(ch == '/')
return rjp_lex_block_comment;
return rjp_lex_block_comment_start;
}
static RJP_lex_category irjp_lex_char(char ch, RJP_lex_category node){
switch(node){
case rjp_lex_start:
return irjp_lex_do_start(ch);
//whitespace
case rjp_lex_spaces:
return irjp_lex_do_spaces(ch);
//numbers
case rjp_lex_signed_number:
return irjp_lex_do_signed_num(ch);
case rjp_lex_number:
return irjp_lex_do_number(ch);
case rjp_lex_decimal:
return irjp_lex_do_decimal(ch);
case rjp_lex_fnumber:
return irjp_lex_do_fnumber(ch);
case rjp_lex_fnum_e:
return irjp_lex_do_fnum_e(ch);
case rjp_lex_sci_num_signed:
return irjp_lex_do_sci_num_signed(ch);
case rjp_lex_sci_num:
return irjp_lex_do_sci_num(ch);
//strings
case rjp_lex_quote:
return irjp_lex_do_quote(ch);
case rjp_lex_escaped:
return rjp_lex_quote;
//comments
case rjp_lex_slash:
return irjp_lex_do_slash(ch);
case rjp_lex_line_comment:
return irjp_lex_do_line_comment(ch);
case rjp_lex_block_comment_start:
return irjp_lex_do_block_comment_start(ch);
case rjp_lex_block_comment_end1:
return irjp_lex_do_block_comment_end1(ch);
//true
case rjp_lex_t:
if(ch != 'r')
return rjp_lex_unrecognized_word;
return rjp_lex_tr;
case rjp_lex_tr:
if(ch != 'u')
return rjp_lex_unrecognized_word;
return rjp_lex_tru;
case rjp_lex_tru:
if(ch != 'e')
return rjp_lex_unrecognized_word;
return rjp_lex_true;
//false
case rjp_lex_f:
if(ch != 'a')
return rjp_lex_unrecognized_word;
return rjp_lex_fa;
case rjp_lex_fa:
if(ch != 'l')
return rjp_lex_unrecognized_word;
return rjp_lex_fal;
case rjp_lex_fal:
if(ch != 's')
return rjp_lex_unrecognized_word;
return rjp_lex_fals;
case rjp_lex_fals:
if(ch != 'e')
return rjp_lex_unrecognized_word;
return rjp_lex_false;
//null
case rjp_lex_n:
if(ch != 'u')
return rjp_lex_unrecognized_word;
return rjp_lex_nu;
case rjp_lex_nu:
if(ch != 'l')
return rjp_lex_unrecognized_word;
return rjp_lex_nul;
case rjp_lex_nul:
if(ch != 'l')
return rjp_lex_unrecognized_word;
return rjp_lex_null;
case rjp_lex_true:
case rjp_lex_false:
case rjp_lex_null:
if(!isalnum(ch))
return rjp_lex_invalid;
return rjp_lex_unrecognized_word;
case rjp_lex_unrecognized_word:
if(isalnum(ch))
return rjp_lex_unrecognized_word;
return rjp_lex_invalid;
default:
return rjp_lex_invalid;
};
return node;
}
RJP_lex_category irjp_lex(RJP_lex_state* state){
state->offset += state->length;
state->length = 0;
for(const char* c = state->str+state->offset;1;++c,++state->length){
RJP_lex_category cat = irjp_lex_char(*c, state->node);
if(cat == rjp_lex_invalid)
return irjp_lex_accept(state->node, state);
state->node = cat;
if(*c == 0)
break;
}
return irjp_lex_accept(state->node, state);
}

View File

@ -23,55 +23,12 @@
#include "rjp_value.h" #include "rjp_value.h"
#include "rjp_string.h" #include "rjp_string.h"
#include "memory.h" #include "memory.h"
#include "rjp_lex.h"
#include <stdlib.h> //strtod, strtol #include <stdlib.h> //strtod, strtol
#include <ctype.h> //isalpha, etc
#include <string.h> //memcpy #include <string.h> //memcpy
#define RJP_INITIAL_PARSE_DEPTH 16
#define rjp_lex_accept 1 #define RJP_INITIAL_PARSE_DEPTH 16
typedef enum RJP_lex_category{
rjp_lex_start = 0,
rjp_lex_obracket = 3,
rjp_lex_obrace = 5,
rjp_lex_cbracket = 7,
rjp_lex_cbrace = 9,
rjp_lex_spaces = 11,
rjp_lex_quote = 12,
rjp_lex_t = 14,
rjp_lex_tr = 16,
rjp_lex_tru = 18,
rjp_lex_true = 19,
rjp_lex_f = 20,
rjp_lex_fa = 22,
rjp_lex_fal = 24,
rjp_lex_fals = 26,
rjp_lex_false = 27,
rjp_lex_n = 28,
rjp_lex_nu = 30,
rjp_lex_nul = 32,
rjp_lex_null = 33,
rjp_lex_escaped = 34,
rjp_lex_string = 35,
rjp_lex_comma = 37,
rjp_lex_colon = 39,
rjp_lex_number = 41,
rjp_lex_decimal = 42,
rjp_lex_fnumber = 43,
rjp_lex_fnum_e = 44,
rjp_lex_sci_num = 45,
rjp_lex_slash = 46,
rjp_lex_line_comment = 47,
rjp_lex_signed_number = 49,
rjp_lex_sci_num_signed = 51,
rjp_lex_newlines = 53,
rjp_lex_block_comment_start = 54,
rjp_lex_block_comment_end1 = 56,
rjp_lex_block_comment = 57,
rjp_lex_invalid = 1000,
rjp_lex_unrecognized_word = 1002,
rjp_lex_end = 1004,
}RJP_lex_category;
typedef enum RJP_yacc_target{ typedef enum RJP_yacc_target{
rjp_yacc_end, rjp_yacc_end,
@ -86,12 +43,6 @@ typedef enum RJP_yacc_target{
rjp_yacc_obj_comma rjp_yacc_obj_comma
}RJP_yacc_target; }RJP_yacc_target;
typedef struct RJP_lex_state{
const char* str;
RJP_lex_category node;
RJP_index length;
RJP_index offset;
}RJP_lex_state;
typedef struct RJP_yacc_stack{ typedef struct RJP_yacc_stack{
RJP_yacc_target* stack; RJP_yacc_target* stack;
RJP_index position; RJP_index position;
@ -140,208 +91,6 @@ static void irjp_yacc_stack_set(RJP_yacc_stack* s, RJP_yacc_target target){
s->stack[s->position] = target; s->stack[s->position] = target;
} }
static RJP_lex_category irjp_lex_accept(RJP_lex_category val, RJP_lex_state* state){
state->node = rjp_lex_start;
return val;
}
static RJP_lex_category irjp_lex_char(char ch, RJP_lex_category node){
switch(node){
case rjp_lex_start:
if(ch == 0)
return rjp_lex_end;
else if(ch == '{')
return rjp_lex_obrace;
else if(ch == '}')
return rjp_lex_cbrace;
else if(ch == '[')
return rjp_lex_obracket;
else if(ch == ']')
return rjp_lex_cbracket;
else if(ch == '"')
return rjp_lex_quote;
else if(ch == ',')
return rjp_lex_comma;
else if(ch == ':')
return rjp_lex_colon;
else if(isdigit(ch))
return rjp_lex_number;
else if(ch == '+' || ch == '-')
return rjp_lex_signed_number;
else if(ch == '\n' || ch == '\r')
return rjp_lex_newlines;
else if(isspace(ch))
return rjp_lex_spaces;
else if(ch == 't')
return rjp_lex_t;
else if(ch == 'f')
return rjp_lex_f;
else if(ch == 'n')
return rjp_lex_n;
else if(ch == '/')
return rjp_lex_slash;
return rjp_lex_invalid;
//whitespace
case rjp_lex_spaces:
if(isspace(ch))
break;
return rjp_lex_invalid;
//numbers
case rjp_lex_signed_number:
if(ch == '-' || ch == '+'){
return rjp_lex_number;
}else{
//fallthrough
case rjp_lex_number:
if(isdigit(ch))
return rjp_lex_number;
if(ch == '.')
return rjp_lex_decimal;
else if(isalpha(ch))
return rjp_lex_unrecognized_word;
}
return rjp_lex_invalid;
case rjp_lex_decimal:
if(isdigit(ch))
return rjp_lex_fnumber;
return rjp_lex_invalid;
case rjp_lex_fnumber:
if(isdigit(ch))
break;
else if(ch == 'e' || ch == 'E')
return rjp_lex_fnum_e;
else if(isalpha(ch))
return rjp_lex_unrecognized_word;
return rjp_lex_invalid;
case rjp_lex_fnum_e:
if(ch == '-' || ch == '+')
return rjp_lex_sci_num_signed;
else if(isdigit(ch))
return rjp_lex_sci_num;
return rjp_lex_invalid;
case rjp_lex_sci_num_signed:
if(isdigit(ch))
return rjp_lex_sci_num;
return rjp_lex_unrecognized_word;
case rjp_lex_sci_num:
if(isdigit(ch))
break;
else if(isalpha(ch))
return rjp_lex_unrecognized_word;
return rjp_lex_invalid;
//strings
case rjp_lex_quote:
if(ch == '\\')
return rjp_lex_escaped;
else if(ch == '\n' || ch == '\r')
return rjp_lex_invalid;
else if(ch == '"')
return rjp_lex_string;
break;
case rjp_lex_escaped:
return rjp_lex_quote;
case rjp_lex_string:
return rjp_lex_invalid;
//comments
case rjp_lex_slash:
if(ch == '/')
return rjp_lex_line_comment;
else if(ch == '*')
return rjp_lex_block_comment_start;
return rjp_lex_invalid;
case rjp_lex_line_comment:
if(ch == '\n' || ch == '\r' || ch == 0) //don't consume this character
return rjp_lex_invalid;
break;
case rjp_lex_block_comment_start:
if(ch == '*')
return rjp_lex_block_comment_end1;
break;
case rjp_lex_block_comment_end1:
if(ch == '/')
return rjp_lex_block_comment;
return rjp_lex_block_comment_start;
//true
case rjp_lex_t:
if(ch != 'r')
return rjp_lex_unrecognized_word;
return rjp_lex_tr;
case rjp_lex_tr:
if(ch != 'u')
return rjp_lex_unrecognized_word;
return rjp_lex_tru;
case rjp_lex_tru:
if(ch != 'e')
return rjp_lex_unrecognized_word;
return rjp_lex_true;
//false
case rjp_lex_f:
if(ch != 'a')
return rjp_lex_unrecognized_word;
return rjp_lex_fa;
case rjp_lex_fa:
if(ch != 'l')
return rjp_lex_unrecognized_word;
return rjp_lex_fal;
case rjp_lex_fal:
if(ch != 's')
return rjp_lex_unrecognized_word;
return rjp_lex_fals;
case rjp_lex_fals:
if(ch != 'e')
return rjp_lex_unrecognized_word;
return rjp_lex_false;
//null
case rjp_lex_n:
if(ch != 'u')
return rjp_lex_unrecognized_word;
return rjp_lex_nu;
case rjp_lex_nu:
if(ch != 'l')
return rjp_lex_unrecognized_word;
return rjp_lex_nul;
case rjp_lex_nul:
if(ch != 'l')
return rjp_lex_unrecognized_word;
return rjp_lex_null;
case rjp_lex_true:
case rjp_lex_false:
case rjp_lex_null:
if(!isalnum(ch))
return rjp_lex_invalid;
return rjp_lex_unrecognized_word;
case rjp_lex_unrecognized_word:
if(isalnum(ch))
break;
return rjp_lex_invalid;
//fallthrough
default:
return rjp_lex_invalid;
};
return node;
}
static RJP_lex_category irjp_lex(RJP_lex_state* state){
state->offset += state->length;
state->length = 0;
for(const char* c = state->str+state->offset;1;++c,++state->length){
RJP_lex_category cat = irjp_lex_char(*c, state->node);
if(cat == rjp_lex_invalid)
return irjp_lex_accept(state->node, state);
state->node = cat;
if(*c == 0)
break;
}
return irjp_lex_accept(state->node, state);
}
static int irjp_init_value(RJP_value* newval, RJP_lex_category cat, RJP_yacc_state* state){ static int irjp_init_value(RJP_value* newval, RJP_lex_category cat, RJP_yacc_state* state){
RJP_index length = state->lexstate.length; RJP_index length = state->lexstate.length;
RJP_index offset = state->lexstate.offset; RJP_index offset = state->lexstate.offset;