Separate tokenizing logic from loop

This commit is contained in:
rexy712 2020-03-16 18:28:28 -07:00
parent e38245261e
commit 21e4583827

View File

@ -144,264 +144,199 @@ static RJP_lex_category irjp_lex_accept(RJP_lex_category val, RJP_lex_state* sta
state->node = rjp_lex_start;
return val;
}
static RJP_lex_category irjp_lex_error(RJP_lex_state* state){
return irjp_lex_accept(rjp_lex_invalid, state);
static RJP_lex_category irjp_lex_char(char ch, RJP_lex_category node){
switch(node){
case rjp_lex_start:
if(ch == 0)
return rjp_lex_end;
else if(ch == '{')
return rjp_lex_obrace;
else if(ch == '}')
return rjp_lex_cbrace;
else if(ch == '[')
return rjp_lex_obracket;
else if(ch == ']')
return rjp_lex_cbracket;
else if(ch == '"')
return rjp_lex_quote;
else if(ch == ',')
return rjp_lex_comma;
else if(ch == ':')
return rjp_lex_colon;
else if(isdigit(ch))
return rjp_lex_number;
else if(ch == '+' || ch == '-')
return rjp_lex_signed_number;
else if(ch == '\n' || ch == '\r')
return rjp_lex_newlines;
else if(isspace(ch))
return rjp_lex_spaces;
else if(ch == 't')
return rjp_lex_t;
else if(ch == 'f')
return rjp_lex_f;
else if(ch == 'n')
return rjp_lex_n;
else if(ch == '/')
return rjp_lex_slash;
return rjp_lex_invalid;
//whitespace
case rjp_lex_spaces:
if(isspace(ch))
break;
return rjp_lex_invalid;
//numbers
case rjp_lex_signed_number:
if(ch == '-' || ch == '+'){
return rjp_lex_number;
}else{
//fallthrough
case rjp_lex_number:
if(isdigit(ch))
return rjp_lex_number;
if(ch == '.')
return rjp_lex_decimal;
else if(isalpha(ch))
return rjp_lex_unrecognized_word;
}
return rjp_lex_invalid;
case rjp_lex_decimal:
if(isdigit(ch))
return rjp_lex_fnumber;
return rjp_lex_invalid;
case rjp_lex_fnumber:
if(isdigit(ch))
break;
else if(ch == 'e' || ch == 'E')
return rjp_lex_fnum_e;
else if(isalpha(ch))
return rjp_lex_unrecognized_word;
return rjp_lex_invalid;
case rjp_lex_fnum_e:
if(ch == '-' || ch == '+')
return rjp_lex_sci_num_signed;
else if(isdigit(ch))
return rjp_lex_sci_num;
return rjp_lex_invalid;
case rjp_lex_sci_num_signed:
if(isdigit(ch))
return rjp_lex_sci_num;
return rjp_lex_unrecognized_word;
case rjp_lex_sci_num:
if(isdigit(ch))
break;
else if(isalpha(ch))
return rjp_lex_unrecognized_word;
return rjp_lex_invalid;
//strings
case rjp_lex_quote:
if(ch == '\\')
return rjp_lex_escaped;
else if(ch == '\n' || ch == '\r')
return rjp_lex_invalid;
else if(ch == '"')
return rjp_lex_string;
break;
case rjp_lex_escaped:
return rjp_lex_quote;
case rjp_lex_string:
return rjp_lex_invalid;
//comments
case rjp_lex_slash:
if(ch == '/')
return rjp_lex_line_comment;
else if(ch == '*')
return rjp_lex_block_comment_start;
return rjp_lex_invalid;
case rjp_lex_line_comment:
if(ch == '\n' || ch == '\r' || ch == 0) //don't consume this character
return rjp_lex_invalid;
break;
case rjp_lex_block_comment_start:
if(ch == '*')
return rjp_lex_block_comment_end1;
break;
case rjp_lex_block_comment_end1:
if(ch == '/')
return rjp_lex_block_comment;
return rjp_lex_block_comment_start;
//true
case rjp_lex_t:
if(ch != 'r')
return rjp_lex_unrecognized_word;
return rjp_lex_tr;
case rjp_lex_tr:
if(ch != 'u')
return rjp_lex_unrecognized_word;
return rjp_lex_tru;
case rjp_lex_tru:
if(ch != 'e')
return rjp_lex_unrecognized_word;
return rjp_lex_true;
//false
case rjp_lex_f:
if(ch != 'a')
return rjp_lex_unrecognized_word;
return rjp_lex_fa;
case rjp_lex_fa:
if(ch != 'l')
return rjp_lex_unrecognized_word;
return rjp_lex_fal;
case rjp_lex_fal:
if(ch != 's')
return rjp_lex_unrecognized_word;
return rjp_lex_fals;
case rjp_lex_fals:
if(ch != 'e')
return rjp_lex_unrecognized_word;
return rjp_lex_false;
//null
case rjp_lex_n:
if(ch != 'u')
return rjp_lex_unrecognized_word;
return rjp_lex_nu;
case rjp_lex_nu:
if(ch != 'l')
return rjp_lex_unrecognized_word;
return rjp_lex_nul;
case rjp_lex_nul:
if(ch != 'l')
return rjp_lex_unrecognized_word;
return rjp_lex_null;
case rjp_lex_true:
case rjp_lex_false:
case rjp_lex_null:
if(!isalnum(ch))
return rjp_lex_invalid;
return rjp_lex_unrecognized_word;
case rjp_lex_unrecognized_word:
if(isalnum(ch))
break;
return rjp_lex_invalid;
//fallthrough
default:
return rjp_lex_invalid;
};
return node;
}
static RJP_lex_category irjp_lex(RJP_lex_state* state){
state->offset += state->length;
state->length = 0;
for(const char* c = state->str+state->offset;1;++c,++state->length){
char ch = *c;
switch(state->node){
case rjp_lex_start:
if(ch == '{')
state->node = rjp_lex_obrace;
else if(ch == '}')
state->node = rjp_lex_cbrace;
else if(ch == '[')
state->node = rjp_lex_obracket;
else if(ch == ']')
state->node = rjp_lex_cbracket;
else if(ch == '"')
state->node = rjp_lex_quote;
else if(ch == ',')
state->node = rjp_lex_comma;
else if(ch == ':')
state->node = rjp_lex_colon;
else if(isdigit(ch))
state->node = rjp_lex_number;
else if(ch == '+' || ch == '-')
state->node = rjp_lex_signed_number;
else if(ch == '\n' || ch == '\r')
state->node = rjp_lex_newlines;
else if(isspace(ch))
state->node = rjp_lex_spaces;
else if(ch == 't')
state->node = rjp_lex_t;
else if(ch == 'f')
state->node = rjp_lex_f;
else if(ch == 'n')
state->node = rjp_lex_n;
else if(ch == '/')
state->node = rjp_lex_slash;
else if(ch == 0)
return irjp_lex_accept(rjp_lex_end, state);
else
state->node = rjp_lex_invalid;
break;
//punctuation
case rjp_lex_obracket:
return irjp_lex_accept(rjp_lex_obracket, state);
case rjp_lex_obrace:
return irjp_lex_accept(rjp_lex_obrace, state);
case rjp_lex_cbracket:
return irjp_lex_accept(rjp_lex_cbracket, state);
case rjp_lex_cbrace:
return irjp_lex_accept(rjp_lex_cbrace, state);
case rjp_lex_comma:
return irjp_lex_accept(rjp_lex_comma, state);
case rjp_lex_colon:
return irjp_lex_accept(rjp_lex_colon, state);
//whitespace
case rjp_lex_newlines:
return irjp_lex_accept(rjp_lex_newlines, state);
case rjp_lex_spaces:
if(isspace(ch))
break;
return irjp_lex_accept(rjp_lex_spaces, state);
//numbers
case rjp_lex_signed_number:
if(ch == '-' || ch == '+'){
state->node = rjp_lex_number;
break;
}else{
//fallthrough
case rjp_lex_number:
if(isdigit(ch))
break;
if(ch == '.')
state->node = rjp_lex_decimal;
else if(isalpha(ch))
state->node = rjp_lex_unrecognized_word;
else
return irjp_lex_accept(rjp_lex_number, state);
}
break;
case rjp_lex_decimal:
if(isdigit(ch))
state->node = rjp_lex_fnumber;
else
state->node = rjp_lex_invalid;
break;
case rjp_lex_fnumber:
if(isdigit(ch))
break;
if(ch == 'e' || ch == 'E')
state->node = rjp_lex_fnum_e;
else if(isalpha(ch))
state->node = rjp_lex_unrecognized_word;
else
return irjp_lex_accept(rjp_lex_fnumber, state);
break;
case rjp_lex_fnum_e:
if(ch == '-' || ch == '+')
state->node = rjp_lex_sci_num_signed;
else if(isdigit(ch))
state->node = rjp_lex_sci_num;
else
state->node = rjp_lex_invalid;
break;
case rjp_lex_sci_num_signed:
if(isdigit(ch))
state->node = rjp_lex_sci_num;
else
state->node = rjp_lex_unrecognized_word;
break;
case rjp_lex_sci_num:
if(isdigit(ch))
break;
else if(isalpha(ch))
state->node = rjp_lex_unrecognized_word;
else
return irjp_lex_accept(rjp_lex_sci_num, state);
break;
//strings
case rjp_lex_quote:
if(ch == '\\')
state->node = rjp_lex_escaped;
else if(ch == '\n' || ch == '\r')
state->node = rjp_lex_invalid;
else if(ch == '"')
state->node = rjp_lex_string;
break;
case rjp_lex_escaped:
state->node = rjp_lex_quote;
break;
case rjp_lex_string:
return irjp_lex_accept(rjp_lex_string, state);
//comments
case rjp_lex_slash:
if(ch == '/')
state->node = rjp_lex_line_comment;
else if(ch == '*')
state->node = rjp_lex_block_comment_start;
else
state->node = rjp_lex_invalid;
break;
case rjp_lex_line_comment:
if(ch == '\n' || ch == '\r' || ch == 0) //don't consume this character
return irjp_lex_accept(rjp_lex_line_comment, state);
break;
case rjp_lex_block_comment_start:
if(ch == '*')
state->node = rjp_lex_block_comment_end1;
break;
case rjp_lex_block_comment_end1:
if(ch == '/')
state->node = rjp_lex_block_comment;
else
state->node = rjp_lex_block_comment_start;
break;
case rjp_lex_block_comment:
return irjp_lex_accept(rjp_lex_block_comment, state);
//true
case rjp_lex_t:
if(ch != 'r')
state->node = rjp_lex_unrecognized_word;
else
state->node = rjp_lex_tr;
break;
case rjp_lex_tr:
if(ch != 'u')
state->node = rjp_lex_unrecognized_word;
else
state->node = rjp_lex_tru;
break;
case rjp_lex_tru:
if(ch != 'e')
state->node = rjp_lex_unrecognized_word;
else
state->node = rjp_lex_true;
break;
case rjp_lex_true:
if(!isalnum(ch))
return irjp_lex_accept(rjp_lex_true, state);
else
state->node = rjp_lex_unrecognized_word;
break;
//false
case rjp_lex_f:
if(ch != 'a')
state->node = rjp_lex_unrecognized_word;
else
state->node = rjp_lex_fa;
break;
case rjp_lex_fa:
if(ch != 'l')
state->node = rjp_lex_unrecognized_word;
else
state->node = rjp_lex_fal;
break;
case rjp_lex_fal:
if(ch != 's')
state->node = rjp_lex_unrecognized_word;
else
state->node = rjp_lex_fals;
break;
case rjp_lex_fals:
if(ch != 'e')
state->node = rjp_lex_unrecognized_word;
else
state->node = rjp_lex_false;
break;
case rjp_lex_false:
if(!isalnum(ch))
return irjp_lex_accept(rjp_lex_false, state);
state->node = rjp_lex_unrecognized_word;
break;
//null
case rjp_lex_n:
if(ch != 'u')
state->node = rjp_lex_unrecognized_word;
else
state->node = rjp_lex_nu;
break;
case rjp_lex_nu:
if(ch != 'l')
state->node = rjp_lex_unrecognized_word;
else
state->node = rjp_lex_nul;
break;
case rjp_lex_nul:
if(ch != 'l')
state->node = rjp_lex_unrecognized_word;
else
state->node = rjp_lex_null;
break;
case rjp_lex_null:
if(!isalnum(ch))
return irjp_lex_accept(rjp_lex_null, state);
state->node = rjp_lex_unrecognized_word;
break;
case rjp_lex_unrecognized_word:
if(isalnum(ch))
break;
state->node = rjp_lex_invalid;
//fallthrough
case rjp_lex_invalid:
default:
return irjp_lex_error(state);
};
if(ch == 0)
RJP_lex_category cat = irjp_lex_char(*c, state->node);
if(cat == rjp_lex_invalid)
return irjp_lex_accept(state->node, state);
state->node = cat;
if(*c == 0)
break;
}
return irjp_lex_accept(state->node, state);