rjp/src/rjp_parse.c
2020-04-09 13:03:29 -07:00

435 lines
14 KiB
C

/**
rjp
Copyright (C) 2018-2020 rexy712
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
//TODO: Scientific notation
#include "rjp.h"
#include "rjp_internal.h"
#include "rjp_value.h"
#include "rjp_string.h"
#include "rjp_lex.h"
#include <stdlib.h> //strtod, strtol
#include <string.h> //memcpy
#define RJP_INITIAL_PARSE_DEPTH 16
typedef enum RJP_parse_status{
RJP_PARSE_STATUS_SUC,
RJP_PARSE_STATUS_ERR,
RJP_PARSE_STATUS_MISSING_VALUE,
RJP_PARSE_STATUS_MISSING_COMMA,
RJP_PARSE_STATUS_INVALID,
RJP_PARSE_STATUS_NO_ROOT_VALUE,
RJP_PARSE_STATUS_MISSING_KEY,
RJP_PARSE_STATUS_MISSING_COLON,
RJP_PARSE_STATUS_EXCESS_DATA,
RJP_PARSE_STATUS_MISSING_CLOSE_BRACE,
}RJP_parse_status;
typedef enum RJP_parse_target{
rjp_parse_end,
rjp_parse_start,
rjp_parse_first_mem_key,
rjp_parse_mem_key,
rjp_parse_arr_first_value,
rjp_parse_arr_value,
rjp_parse_arr_comma,
rjp_parse_key_colon,
rjp_parse_obj_value,
rjp_parse_obj_comma
}RJP_parse_target;
typedef struct RJP_parse_stack{
RJP_parse_target* stack;
RJP_index position;
RJP_index size;
}RJP_parse_stack;
typedef struct RJP_parse_state{
RJP_parse_stack target_stack;
RJP_value* root;
RJP_value* curr;
RJP_value* lastadded;
RJP_lex_state lexstate;
int row, column;
_Bool allow_comments;
_Bool allow_trail_comma;
}RJP_parse_state;
static void irjp_init_parse_stack(RJP_parse_stack* s){
s->size = RJP_INITIAL_PARSE_DEPTH;
s->stack = rjp_alloc(sizeof(RJP_parse_target)*s->size);
s->position = 0;
s->stack[0] = rjp_parse_start;
}
static void irjp_delete_parse_stack(RJP_parse_stack* s){
rjp_free(s->stack);
s->stack = NULL;
}
static void irjp_resize_parse_stack(RJP_parse_stack* s, RJP_index newsize){
RJP_parse_target* newstack = rjp_alloc(sizeof(RJP_parse_target) * newsize);
memcpy(newstack, s->stack, s->size*sizeof(RJP_parse_target));
rjp_free(s->stack);
s->stack = newstack;
s->size = newsize;
}
static void irjp_parse_stack_push(RJP_parse_stack* s, RJP_parse_target target){
if((s->position+1) == s->size)
irjp_resize_parse_stack(s, s->size*2);
s->stack[++s->position] = target;
}
static RJP_parse_target irjp_parse_stack_pop(RJP_parse_stack* s){
return s->stack[s->position--];
}
static RJP_parse_target irjp_parse_stack_current(RJP_parse_stack* s){
return s->stack[s->position];
}
static void irjp_parse_stack_set(RJP_parse_stack* s, RJP_parse_target target){
s->stack[s->position] = target;
}
static int irjp_init_value(RJP_value* newval, RJP_lex_category cat, RJP_parse_state* state){
RJP_index length = state->lexstate.length;
RJP_index offset = state->lexstate.offset;
const char* str = state->lexstate.str + offset;
switch(cat){
case rjp_lex_string:;
RJP_index newlength;
newval->type = rjp_json_string;
newval->string.value = irjp_convert_string(str, length, &newlength);
if(!newval->string.value)
return 1;
newval->string.length = newlength;
break;
case rjp_lex_number:
newval->type = rjp_json_integer;
newval->integer = strtoll(str, NULL, 10);
break;
case rjp_lex_fnumber:
case rjp_lex_sci_num:
newval->type = rjp_json_dfloat;
newval->dfloat = strtod(str, NULL);
break;
case rjp_lex_true:
newval->type = rjp_json_boolean;
newval->boolean = 1;
break;
case rjp_lex_false:
newval->type = rjp_json_boolean;
newval->boolean = 0;
break;
case rjp_lex_null:
newval->type = rjp_json_null;
break;
case rjp_lex_obrace:
newval->type = rjp_json_object;
irjp_parse_stack_push(&state->target_stack, rjp_parse_first_mem_key);
state->curr = state->lastadded;
break;
case rjp_lex_obracket:
newval->type = rjp_json_array;
irjp_parse_stack_push(&state->target_stack, rjp_parse_arr_first_value);
state->curr = state->lastadded;
break;
default:
return 1;
};
return 0;
}
static RJP_value* irjp_add_value_to_array(RJP_lex_category cat, RJP_parse_state* state){
state->lastadded = rjp_new_element(state->curr);
if(irjp_init_value(state->lastadded, cat, state))
return NULL;
return state->lastadded;
}
static RJP_value* irjp_add_value_to_object(RJP_parse_state* state, const char* key, RJP_index keylen){
RJP_index newlen;
char* newkey = irjp_convert_string(key, keylen, &newlen);
if(!newlen){ //cannot have empty key
rjp_free(newkey);
return NULL;
}
return (state->lastadded = rjp_new_member_steal_key(state->curr, newkey, newlen));
}
static RJP_lex_category irjp_convert_comment(_Bool allow_comments){
if(allow_comments)
return rjp_lex_spaces;
return rjp_lex_invalid;
}
static void irjp_init_parse_state(RJP_parse_state* state, const char* str){
state->column = 1;
state->row = 1;
irjp_init_parse_stack(&state->target_stack);
state->lexstate.str = (char*)str;
state->root = state->curr = state->lastadded = rjp_calloc(1, sizeof(RJP_value));
}
static void irjp_delete_parse_state(RJP_parse_state* state){
irjp_delete_parse_stack(&state->target_stack);
irjp_delete_lex_state(&state->lexstate);
}
static void irjp_delete_parse_state_no_preserve_root(RJP_parse_state* state){
irjp_delete_parse_state(state);
rjp_free_value(state->root);
state->root = NULL;
}
static int irjp_parse_handle_lexcat(RJP_lex_category cat, RJP_parse_state* state){
if(cat == rjp_lex_line_comment || cat == rjp_lex_block_comment)
cat = irjp_convert_comment(state->allow_comments);
if(cat == rjp_lex_spaces)
return RJP_PARSE_STATUS_SUC;
if(cat == rjp_lex_newlines){
state->row = 1;
++(state->column);
return RJP_PARSE_STATUS_SUC;
}
if(cat == rjp_lex_invalid)
return RJP_PARSE_STATUS_INVALID;
switch(irjp_parse_stack_current(&state->target_stack)){
case rjp_parse_start:
irjp_parse_stack_set(&state->target_stack, rjp_parse_end);
if(irjp_init_value(state->root, cat, state)){
return RJP_PARSE_STATUS_NO_ROOT_VALUE;
}
break;
case rjp_parse_first_mem_key:
if(cat == rjp_lex_cbrace){
irjp_parse_stack_pop(&state->target_stack);
state->curr = state->curr->parent;
}else{
//fallthrough
case rjp_parse_mem_key:
if(cat == rjp_lex_string){
irjp_parse_stack_set(&state->target_stack, rjp_parse_key_colon);
if(!irjp_add_value_to_object(state, state->lexstate.str+state->lexstate.offset, state->lexstate.length)){
return RJP_PARSE_STATUS_MISSING_KEY;
}
}else{
return RJP_PARSE_STATUS_MISSING_KEY;
}
}
break;
case rjp_parse_arr_first_value:
if(cat == rjp_lex_cbracket){
irjp_parse_stack_pop(&state->target_stack);
state->curr = state->curr->parent;
}else{
//fallthrough
case rjp_parse_arr_value:
irjp_parse_stack_set(&state->target_stack, rjp_parse_arr_comma);
if(!irjp_add_value_to_array(cat, state))
return RJP_PARSE_STATUS_MISSING_VALUE;
}
break;
case rjp_parse_key_colon:
if(cat != rjp_lex_colon)
return RJP_PARSE_STATUS_MISSING_COLON;
irjp_parse_stack_set(&state->target_stack, rjp_parse_obj_value);
break;
case rjp_parse_obj_value:
irjp_parse_stack_set(&state->target_stack, rjp_parse_obj_comma);
if(irjp_init_value(state->lastadded, cat, state)){
return RJP_PARSE_STATUS_MISSING_VALUE;
}
break;
case rjp_parse_obj_comma:
if(cat == rjp_lex_comma){
irjp_parse_stack_set(&state->target_stack, state->allow_trail_comma ? rjp_parse_first_mem_key : rjp_parse_mem_key);
}else if(cat == rjp_lex_cbrace){
irjp_parse_stack_pop(&state->target_stack);
state->curr = state->curr->parent;
}else{
return RJP_PARSE_STATUS_MISSING_COMMA;
}
break;
case rjp_parse_arr_comma:
if(cat == rjp_lex_comma){
irjp_parse_stack_set(&state->target_stack, state->allow_trail_comma ? rjp_parse_arr_first_value : rjp_parse_arr_value);
}else if(cat == rjp_lex_cbracket){
irjp_parse_stack_pop(&state->target_stack);
state->curr = state->curr->parent;
}else{
return RJP_PARSE_STATUS_MISSING_COMMA;
}
break;
case rjp_parse_end:
if(state->lexstate.str[state->lexstate.offset] != 0)
return RJP_PARSE_STATUS_EXCESS_DATA;
};
return RJP_PARSE_STATUS_SUC;
}
//Handle the final token returned by the lexer. rjp_lex_end is a nonaccepting state to break the
//parse loop. it is a successful state though as it just indicates end of input.
static int irjp_handle_final_parse_token(RJP_parse_state* state, RJP_lex_category cat){
if(state->target_stack.position != 0)
return RJP_PARSE_STATUS_MISSING_CLOSE_BRACE;
if(cat == rjp_lex_end)
return RJP_PARSE_STATUS_SUC;
return RJP_PARSE_STATUS_INVALID;
}
//Basic parse loop
static int irjp_parse(RJP_parse_state* state){
RJP_lex_category cat;
RJP_parse_status status;
for(cat = irjp_lex(&state->lexstate);cat & rjp_lex_accept;cat = irjp_lex(&state->lexstate),state->row += state->lexstate.length){
if((status = irjp_parse_handle_lexcat(cat, state)) != RJP_PARSE_STATUS_SUC)
return status;
}
return irjp_handle_final_parse_token(state, cat);
}
//Callback parse loop
static int irjp_parse_cback(RJP_parse_state* state, RJP_parse_callback* cback){
RJP_lex_category cat;
RJP_parse_status status;
for(cat = irjp_lex_cback(&state->lexstate, cback);cat & rjp_lex_accept;cat = irjp_lex_cback(&state->lexstate, cback),state->row += state->lexstate.length){
if((status = irjp_parse_handle_lexcat(cat, state)) != RJP_PARSE_STATUS_SUC)
return status;
}
return irjp_handle_final_parse_token(state, cat);
}
char* rjp_parse_error_to_string(const RJP_parse_error* err){
const RJP_parse_state* state = (const RJP_parse_state*)err->parsestate;
RJP_parse_status status = err->errcode;
char* buffer = NULL;
const char* format = NULL;
switch(status){
case RJP_PARSE_STATUS_MISSING_VALUE:
format = "Expected value before '%.*s'";
buffer = rjp_alloc(snprintf(NULL, 0, format, (int)state->lexstate.length, (state->lexstate.str + state->lexstate.offset)) + 1);
sprintf(buffer, format, (int)state->lexstate.length, (state->lexstate.str + state->lexstate.offset));
break;
case RJP_PARSE_STATUS_MISSING_COMMA:
format = "Expected comma before '%.*s'";
buffer = rjp_alloc(snprintf(NULL, 0, format, (int)state->lexstate.length, (state->lexstate.str + state->lexstate.offset)) + 1);
sprintf(buffer, format, (int)state->lexstate.length, (state->lexstate.str + state->lexstate.offset));
break;
case RJP_PARSE_STATUS_INVALID:
format = "Invalid lex token '%.*s'";
buffer = rjp_alloc(snprintf(NULL, 0, format, (int)state->lexstate.length, (state->lexstate.str + state->lexstate.offset)) + 1);
sprintf(buffer, format, (int)state->lexstate.length, (state->lexstate.str + state->lexstate.offset));
break;
case RJP_PARSE_STATUS_NO_ROOT_VALUE:
format = "Missing root JSON value";
buffer = rjp_alloc(snprintf(NULL, 0, "%s", format) + 1);
sprintf(buffer, "%s", format);
break;
case RJP_PARSE_STATUS_MISSING_KEY:
format = "Expected key before '%.*s'";
buffer = rjp_alloc(snprintf(NULL, 0, format, (int)state->lexstate.length, (state->lexstate.str + state->lexstate.offset)) + 1);
sprintf(buffer, format, (int)state->lexstate.length, (state->lexstate.str + state->lexstate.offset));
break;
case RJP_PARSE_STATUS_MISSING_COLON:
format = "Expected colon before '%.*s'";
buffer = rjp_alloc(snprintf(NULL, 0, format, (int)state->lexstate.length, (state->lexstate.str + state->lexstate.offset)) + 1);
sprintf(buffer, format, (int)state->lexstate.length, (state->lexstate.str + state->lexstate.offset));
break;
case RJP_PARSE_STATUS_EXCESS_DATA:
format = "Excess data after JSON";
buffer = rjp_alloc(snprintf(NULL, 0, "%s", format) + 1);
sprintf(buffer, "%s", format);
break;
case RJP_PARSE_STATUS_MISSING_CLOSE_BRACE:
format = "Missing closing brace";
buffer = rjp_alloc(snprintf(NULL, 0, "%s", format) + 1);
sprintf(buffer, "%s", format);
break;
default:
break;
};
return buffer;
}
void rjp_delete_parse_error(RJP_parse_error* err){
irjp_delete_parse_state_no_preserve_root((RJP_parse_state*)err->parsestate);
rjp_free(err->parsestate);
}
RJP_value* rjp_simple_parse(const char* str){
return rjp_parse(str, RJP_PARSE_NONE, NULL);
}
RJP_value* rjp_parse(const char* str, int flags, RJP_parse_error* err){
RJP_parse_state* state = rjp_calloc(sizeof(RJP_parse_state), 1);
state->allow_comments = (flags & RJP_PARSE_ALLOW_COMMENTS);
state->allow_trail_comma = (flags & RJP_PARSE_ALLOW_TRAILING_COMMA);
irjp_init_parse_state(state, str);
irjp_init_lex_state(&state->lexstate);
int status = irjp_parse(state);
if(status == RJP_PARSE_STATUS_SUC){
irjp_delete_parse_state(state);
RJP_value* root = state->root;
rjp_free(state);
return root;
}else{
if(err){
err->parsestate = state;
err->errcode = status;
err->row = state->column;
err->column = state->row;
}else{
irjp_delete_parse_state_no_preserve_root(state);
rjp_free(state);
}
return NULL;
}
}
//Callback based parse. Runs identical to normal parsing except sets up callback
//lex state and calls callback lex function
RJP_value* rjp_parse_cback(int flags, RJP_parse_callback* cback, RJP_parse_error* err){
RJP_parse_state* state = rjp_calloc(sizeof(RJP_parse_state), 1);
state->allow_comments = (flags & RJP_PARSE_ALLOW_COMMENTS);
state->allow_trail_comma = (flags & RJP_PARSE_ALLOW_TRAILING_COMMA);
irjp_init_parse_state(state, NULL);
irjp_init_lex_cback_state(&state->lexstate);
int status = irjp_parse_cback(state, cback);
if(status == RJP_PARSE_STATUS_SUC){
irjp_delete_parse_state(state);
RJP_value* root = state->root;
rjp_free(state);
return root;
}else{
if(err){
err->parsestate = state;
err->errcode = status;
err->row = state->column;
err->column = state->row;
}else{
irjp_delete_parse_state_no_preserve_root(state);
rjp_free(state);
}
return NULL;
}
}