Working on enabling chunked reading

2019-06-10 17:51:45 -07:00 · 2019-06-10 17:51:45 -07:00 · dc0c003785
commit dc0c003785
parent 4577836f8e
3 changed files with 316 additions and 219 deletions
--- a/3
+++ b/3
@ -0,0 +1,3 @@
+Change string handling to work with chunked reading
+Change numeral handling to work with chunked reading
+handle scientific notation
--- a/include/rjp.h
+++ b/include/rjp.h
@ -84,6 +84,7 @@ typedef struct RJP_search_res{

 //Convert C string consisting of json data into RJP's format
 RJP_value* rjp_parse(const char* str);
+RJP_value* rjp_parse_chunked(const char* str, RJP_value* prev_chunk);

 //Initialize a root RJP_value to NULL
 RJP_value* rjp_init_json(void);
--- a/src/input.c
+++ b/src/input.c
@ -24,14 +24,17 @@
 #include "memory.h"
 #include <stdlib.h> //strtod, strtol
 #include <stdio.h>  //fprintf, stderr
+#include <string.h> //memset

 //types of searches in the text
 typedef enum json_search_target{
-	json_key,
-	json_colon,
-	json_comma,
-	json_value,
-	json_none
+	json_target_key,
+	json_target_colon,
+	json_target_comma,
+	json_target_value,
+	json_target_string,
+	json_target_numeral,
+	json_target_none
 }json_search_target;

 static RJP_value* _rjp__add_value(RJP_value* curr, RJP_value new_val){
@ -49,234 +52,324 @@ static RJP_value* _rjp__add_value(RJP_value* curr, RJP_value new_val){
 	curr->object.last->value = new_val;
 	return &curr->object.last->value;
 }
-#define syntax_error(msg, row, column)\
-	do{DIAG_PRINT(stderr, "Syntax error! %s (%i:%i)\n", msg, row, column);rjp_free_value(root);return NULL;}while(0)
-
 #define MAX_DEPTH 16
-RJP_value* rjp_parse(const char* str){
-	RJP_value* root = 0;
-	RJP_value* curr = 0;
-	int row = 1, column = 0;
-	int in_line_comment = 0;
-	int in_block_comment = 0;

-	//keep track of where we are in a given subobject
-	int state_stack[MAX_DEPTH] = {0},*top = state_stack;
+typedef struct RJP_string_state{
+	int escaped;
+	int in_utf_sequence;
+	char* buffer; //store partial string here only when chunked reading and chunk ends mid string
+}RJP_string_state;
+
+typedef struct RJP_numeral_state{
+	int numlen;
+	char* buffer; //store partial number string here only when chunked reading and chunk ends mid number
+}RJP_numeral_state;
+
+typedef struct RJP_parse_state{
+	RJP_value* root;
+	RJP_value* curr;
+	union{
+		RJP_string_state str_state;
+		RJP_numeral_state num_state;
+	};
+	int row, column;
+	int in_line_comment;
+	int in_block_comment;
+	int target_stack[MAX_DEPTH];
+	int* target;
+}RJP_parse_state;
+
+void _rjp__init_parse_state(RJP_parse_state* state){
+	state->root = NULL;
+	state->curr = NULL;
+	state->row = state->column = 0;
+	state->in_line_comment = 0;
+	state->in_block_comment = 0;
+	memset(state->target_stack, 0, MAX_DEPTH*sizeof(int));
+	state->target = state->target_stack;
+}
+
+static void syntax_error(const char* msg, RJP_parse_state* state){
+	DIAG_PRINT(stderr, "Syntax error! %s (%i:%i)\n", msg, state->row, state->column);
+	rjp_free_value(state->root);
+}
+
+//Return number of characters handled while processing comment
+int _rjp__handle_comment(const char* str, RJP_parse_state* state){
+	char c = *str;
+	if(state->in_line_comment){
+		if(c == '\n')
+			state->in_line_comment = 0;
+		return 1;
+	}else if(state->in_block_comment){
+		if(c == '*' && *(str+1) == '/'){
+			state->in_block_comment = 0;
+			return 2;
+		}
+		return 1;
+	}else if(c == '/' && *(str+1) == '/'){
+		state->in_block_comment = 1;
+		return 2;
+	}else if(c == '/' && *(str+1) == '/'){
+		state->in_line_comment = 1;
+		return 2;
+	}
+	return 0;
+}
+int _rjp__handle_key(const char* str, RJP_parse_state* state){
+	char c = *str;
+	//start of key
+	if(c == '"'){
+		if(state->curr == NULL){
+			syntax_error("Key found outside of object definition!", state);
+			return -1;
+		}
+
+		int keylen;
+		int inclen;
+		char* new_string = _rjp__parse_string(state->root, str+1, &inclen, &keylen, &state->row, &state->column);
+		if(!new_string){
+			if(!keylen)
+				syntax_error("Cannot have empty key name!", state);
+			return -1;
+		}
+		_rjp__add_member_no_alloc(&(state->curr->object), new_string, keylen);
+		*state->target = json_target_colon;
+		return inclen+2;
+	//end of this object (object is empty)
+	}else if(c == '}'){
+		state->curr = state->curr->parent;
+		if(state->target != state->target_stack)
+			--state->target;
+		return 1;
+
+	//unrecognized character
+	}else if(!_rjp__is_whitespace(c)){
+		syntax_error("Unexpected character, expected '\"'!", state);
+		return -1;
+	}
+	return 1;
+}
+
+int _rjp__handle_colon(const char* str, RJP_parse_state* state){
+	char c = *str;
+	//colon after a key
+	if(c == ':'){
+		*state->target = json_target_value;
+	//unrecognized character
+	}else if(!_rjp__is_whitespace(c)){
+		syntax_error( "Unexpected character, expected ':'!", state);
+		return -1;
+	}
+	return 1;
+}
+int _rjp__handle_comma(const char* str, RJP_parse_state* state){
+	char c = *str;
+	//comma separating keys in an object or values in an array
+	if(c == ','){
+		*state->target = (state->curr->type == json_array ? json_target_value : json_target_key);
+
+	//end of object
+	}else if(c == '}'){
+		if(state->curr->type == json_array){
+			syntax_error("Unexpected end of object within array!", state);
+			return -1;
+		}
+		state->curr = state->curr->parent;
+		if(state->target != state->target_stack)
+			--state->target;
+	//end of array
+	}else if(c == ']' && state->curr->type == json_array){
+		state->curr = state->curr->parent;
+	//unrecognized character
+	}else if(!_rjp__is_whitespace(c)){
+		syntax_error("Unexpected character, expected ','!", state);
+		return -1;
+	}
+	return 1;
+}
+
+int _rjp__handle_value(const char* str, RJP_parse_state* state){
+	//object
+	char c = *str;
+	if(c == '{'){
+		if(!state->root){
+			state->root = _rjp__add_value(NULL, rjp_object());
+			state->curr = state->root;
+			*state->target = json_target_key;
+		}else{
+			state->curr = _rjp__add_value(state->curr, rjp_object());
+			*state->target = json_target_comma;
+			++state->target;
+			*state->target = json_target_key;
+		}
+		return 1;
+	}
+	else if(c == '['){
+		if(!state->root){
+			state->root = _rjp__add_value(NULL, rjp_array());
+			state->curr = state->root;
+
+		}else{
+			state->curr = _rjp__add_value(state->curr, rjp_array());
+		}
+		return 1;
+	}
+	else if(c == ']' && state->curr->type == json_array){ //empty array
+		*state->target = json_target_comma;
+		state->curr = state->curr->parent;
+		return 1;
+	}
+	//strings
+	else if(c == '"'){
+		int vallen, inclen;
+		char* new_string = _rjp__parse_string(state->root, str+1, &inclen, &vallen, &state->row, &state->column);
+		if(!new_string){
+			if(vallen == 0){
+				new_string = rjp_calloc(1, 1);
+			}else{
+				return -1;
+			}
+		}
+		_rjp__add_value(state->curr, rjp_string(new_string, vallen));
+		*state->target = json_target_comma;
+		return inclen+2;
+	}
+	//numbers
+	else if((c >= '0' && c <= '9') || c == '-'){
+		if(!state->curr)
+			*state->target = json_target_none;
+		else
+			*state->target = json_target_comma;
+		int numlen;
+		int floating = 0; //is an int or a double
+		for(numlen = 1;*(str+numlen) >= '0' && *(str+numlen) <= '9';++numlen);
+		if(*(str+numlen) == '.'){ //if we have a decimal, make it a double and continue parsing as a number
+			int i = ++numlen;
+			for(;*(str+numlen) >= '0' && *(str+numlen) <= '9';++numlen);
+			if(i == numlen){ //no number after decimal
+				syntax_error("Missing numerals after decimal place!", state);
+				return -1;
+			}
+			floating = 1;
+		}
+		if(*(str+numlen) == '\0' && state->curr){ //hit EOF early
+			syntax_error("Unexpected EOF before end of object!", state);
+			return -1;
+		}
+		if(c == '-' && numlen == 1){ //only have a '-' with no numbers
+			syntax_error("Missing numerals after '-' sign!", state);
+			return -1;
+		}
+		if(floating){
+			if(!state->root){
+				state->root = state->curr = _rjp__add_value(NULL, rjp_dfloat(strtod(str, NULL)));
+			}else{
+				_rjp__add_value(state->curr, rjp_dfloat(strtod(str, NULL)));
+			}
+		}else{
+			if(!state->root){
+				state->root = state->curr = _rjp__add_value(NULL, rjp_integer(strtol(str, NULL, 10)));
+			}else{
+				_rjp__add_value(state->curr, rjp_integer(strtol(str, NULL, 10)));
+			}
+		}
+		state->column += numlen;
+		return numlen;
+	}
+	//booleans and null
+	else if(!strncmp(str, "true", 4)){
+		if(!state->curr){
+			*state->target = json_target_none;
+			state->root = state->curr = _rjp__add_value(state->curr, rjp_boolean(1));
+		}else{
+			*state->target = json_target_comma;
+			_rjp__add_value(state->curr, rjp_boolean(1));
+		}
+		state->column += 3;
+		return 4;
+	}else if(!strncmp(str, "false", 5)){
+		if(!state->curr){
+			*state->target = json_target_none;
+			state->root = state->curr = _rjp__add_value(state->curr, rjp_boolean(0));
+		}else{
+			*state->target = json_target_comma;
+			_rjp__add_value(state->curr, rjp_boolean(0));
+		}
+		state->column += 4;
+		return 5;
+	}else if(!strncmp(str, "null", 4)){
+		if(!state->curr){
+			*state->target = json_target_none;
+			state->root = state->curr = _rjp__add_value(state->curr, rjp_null());
+		}else{
+			*state->target = json_target_comma;
+			_rjp__add_value(state->curr, rjp_null());
+		}
+		state->column += 3;
+		return 4;
+	}
+	//unrecognized character
+	else if(!_rjp__is_whitespace(c)){
+		syntax_error("Unexpected character!", state);
+		return -1;
+	}
+	return 1;
+}
+
+RJP_value* rjp_parse(const char* str){
+	RJP_parse_state state;
+	_rjp__init_parse_state(&state);

 	//initially search for the root object
-	*top = json_value;
+	*state.target = json_target_value;

-	for(;*str != '\0';++str){
+	int inc = 0;
+	for(;*str != '\0';str += inc){
 		char c = *str;

 		//keep track of position in input file
 		if(c == '\n'){
-			++row;
-			column = 0;
+			++state.row;
+			state.column = 0;
 		}else{
-			++column;
+			++state.column;
 		}

-		//Handle comments
-		if(in_line_comment){
-			if(c == '\n')
-				in_line_comment = 0;
-		}
-		else if(in_block_comment){
-			if(c == '*' && *(str+1) == '/'){
-				in_block_comment = 0;
-				++str;
-			}
-		}
-		else if(c == '/' && *(str+1) == '/'){
-			in_line_comment = 1;
-			++str;
-		}
-		else if(c == '/' && *(str+1) == '*'){
-			in_block_comment = 1;
-			++str;
+		if((inc = _rjp__handle_comment(str, &state))){
+			continue;
 		}

-		else if(*top == json_key){
-			//start of key
-			if(c == '"'){
-				if(curr == NULL)
-					syntax_error("Key found outside of object definition!", row, column);
-
-				int keylen;
-				int inclen;
-				char* new_string = _rjp__parse_string(root, ++str, &inclen, &keylen, &row, &column);
-				if(!new_string){
-					if(!keylen)
-						syntax_error("Cannot have empty key name!", row, column);
-					return NULL;
-				}
-				_rjp__add_member_no_alloc(&curr->object, new_string, keylen);
-				str += inclen;
-				*top = json_colon;
-			//end of this object (object is empty)
-			}else if(c == '}'){
-				curr = curr->parent;
-				if(top != state_stack)
-					--top;
-
-			//unrecognized character
-			}else if(!_rjp__is_whitespace(c)){
-				syntax_error("Unexpected character, expected '\"'!", row, column);
+		switch(*state.target){
+		case json_target_key:
+			inc = _rjp__handle_key(str, &state);
+			break;
+		case json_target_colon:
+			inc = _rjp__handle_colon(str, &state);
+			break;
+		case json_target_comma:
+			inc = _rjp__handle_comma(str, &state);
+			break;
+		case json_target_value:
+			inc = _rjp__handle_value(str, &state);
+			break;
+		case json_target_none:
+			if(!_rjp__is_whitespace(*str)){
+				syntax_error("Unexpected character!", &state);
+				return NULL;
 			}
-		}
-		else if(*top == json_colon){
-			//colon after a key
-			if(c == ':'){
-				*top = json_value;
-			//unrecognized character
-			}else if(!_rjp__is_whitespace(c)){
-				syntax_error( "Unexpected character, expected ':'!", row, column);
-			}
-		}
-		else if(*top == json_comma){
-			//comma separating keys in an object or values in an array
-			if(c == ','){
-				*top = (curr->type == json_array ? json_value : json_key);
-
-			//end of object
-			}else if(c == '}'){
-				if(curr->type == json_array){
-					syntax_error("Unexpected end of object within array!", row, column);
-				}
-				curr = curr->parent;
-				if(top != state_stack)
-					--top;
-			//end of array
-			}else if(c == ']' && curr->type == json_array){
-				curr = curr->parent;
-			//unrecognized character
-			}else if(!_rjp__is_whitespace(c)){
-				syntax_error("Unexpected character, expected ','!", row, column);
-			}
-		}
-		else if(*top == json_value){
-			//object
-			if(c == '{'){
-				if(!root){
-					root = _rjp__add_value(NULL, rjp_object());
-					curr = root;
-					*top = json_key;
-				}else{
-					curr = _rjp__add_value(curr, rjp_object());
-					*top = json_comma;
-					++top;
-					*top = json_key;
-				}
-			}
-			else if(c == '['){
-				if(!root){
-					root = _rjp__add_value(NULL, rjp_array());
-					curr = root;
-
-				}else{
-					curr = _rjp__add_value(curr, rjp_array());
-				}
-			}
-			else if(c == ']' && curr->type == json_array){ //empty array
-				*top = json_comma;
-				curr = curr->parent;
-			}
-			//strings
-			else if(c == '"'){
-				int vallen, inclen;
-				++str;
-				char* new_string = _rjp__parse_string(root, str, &inclen, &vallen, &row, &column);
-				if(!new_string){
-					if(vallen == 0){
-						new_string = rjp_calloc(1, 1);
-					}else{
-						return NULL;
-					}
-				}
-				_rjp__add_value(curr, rjp_string(new_string, vallen));
-				str += inclen;
-				*top = json_comma;
-			}
-			//numbers
-			else if((c >= '0' && c <= '9') || c == '-'){
-				if(!curr)
-					*top = json_none;
-				else
-					*top = json_comma;
-				int numlen;
-				int floating = 0; //is an int or a double
-				for(numlen = 1;*(str+numlen) >= '0' && *(str+numlen) <= '9';++numlen);
-				if(*(str+numlen) == '.'){ //if we have a decimal, make it a double and continue parsing as a number
-					int i = ++numlen;
-					for(;*(str+numlen) >= '0' && *(str+numlen) <= '9';++numlen);
-					if(i == numlen){ //no number after decimal
-						syntax_error("Missing numerals after decimal place!", row, column);
-					}
-					floating = 1;
-				}
-				if(*(str+numlen) == '\0' && curr){ //hit EOF early
-					syntax_error("Unexpected EOF before end of object!", row, column);
-				}
-				if(c == '-' && numlen == 1){ //only have a '-' with no numbers
-					syntax_error("Missing numerals ofter '-' sign!", row, column);
-				}
-				if(floating){
-					if(!root){
-						root = curr = _rjp__add_value(NULL, rjp_dfloat(strtod(str, NULL)));
-					}else{
-						_rjp__add_value(curr, rjp_dfloat(strtod(str, NULL)));
-					}
-				}else{
-					if(!root){
-						root = curr = _rjp__add_value(NULL, rjp_integer(strtol(str, NULL, 10)));
-					}else{
-						_rjp__add_value(curr, rjp_integer(strtol(str, NULL, 10)));
-					}
-				}
-				str += (numlen-1);
-				column += numlen;
-			}
-			//booleans and null
-			else if(!strncmp(str, "true", 4)){
-				if(!curr){
-					*top = json_none;
-					root = curr = _rjp__add_value(curr, rjp_boolean(1));
-				}else{
-					*top = json_comma;
-					_rjp__add_value(curr, rjp_boolean(1));
-				}
-				str += 3;column += 3;
-			}else if(!strncmp(str, "false", 5)){
-				if(!curr){
-					*top = json_none;
-					root = curr = _rjp__add_value(curr, rjp_boolean(0));
-				}else{
-					*top = json_comma;
-					_rjp__add_value(curr, rjp_boolean(0));
-				}
-				str += 4;column += 4;
-			}else if(!strncmp(str, "null", 4)){
-				if(!curr){
-					*top = json_none;
-					root = curr = _rjp__add_value(curr, rjp_null());
-				}else{
-					*top = json_comma;
-					_rjp__add_value(curr, rjp_null());
-				}
-				str += 3;column += 3;
-			}
-			//unrecognized character
-			else if(!_rjp__is_whitespace(c)){
-				syntax_error("Unexpected character!", row, column);
-			}
-		}else if(*top == json_none && !_rjp__is_whitespace(c)){
-			syntax_error("Unexpected character!", row, column);
-		}
+			inc = 1;
+			break;
+		default:
+			inc = 1;
+			break;
+		};
 	}
-	return root;
+	return state.root;
+}
+RJP_value* rjp_parse_chunked(const char* str, RJP_value* prev_chunk){
+	if(!prev_chunk){
+		return rjp_parse(str);
+	}
+	return NULL;
 }
-
-#undef syntax_error
-
-