Added support for unicode escape sequences

2018-12-15 05:33:44 -08:00 · 2018-12-15 05:33:44 -08:00 · 20e46d2af2
commit 20e46d2af2
parent 383cb5af54
4 changed files with 204 additions and 11 deletions
--- a/include/rjp_internal.h
+++ b/include/rjp_internal.h
@ -1,3 +1,21 @@
 /**
 	rjp
 	Copyright (C) 2018 rexy712
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
 	the Free Software Foundation, either version 3 of the License, or
 	(at your option) any later version.
 	This program is distributed in the hope that it will be useful,
 	but WITHOUT ANY WARRANTY; without even the implied warranty of
 	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE,  See the
 	GNU General Public License for more details.
 	You should have received a copy of the GNU General Public License
 	along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef RJP_INTERNAL_H
 #define RJP_INTERNAL_H
--- a/src/input.c
+++ b/src/input.c
@ -17,7 +17,6 @@
 */
 //TODO: Scientific notation
 //TODO: \e escape sequence in strings
 #include "rjp.h"
 #include "rjp_internal.h"
--- a/src/output.c
+++ b/src/output.c
@ -1,3 +1,21 @@
 /**
 	rjp
 	Copyright (C) 2018 rexy712
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
 	the Free Software Foundation, either version 3 of the License, or
 	(at your option) any later version.
 	This program is distributed in the hope that it will be useful,
 	but WITHOUT ANY WARRANTY; without even the implied warranty of
 	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 	GNU General Public License for more details.
 	You should have received a copy of the GNU General Public License
 	along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "rjp.h"
 #include "rjp_internal.h"
--- a/src/strings.c
+++ b/src/strings.c
@ -1,32 +1,178 @@
 /**
 	rjp
 	Copyright (C) 2018 rexy712
 	This program is free software: you can redistribute it and/or modify
 	it under the terms of the GNU General Public License as published by
 	the Free Software Foundation, either version 3 of the License, or
 	(at your option) any later version.
 	This program is distributed in the hope that it will be useful,
 	but WITHOUT ANY WARRANTY; without even the implied warranty of
 	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 	GNU General Public License for more details.
 	You should have received a copy of the GNU General Public License
 	along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "rjp.h"
 #include "rjp_internal.h"
 #include <stdio.h> //fprintf
 #include <stdlib.h> //malloc, free
 #include <stdint.h>
 static uint32_t utf_strtol_4(const char* c){
 	uint32_t ret = 0;
 	for(size_t i = 0;i < 4;++i){
 		if(c[i] >= '0' && c[i] <= '9'){
 			ret |= ((c[i] ^ 0x30) << (4*(3-i)));
 		}else if(c[i] >= 'A' && c[i] <= 'F'){
 			ret |= ((c[i] - 0x37) << (4*(3-i)));
 		}else if(c[i] >= 'a' && c[i] <= 'f'){
 			ret |= ((c[i] - 0x57) << (4*(3-i)));
 		}else{
 			return 0;
 		}
 	}
 	return ret;
 }
 static int decode_unicode_escape(const char* str, uint32_t* high, uint32_t* low){
 	if(*str != '\\' || *(str+1) != 'u'){ //invalid
 		return *low = *high = 0;
 	}
 	*high = utf_strtol_4(str+2);
 	if(!*high)
 		return *low = *high = 0;
 	if((*high & 0xF800) == 0xD800){ //utf-16
 		if(*(str+6) != '\\' || *(str+7) != 'u'){
 			return *low = *high = 0;
 		}
 		*low = utf_strtol_4(str+8);
 		return 12;
 	}else{
 		*low = 0;
 	}
 	return 6;
 }
 static uint32_t u16_surrogate_pair_to_codepoint(uint32_t high, uint32_t low){
 	uint32_t codepoint;
 	codepoint = ((high & 0x07FF) << 10) | (1 << 16);
 	codepoint = codepoint | (low & 0x03FF);
 	return codepoint;
 }
 static uint32_t utf_to_codepoint(uint32_t high, uint32_t low){
 	if(!low) //utf8
 		return high;
 	return u16_surrogate_pair_to_codepoint(high, low);
 }
 static int codepoint_strlen(uint32_t codepoint){
 	if(codepoint <= 0x007F){
 		return 1;
 	}else if(codepoint <= 0x07FF){
 		return 2;
 	}else if(codepoint <= 0xFFFF){
 		return 3;
 	}else if(codepoint <= 0x10FFFF){
 		return 4;
 	}else{
 		return 0;
 	}
 }
 static int codepoint_to_u8(char* dest, uint32_t codepoint){
 	if(codepoint <= 0x007F){
 		dest[0] = codepoint;
 		return 1;
 	}else if(codepoint <= 0x07FF){
 		dest[0] = (codepoint >> 6) | 0xC0;
 		dest[1] = (codepoint & 0x3F) | 0x80;
 		return 2;
 	}else if(codepoint <= 0xFFFF){
 		dest[0] = (codepoint >> 12) | 0xE0;
 		dest[1] = ((codepoint >> 6) & 0x3F) | 0x80;
 		dest[2] = (codepoint & 0x3F) | 0x80;
 		return 3;
 	}else if(codepoint <= 0x10FFFF){
 		dest[0] = (codepoint >> 18) | 0xF0;
 		dest[1] = ((codepoint >> 12) & 0x3F) | 0x80;
 		dest[2] = ((codepoint >> 6) & 0x3F) | 0x80;
 		dest[3] = (codepoint & 0x3F) | 0x80;
 		return 4;
 	}else{
 		return 0;
 	}
 }
 static uint32_t u8_to_codepoint(char* u){
 	if((u[0] & 0x80) == 0){
 		//one byte
 		return u[0];
 	}else if((u[0] & 0xE0) == 0xC0){
 		//two byte
 		uint32_t codepoint;
 		codepoint = (u[0] & 0x1F) << 6;
 		codepoint |= (u[1] & 0x3F);
 		return codepoint;
 	}else if((u[0] & 0xF0) == 0xE0){
 		//three byte
 		uint32_t codepoint;
 		codepoint = (u[0] & 0x0F) << 12;
 		codepoint |= (u[1] & 0x3F) << 6;
 		codepoint |= (u[2] & 0x3F);
 		return codepoint;
 	}else if((u[0] & 0xF8) == 0xF0){
 		//four byte
 		uint32_t codepoint;
 		codepoint = (u[0] & 0x07) << 18;
 		codepoint |= (u[1] & 0x3F) << 12;
 		codepoint |= (u[2] & 0x3F) << 6;
 		codepoint |= (u[3] & 0x3F);
 		return codepoint;
 	}else{
 		//invalid
 		return 0;
 	}
 }
 //Convert escape sequences in strings
 char* _rjp__parse_string(RJP_value* root, const char* str, int* len, int* row, int* column){
 	char* new_string;
 	++(*column); //account for starting quotation mark
-	for(*len = 0;*(str+*len) != '"';++(*len), ++(*column)){
+	int oldpos = 0;
-		if(*(str+*len) == '\\'){
+	int newpos = 0;
-			++(*len);
+	for(;*(str+oldpos) != '"';++oldpos, ++(newpos), ++(*column)){
-			++(*column);
+		if(*(str+oldpos) == '\\'){
-		}else if(*(str+*len) == '\0'){
+			if(*(str+oldpos+1) == 'u'){
-			*len = 1;
+				uint32_t high, low;
 				oldpos += (decode_unicode_escape(str+oldpos, &high, &low)-1);
 				newpos += (codepoint_strlen(utf_to_codepoint(high, low))-1);
 			}else{
 				++oldpos;
 				++(*column);
 			}
 		}else if(*(str+oldpos) == '\0'){
 			newpos = 1;
 			fprintf(stderr, "Syntax error! %s (%i:%i)\n", "Unexpected EOF in string!", *row, *column);
 			rjp_free_value(root);
 			return NULL;
-		}else if(*(str+*len) == '\n'){
+		}else if(*(str+oldpos) == '\n'){
 			++(*row);
 			*column = 0;
 		}
 	}
-	if(*len == 0){
+	*len = oldpos;
 	if(newpos == 0){
 		return NULL;
 	}
-	new_string = rjp_alloc(*len + 1);
+	new_string = rjp_alloc(newpos + 1);
-	new_string[*len] = 0;
+	new_string[newpos] = 0;
 	for(int i = 0;*str != '"';++i,++str){
 		if(*str == '\\'){
 			++str;
@ -52,6 +198,18 @@ char* _rjp__parse_string(RJP_value* root, const char* str, int* len, int* row, i
 			case 'f':
 				new_string[i] = '\f';
 				break;
 			case 'u':;
 				uint32_t high, low;
 				uint32_t codepoint;
 				--str;
 				str += (decode_unicode_escape(str, &high, &low) - 1);
 				if(!high){
 					rjp_free(new_string);
 					return NULL;
 				}
 				codepoint = utf_to_codepoint(high, low);
 				i += (codepoint_to_u8(new_string+i, codepoint)-1);
 				break;
 			default:
 				new_string[i] = *str;
 				break;