Added support for unicode escape sequences

2018-12-15 05:33:44 -08:00 · 2018-12-15 05:33:44 -08:00 · 20e46d2af2
commit 20e46d2af2
parent 383cb5af54
4 changed files with 204 additions and 11 deletions
--- a/include/rjp_internal.h
+++ b/include/rjp_internal.h
@ -1,3 +1,21 @@
+/**
+	rjp
+	Copyright (C) 2018 rexy712
+
+	This program is free software: you can redistribute it and/or modify
+	it under the terms of the GNU General Public License as published by
+	the Free Software Foundation, either version 3 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE,  See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
 #ifndef RJP_INTERNAL_H
 #define RJP_INTERNAL_H

--- a/src/input.c
+++ b/src/input.c
@ -17,7 +17,6 @@
 */

 //TODO: Scientific notation
-//TODO: \e escape sequence in strings

 #include "rjp.h"
 #include "rjp_internal.h"
--- a/src/output.c
+++ b/src/output.c
@ -1,3 +1,21 @@
+/**
+	rjp
+	Copyright (C) 2018 rexy712
+
+	This program is free software: you can redistribute it and/or modify
+	it under the terms of the GNU General Public License as published by
+	the Free Software Foundation, either version 3 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
 #include "rjp.h"
 #include "rjp_internal.h"

--- a/src/strings.c
+++ b/src/strings.c
@ -1,32 +1,178 @@
+/**
+	rjp
+	Copyright (C) 2018 rexy712
+
+	This program is free software: you can redistribute it and/or modify
+	it under the terms of the GNU General Public License as published by
+	the Free Software Foundation, either version 3 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
 #include "rjp.h"
 #include "rjp_internal.h"

 #include <stdio.h> //fprintf
 #include <stdlib.h> //malloc, free
+#include <stdint.h>
+
+static uint32_t utf_strtol_4(const char* c){
+	uint32_t ret = 0;
+	for(size_t i = 0;i < 4;++i){
+		if(c[i] >= '0' && c[i] <= '9'){
+			ret |= ((c[i] ^ 0x30) << (4*(3-i)));
+		}else if(c[i] >= 'A' && c[i] <= 'F'){
+			ret |= ((c[i] - 0x37) << (4*(3-i)));
+		}else if(c[i] >= 'a' && c[i] <= 'f'){
+			ret |= ((c[i] - 0x57) << (4*(3-i)));
+		}else{
+			return 0;
+		}
+	}
+	return ret;
+}
+
+
+static int decode_unicode_escape(const char* str, uint32_t* high, uint32_t* low){
+	if(*str != '\\' || *(str+1) != 'u'){ //invalid
+		return *low = *high = 0;
+	}
+	*high = utf_strtol_4(str+2);
+	if(!*high)
+		return *low = *high = 0;
+	if((*high & 0xF800) == 0xD800){ //utf-16
+		if(*(str+6) != '\\' || *(str+7) != 'u'){
+			return *low = *high = 0;
+		}
+		*low = utf_strtol_4(str+8);
+		return 12;
+	}else{
+		*low = 0;
+	}
+	return 6;
+}
+
+static uint32_t u16_surrogate_pair_to_codepoint(uint32_t high, uint32_t low){
+	uint32_t codepoint;
+
+	codepoint = ((high & 0x07FF) << 10) | (1 << 16);
+	codepoint = codepoint | (low & 0x03FF);
+
+	return codepoint;
+}
+
+static uint32_t utf_to_codepoint(uint32_t high, uint32_t low){
+	if(!low) //utf8
+		return high;
+	return u16_surrogate_pair_to_codepoint(high, low);
+}
+static int codepoint_strlen(uint32_t codepoint){
+	if(codepoint <= 0x007F){
+		return 1;
+	}else if(codepoint <= 0x07FF){
+		return 2;
+	}else if(codepoint <= 0xFFFF){
+		return 3;
+	}else if(codepoint <= 0x10FFFF){
+		return 4;
+	}else{
+		return 0;
+	}
+}
+
+static int codepoint_to_u8(char* dest, uint32_t codepoint){
+	if(codepoint <= 0x007F){
+		dest[0] = codepoint;
+		return 1;
+	}else if(codepoint <= 0x07FF){
+		dest[0] = (codepoint >> 6) | 0xC0;
+		dest[1] = (codepoint & 0x3F) | 0x80;
+		return 2;
+	}else if(codepoint <= 0xFFFF){
+		dest[0] = (codepoint >> 12) | 0xE0;
+		dest[1] = ((codepoint >> 6) & 0x3F) | 0x80;
+		dest[2] = (codepoint & 0x3F) | 0x80;
+		return 3;
+	}else if(codepoint <= 0x10FFFF){
+		dest[0] = (codepoint >> 18) | 0xF0;
+		dest[1] = ((codepoint >> 12) & 0x3F) | 0x80;
+		dest[2] = ((codepoint >> 6) & 0x3F) | 0x80;
+		dest[3] = (codepoint & 0x3F) | 0x80;
+		return 4;
+	}else{
+		return 0;
+	}
+}
+static uint32_t u8_to_codepoint(char* u){
+	if((u[0] & 0x80) == 0){
+		//one byte
+		return u[0];
+	}else if((u[0] & 0xE0) == 0xC0){
+		//two byte
+		uint32_t codepoint;
+		codepoint = (u[0] & 0x1F) << 6;
+		codepoint |= (u[1] & 0x3F);
+		return codepoint;
+	}else if((u[0] & 0xF0) == 0xE0){
+		//three byte
+		uint32_t codepoint;
+		codepoint = (u[0] & 0x0F) << 12;
+		codepoint |= (u[1] & 0x3F) << 6;
+		codepoint |= (u[2] & 0x3F);
+		return codepoint;
+	}else if((u[0] & 0xF8) == 0xF0){
+		//four byte
+		uint32_t codepoint;
+		codepoint = (u[0] & 0x07) << 18;
+		codepoint |= (u[1] & 0x3F) << 12;
+		codepoint |= (u[2] & 0x3F) << 6;
+		codepoint |= (u[3] & 0x3F);
+		return codepoint;
+	}else{
+		//invalid
+		return 0;
+	}
+}

 //Convert escape sequences in strings
 char* _rjp__parse_string(RJP_value* root, const char* str, int* len, int* row, int* column){
 	char* new_string;
 	++(*column); //account for starting quotation mark
-	for(*len = 0;*(str+*len) != '"';++(*len), ++(*column)){
-		if(*(str+*len) == '\\'){
-			++(*len);
+	int oldpos = 0;
+	int newpos = 0;
+	for(;*(str+oldpos) != '"';++oldpos, ++(newpos), ++(*column)){
+		if(*(str+oldpos) == '\\'){
+			if(*(str+oldpos+1) == 'u'){
+				uint32_t high, low;
+				oldpos += (decode_unicode_escape(str+oldpos, &high, &low)-1);
+				newpos += (codepoint_strlen(utf_to_codepoint(high, low))-1);
+			}else{
+				++oldpos;
 				++(*column);
-		}else if(*(str+*len) == '\0'){
-			*len = 1;
+			}
+		}else if(*(str+oldpos) == '\0'){
+			newpos = 1;
 			fprintf(stderr, "Syntax error! %s (%i:%i)\n", "Unexpected EOF in string!", *row, *column);
 			rjp_free_value(root);
 			return NULL;
-		}else if(*(str+*len) == '\n'){
+		}else if(*(str+oldpos) == '\n'){
 			++(*row);
 			*column = 0;
 		}
 	}
-	if(*len == 0){
+	*len = oldpos;
+	if(newpos == 0){
 		return NULL;
 	}
-	new_string = rjp_alloc(*len + 1);
-	new_string[*len] = 0;
+	new_string = rjp_alloc(newpos + 1);
+	new_string[newpos] = 0;
 	for(int i = 0;*str != '"';++i,++str){
 		if(*str == '\\'){
 			++str;
@ -52,6 +198,18 @@ char* _rjp__parse_string(RJP_value* root, const char* str, int* len, int* row, i
 			case 'f':
 				new_string[i] = '\f';
 				break;
+			case 'u':;
+				uint32_t high, low;
+				uint32_t codepoint;
+				--str;
+				str += (decode_unicode_escape(str, &high, &low) - 1);
+				if(!high){
+					rjp_free(new_string);
+					return NULL;
+				}
+				codepoint = utf_to_codepoint(high, low);
+				i += (codepoint_to_u8(new_string+i, codepoint)-1);
+				break;
 			default:
 				new_string[i] = *str;
 				break;