Added support for unicode escape sequences

This commit is contained in:
rexy712 2018-12-15 05:33:44 -08:00
parent 383cb5af54
commit 20e46d2af2
4 changed files with 204 additions and 11 deletions

View File

@ -1,3 +1,21 @@
/**
rjp
Copyright (C) 2018 rexy712
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef RJP_INTERNAL_H #ifndef RJP_INTERNAL_H
#define RJP_INTERNAL_H #define RJP_INTERNAL_H

View File

@ -17,7 +17,6 @@
*/ */
//TODO: Scientific notation //TODO: Scientific notation
//TODO: \e escape sequence in strings
#include "rjp.h" #include "rjp.h"
#include "rjp_internal.h" #include "rjp_internal.h"

View File

@ -1,3 +1,21 @@
/**
rjp
Copyright (C) 2018 rexy712
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "rjp.h" #include "rjp.h"
#include "rjp_internal.h" #include "rjp_internal.h"

View File

@ -1,32 +1,178 @@
/**
rjp
Copyright (C) 2018 rexy712
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "rjp.h" #include "rjp.h"
#include "rjp_internal.h" #include "rjp_internal.h"
#include <stdio.h> //fprintf #include <stdio.h> //fprintf
#include <stdlib.h> //malloc, free #include <stdlib.h> //malloc, free
#include <stdint.h>
static uint32_t utf_strtol_4(const char* c){
uint32_t ret = 0;
for(size_t i = 0;i < 4;++i){
if(c[i] >= '0' && c[i] <= '9'){
ret |= ((c[i] ^ 0x30) << (4*(3-i)));
}else if(c[i] >= 'A' && c[i] <= 'F'){
ret |= ((c[i] - 0x37) << (4*(3-i)));
}else if(c[i] >= 'a' && c[i] <= 'f'){
ret |= ((c[i] - 0x57) << (4*(3-i)));
}else{
return 0;
}
}
return ret;
}
static int decode_unicode_escape(const char* str, uint32_t* high, uint32_t* low){
if(*str != '\\' || *(str+1) != 'u'){ //invalid
return *low = *high = 0;
}
*high = utf_strtol_4(str+2);
if(!*high)
return *low = *high = 0;
if((*high & 0xF800) == 0xD800){ //utf-16
if(*(str+6) != '\\' || *(str+7) != 'u'){
return *low = *high = 0;
}
*low = utf_strtol_4(str+8);
return 12;
}else{
*low = 0;
}
return 6;
}
static uint32_t u16_surrogate_pair_to_codepoint(uint32_t high, uint32_t low){
uint32_t codepoint;
codepoint = ((high & 0x07FF) << 10) | (1 << 16);
codepoint = codepoint | (low & 0x03FF);
return codepoint;
}
static uint32_t utf_to_codepoint(uint32_t high, uint32_t low){
if(!low) //utf8
return high;
return u16_surrogate_pair_to_codepoint(high, low);
}
static int codepoint_strlen(uint32_t codepoint){
if(codepoint <= 0x007F){
return 1;
}else if(codepoint <= 0x07FF){
return 2;
}else if(codepoint <= 0xFFFF){
return 3;
}else if(codepoint <= 0x10FFFF){
return 4;
}else{
return 0;
}
}
static int codepoint_to_u8(char* dest, uint32_t codepoint){
if(codepoint <= 0x007F){
dest[0] = codepoint;
return 1;
}else if(codepoint <= 0x07FF){
dest[0] = (codepoint >> 6) | 0xC0;
dest[1] = (codepoint & 0x3F) | 0x80;
return 2;
}else if(codepoint <= 0xFFFF){
dest[0] = (codepoint >> 12) | 0xE0;
dest[1] = ((codepoint >> 6) & 0x3F) | 0x80;
dest[2] = (codepoint & 0x3F) | 0x80;
return 3;
}else if(codepoint <= 0x10FFFF){
dest[0] = (codepoint >> 18) | 0xF0;
dest[1] = ((codepoint >> 12) & 0x3F) | 0x80;
dest[2] = ((codepoint >> 6) & 0x3F) | 0x80;
dest[3] = (codepoint & 0x3F) | 0x80;
return 4;
}else{
return 0;
}
}
static uint32_t u8_to_codepoint(char* u){
if((u[0] & 0x80) == 0){
//one byte
return u[0];
}else if((u[0] & 0xE0) == 0xC0){
//two byte
uint32_t codepoint;
codepoint = (u[0] & 0x1F) << 6;
codepoint |= (u[1] & 0x3F);
return codepoint;
}else if((u[0] & 0xF0) == 0xE0){
//three byte
uint32_t codepoint;
codepoint = (u[0] & 0x0F) << 12;
codepoint |= (u[1] & 0x3F) << 6;
codepoint |= (u[2] & 0x3F);
return codepoint;
}else if((u[0] & 0xF8) == 0xF0){
//four byte
uint32_t codepoint;
codepoint = (u[0] & 0x07) << 18;
codepoint |= (u[1] & 0x3F) << 12;
codepoint |= (u[2] & 0x3F) << 6;
codepoint |= (u[3] & 0x3F);
return codepoint;
}else{
//invalid
return 0;
}
}
//Convert escape sequences in strings //Convert escape sequences in strings
char* _rjp__parse_string(RJP_value* root, const char* str, int* len, int* row, int* column){ char* _rjp__parse_string(RJP_value* root, const char* str, int* len, int* row, int* column){
char* new_string; char* new_string;
++(*column); //account for starting quotation mark ++(*column); //account for starting quotation mark
for(*len = 0;*(str+*len) != '"';++(*len), ++(*column)){ int oldpos = 0;
if(*(str+*len) == '\\'){ int newpos = 0;
++(*len); for(;*(str+oldpos) != '"';++oldpos, ++(newpos), ++(*column)){
++(*column); if(*(str+oldpos) == '\\'){
}else if(*(str+*len) == '\0'){ if(*(str+oldpos+1) == 'u'){
*len = 1; uint32_t high, low;
oldpos += (decode_unicode_escape(str+oldpos, &high, &low)-1);
newpos += (codepoint_strlen(utf_to_codepoint(high, low))-1);
}else{
++oldpos;
++(*column);
}
}else if(*(str+oldpos) == '\0'){
newpos = 1;
fprintf(stderr, "Syntax error! %s (%i:%i)\n", "Unexpected EOF in string!", *row, *column); fprintf(stderr, "Syntax error! %s (%i:%i)\n", "Unexpected EOF in string!", *row, *column);
rjp_free_value(root); rjp_free_value(root);
return NULL; return NULL;
}else if(*(str+*len) == '\n'){ }else if(*(str+oldpos) == '\n'){
++(*row); ++(*row);
*column = 0; *column = 0;
} }
} }
if(*len == 0){ *len = oldpos;
if(newpos == 0){
return NULL; return NULL;
} }
new_string = rjp_alloc(*len + 1); new_string = rjp_alloc(newpos + 1);
new_string[*len] = 0; new_string[newpos] = 0;
for(int i = 0;*str != '"';++i,++str){ for(int i = 0;*str != '"';++i,++str){
if(*str == '\\'){ if(*str == '\\'){
++str; ++str;
@ -52,6 +198,18 @@ char* _rjp__parse_string(RJP_value* root, const char* str, int* len, int* row, i
case 'f': case 'f':
new_string[i] = '\f'; new_string[i] = '\f';
break; break;
case 'u':;
uint32_t high, low;
uint32_t codepoint;
--str;
str += (decode_unicode_escape(str, &high, &low) - 1);
if(!high){
rjp_free(new_string);
return NULL;
}
codepoint = utf_to_codepoint(high, low);
i += (codepoint_to_u8(new_string+i, codepoint)-1);
break;
default: default:
new_string[i] = *str; new_string[i] = *str;
break; break;