Added support for unicode escape sequences
This commit is contained in:
parent
383cb5af54
commit
20e46d2af2
@ -1,3 +1,21 @@
|
||||
/**
|
||||
rjp
|
||||
Copyright (C) 2018 rexy712
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef RJP_INTERNAL_H
|
||||
#define RJP_INTERNAL_H
|
||||
|
||||
|
||||
@ -17,7 +17,6 @@
|
||||
*/
|
||||
|
||||
//TODO: Scientific notation
|
||||
//TODO: \e escape sequence in strings
|
||||
|
||||
#include "rjp.h"
|
||||
#include "rjp_internal.h"
|
||||
|
||||
18
src/output.c
18
src/output.c
@ -1,3 +1,21 @@
|
||||
/**
|
||||
rjp
|
||||
Copyright (C) 2018 rexy712
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "rjp.h"
|
||||
#include "rjp_internal.h"
|
||||
|
||||
|
||||
176
src/strings.c
176
src/strings.c
@ -1,32 +1,178 @@
|
||||
/**
|
||||
rjp
|
||||
Copyright (C) 2018 rexy712
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "rjp.h"
|
||||
#include "rjp_internal.h"
|
||||
|
||||
#include <stdio.h> //fprintf
|
||||
#include <stdlib.h> //malloc, free
|
||||
#include <stdint.h>
|
||||
|
||||
static uint32_t utf_strtol_4(const char* c){
|
||||
uint32_t ret = 0;
|
||||
for(size_t i = 0;i < 4;++i){
|
||||
if(c[i] >= '0' && c[i] <= '9'){
|
||||
ret |= ((c[i] ^ 0x30) << (4*(3-i)));
|
||||
}else if(c[i] >= 'A' && c[i] <= 'F'){
|
||||
ret |= ((c[i] - 0x37) << (4*(3-i)));
|
||||
}else if(c[i] >= 'a' && c[i] <= 'f'){
|
||||
ret |= ((c[i] - 0x57) << (4*(3-i)));
|
||||
}else{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static int decode_unicode_escape(const char* str, uint32_t* high, uint32_t* low){
|
||||
if(*str != '\\' || *(str+1) != 'u'){ //invalid
|
||||
return *low = *high = 0;
|
||||
}
|
||||
*high = utf_strtol_4(str+2);
|
||||
if(!*high)
|
||||
return *low = *high = 0;
|
||||
if((*high & 0xF800) == 0xD800){ //utf-16
|
||||
if(*(str+6) != '\\' || *(str+7) != 'u'){
|
||||
return *low = *high = 0;
|
||||
}
|
||||
*low = utf_strtol_4(str+8);
|
||||
return 12;
|
||||
}else{
|
||||
*low = 0;
|
||||
}
|
||||
return 6;
|
||||
}
|
||||
|
||||
static uint32_t u16_surrogate_pair_to_codepoint(uint32_t high, uint32_t low){
|
||||
uint32_t codepoint;
|
||||
|
||||
codepoint = ((high & 0x07FF) << 10) | (1 << 16);
|
||||
codepoint = codepoint | (low & 0x03FF);
|
||||
|
||||
return codepoint;
|
||||
}
|
||||
|
||||
static uint32_t utf_to_codepoint(uint32_t high, uint32_t low){
|
||||
if(!low) //utf8
|
||||
return high;
|
||||
return u16_surrogate_pair_to_codepoint(high, low);
|
||||
}
|
||||
static int codepoint_strlen(uint32_t codepoint){
|
||||
if(codepoint <= 0x007F){
|
||||
return 1;
|
||||
}else if(codepoint <= 0x07FF){
|
||||
return 2;
|
||||
}else if(codepoint <= 0xFFFF){
|
||||
return 3;
|
||||
}else if(codepoint <= 0x10FFFF){
|
||||
return 4;
|
||||
}else{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static int codepoint_to_u8(char* dest, uint32_t codepoint){
|
||||
if(codepoint <= 0x007F){
|
||||
dest[0] = codepoint;
|
||||
return 1;
|
||||
}else if(codepoint <= 0x07FF){
|
||||
dest[0] = (codepoint >> 6) | 0xC0;
|
||||
dest[1] = (codepoint & 0x3F) | 0x80;
|
||||
return 2;
|
||||
}else if(codepoint <= 0xFFFF){
|
||||
dest[0] = (codepoint >> 12) | 0xE0;
|
||||
dest[1] = ((codepoint >> 6) & 0x3F) | 0x80;
|
||||
dest[2] = (codepoint & 0x3F) | 0x80;
|
||||
return 3;
|
||||
}else if(codepoint <= 0x10FFFF){
|
||||
dest[0] = (codepoint >> 18) | 0xF0;
|
||||
dest[1] = ((codepoint >> 12) & 0x3F) | 0x80;
|
||||
dest[2] = ((codepoint >> 6) & 0x3F) | 0x80;
|
||||
dest[3] = (codepoint & 0x3F) | 0x80;
|
||||
return 4;
|
||||
}else{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
static uint32_t u8_to_codepoint(char* u){
|
||||
if((u[0] & 0x80) == 0){
|
||||
//one byte
|
||||
return u[0];
|
||||
}else if((u[0] & 0xE0) == 0xC0){
|
||||
//two byte
|
||||
uint32_t codepoint;
|
||||
codepoint = (u[0] & 0x1F) << 6;
|
||||
codepoint |= (u[1] & 0x3F);
|
||||
return codepoint;
|
||||
}else if((u[0] & 0xF0) == 0xE0){
|
||||
//three byte
|
||||
uint32_t codepoint;
|
||||
codepoint = (u[0] & 0x0F) << 12;
|
||||
codepoint |= (u[1] & 0x3F) << 6;
|
||||
codepoint |= (u[2] & 0x3F);
|
||||
return codepoint;
|
||||
}else if((u[0] & 0xF8) == 0xF0){
|
||||
//four byte
|
||||
uint32_t codepoint;
|
||||
codepoint = (u[0] & 0x07) << 18;
|
||||
codepoint |= (u[1] & 0x3F) << 12;
|
||||
codepoint |= (u[2] & 0x3F) << 6;
|
||||
codepoint |= (u[3] & 0x3F);
|
||||
return codepoint;
|
||||
}else{
|
||||
//invalid
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
//Convert escape sequences in strings
|
||||
char* _rjp__parse_string(RJP_value* root, const char* str, int* len, int* row, int* column){
|
||||
char* new_string;
|
||||
++(*column); //account for starting quotation mark
|
||||
for(*len = 0;*(str+*len) != '"';++(*len), ++(*column)){
|
||||
if(*(str+*len) == '\\'){
|
||||
++(*len);
|
||||
int oldpos = 0;
|
||||
int newpos = 0;
|
||||
for(;*(str+oldpos) != '"';++oldpos, ++(newpos), ++(*column)){
|
||||
if(*(str+oldpos) == '\\'){
|
||||
if(*(str+oldpos+1) == 'u'){
|
||||
uint32_t high, low;
|
||||
oldpos += (decode_unicode_escape(str+oldpos, &high, &low)-1);
|
||||
newpos += (codepoint_strlen(utf_to_codepoint(high, low))-1);
|
||||
}else{
|
||||
++oldpos;
|
||||
++(*column);
|
||||
}else if(*(str+*len) == '\0'){
|
||||
*len = 1;
|
||||
}
|
||||
}else if(*(str+oldpos) == '\0'){
|
||||
newpos = 1;
|
||||
fprintf(stderr, "Syntax error! %s (%i:%i)\n", "Unexpected EOF in string!", *row, *column);
|
||||
rjp_free_value(root);
|
||||
return NULL;
|
||||
}else if(*(str+*len) == '\n'){
|
||||
}else if(*(str+oldpos) == '\n'){
|
||||
++(*row);
|
||||
*column = 0;
|
||||
}
|
||||
}
|
||||
if(*len == 0){
|
||||
*len = oldpos;
|
||||
if(newpos == 0){
|
||||
return NULL;
|
||||
}
|
||||
new_string = rjp_alloc(*len + 1);
|
||||
new_string[*len] = 0;
|
||||
new_string = rjp_alloc(newpos + 1);
|
||||
new_string[newpos] = 0;
|
||||
for(int i = 0;*str != '"';++i,++str){
|
||||
if(*str == '\\'){
|
||||
++str;
|
||||
@ -52,6 +198,18 @@ char* _rjp__parse_string(RJP_value* root, const char* str, int* len, int* row, i
|
||||
case 'f':
|
||||
new_string[i] = '\f';
|
||||
break;
|
||||
case 'u':;
|
||||
uint32_t high, low;
|
||||
uint32_t codepoint;
|
||||
--str;
|
||||
str += (decode_unicode_escape(str, &high, &low) - 1);
|
||||
if(!high){
|
||||
rjp_free(new_string);
|
||||
return NULL;
|
||||
}
|
||||
codepoint = utf_to_codepoint(high, low);
|
||||
i += (codepoint_to_u8(new_string+i, codepoint)-1);
|
||||
break;
|
||||
default:
|
||||
new_string[i] = *str;
|
||||
break;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user