Added support for unicode escape sequences
This commit is contained in:
parent
383cb5af54
commit
20e46d2af2
@ -1,3 +1,21 @@
|
|||||||
|
/**
|
||||||
|
rjp
|
||||||
|
Copyright (C) 2018 rexy712
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
#ifndef RJP_INTERNAL_H
|
#ifndef RJP_INTERNAL_H
|
||||||
#define RJP_INTERNAL_H
|
#define RJP_INTERNAL_H
|
||||||
|
|
||||||
|
|||||||
@ -17,7 +17,6 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
//TODO: Scientific notation
|
//TODO: Scientific notation
|
||||||
//TODO: \e escape sequence in strings
|
|
||||||
|
|
||||||
#include "rjp.h"
|
#include "rjp.h"
|
||||||
#include "rjp_internal.h"
|
#include "rjp_internal.h"
|
||||||
|
|||||||
18
src/output.c
18
src/output.c
@ -1,3 +1,21 @@
|
|||||||
|
/**
|
||||||
|
rjp
|
||||||
|
Copyright (C) 2018 rexy712
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
#include "rjp.h"
|
#include "rjp.h"
|
||||||
#include "rjp_internal.h"
|
#include "rjp_internal.h"
|
||||||
|
|
||||||
|
|||||||
178
src/strings.c
178
src/strings.c
@ -1,32 +1,178 @@
|
|||||||
|
/**
|
||||||
|
rjp
|
||||||
|
Copyright (C) 2018 rexy712
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
#include "rjp.h"
|
#include "rjp.h"
|
||||||
#include "rjp_internal.h"
|
#include "rjp_internal.h"
|
||||||
|
|
||||||
#include <stdio.h> //fprintf
|
#include <stdio.h> //fprintf
|
||||||
#include <stdlib.h> //malloc, free
|
#include <stdlib.h> //malloc, free
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
static uint32_t utf_strtol_4(const char* c){
|
||||||
|
uint32_t ret = 0;
|
||||||
|
for(size_t i = 0;i < 4;++i){
|
||||||
|
if(c[i] >= '0' && c[i] <= '9'){
|
||||||
|
ret |= ((c[i] ^ 0x30) << (4*(3-i)));
|
||||||
|
}else if(c[i] >= 'A' && c[i] <= 'F'){
|
||||||
|
ret |= ((c[i] - 0x37) << (4*(3-i)));
|
||||||
|
}else if(c[i] >= 'a' && c[i] <= 'f'){
|
||||||
|
ret |= ((c[i] - 0x57) << (4*(3-i)));
|
||||||
|
}else{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int decode_unicode_escape(const char* str, uint32_t* high, uint32_t* low){
|
||||||
|
if(*str != '\\' || *(str+1) != 'u'){ //invalid
|
||||||
|
return *low = *high = 0;
|
||||||
|
}
|
||||||
|
*high = utf_strtol_4(str+2);
|
||||||
|
if(!*high)
|
||||||
|
return *low = *high = 0;
|
||||||
|
if((*high & 0xF800) == 0xD800){ //utf-16
|
||||||
|
if(*(str+6) != '\\' || *(str+7) != 'u'){
|
||||||
|
return *low = *high = 0;
|
||||||
|
}
|
||||||
|
*low = utf_strtol_4(str+8);
|
||||||
|
return 12;
|
||||||
|
}else{
|
||||||
|
*low = 0;
|
||||||
|
}
|
||||||
|
return 6;
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint32_t u16_surrogate_pair_to_codepoint(uint32_t high, uint32_t low){
|
||||||
|
uint32_t codepoint;
|
||||||
|
|
||||||
|
codepoint = ((high & 0x07FF) << 10) | (1 << 16);
|
||||||
|
codepoint = codepoint | (low & 0x03FF);
|
||||||
|
|
||||||
|
return codepoint;
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint32_t utf_to_codepoint(uint32_t high, uint32_t low){
|
||||||
|
if(!low) //utf8
|
||||||
|
return high;
|
||||||
|
return u16_surrogate_pair_to_codepoint(high, low);
|
||||||
|
}
|
||||||
|
static int codepoint_strlen(uint32_t codepoint){
|
||||||
|
if(codepoint <= 0x007F){
|
||||||
|
return 1;
|
||||||
|
}else if(codepoint <= 0x07FF){
|
||||||
|
return 2;
|
||||||
|
}else if(codepoint <= 0xFFFF){
|
||||||
|
return 3;
|
||||||
|
}else if(codepoint <= 0x10FFFF){
|
||||||
|
return 4;
|
||||||
|
}else{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int codepoint_to_u8(char* dest, uint32_t codepoint){
|
||||||
|
if(codepoint <= 0x007F){
|
||||||
|
dest[0] = codepoint;
|
||||||
|
return 1;
|
||||||
|
}else if(codepoint <= 0x07FF){
|
||||||
|
dest[0] = (codepoint >> 6) | 0xC0;
|
||||||
|
dest[1] = (codepoint & 0x3F) | 0x80;
|
||||||
|
return 2;
|
||||||
|
}else if(codepoint <= 0xFFFF){
|
||||||
|
dest[0] = (codepoint >> 12) | 0xE0;
|
||||||
|
dest[1] = ((codepoint >> 6) & 0x3F) | 0x80;
|
||||||
|
dest[2] = (codepoint & 0x3F) | 0x80;
|
||||||
|
return 3;
|
||||||
|
}else if(codepoint <= 0x10FFFF){
|
||||||
|
dest[0] = (codepoint >> 18) | 0xF0;
|
||||||
|
dest[1] = ((codepoint >> 12) & 0x3F) | 0x80;
|
||||||
|
dest[2] = ((codepoint >> 6) & 0x3F) | 0x80;
|
||||||
|
dest[3] = (codepoint & 0x3F) | 0x80;
|
||||||
|
return 4;
|
||||||
|
}else{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static uint32_t u8_to_codepoint(char* u){
|
||||||
|
if((u[0] & 0x80) == 0){
|
||||||
|
//one byte
|
||||||
|
return u[0];
|
||||||
|
}else if((u[0] & 0xE0) == 0xC0){
|
||||||
|
//two byte
|
||||||
|
uint32_t codepoint;
|
||||||
|
codepoint = (u[0] & 0x1F) << 6;
|
||||||
|
codepoint |= (u[1] & 0x3F);
|
||||||
|
return codepoint;
|
||||||
|
}else if((u[0] & 0xF0) == 0xE0){
|
||||||
|
//three byte
|
||||||
|
uint32_t codepoint;
|
||||||
|
codepoint = (u[0] & 0x0F) << 12;
|
||||||
|
codepoint |= (u[1] & 0x3F) << 6;
|
||||||
|
codepoint |= (u[2] & 0x3F);
|
||||||
|
return codepoint;
|
||||||
|
}else if((u[0] & 0xF8) == 0xF0){
|
||||||
|
//four byte
|
||||||
|
uint32_t codepoint;
|
||||||
|
codepoint = (u[0] & 0x07) << 18;
|
||||||
|
codepoint |= (u[1] & 0x3F) << 12;
|
||||||
|
codepoint |= (u[2] & 0x3F) << 6;
|
||||||
|
codepoint |= (u[3] & 0x3F);
|
||||||
|
return codepoint;
|
||||||
|
}else{
|
||||||
|
//invalid
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//Convert escape sequences in strings
|
//Convert escape sequences in strings
|
||||||
char* _rjp__parse_string(RJP_value* root, const char* str, int* len, int* row, int* column){
|
char* _rjp__parse_string(RJP_value* root, const char* str, int* len, int* row, int* column){
|
||||||
char* new_string;
|
char* new_string;
|
||||||
++(*column); //account for starting quotation mark
|
++(*column); //account for starting quotation mark
|
||||||
for(*len = 0;*(str+*len) != '"';++(*len), ++(*column)){
|
int oldpos = 0;
|
||||||
if(*(str+*len) == '\\'){
|
int newpos = 0;
|
||||||
++(*len);
|
for(;*(str+oldpos) != '"';++oldpos, ++(newpos), ++(*column)){
|
||||||
++(*column);
|
if(*(str+oldpos) == '\\'){
|
||||||
}else if(*(str+*len) == '\0'){
|
if(*(str+oldpos+1) == 'u'){
|
||||||
*len = 1;
|
uint32_t high, low;
|
||||||
|
oldpos += (decode_unicode_escape(str+oldpos, &high, &low)-1);
|
||||||
|
newpos += (codepoint_strlen(utf_to_codepoint(high, low))-1);
|
||||||
|
}else{
|
||||||
|
++oldpos;
|
||||||
|
++(*column);
|
||||||
|
}
|
||||||
|
}else if(*(str+oldpos) == '\0'){
|
||||||
|
newpos = 1;
|
||||||
fprintf(stderr, "Syntax error! %s (%i:%i)\n", "Unexpected EOF in string!", *row, *column);
|
fprintf(stderr, "Syntax error! %s (%i:%i)\n", "Unexpected EOF in string!", *row, *column);
|
||||||
rjp_free_value(root);
|
rjp_free_value(root);
|
||||||
return NULL;
|
return NULL;
|
||||||
}else if(*(str+*len) == '\n'){
|
}else if(*(str+oldpos) == '\n'){
|
||||||
++(*row);
|
++(*row);
|
||||||
*column = 0;
|
*column = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(*len == 0){
|
*len = oldpos;
|
||||||
|
if(newpos == 0){
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
new_string = rjp_alloc(*len + 1);
|
new_string = rjp_alloc(newpos + 1);
|
||||||
new_string[*len] = 0;
|
new_string[newpos] = 0;
|
||||||
for(int i = 0;*str != '"';++i,++str){
|
for(int i = 0;*str != '"';++i,++str){
|
||||||
if(*str == '\\'){
|
if(*str == '\\'){
|
||||||
++str;
|
++str;
|
||||||
@ -52,6 +198,18 @@ char* _rjp__parse_string(RJP_value* root, const char* str, int* len, int* row, i
|
|||||||
case 'f':
|
case 'f':
|
||||||
new_string[i] = '\f';
|
new_string[i] = '\f';
|
||||||
break;
|
break;
|
||||||
|
case 'u':;
|
||||||
|
uint32_t high, low;
|
||||||
|
uint32_t codepoint;
|
||||||
|
--str;
|
||||||
|
str += (decode_unicode_escape(str, &high, &low) - 1);
|
||||||
|
if(!high){
|
||||||
|
rjp_free(new_string);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
codepoint = utf_to_codepoint(high, low);
|
||||||
|
i += (codepoint_to_u8(new_string+i, codepoint)-1);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
new_string[i] = *str;
|
new_string[i] = *str;
|
||||||
break;
|
break;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user