Skip to content

Commit

Permalink
Added surrogate pairs support (JSON support is full now), removed PAR…
Browse files Browse the repository at this point in the history
…SON_VERSION macro.
  • Loading branch information
kgabis committed Apr 10, 2014
1 parent 19a0d79 commit c707051
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 43 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Parson is a lighweight [json](http://json.org) parser and reader written in C.

##Features
* Full JSON support
* Lightweight (only 2 files)
* Simple API
* Addressing json values with dot notation (similiar to C structs or objects in most OO languages, e.g. "objectA.objectB.value")
Expand Down
94 changes: 59 additions & 35 deletions parson.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
Parson ( http://kgabis.github.com/parson/ )
Copyright (c) 2013 Krzysztof Gabis
Copyright (c) 2012 - 2014 Krzysztof Gabis
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -102,6 +102,7 @@ static JSON_Value * json_value_init_null(void);

/* Parser */
static void skip_quotes(const char **string);
static int parse_utf_16(char **processed, char **unprocessed);
static const char * get_processed_string(const char **string);
static JSON_Value * parse_object_value(const char **string, size_t nesting);
static JSON_Value * parse_array_value(const char **string, size_t nesting);
Expand Down Expand Up @@ -380,63 +381,86 @@ static void skip_quotes(const char **string) {
skip_char(string);
}

static int parse_utf_16(char **processed, char **unprocessed) {
unsigned int cp, lead, trail;
char *processed_ptr = *processed;
char *unprocessed_ptr = *unprocessed;
unprocessed_ptr++; /* skips u */
if (!is_utf((const unsigned char*)unprocessed_ptr) || sscanf(unprocessed_ptr, "%4x", &cp) == EOF)
return ERROR;
if (cp < 0x80) {
*processed_ptr = cp; /* 0xxxxxxx */
} else if (cp < 0x800) {
*processed_ptr++ = ((cp >> 6) & 0x1F) | 0xC0; /* 110xxxxx */
*processed_ptr = ((cp ) & 0x3F) | 0x80; /* 10xxxxxx */
} else if (cp < 0xD800 || cp > 0xDFFF) {
*processed_ptr++ = ((cp >> 12) & 0x0F) | 0xE0; /* 1110xxxx */
*processed_ptr++ = ((cp >> 6) & 0x3F) | 0x80; /* 10xxxxxx */
*processed_ptr = ((cp ) & 0x3F) | 0x80; /* 10xxxxxx */
} else if (cp >= 0xD800 && cp <= 0xDBFF) { /* lead surrogate (0xD800..0xDBFF) */
lead = cp;
unprocessed_ptr += 4; /* should always be within the buffer, otherwise previous sscanf would fail */
if (*unprocessed_ptr++ != '\\' || *unprocessed_ptr++ != 'u' || /* starts with \u? */
!is_utf((const unsigned char*)unprocessed_ptr) ||
sscanf(unprocessed_ptr, "%4x", &trail) == EOF ||
trail < 0xDC00 || trail > 0xDFFF) { /* valid trail surrogate? (0xDC00..0xDFFF) */
return ERROR;
}
cp = ((((lead-0xD800)&0x3FF)<<10)|((trail-0xDC00)&0x3FF))+0x010000;
*processed_ptr++ = (((cp >> 18) & 0x07) | 0xF0); /* 11110xxx */
*processed_ptr++ = (((cp >> 12) & 0x3F) | 0x80); /* 10xxxxxx */
*processed_ptr++ = (((cp >> 6) & 0x3F) | 0x80); /* 10xxxxxx */
*processed_ptr = (((cp ) & 0x3F) | 0x80); /* 10xxxxxx */
} else { /* trail surrogate before lead surrogate */
return ERROR;
}
unprocessed_ptr += 3;
*processed = processed_ptr;
*unprocessed = unprocessed_ptr;
return SUCCESS;
}

/* Returns contents of a string inside double quotes and parses escaped
characters inside.
Example: "\u006Corem ipsum" -> lorem ipsum */
static const char * get_processed_string(const char **string) {
const char *string_start = *string;
char *output, *processed_ptr, *unprocessed_ptr, current_char;
unsigned int utf_val;
char *output = NULL, *processed_ptr = NULL, *unprocessed_ptr = NULL;
skip_quotes(string);
if (**string == '\0')
return NULL;
output = parson_strndup(string_start + 1, *string - string_start - 2);
output = parson_strndup(string_start + 1, *string - string_start - 2);
if (!output)
return NULL;
processed_ptr = unprocessed_ptr = output;
while (*unprocessed_ptr) {
current_char = *unprocessed_ptr;
if (current_char == '\\') {
while (*unprocessed_ptr != '\0') {
if (*unprocessed_ptr == '\\') {
unprocessed_ptr++;
current_char = *unprocessed_ptr;
switch (current_char) {
switch (*unprocessed_ptr) {
case '\"': case '\\': case '/': break;
case 'b': current_char = '\b'; break;
case 'f': current_char = '\f'; break;
case 'n': current_char = '\n'; break;
case 'r': current_char = '\r'; break;
case 't': current_char = '\t'; break;
case 'b': *processed_ptr = '\b'; break;
case 'f': *processed_ptr = '\f'; break;
case 'n': *processed_ptr = '\n'; break;
case 'r': *processed_ptr = '\r'; break;
case 't': *processed_ptr = '\t'; break;
case 'u':
unprocessed_ptr++;
if (!is_utf((const unsigned char*)unprocessed_ptr) ||
sscanf(unprocessed_ptr, "%4x", &utf_val) == EOF) {
parson_free(output);
return NULL;
}
if (utf_val < 0x80) {
current_char = utf_val;
} else if (utf_val < 0x800) {
*processed_ptr++ = (utf_val >> 6) | 0xC0;
current_char = ((utf_val | 0x80) & 0xBF);
} else {
*processed_ptr++ = (utf_val >> 12) | 0xE0;
*processed_ptr++ = (((utf_val >> 6) | 0x80) & 0xBF);
current_char = ((utf_val | 0x80) & 0xBF);
if (parse_utf_16(&processed_ptr, &unprocessed_ptr) == ERROR) {
parson_free(output);
return NULL;
}
unprocessed_ptr += 3;
break;
default:
parson_free(output);
return NULL;
break;
}
} else if ((unsigned char)current_char < 0x20) { /* 0x00-0x19 are invalid characters for json string (http://www.ietf.org/rfc/rfc4627.txt) */
parson_free(output);
} else if ((unsigned char)*unprocessed_ptr < 0x20) {
parson_free(output); /* 0x00-0x19 are invalid characters for json string (http://www.ietf.org/rfc/rfc4627.txt) */
return NULL;
} else {
*processed_ptr = *unprocessed_ptr;
}
*processed_ptr = current_char;
processed_ptr++;
unprocessed_ptr++;
processed_ptr++, unprocessed_ptr++;
}
*processed_ptr = '\0';
if (try_realloc((void**)&output, strlen(output) + 1) == ERROR)
Expand Down
4 changes: 1 addition & 3 deletions parson.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
Parson ( http://kgabis.github.com/parson/ )
Copyright (c) 2013 Krzysztof Gabis
Copyright (c) 2012 - 2014 Krzysztof Gabis
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -30,8 +30,6 @@ extern "C"
#endif

#include <stddef.h> /* size_t */

#define PARSON_VERSION 20131130

/* Types and enums */
typedef struct json_object_t JSON_Object;
Expand Down
10 changes: 6 additions & 4 deletions tests.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
Parson ( http://kgabis.github.com/parson/ )
Copyright (c) 2013 Krzysztof Gabis
Copyright (c) 2012 - 2014 Krzysztof Gabis
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -36,7 +36,7 @@
void test_suite_1(void);
void test_suite_2(JSON_Value *value);
void test_suite_2_no_comments(void);
void test_suite_2_with_commnets(void);
void test_suite_2_with_comments(void);
void test_suite_3(void);

char *read_file(const char *filename);
Expand All @@ -50,7 +50,7 @@ int main() {
/* print_commits_info("torvalds", "linux"); */
test_suite_1();
test_suite_2_no_comments();
test_suite_2_with_commnets();
test_suite_2_with_comments();
test_suite_3();
printf("Tests failed: %d\n", tests_failed);
printf("Tests passed: %d\n", tests_passed);
Expand Down Expand Up @@ -87,6 +87,7 @@ void test_suite_2(JSON_Value *root_value) {
TEST(STREQ(json_object_get_string(root_object, "string"), "lorem ipsum"));
TEST(STREQ(json_object_get_string(root_object, "utf string"), "lorem ipsum"));
TEST(STREQ(json_object_get_string(root_object, "utf-8 string"), "あいうえお"));
TEST(STREQ(json_object_get_string(root_object, "surrogate string"), "lorem𝄞ipsum𝍧lorem"));
TEST(json_object_get_number(root_object, "positive one") == 1.0);
TEST(json_object_get_number(root_object, "negative one") == -1.0);
TEST(json_object_get_number(root_object, "hard to parse number") == -0.000314);
Expand Down Expand Up @@ -145,7 +146,7 @@ void test_suite_2_no_comments(void) {
json_value_free(root_value);
}

void test_suite_2_with_commnets(void) {
void test_suite_2_with_comments(void) {
const char *filename = "tests/test_2_comments.txt";
JSON_Value *root_value = NULL;
printf("Testing %s:\n", filename);
Expand Down Expand Up @@ -199,6 +200,7 @@ void test_suite_3(void) {
TEST(json_parse_string("[-07]") == NULL);
TEST(json_parse_string("[-007]") == NULL);
TEST(json_parse_string("[-07.0]") == NULL);
TEST(json_parse_string("[\"\\uDF67\\uD834\"]") == NULL); /* wrong order surrogate pair */
}

void print_commits_info(const char *username, const char *repo) {
Expand Down
3 changes: 2 additions & 1 deletion tests/test_2.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
{
"string" : "lorem ipsum",
"utf string" : "\u006corem\u0020ipsum",
"utf-8 string": "あいうえお",
"utf-8 string": "あいうえお",
"surrogate string": "lorem\uD834\uDD1Eipsum\uD834\uDF67lorem",
"positive one" : 1,
"negative one" : -1,
"pi" : 3.14,
Expand Down
1 change: 1 addition & 0 deletions tests/test_2_comments.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"string" : "lorem ipsum", // lorem ipsum
"utf string" : "\u006corem\u0020ipsum", // lorem ipsum //
"utf-8 string": "あいうえお", // /* lorem ipsum */
"surrogate string": "lorem\uD834\uDD1Eipsum\uD834\uDF67lorem",
"positive one" : 1,
"negative one" : -1,
"pi" : 3.14,
Expand Down

0 comments on commit c707051

Please sign in to comment.