Added surrogate pairs support (JSON support is full now), removed PAR…

…SON_VERSION macro.
nyteshade · Apr 10, 2014 · c707051 · c707051
1 parent 19a0d79
commit c707051
Show file tree

Hide file tree

Showing 6 changed files with 70 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -2,6 +2,7 @@
 Parson is a lighweight [json](http://json.org) parser and reader written in C.  
 
 ##Features
+* Full JSON support
 * Lightweight (only 2 files)
 * Simple API
 * Addressing json values with dot notation (similiar to C structs or objects in most OO languages, e.g. "objectA.objectB.value")

diff --git a/parson.c b/parson.c
@@ -1,6 +1,6 @@
 /*
  Parson ( http://kgabis.github.com/parson/ )
- Copyright (c) 2013 Krzysztof Gabis
+ Copyright (c) 2012 - 2014 Krzysztof Gabis
 
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -102,6 +102,7 @@ static JSON_Value * json_value_init_null(void);
 
 /* Parser */
 static void         skip_quotes(const char **string);
+static int          parse_utf_16(char **processed, char **unprocessed);
 static const char * get_processed_string(const char **string);
 static JSON_Value * parse_object_value(const char **string, size_t nesting);
 static JSON_Value * parse_array_value(const char **string, size_t nesting);
@@ -380,63 +381,86 @@ static void skip_quotes(const char **string) {
     skip_char(string);
 }
 
+static int parse_utf_16(char **processed, char **unprocessed) {
+    unsigned int cp, lead, trail;
+    char *processed_ptr = *processed;
+    char *unprocessed_ptr = *unprocessed;
+    unprocessed_ptr++; /* skips u */
+    if (!is_utf((const unsigned char*)unprocessed_ptr) || sscanf(unprocessed_ptr, "%4x", &cp) == EOF)
+            return ERROR;
+    if (cp < 0x80) {
+        *processed_ptr = cp; /* 0xxxxxxx */
+    } else if (cp < 0x800) {
+        *processed_ptr++ = ((cp >> 6) & 0x1F) | 0xC0; /* 110xxxxx */
+        *processed_ptr   = ((cp     ) & 0x3F) | 0x80; /* 10xxxxxx */
+    } else if (cp < 0xD800 || cp > 0xDFFF) {
+        *processed_ptr++ = ((cp >> 12) & 0x0F) | 0xE0; /* 1110xxxx */
+        *processed_ptr++ = ((cp >> 6)  & 0x3F) | 0x80; /* 10xxxxxx */
+        *processed_ptr   = ((cp     )  & 0x3F) | 0x80; /* 10xxxxxx */
+    } else if (cp >= 0xD800 && cp <= 0xDBFF) { /* lead surrogate (0xD800..0xDBFF) */
+        lead = cp;
+        unprocessed_ptr += 4; /* should always be within the buffer, otherwise previous sscanf would fail */
+        if (*unprocessed_ptr++ != '\\' || *unprocessed_ptr++ != 'u' || /* starts with \u? */
+            !is_utf((const unsigned char*)unprocessed_ptr)          ||
+            sscanf(unprocessed_ptr, "%4x", &trail) == EOF           ||
+            trail < 0xDC00 || trail > 0xDFFF) { /* valid trail surrogate? (0xDC00..0xDFFF) */
+                return ERROR;
+        }
+        cp = ((((lead-0xD800)&0x3FF)<<10)|((trail-0xDC00)&0x3FF))+0x010000;
+        *processed_ptr++ = (((cp >> 18) & 0x07) | 0xF0); /* 11110xxx */
+        *processed_ptr++ = (((cp >> 12) & 0x3F) | 0x80); /* 10xxxxxx */
+        *processed_ptr++ = (((cp >> 6)  & 0x3F) | 0x80); /* 10xxxxxx */
+        *processed_ptr   = (((cp     )  & 0x3F) | 0x80); /* 10xxxxxx */
+    } else { /* trail surrogate before lead surrogate */
+        return ERROR;
+    }
+    unprocessed_ptr += 3;
+    *processed = processed_ptr;
+    *unprocessed = unprocessed_ptr;
+    return SUCCESS;
+}
+
 /* Returns contents of a string inside double quotes and parses escaped
  characters inside.
  Example: "\u006Corem ipsum" -> lorem ipsum */
 static const char * get_processed_string(const char **string) {
     const char *string_start = *string;
-    char *output, *processed_ptr, *unprocessed_ptr, current_char;
-    unsigned int utf_val;
+    char *output = NULL, *processed_ptr = NULL, *unprocessed_ptr = NULL;
     skip_quotes(string);
     if (**string == '\0')
         return NULL;
-    output = parson_strndup(string_start + 1, *string  - string_start - 2);
+    output = parson_strndup(string_start + 1, *string - string_start - 2);
     if (!output)
         return NULL;
     processed_ptr = unprocessed_ptr = output;
-    while (*unprocessed_ptr) {
-        current_char = *unprocessed_ptr;
-        if (current_char == '\\') {
+    while (*unprocessed_ptr != '\0') {
+        if (*unprocessed_ptr == '\\') {
             unprocessed_ptr++;
-            current_char = *unprocessed_ptr;
-            switch (current_char) {
+            switch (*unprocessed_ptr) {
                 case '\"': case '\\': case '/': break;
-                case 'b': current_char = '\b'; break;
-                case 'f': current_char = '\f'; break;
-                case 'n': current_char = '\n'; break;
-                case 'r': current_char = '\r'; break;
-                case 't': current_char = '\t'; break;
+                case 'b': *processed_ptr = '\b'; break;
+                case 'f': *processed_ptr = '\f'; break;
+                case 'n': *processed_ptr = '\n'; break;
+                case 'r': *processed_ptr = '\r'; break;
+                case 't': *processed_ptr = '\t'; break;
                 case 'u':
-                    unprocessed_ptr++;
-                    if (!is_utf((const unsigned char*)unprocessed_ptr) ||
-                        sscanf(unprocessed_ptr, "%4x", &utf_val) == EOF) {
-                            parson_free(output);
-                            return NULL;
-                    }
-                    if (utf_val < 0x80) {
-                        current_char = utf_val;
-                    } else if (utf_val < 0x800) {
-                        *processed_ptr++ = (utf_val >> 6) | 0xC0;
-                        current_char = ((utf_val | 0x80) & 0xBF);
-                    } else {
-                        *processed_ptr++ = (utf_val >> 12) | 0xE0;
-                        *processed_ptr++ = (((utf_val >> 6) | 0x80) & 0xBF);
-                        current_char = ((utf_val | 0x80) & 0xBF);
+                    if (parse_utf_16(&processed_ptr, &unprocessed_ptr) == ERROR) {
+                        parson_free(output);
+                        return NULL;
                     }
-                    unprocessed_ptr += 3;
                     break;
                 default:
                     parson_free(output);
                     return NULL;
                     break;
             }
-        } else if ((unsigned char)current_char < 0x20) { /* 0x00-0x19 are invalid characters for json string (http://www.ietf.org/rfc/rfc4627.txt) */
-            parson_free(output);
+        } else if ((unsigned char)*unprocessed_ptr < 0x20) {
+            parson_free(output); /* 0x00-0x19 are invalid characters for json string (http://www.ietf.org/rfc/rfc4627.txt) */
             return NULL;
+        } else {
+            *processed_ptr = *unprocessed_ptr;
         }
-        *processed_ptr = current_char;
-        processed_ptr++;
-        unprocessed_ptr++;
+        processed_ptr++, unprocessed_ptr++;
     }
     *processed_ptr = '\0';
     if (try_realloc((void**)&output, strlen(output) + 1) == ERROR)

diff --git a/parson.h b/parson.h
@@ -1,6 +1,6 @@
 /*
  Parson ( http://kgabis.github.com/parson/ )
- Copyright (c) 2013 Krzysztof Gabis
+ Copyright (c) 2012 - 2014 Krzysztof Gabis
  
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -30,8 +30,6 @@ extern "C"
 #endif    
 
 #include <stddef.h>   /* size_t */    
-
-#define PARSON_VERSION 20131130
 
 /* Types and enums */
 typedef struct json_object_t JSON_Object;

diff --git a/tests.c b/tests.c
@@ -1,6 +1,6 @@
 /*
  Parson ( http://kgabis.github.com/parson/ )
- Copyright (c) 2013 Krzysztof Gabis
+ Copyright (c) 2012 - 2014 Krzysztof Gabis
  
  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
@@ -36,7 +36,7 @@
 void test_suite_1(void);
 void test_suite_2(JSON_Value *value);
 void test_suite_2_no_comments(void);
-void test_suite_2_with_commnets(void);
+void test_suite_2_with_comments(void);
 void test_suite_3(void);
 
 char *read_file(const char *filename);
@@ -50,7 +50,7 @@ int main() {
     /* print_commits_info("torvalds", "linux"); */
     test_suite_1();
     test_suite_2_no_comments();
-    test_suite_2_with_commnets();
+    test_suite_2_with_comments();
     test_suite_3();
     printf("Tests failed: %d\n", tests_failed);
     printf("Tests passed: %d\n", tests_passed);
@@ -87,6 +87,7 @@ void test_suite_2(JSON_Value *root_value) {
     TEST(STREQ(json_object_get_string(root_object, "string"), "lorem ipsum"));
     TEST(STREQ(json_object_get_string(root_object, "utf string"), "lorem ipsum"));
     TEST(STREQ(json_object_get_string(root_object, "utf-8 string"), "あいうえお"));
+    TEST(STREQ(json_object_get_string(root_object, "surrogate string"), "lorem𝄞ipsum𝍧lorem"));
     TEST(json_object_get_number(root_object, "positive one") == 1.0);
     TEST(json_object_get_number(root_object, "negative one") == -1.0);
     TEST(json_object_get_number(root_object, "hard to parse number") == -0.000314);
@@ -145,7 +146,7 @@ void test_suite_2_no_comments(void) {
     json_value_free(root_value);
 }
 
-void test_suite_2_with_commnets(void) {
+void test_suite_2_with_comments(void) {
     const char *filename = "tests/test_2_comments.txt";
     JSON_Value *root_value = NULL;
     printf("Testing %s:\n", filename);
@@ -199,6 +200,7 @@ void test_suite_3(void) {
     TEST(json_parse_string("[-07]") == NULL);
     TEST(json_parse_string("[-007]") == NULL);
     TEST(json_parse_string("[-07.0]") == NULL);
+    TEST(json_parse_string("[\"\\uDF67\\uD834\"]") == NULL); /* wrong order surrogate pair */
 }
 
 void print_commits_info(const char *username, const char *repo) {

diff --git a/tests/test_2.txt b/tests/test_2.txt
@@ -1,7 +1,8 @@
 {
 	"string" : "lorem ipsum",
 	"utf string" : "\u006corem\u0020ipsum",
-   "utf-8 string": "あいうえお",
+    "utf-8 string": "あいうえお",
+    "surrogate string": "lorem\uD834\uDD1Eipsum\uD834\uDF67lorem",
 	"positive one" : 1,
 	"negative one" : -1,
 	"pi" : 3.14,

diff --git a/tests/test_2_comments.txt b/tests/test_2_comments.txt
@@ -8,6 +8,7 @@
 	"string" : "lorem ipsum", // lorem ipsum 
 	"utf string" : "\u006corem\u0020ipsum", // lorem ipsum //
    "utf-8 string": "あいうえお", // /* lorem ipsum */
+    "surrogate string": "lorem\uD834\uDD1Eipsum\uD834\uDF67lorem",
 	"positive one" : 1, 
 	"negative one" : -1,
 	"pi" : 3.14,