1 | /* 2 | * Copyright (c) 2007-2014, Lloyd Hilaiel <me@lloyd.io> 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted, provided that the above 6 | * copyright notice and this permission notice appear in all copies. 7 | * 8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 | */ 16 | 17 | #include "yajl_encode.h" 18 | 19 | #include <assert.h> 20 | #include <stdlib.h> 21 | #include <string.h> 22 | #include <stdio.h> 23 | 24 | static void CharToHex(unsigned int c, unsigned char * hexBuf) 25 | { 26 | const unsigned char *hexchar = (const unsigned char * ) "0123456789ABCDEF"; 27 | hexBuf[0] = hexchar[c >> 4]; 28 | hexBuf[1] = hexchar[c & 0x0F]; 29 | } 30 | 31 | void 32 | yajl_string_encode(const yajl_print_t print, 33 | void * ctx, 34 | const unsigned char * str, 35 | size_t len, 36 | unsigned int escape_solidus) 37 | { 38 | size_t beg = 0; 39 | size_t end = 0; 40 | unsigned char hexBuf[7]; 41 | hexBuf[0] = '\\'; hexBuf[1] = 'u'; hexBuf[2] = '0'; hexBuf[3] = '0'; 42 | hexBuf[6] = 0; 43 | 44 | while (end < len) { 45 | const char * escaped = NULL; 46 | switch (str[end]) { 47 | case '\r': escaped = "\\r"; break; 48 | case '\n': escaped = "\\n"; break; 49 | case '\\': escaped = "\\\\"; break; 50 | /* it is not required to escape a solidus in JSON: 51 | * read sec. 2.5: http://www.ietf.org/rfc/rfc4627.txt 52 | * specifically, this production from the grammar: 53 | * unescaped = %x20-21 / %x23-5B / %x5D-10FFFF 54 | */ 55 | case '/': if (escape_solidus) escaped = "\\/"; break; 56 | case '"': escaped = "\\\""; break; 57 | case '\f': escaped = "\\f"; break; 58 | case '\b': escaped = "\\b"; break; 59 | case '\t': escaped = "\\t"; break; 60 | default: 61 | if ((unsigned char) str[end] < 32) { 62 | CharToHex((unsigned int) str[end], hexBuf + 4); 63 | escaped = (char *) hexBuf; 64 | } 65 | break; 66 | } 67 | if (escaped != NULL) { 68 | print(ctx, (const char *) (str + beg), end - beg); 69 | print(ctx, escaped, (size_t)strlen(escaped)); 70 | beg = ++end; 71 | } else { 72 | ++end; 73 | } 74 | } 75 | print(ctx, (const char *) (str + beg), end - beg); 76 | } 77 | 78 | static void hexToDigit(unsigned int * val, const unsigned char * hex) 79 | { 80 | unsigned int i; 81 | for (i=0;i<4;i++) { 82 | unsigned char c = hex[i]; 83 | if (c >= 'A') c = (unsigned char) ((c & ~0x20) - 7); 84 | c = (unsigned char) (c - '0'); 85 | assert(!(c & 0xF0)); 86 | *val = (*val << 4) | c; 87 | } 88 | } 89 | 90 | static void Utf32toUtf8(unsigned int codepoint, char * utf8Buf) 91 | { 92 | if (codepoint < 0x80) { 93 | utf8Buf[0] = (char) codepoint; 94 | utf8Buf[1] = 0; 95 | } else if (codepoint < 0x0800) { 96 | utf8Buf[0] = (char) ((codepoint >> 6) | 0xC0); 97 | utf8Buf[1] = (char) ((codepoint & 0x3F) | 0x80); 98 | utf8Buf[2] = 0; 99 | } else if (codepoint < 0x10000) { 100 | utf8Buf[0] = (char) ((codepoint >> 12) | 0xE0); 101 | utf8Buf[1] = (char) (((codepoint >> 6) & 0x3F) | 0x80); 102 | utf8Buf[2] = (char) ((codepoint & 0x3F) | 0x80); 103 | utf8Buf[3] = 0; 104 | } else if (codepoint < 0x200000) { 105 | utf8Buf[0] =(char)((codepoint >> 18) | 0xF0); 106 | utf8Buf[1] =(char)(((codepoint >> 12) & 0x3F) | 0x80); 107 | utf8Buf[2] =(char)(((codepoint >> 6) & 0x3F) | 0x80); 108 | utf8Buf[3] =(char)((codepoint & 0x3F) | 0x80); 109 | utf8Buf[4] = 0; 110 | } else { 111 | utf8Buf[0] = '?'; 112 | utf8Buf[1] = 0; 113 | } 114 | } 115 | 116 | void yajl_string_decode(yajl_buf buf, const unsigned char * str, 117 | size_t len) 118 | { 119 | size_t beg = 0; 120 | size_t end = 0; 121 | 122 | while (end < len) { 123 | if (str[end] == '\\') { 124 | char utf8Buf[5]; 125 | const char * unescaped = "?"; 126 | yajl_buf_append(buf, str + beg, end - beg); 127 | switch (str[++end]) { 128 | case 'r': unescaped = "\r"; break; 129 | case 'n': unescaped = "\n"; break; 130 | case '\\': unescaped = "\\"; break; 131 | case '/': unescaped = "/"; break; 132 | case '"': unescaped = "\""; break; 133 | case 'f': unescaped = "\f"; break; 134 | case 'b': unescaped = "\b"; break; 135 | case 't': unescaped = "\t"; break; 136 | case 'u': { 137 | unsigned int codepoint = 0; 138 | hexToDigit(&codepoint, str + ++end); 139 | end+=3; 140 | /* check if this is a surrogate */ 141 | if ((codepoint & 0xFC00) == 0xD800) { 142 | end++; 143 | if (str[end] == '\\' && str[end + 1] == 'u') { 144 | unsigned int surrogate = 0; 145 | hexToDigit(&surrogate, str + end + 2); 146 | codepoint = 147 | (((codepoint & 0x3F) << 10) | 148 | ((((codepoint >> 6) & 0xF) + 1) << 16) | 149 | (surrogate & 0x3FF)); 150 | end += 5; 151 | } else { 152 | unescaped = "?"; 153 | break; 154 | } 155 | } 156 | 157 | Utf32toUtf8(codepoint, utf8Buf); 158 | unescaped = utf8Buf; 159 | 160 | if (codepoint == 0) { 161 | yajl_buf_append(buf, unescaped, (size_t) 1); 162 | beg = ++end; 163 | continue; 164 | } 165 | 166 | break; 167 | } 168 | default: 169 | assert("this should never happen" == NULL); 170 | } 171 | yajl_buf_append(buf, unescaped, (size_t)strlen(unescaped)); 172 | beg = ++end; 173 | } else { 174 | end++; 175 | } 176 | } 177 | yajl_buf_append(buf, str + beg, end - beg); 178 | } 179 | 180 | #define ADV_PTR s++; if (!(len--)) return 0; 181 | 182 | int yajl_string_validate_utf8(const unsigned char * s, size_t len) 183 | { 184 | if (!len) return 1; 185 | if (!s) return 0; 186 | 187 | while (len--) { 188 | /* single byte */ 189 | if (*s <= 0x7f) { 190 | /* noop */ 191 | } 192 | /* two byte */ 193 | else if ((*s >> 5) == 0x6) { 194 | ADV_PTR; 195 | if (!((*s >> 6) == 0x2)) return 0; 196 | } 197 | /* three byte */ 198 | else if ((*s >> 4) == 0x0e) { 199 | ADV_PTR; 200 | if (!((*s >> 6) == 0x2)) return 0; 201 | ADV_PTR; 202 | if (!((*s >> 6) == 0x2)) return 0; 203 | } 204 | /* four byte */ 205 | else if ((*s >> 3) == 0x1e) { 206 | ADV_PTR; 207 | if (!((*s >> 6) == 0x2)) return 0; 208 | ADV_PTR; 209 | if (!((*s >> 6) == 0x2)) return 0; 210 | ADV_PTR; 211 | if (!((*s >> 6) == 0x2)) return 0; 212 | } else { 213 | return 0; 214 | } 215 | 216 | s++; 217 | } 218 | 219 | return 1; 220 | }