1 | /*
2 | * Copyright (c) 2007-2014, Lloyd Hilaiel <me@lloyd.io>
3 | *
4 | * Permission to use, copy, modify, and/or distribute this software for any
5 | * purpose with or without fee is hereby granted, provided that the above
6 | * copyright notice and this permission notice appear in all copies.
7 | *
8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 | */
16 |
17 | #include "yajl_encode.h"
18 |
19 | #include <assert.h>
20 | #include <stdlib.h>
21 | #include <string.h>
22 | #include <stdio.h>
23 |
24 | static void CharToHex(unsigned int c, unsigned char * hexBuf)
25 | {
26 | const unsigned char *hexchar = (const unsigned char * ) "0123456789ABCDEF";
27 | hexBuf[0] = hexchar[c >> 4];
28 | hexBuf[1] = hexchar[c & 0x0F];
29 | }
30 |
31 | void
32 | yajl_string_encode(const yajl_print_t print,
33 | void * ctx,
34 | const unsigned char * str,
35 | size_t len,
36 | unsigned int escape_solidus)
37 | {
38 | size_t beg = 0;
39 | size_t end = 0;
40 | unsigned char hexBuf[7];
41 | hexBuf[0] = '\\'; hexBuf[1] = 'u'; hexBuf[2] = '0'; hexBuf[3] = '0';
42 | hexBuf[6] = 0;
43 |
44 | while (end < len) {
45 | const char * escaped = NULL;
46 | switch (str[end]) {
47 | case '\r': escaped = "\\r"; break;
48 | case '\n': escaped = "\\n"; break;
49 | case '\\': escaped = "\\\\"; break;
50 | /* it is not required to escape a solidus in JSON:
51 | * read sec. 2.5: http://www.ietf.org/rfc/rfc4627.txt
52 | * specifically, this production from the grammar:
53 | * unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
54 | */
55 | case '/': if (escape_solidus) escaped = "\\/"; break;
56 | case '"': escaped = "\\\""; break;
57 | case '\f': escaped = "\\f"; break;
58 | case '\b': escaped = "\\b"; break;
59 | case '\t': escaped = "\\t"; break;
60 | default:
61 | if ((unsigned char) str[end] < 32) {
62 | CharToHex((unsigned int) str[end], hexBuf + 4);
63 | escaped = (char *) hexBuf;
64 | }
65 | break;
66 | }
67 | if (escaped != NULL) {
68 | print(ctx, (const char *) (str + beg), end - beg);
69 | print(ctx, escaped, (size_t)strlen(escaped));
70 | beg = ++end;
71 | } else {
72 | ++end;
73 | }
74 | }
75 | print(ctx, (const char *) (str + beg), end - beg);
76 | }
77 |
78 | static void hexToDigit(unsigned int * val, const unsigned char * hex)
79 | {
80 | unsigned int i;
81 | for (i=0;i<4;i++) {
82 | unsigned char c = hex[i];
83 | if (c >= 'A') c = (unsigned char) ((c & ~0x20) - 7);
84 | c = (unsigned char) (c - '0');
85 | assert(!(c & 0xF0));
86 | *val = (*val << 4) | c;
87 | }
88 | }
89 |
90 | static void Utf32toUtf8(unsigned int codepoint, char * utf8Buf)
91 | {
92 | if (codepoint < 0x80) {
93 | utf8Buf[0] = (char) codepoint;
94 | utf8Buf[1] = 0;
95 | } else if (codepoint < 0x0800) {
96 | utf8Buf[0] = (char) ((codepoint >> 6) | 0xC0);
97 | utf8Buf[1] = (char) ((codepoint & 0x3F) | 0x80);
98 | utf8Buf[2] = 0;
99 | } else if (codepoint < 0x10000) {
100 | utf8Buf[0] = (char) ((codepoint >> 12) | 0xE0);
101 | utf8Buf[1] = (char) (((codepoint >> 6) & 0x3F) | 0x80);
102 | utf8Buf[2] = (char) ((codepoint & 0x3F) | 0x80);
103 | utf8Buf[3] = 0;
104 | } else if (codepoint < 0x200000) {
105 | utf8Buf[0] =(char)((codepoint >> 18) | 0xF0);
106 | utf8Buf[1] =(char)(((codepoint >> 12) & 0x3F) | 0x80);
107 | utf8Buf[2] =(char)(((codepoint >> 6) & 0x3F) | 0x80);
108 | utf8Buf[3] =(char)((codepoint & 0x3F) | 0x80);
109 | utf8Buf[4] = 0;
110 | } else {
111 | utf8Buf[0] = '?';
112 | utf8Buf[1] = 0;
113 | }
114 | }
115 |
116 | void yajl_string_decode(yajl_buf buf, const unsigned char * str,
117 | size_t len)
118 | {
119 | size_t beg = 0;
120 | size_t end = 0;
121 |
122 | while (end < len) {
123 | if (str[end] == '\\') {
124 | char utf8Buf[5];
125 | const char * unescaped = "?";
126 | yajl_buf_append(buf, str + beg, end - beg);
127 | switch (str[++end]) {
128 | case 'r': unescaped = "\r"; break;
129 | case 'n': unescaped = "\n"; break;
130 | case '\\': unescaped = "\\"; break;
131 | case '/': unescaped = "/"; break;
132 | case '"': unescaped = "\""; break;
133 | case 'f': unescaped = "\f"; break;
134 | case 'b': unescaped = "\b"; break;
135 | case 't': unescaped = "\t"; break;
136 | case 'u': {
137 | unsigned int codepoint = 0;
138 | hexToDigit(&codepoint, str + ++end);
139 | end+=3;
140 | /* check if this is a surrogate */
141 | if ((codepoint & 0xFC00) == 0xD800) {
142 | end++;
143 | if (str[end] == '\\' && str[end + 1] == 'u') {
144 | unsigned int surrogate = 0;
145 | hexToDigit(&surrogate, str + end + 2);
146 | codepoint =
147 | (((codepoint & 0x3F) << 10) |
148 | ((((codepoint >> 6) & 0xF) + 1) << 16) |
149 | (surrogate & 0x3FF));
150 | end += 5;
151 | } else {
152 | unescaped = "?";
153 | break;
154 | }
155 | }
156 |
157 | Utf32toUtf8(codepoint, utf8Buf);
158 | unescaped = utf8Buf;
159 |
160 | if (codepoint == 0) {
161 | yajl_buf_append(buf, unescaped, (size_t) 1);
162 | beg = ++end;
163 | continue;
164 | }
165 |
166 | break;
167 | }
168 | default:
169 | assert("this should never happen" == NULL);
170 | }
171 | yajl_buf_append(buf, unescaped, (size_t)strlen(unescaped));
172 | beg = ++end;
173 | } else {
174 | end++;
175 | }
176 | }
177 | yajl_buf_append(buf, str + beg, end - beg);
178 | }
179 |
180 | #define ADV_PTR s++; if (!(len--)) return 0;
181 |
182 | int yajl_string_validate_utf8(const unsigned char * s, size_t len)
183 | {
184 | if (!len) return 1;
185 | if (!s) return 0;
186 |
187 | while (len--) {
188 | /* single byte */
189 | if (*s <= 0x7f) {
190 | /* noop */
191 | }
192 | /* two byte */
193 | else if ((*s >> 5) == 0x6) {
194 | ADV_PTR;
195 | if (!((*s >> 6) == 0x2)) return 0;
196 | }
197 | /* three byte */
198 | else if ((*s >> 4) == 0x0e) {
199 | ADV_PTR;
200 | if (!((*s >> 6) == 0x2)) return 0;
201 | ADV_PTR;
202 | if (!((*s >> 6) == 0x2)) return 0;
203 | }
204 | /* four byte */
205 | else if ((*s >> 3) == 0x1e) {
206 | ADV_PTR;
207 | if (!((*s >> 6) == 0x2)) return 0;
208 | ADV_PTR;
209 | if (!((*s >> 6) == 0x2)) return 0;
210 | ADV_PTR;
211 | if (!((*s >> 6) == 0x2)) return 0;
212 | } else {
213 | return 0;
214 | }
215 |
216 | s++;
217 | }
218 |
219 | return 1;
220 | }