1    | /*
2    |  * Copyright (c) 2007-2014, Lloyd Hilaiel <me@lloyd.io>
3    |  *
4    |  * Permission to use, copy, modify, and/or distribute this software for any
5    |  * purpose with or without fee is hereby granted, provided that the above
6    |  * copyright notice and this permission notice appear in all copies.
7    |  *
8    |  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9    |  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10   |  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11   |  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12   |  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13   |  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14   |  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15   |  */
16   | 
17   | #include "yajl_encode.h"
18   | 
19   | #include <assert.h>
20   | #include <stdlib.h>
21   | #include <string.h>
22   | #include <stdio.h>
23   | 
24   | static void CharToHex(unsigned int c, unsigned char * hexBuf)
25   | {
26   |     const unsigned char *hexchar = (const unsigned char * ) "0123456789ABCDEF";
27   |     hexBuf[0] = hexchar[c >> 4];
28   |     hexBuf[1] = hexchar[c & 0x0F];
29   | }
30   | 
31   | void
32   | yajl_string_encode(const yajl_print_t print,
33   |                    void * ctx,
34   |                    const unsigned char * str,
35   |                    size_t len,
36   |                    unsigned int escape_solidus)
37   | {
38   |     size_t beg = 0;
39   |     size_t end = 0;
40   |     unsigned char hexBuf[7];
41   |     hexBuf[0] = '\\'; hexBuf[1] = 'u'; hexBuf[2] = '0'; hexBuf[3] = '0';
42   |     hexBuf[6] = 0;
43   | 
44   |     while (end < len) {
45   |         const char * escaped = NULL;
46   |         switch (str[end]) {
47   |             case '\r': escaped = "\\r"; break;
48   |             case '\n': escaped = "\\n"; break;
49   |             case '\\': escaped = "\\\\"; break;
50   |             /* it is not required to escape a solidus in JSON:
51   |              * read sec. 2.5: http://www.ietf.org/rfc/rfc4627.txt
52   |              * specifically, this production from the grammar:
53   |              *   unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
54   |              */
55   |             case '/': if (escape_solidus) escaped = "\\/"; break;
56   |             case '"': escaped = "\\\""; break;
57   |             case '\f': escaped = "\\f"; break;
58   |             case '\b': escaped = "\\b"; break;
59   |             case '\t': escaped = "\\t"; break;
60   |             default:
61   |                 if ((unsigned char) str[end] < 32) {
62   |                     CharToHex((unsigned int) str[end], hexBuf + 4);
63   |                     escaped = (char *) hexBuf;
64   |                 }
65   |                 break;
66   |         }
67   |         if (escaped != NULL) {
68   |             print(ctx, (const char *) (str + beg), end - beg);
69   |             print(ctx, escaped, (size_t)strlen(escaped));
70   |             beg = ++end;
71   |         } else {
72   |             ++end;
73   |         }
74   |     }
75   |     print(ctx, (const char *) (str + beg), end - beg);
76   | }
77   | 
78   | static void hexToDigit(unsigned int * val, const unsigned char * hex)
79   | {
80   |     unsigned int i;
81   |     for (i=0;i<4;i++) {
82   |         unsigned char c = hex[i];
83   |         if (c >= 'A') c = (unsigned char) ((c & ~0x20) - 7);
84   |         c = (unsigned char) (c - '0');
85   |         assert(!(c & 0xF0));
86   |         *val = (*val << 4) | c;
87   |     }
88   | }
89   | 
90   | static void Utf32toUtf8(unsigned int codepoint, char * utf8Buf) 
91   | {
92   |     if (codepoint < 0x80) {
93   |         utf8Buf[0] = (char) codepoint;
94   |         utf8Buf[1] = 0;
95   |     } else if (codepoint < 0x0800) {
96   |         utf8Buf[0] = (char) ((codepoint >> 6) | 0xC0);
97   |         utf8Buf[1] = (char) ((codepoint & 0x3F) | 0x80);
98   |         utf8Buf[2] = 0;
99   |     } else if (codepoint < 0x10000) {
100  |         utf8Buf[0] = (char) ((codepoint >> 12) | 0xE0);
101  |         utf8Buf[1] = (char) (((codepoint >> 6) & 0x3F) | 0x80);
102  |         utf8Buf[2] = (char) ((codepoint & 0x3F) | 0x80);
103  |         utf8Buf[3] = 0;
104  |     } else if (codepoint < 0x200000) {
105  |         utf8Buf[0] =(char)((codepoint >> 18) | 0xF0);
106  |         utf8Buf[1] =(char)(((codepoint >> 12) & 0x3F) | 0x80);
107  |         utf8Buf[2] =(char)(((codepoint >> 6) & 0x3F) | 0x80);
108  |         utf8Buf[3] =(char)((codepoint & 0x3F) | 0x80);
109  |         utf8Buf[4] = 0;
110  |     } else {
111  |         utf8Buf[0] = '?';
112  |         utf8Buf[1] = 0;
113  |     }
114  | }
115  | 
116  | void yajl_string_decode(yajl_buf buf, const unsigned char * str,
117  |                         size_t len)
118  | {
119  |     size_t beg = 0;
120  |     size_t end = 0;    
121  | 
122  |     while (end < len) {
123  |         if (str[end] == '\\') {
124  |             char utf8Buf[5];
125  |             const char * unescaped = "?";
126  |             yajl_buf_append(buf, str + beg, end - beg);
127  |             switch (str[++end]) {
128  |                 case 'r': unescaped = "\r"; break;
129  |                 case 'n': unescaped = "\n"; break;
130  |                 case '\\': unescaped = "\\"; break;
131  |                 case '/': unescaped = "/"; break;
132  |                 case '"': unescaped = "\""; break;
133  |                 case 'f': unescaped = "\f"; break;
134  |                 case 'b': unescaped = "\b"; break;
135  |                 case 't': unescaped = "\t"; break;
136  |                 case 'u': {
137  |                     unsigned int codepoint = 0;
138  |                     hexToDigit(&codepoint, str + ++end);
139  |                     end+=3;
140  |                     /* check if this is a surrogate */
141  |                     if ((codepoint & 0xFC00) == 0xD800) {
142  |                         end++;
143  |                         if (str[end] == '\\' && str[end + 1] == 'u') {
144  |                             unsigned int surrogate = 0;
145  |                             hexToDigit(&surrogate, str + end + 2);
146  |                             codepoint =
147  |                                 (((codepoint & 0x3F) << 10) | 
148  |                                  ((((codepoint >> 6) & 0xF) + 1) << 16) | 
149  |                                  (surrogate & 0x3FF));
150  |                             end += 5;
151  |                         } else {
152  |                             unescaped = "?";
153  |                             break;
154  |                         }
155  |                     }
156  |                     
157  |                     Utf32toUtf8(codepoint, utf8Buf);
158  |                     unescaped = utf8Buf;
159  | 
160  |                     if (codepoint == 0) {
161  |                         yajl_buf_append(buf, unescaped, (size_t) 1);
162  |                         beg = ++end;
163  |                         continue;
164  |                     }
165  | 
166  |                     break;
167  |                 }
168  |                 default:
169  |                     assert("this should never happen" == NULL);
170  |             }
171  |             yajl_buf_append(buf, unescaped, (size_t)strlen(unescaped));
172  |             beg = ++end;
173  |         } else {
174  |             end++;
175  |         }
176  |     }
177  |     yajl_buf_append(buf, str + beg, end - beg);
178  | }
179  | 
180  | #define ADV_PTR s++; if (!(len--)) return 0;
181  | 
182  | int yajl_string_validate_utf8(const unsigned char * s, size_t len)
183  | {
184  |     if (!len) return 1;
185  |     if (!s) return 0;
186  |     
187  |     while (len--) {
188  |         /* single byte */
189  |         if (*s <= 0x7f) {
190  |             /* noop */
191  |         }
192  |         /* two byte */ 
193  |         else if ((*s >> 5) == 0x6) {
194  |             ADV_PTR;
195  |             if (!((*s >> 6) == 0x2)) return 0;
196  |         }
197  |         /* three byte */
198  |         else if ((*s >> 4) == 0x0e) {
199  |             ADV_PTR;
200  |             if (!((*s >> 6) == 0x2)) return 0;
201  |             ADV_PTR;
202  |             if (!((*s >> 6) == 0x2)) return 0;
203  |         }
204  |         /* four byte */        
205  |         else if ((*s >> 3) == 0x1e) {
206  |             ADV_PTR;
207  |             if (!((*s >> 6) == 0x2)) return 0;
208  |             ADV_PTR;
209  |             if (!((*s >> 6) == 0x2)) return 0;
210  |             ADV_PTR;
211  |             if (!((*s >> 6) == 0x2)) return 0;
212  |         } else {
213  |             return 0;
214  |         }
215  |         
216  |         s++;
217  |     }
218  |     
219  |     return 1;
220  | }