Source File src/yajl

1    | /*
2    |  * Copyright (c) 2007-2014, Lloyd Hilaiel <me@lloyd.io>
3    |  *
4    |  * Permission to use, copy, modify, and/or distribute this software for any
5    |  * purpose with or without fee is hereby granted, provided that the above
6    |  * copyright notice and this permission notice appear in all copies.
7    |  *
8    |  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9    |  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10   |  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11   |  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12   |  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13   |  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14   |  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15   |  */
16   | 
17   | /**
18   |  * A JSON text lexical analyzer.
19   |  *
20   |  * The implementation.
21   |  **/
22   | 
23   | #include "yajl_lex.h"
24   | #include "yajl_buf.h"
25   | 
26   | #include <stdlib.h>
27   | #include <stdio.h>
28   | #include <assert.h>
29   | #include <string.h>
30   | 
31   | #ifdef YAJL_LEXER_DEBUG
32   | static const char *
33   | tokToStr(yajl_tok tok)
34   | {
35   |     switch (tok) {
36   |         case yajl_tok_bool: return "bool";
37   |         case yajl_tok_colon: return "colon";
38   |         case yajl_tok_comma: return "comma";
39   |         case yajl_tok_eof: return "eof";
40   |         case yajl_tok_error: return "error";
41   |         case yajl_tok_left_brace: return "brace";
42   |         case yajl_tok_left_bracket: return "bracket";
43   |         case yajl_tok_null: return "null";
44   |         case yajl_tok_integer: return "integer";
45   |         case yajl_tok_double: return "double";
46   |         case yajl_tok_right_brace: return "brace";
47   |         case yajl_tok_right_bracket: return "bracket";
48   |         case yajl_tok_string: return "string";
49   |         case yajl_tok_string_with_escapes: return "string_with_escapes";
50   |     }
51   |     return "unknown";
52   | }
53   | #endif
54   | 
55   | /*
56   |  * Impact of the stream parsing feature on the lexer:
57   |  *
58   |  * YAJL supports parsing of streams.  That is, the ability to parse the first
59   |  * bits of a chunk of JSON before the last bits are available (still on
60   |  * the network or disk).  This makes the lexer more complex.  The
61   |  * responsibility of the lexer is to handle transparently the case where
62   |  * a chunk boundary falls in the middle of a token.  This is
63   |  * accomplished is via a buffer and a character reading abstraction.
64   |  *
65   |  * Overview of implementation
66   |  *
67   |  * When we lex to end of input string before end of token is hit, we
68   |  * copy all of the input text composing the token into our lexBuf.
69   |  *
70   |  * Every time we read a character, we do so through the readChar function.
71   |  * readChar's responsibility is to handle pulling all chars from the buffer
72   |  * before pulling chars from input text
73   |  */
74   | 
75   | /*+ the (private) lexer context +*/
76   | struct yajl_lexer_t {
77   |     /*+ the current line count +*/
78   |     size_t lineOff;
79   |     /* the current character offset into the current line (i.e. since the last '\r' or '\n') */
80   |     size_t charOff;
81   | 
82   |     /*+ error +*/
83   |     yajl_lex_error error;
84   | 
85   |     /*+ a input buffer to handle the case where a token is spread over
86   |      * multiple chunks +*/
87   |     yajl_buf buf;
88   | 
89   |     /*+ in the case where we have data in the lexBuf, bufOff holds
90   |      * the current offset into the lexBuf. +*/
91   |     size_t bufOff;
92   | 
93   |     /*+ are we using the lex buf? +*/
94   |     /* bool */ int bufInUse;
95   | 
96   |     /*+ shall we allow comments? +*/
97   |     /* bool */ int allowComments;
98   | 
99   |     /*+ shall we validate utf8 inside strings? +*/
100  |     /* bool */ int validateUTF8;
101  | 
102  |     /* the allocator functions being used by this lexer */
103  |     yajl_alloc_funcs * alloc;
104  | };
105  | 
106  | #define readChar(lxr, txt, off)                      \
107  |     (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \
108  |      (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \
109  |      ((txt)[(*(off))++]))
110  | 
111  | #define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--))
112  | 
113  | /*+
114  |  * allocate a lexer context
115  |  *
116  |  * Returns a lexer context object that must be passed to calls to
117  |  * yajl_lex_lex(), etc., and which must be passed to yajl_lex_free() when lexing
118  |  * is complete (successfully or not).
119  |  +*/
120  | yajl_lexer
121  | yajl_lex_alloc(yajl_alloc_funcs * alloc, /*+ allocator functions, e.g. from yajl_set_default_alloc_funcs() +*/
122  |                /* bool */ int allowComments, /*+ should this lexer handle comments embedded in the JSON text? +*/
123  |                /* bool */ int validateUTF8)  /*+ should this lexer validate UTF8 characters? +*/
124  | {
125  |     yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t));
126  |     memset((void *) lxr, 0, sizeof(struct yajl_lexer_t));
127  |     lxr->buf = yajl_buf_alloc(alloc);
128  |     lxr->allowComments = allowComments;
129  |     lxr->validateUTF8 = validateUTF8;
130  |     lxr->alloc = alloc;
131  |     return lxr;
132  | }
133  | 
134  | /*+ free a lexer context +*/
135  | void
136  | yajl_lex_free(yajl_lexer lxr)           /*+ the lexer context to free +*/
137  | {
138  |     yajl_buf_free(lxr->buf);
139  |     YA_FREE(lxr->alloc, lxr);
140  |     return;
141  | }
142  | 
143  | #define VEC 0x01
144  | #define IJC 0x02
145  | #define VHC 0x04
146  | #define NFP 0x08
147  | #define NUC 0x10
148  | 
149  | /*+
150  |  * a lookup table which lets us quickly determine three things:
151  |  *
152  |  * VEC - valid escaped control char
153  |  *
154  |  * IJC - invalid json char
155  |  *
156  |  * VHC - valid hex char
157  |  *
158  |  * NFP - needs further processing (from a string scanning perspective)
159  |  *
160  |  * NUC - needs utf8 checking when enabled (from a string scanning perspective)
161  |  *
162  |  * note.  the solidus '/' may be escaped or not.
163  |  +*/
164  | static const char charLookupTable[256] =
165  | {
166  | /*00*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
167  | /*08*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
168  | /*10*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
169  | /*18*/ IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    , IJC    ,
170  | 
171  | /*20*/ 0      , 0      , NFP|VEC|IJC, 0      , 0      , 0      , 0      , 0      ,
172  | /*28*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , VEC    ,
173  | /*30*/ VHC    , VHC    , VHC    , VHC    , VHC    , VHC    , VHC    , VHC    ,
174  | /*38*/ VHC    , VHC    , 0      , 0      , 0      , 0      , 0      , 0      ,
175  | 
176  | /*40*/ 0      , VHC    , VHC    , VHC    , VHC    , VHC    , VHC    , 0      ,
177  | /*48*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
178  | /*50*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
179  | /*58*/ 0      , 0      , 0      , 0      , NFP|VEC|IJC, 0      , 0      , 0      ,
180  | 
181  | /*60*/ 0      , VHC    , VEC|VHC, VHC    , VHC    , VHC    , VEC|VHC, 0      ,
182  | /*68*/ 0      , 0      , 0      , 0      , 0      , 0      , VEC    , 0      ,
183  | /*70*/ 0      , 0      , VEC    , 0      , VEC    , 0      , 0      , 0      ,
184  | /*78*/ 0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
185  | 
186  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
187  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
188  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
189  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
190  | 
191  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
192  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
193  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
194  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
195  | 
196  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
197  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
198  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
199  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
200  | 
201  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
202  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
203  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    ,
204  |        NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC    , NUC
205  | };
206  | 
207  | #define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
208  | 
209  | /*+
210  |  *  process a variable length utf8 encoded codepoint.
211  |  *
212  |  *  returns:
213  |  *
214  |  *    yajl_tok_string - if valid utf8 char was parsed and offset was
215  |  *                      advanced
216  |  *
217  |  *    yajl_tok_eof - if end of input was hit before validation could
218  |  *                   complete
219  |  *
220  |  *    yajl_tok_error - if invalid utf8 was encountered
221  |  *
222  |  *  NOTE: on error the offset will point to the first char of the
223  |  *  invalid utf8
224  |  +*/
225  | static yajl_tok
226  | yajl_lex_utf8_char(yajl_lexer lexer,
227  |                    const unsigned char * jsonText,
228  |                    size_t jsonTextLen,
229  |                    size_t * offset,
230  |                    unsigned int curChar)
231  | {
232  |     if (curChar <= 0x7f) {
233  |         /* single byte */
234  |         return yajl_tok_string;
235  |     } else if ((curChar >> 5) == 0x6) {
236  |         /* two byte */
237  |         UTF8_CHECK_EOF;
238  |         curChar = readChar(lexer, jsonText, offset);
239  |         if ((curChar >> 6) == 0x2) return yajl_tok_string;
240  |     } else if ((curChar >> 4) == 0x0e) {
241  |         /* three byte */
242  |         UTF8_CHECK_EOF;
243  |         curChar = readChar(lexer, jsonText, offset);
244  |         if ((curChar >> 6) == 0x2) {
245  |             UTF8_CHECK_EOF;
246  |             curChar = readChar(lexer, jsonText, offset);
247  |             if ((curChar >> 6) == 0x2) return yajl_tok_string;
248  |         }
249  |     } else if ((curChar >> 3) == 0x1e) {
250  |         /* four byte */
251  |         UTF8_CHECK_EOF;
252  |         curChar = readChar(lexer, jsonText, offset);
253  |         if ((curChar >> 6) == 0x2) {
254  |             UTF8_CHECK_EOF;
255  |             curChar = readChar(lexer, jsonText, offset);
256  |             if ((curChar >> 6) == 0x2) {
257  |                 UTF8_CHECK_EOF;
258  |                 curChar = readChar(lexer, jsonText, offset);
259  |                 if ((curChar >> 6) == 0x2) return yajl_tok_string;
260  |             }
261  |         }
262  |     }
263  | 
264  |     return yajl_tok_error;
265  | }
266  | 
267  | #define STR_CHECK_EOF \
268  | if (*offset >= jsonTextLen) { \
269  |    tok = yajl_tok_eof; \
270  |    goto finish_string_lex; \
271  | }
272  | 
273  | /*+
274  |  *  scan a string for interesting characters that might need further
275  |  *  review.
276  |  *
277  |  *  returns the number of chars that are uninteresting and can be skipped.
278  |  *
279  |  * (lth) hi world, any thoughts on how to make this routine faster?
280  |  +*/
281  | static size_t
282  | yajl_string_scan(const unsigned char * buf, size_t len, /* bool */ int utf8check)
283  | {
284  |     unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
285  |     size_t skip = 0;
286  |     while (skip < len && !(charLookupTable[*buf] & mask))
287  |     {
288  |         skip++;
289  |         buf++;
290  |     }
291  |     return skip;
292  | }
293  | 
294  | /*+
295  |  * lex a string.
296  |  *
297  |  * a token is returned which has the following meanings:
298  |  *
299  |  * yajl_tok_string: lex of string was successful.  offset points to
300  |  *                  terminating '"'.
301  |  *
302  |  * yajl_tok_eof: end of text was encountered before we could complete
303  |  *               the lex.
304  |  *
305  |  * yajl_tok_error: embedded in the string were unallowable chars.  offset
306  |  *               points to the offending char
307  |  +*/
308  | static yajl_tok
309  | yajl_lex_string(yajl_lexer lexer,       /*+ the current lexer context +*/
310  |                 const unsigned char * jsonText, /*+ a pointer to the beginning of the JSON text +*/
311  |                 size_t jsonTextLen,             /*+ length of the JSON text +*/
312  |                 size_t * offset)                /*+ offset of the string to be lexed +*/
313  | {
314  |     yajl_tok tok = yajl_tok_error;
315  |     int hasEscapes = 0;
316  | 
317  |     for (;;) {
318  |         unsigned char curChar;
319  | 
320  |         /* now jump into a faster scanning routine to skip as much
321  |          * of the buffers as possible */
322  |         {
323  |             const unsigned char * p;
324  |             size_t len;
325  | 
326  |             if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
327  |                  lexer->bufOff < yajl_buf_len(lexer->buf)))
328  |             {
329  |                 p = ((const unsigned char *) yajl_buf_data(lexer->buf) +
330  |                      (lexer->bufOff));
331  |                 len = yajl_buf_len(lexer->buf) - lexer->bufOff;
332  |                 lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
333  |             }
334  |             else if (*offset < jsonTextLen)
335  |             {
336  |                 p = jsonText + *offset;
337  |                 len = jsonTextLen - *offset;
338  |                 *offset += yajl_string_scan(p, len, lexer->validateUTF8);
339  |             }
340  |         }
341  | 
342  |         STR_CHECK_EOF;
343  | 
344  |         curChar = readChar(lexer, jsonText, offset);
345  | 
346  |         /* quote terminates */
347  |         if (curChar == '"') {
348  |             tok = yajl_tok_string;
349  |             break;
350  |         }
351  |         /* backslash escapes a set of control chars, */
352  |         else if (curChar == '\\') {
353  |             hasEscapes = 1;
354  |             STR_CHECK_EOF;
355  | 
356  |             /* special case \u */
357  |             curChar = readChar(lexer, jsonText, offset);
358  |             if (curChar == 'u') {
359  |                 unsigned int i = 0;
360  | 
361  |                 for (i=0;i<4;i++) {
362  |                     STR_CHECK_EOF;
363  |                     curChar = readChar(lexer, jsonText, offset);
364  |                     if (!(charLookupTable[curChar] & VHC)) {
365  |                         /* back up to offending char */
366  |                         unreadChar(lexer, offset);
367  |                         lexer->error = yajl_lex_string_invalid_hex_char;
368  |                         goto finish_string_lex;
369  |                     }
370  |                 }
371  |             } else if (!(charLookupTable[curChar] & VEC)) {
372  |                 /* back up to offending char */
373  |                 unreadChar(lexer, offset);
374  |                 lexer->error = yajl_lex_string_invalid_escaped_char;
375  |                 goto finish_string_lex;
376  |             }
377  |         }
378  |         /* when not validating UTF8 it's a simple table lookup to determine
379  |          * if the present character is invalid */
380  |         else if(charLookupTable[curChar] & IJC) {
381  |             /* back up to offending char */
382  |             unreadChar(lexer, offset);
383  |             lexer->error = yajl_lex_string_invalid_json_char;
384  |             goto finish_string_lex;
385  |         }
386  |         /* when in validate UTF8 mode we need to do some extra work */
387  |         else if (lexer->validateUTF8) {
388  |             yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
389  |                                             offset, (unsigned int) curChar);
390  | 
391  |             if (t == yajl_tok_eof) {
392  |                 tok = yajl_tok_eof;
393  |                 goto finish_string_lex;
394  |             } else if (t == yajl_tok_error) {
395  |                 lexer->error = yajl_lex_string_invalid_utf8;
396  |                 goto finish_string_lex;
397  |             }
398  |         }
399  |         /* accept it, and move on */
400  |     }
401  |   finish_string_lex:
402  |     /* tell our buddy, the parser, wether he needs to process this string
403  |      * again */
404  |     if (hasEscapes && tok == yajl_tok_string) {
405  |         tok = yajl_tok_string_with_escapes;
406  |     }
407  | 
408  |     return tok;
409  | }
410  | 
411  | #define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof;
412  | 
413  | static yajl_tok
414  | yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
415  |                 size_t jsonTextLen, size_t * offset)
416  | {
417  |     /*
418  |      * XXX:  numbers are the only entities in json that we must lex
419  |      *       _beyond_ in order to know that they are complete.  There
420  |      *       is an ambiguous case for integers at EOF.
421  |      */
422  | 
423  |     unsigned char c;
424  | 
425  |     yajl_tok tok = yajl_tok_integer;
426  | 
427  |     RETURN_IF_EOF;
428  |     c = readChar(lexer, jsonText, offset);
429  | 
430  |     /* optional leading minus */
431  |     if (c == '-') {
432  |         RETURN_IF_EOF;
433  |         c = readChar(lexer, jsonText, offset);
434  |     }
435  | 
436  |     /* a single zero, or a series of integers */
437  |     if (c == '0') {
438  |         RETURN_IF_EOF;
439  |         c = readChar(lexer, jsonText, offset);
440  |     } else if (c >= '1' && c <= '9') {
441  |         do {
442  |             RETURN_IF_EOF;
443  |             c = readChar(lexer, jsonText, offset);
444  |         } while (c >= '0' && c <= '9');
445  |     } else {
446  |         unreadChar(lexer, offset);
447  |         lexer->error = yajl_lex_missing_integer_after_minus;
448  |         return yajl_tok_error;
449  |     }
450  | 
451  |     /* optional fraction (indicates this is floating point) */
452  |     if (c == '.') {
453  |         int numRd = 0;
454  | 
455  |         RETURN_IF_EOF;
456  |         c = readChar(lexer, jsonText, offset);
457  | 
458  |         while (c >= '0' && c <= '9') {
459  |             numRd++;
460  |             RETURN_IF_EOF;
461  |             c = readChar(lexer, jsonText, offset);
462  |         }
463  | 
464  |         if (!numRd) {
465  |             unreadChar(lexer, offset);
466  |             lexer->error = yajl_lex_missing_integer_after_decimal;
467  |             return yajl_tok_error;
468  |         }
469  |         tok = yajl_tok_double;
470  |     }
471  | 
472  |     /* optional exponent (indicates this is floating point) */
473  |     if (c == 'e' || c == 'E') {
474  |         RETURN_IF_EOF;
475  |         c = readChar(lexer, jsonText, offset);
476  | 
477  |         /* optional sign */
478  |         if (c == '+' || c == '-') {
479  |             RETURN_IF_EOF;
480  |             c = readChar(lexer, jsonText, offset);
481  |         }
482  | 
483  |         if (c >= '0' && c <= '9') {
484  |             do {
485  |                 RETURN_IF_EOF;
486  |                 c = readChar(lexer, jsonText, offset);
487  |             } while (c >= '0' && c <= '9');
488  |         } else {
489  |             unreadChar(lexer, offset);
490  |             lexer->error = yajl_lex_missing_integer_after_exponent;
491  |             return yajl_tok_error;
492  |         }
493  |         tok = yajl_tok_double;
494  |     }
495  | 
496  |     /* we always go "one too far" */
497  |     unreadChar(lexer, offset);
498  | 
499  |     return tok;
500  | }
501  | 
502  | static yajl_tok
503  | yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
504  |                  size_t jsonTextLen, size_t * offset)
505  | {
506  |     unsigned char c;
507  | 
508  |     yajl_tok tok = yajl_tok_comment;
509  | 
510  |     RETURN_IF_EOF;
511  |     c = readChar(lexer, jsonText, offset);
512  | 
513  |     /* either slash or star expected */
514  |     if (c == '/') {
515  |         /* now we throw away until end of line */
516  |         do {
517  |             RETURN_IF_EOF;
518  |             c = readChar(lexer, jsonText, offset);
519  |         } while (c != '\n');
520  |     } else if (c == '*') {
521  |         /* now we throw away until end of comment */
522  |         for (;;) {
523  |             RETURN_IF_EOF;
524  |             c = readChar(lexer, jsonText, offset);
525  |             if (c == '*') {
526  |                 RETURN_IF_EOF;
527  |                 c = readChar(lexer, jsonText, offset);
528  |                 if (c == '/') {
529  |                     break;
530  |                 } else {
531  |                     unreadChar(lexer, offset);
532  |                 }
533  |             }
534  |         }
535  |     } else {
536  |         lexer->error = yajl_lex_invalid_char;
537  |         tok = yajl_tok_error;
538  |     }
539  | 
540  |     return tok;
541  | }
542  | 
543  | /*+
544  |  * Begin or continue a lexer.
545  |  *
546  |  * Returns a JSON lexical token for the parser.
547  |  *
548  |  * When you pass the next chunk of data, context should be reinitialized to
549  |  * zero.  xxx ???
550  |  +*/
551  | yajl_tok
552  | yajl_lex_lex(yajl_lexer lexer,          /*+ the current lexer context +*/
553  |              const unsigned char * jsonText, /*+ a chunk of JSON text to be analysed +*/
554  |              size_t jsonTextLen,             /*+ length of this chunk +*/
555  |              size_t * offset,           /*+ Offset is both input & output!  It
556  |                                          * should be initialized to zero for a
557  |                                          * new chunk of target text, and upon
558  |                                          * subsetquent calls with the same
559  |                                          * target text should passed with the
560  |                                          * value of the previous invocation.
561  |                                          *
562  |                                          * The caller may be interested in the
563  |                                          * value of offset when an error is
564  |                                          * returned from the lexer.  This allows
565  |                                          * the caller to render useful error
566  |                                          * messages.
567  |                                          +*/
568  |              const unsigned char ** outBuf, /*+ Finally, the output buffer is
569  |                                              * usually just a pointer into the
570  |                                              * jsonText, however in cases where
571  |                                              * the entity being lexed spans
572  |                                              * multiple chunks, the lexer will
573  |                                              * buffer the entity and the data
574  |                                              * returned will be a pointer into
575  |                                              * that buffer. +*/
576  |              size_t * outLen)           /*+ This behavior is abstracted from
577  |                                          * client code except for the
578  |                                          * performance implications which
579  |                                          * require that the client choose a
580  |                                          * reasonable chunk size to get adequate
581  |                                          * performance. +*/
582  | {
583  |     yajl_tok tok = yajl_tok_error;
584  |     unsigned char c;
585  |     size_t startOffset = *offset;
586  | 
587  |     *outBuf = NULL;
588  |     *outLen = 0;
589  | 
590  |     for (;;) {
591  |         assert(*offset <= jsonTextLen);
592  | 
593  |         if (*offset >= jsonTextLen) {
594  |             tok = yajl_tok_eof;
595  |             goto lexed;
596  |         }
597  | 
598  |         c = readChar(lexer, jsonText, offset);
599  | 
600  |         switch (c) {
601  |             case '{':
602  |                 tok = yajl_tok_left_bracket;
603  |                 goto lexed;
604  |             case '}':
605  |                 tok = yajl_tok_right_bracket;
606  |                 goto lexed;
607  |             case '[':
608  |                 tok = yajl_tok_left_brace;
609  |                 goto lexed;
610  |             case ']':
611  |                 tok = yajl_tok_right_brace;
612  |                 goto lexed;
613  |             case ',':
614  |                 tok = yajl_tok_comma;
615  |                 goto lexed;
616  |             case ':':
617  |                 tok = yajl_tok_colon;
618  |                 goto lexed;
619  |             case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
620  |                 startOffset++;
621  |                 break;
622  |             case 't': {
623  |                 const char * want = "rue";
624  |                 do {
625  |                     if (*offset >= jsonTextLen) {
626  |                         tok = yajl_tok_eof;
627  |                         goto lexed;
628  |                     }
629  |                     c = readChar(lexer, jsonText, offset);
630  |                     if (c != *want) {
631  |                         unreadChar(lexer, offset);
632  |                         lexer->error = yajl_lex_invalid_string;
633  |                         tok = yajl_tok_error;
634  |                         goto lexed;
635  |                     }
636  |                 } while (*(++want));
637  |                 tok = yajl_tok_bool;
638  |                 goto lexed;
639  |             }
640  |             case 'f': {
641  |                 const char * want = "alse";
642  |                 do {
643  |                     if (*offset >= jsonTextLen) {
644  |                         tok = yajl_tok_eof;
645  |                         goto lexed;
646  |                     }
647  |                     c = readChar(lexer, jsonText, offset);
648  |                     if (c != *want) {
649  |                         unreadChar(lexer, offset);
650  |                         lexer->error = yajl_lex_invalid_string;
651  |                         tok = yajl_tok_error;
652  |                         goto lexed;
653  |                     }
654  |                 } while (*(++want));
655  |                 tok = yajl_tok_bool;
656  |                 goto lexed;
657  |             }
658  |             case 'n': {
659  |                 const char * want = "ull";
660  |                 do {
661  |                     if (*offset >= jsonTextLen) {
662  |                         tok = yajl_tok_eof;
663  |                         goto lexed;
664  |                     }
665  |                     c = readChar(lexer, jsonText, offset);
666  |                     if (c != *want) {
667  |                         unreadChar(lexer, offset);
668  |                         lexer->error = yajl_lex_invalid_string;
669  |                         tok = yajl_tok_error;
670  |                         goto lexed;
671  |                     }
672  |                 } while (*(++want));
673  |                 tok = yajl_tok_null;
674  |                 goto lexed;
675  |             }
676  |             case '"': {
677  |                 tok = yajl_lex_string(lexer, (const unsigned char *) jsonText,
678  |                                       jsonTextLen, offset);
679  |                 goto lexed;
680  |             }
681  |             case '-':
682  |             case '0': case '1': case '2': case '3': case '4':
683  |             case '5': case '6': case '7': case '8': case '9': {
684  |                 /* integer parsing wants to start from the beginning */
685  |                 unreadChar(lexer, offset);
686  |                 tok = yajl_lex_number(lexer, (const unsigned char *) jsonText,
687  |                                       jsonTextLen, offset);
688  |                 goto lexed;
689  |             }
690  |             case '/':
691  |                 /* hey, look, a probable comment!  If comments are disabled
692  |                  * it's an error. */
693  |                 if (!lexer->allowComments) {
694  |                     unreadChar(lexer, offset);
695  |                     lexer->error = yajl_lex_unallowed_comment;
696  |                     tok = yajl_tok_error;
697  |                     goto lexed;
698  |                 }
699  |                 /* if comments are enabled, then we should try to lex
700  |                  * the thing.  possible outcomes are
701  |                  * - successful lex (tok_comment, which means continue),
702  |                  * - malformed comment opening (slash not followed by
703  |                  *   '*' or '/') (tok_error)
704  |                  * - eof hit. (tok_eof) */
705  |                 tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText,
706  |                                        jsonTextLen, offset);
707  |                 if (tok == yajl_tok_comment) {
708  |                     /* "error" is silly, but that's the initial
709  |                      * state of tok.  guilty until proven innocent. */
710  |                     tok = yajl_tok_error;
711  |                     yajl_buf_clear(lexer->buf);
712  |                     lexer->bufInUse = 0;
713  |                     startOffset = *offset;
714  |                     break;
715  |                 }
716  |                 /* hit error or eof, bail */
717  |                 goto lexed;
718  |             default:
719  |                 lexer->error = yajl_lex_invalid_char;
720  |                 tok = yajl_tok_error;
721  |                 goto lexed;
722  |         }
723  |     }
724  | 
725  | 
726  |   lexed:
727  |     /* need to append to buffer if the buffer is in use or
728  |      * if it's an EOF token */
729  |     if (tok == yajl_tok_eof || lexer->bufInUse) {
730  |         if (!lexer->bufInUse) yajl_buf_clear(lexer->buf);
731  |         lexer->bufInUse = 1;
732  |         yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
733  |         lexer->bufOff = 0;
734  | 
735  |         if (tok != yajl_tok_eof) {
736  |             *outBuf = yajl_buf_data(lexer->buf);
737  |             *outLen = yajl_buf_len(lexer->buf);
738  |             lexer->bufInUse = 0;
739  |         }
740  |     } else if (tok != yajl_tok_error) {
741  |         *outBuf = jsonText + startOffset;
742  |         *outLen = *offset - startOffset;
743  |     }
744  | 
745  |     /* special case for strings. skip the quotes. */
746  |     if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes)
747  |     {
748  |         assert(*outLen >= 2);
749  |         (*outBuf)++;
750  |         *outLen -= 2;
751  |     }
752  | 
753  | 
754  | #ifdef YAJL_LEXER_DEBUG
755  |     if (tok == yajl_tok_error) {
756  |         printf("lexical error: %s\n",
757  |                yajl_lex_error_to_string(yajl_lex_get_error(lexer)));
758  |     } else if (tok == yajl_tok_eof) {
759  |         printf("EOF hit\n");
760  |     } else {
761  |         printf("lexed %s: '", tokToStr(tok));
762  |         fwrite(*outBuf, (size_t) 1, *outLen, stdout);
763  |         printf("'\n");
764  |     }
765  | #endif
766  | 
767  |     return tok;
768  | }
769  | 
770  | /*+
771  |  *  convert a lexer error value returned by yajl_lex_get_error() to a
772  |  *  descriptive string
773  |  +*/
774  | const char *
775  | yajl_lex_error_to_string(yajl_lex_error error) /*+ lexer error value +*/
776  | {
777  |     switch (error) {
778  |         case yajl_lex_e_ok:
779  |             return "ok, no error";
780  |         case yajl_lex_string_invalid_utf8:
781  |             return "invalid bytes in UTF8 string.";
782  |         case yajl_lex_string_invalid_escaped_char:
783  |             return "inside a string, '\\' occurs before a character "
784  |                    "which it may not.";
785  |         case yajl_lex_string_invalid_json_char:
786  |             return "invalid character inside string.";
787  |         case yajl_lex_string_invalid_hex_char:
788  |             return "invalid (non-hex) character occurs after '\\u' inside "
789  |                    "string.";
790  |         case yajl_lex_invalid_char:
791  |             return "invalid char in json text.";
792  |         case yajl_lex_invalid_string:
793  |             return "invalid string in json text.";
794  |         case yajl_lex_missing_integer_after_exponent:
795  |             return "malformed number, a digit is required after the exponent.";
796  |         case yajl_lex_missing_integer_after_decimal:
797  |             return "malformed number, a digit is required after the "
798  |                    "decimal point.";
799  |         case yajl_lex_missing_integer_after_minus:
800  |             return "malformed number, a digit is required after the "
801  |                    "minus sign.";
802  |         case yajl_lex_unallowed_comment:
803  |             return "probable comment found in input text, comments are "
804  |                    "not enabled.";
805  |     }
806  |     /* NOTREACHED */
807  |     return "unknown error code";
808  | }
809  | 
810  | 
811  | /*+
812  |  *  allows access to more specific information about the lexical
813  |  *  error when yajl_lex_lex() returns yajl_tok_error.
814  |  *
815  |  *  Retunrs a value that may be passed to yajl_lex_error_to_string() to convert
816  |  *  it into descriptive error message text.
817  |  +*/
818  | yajl_lex_error
819  | yajl_lex_get_error(yajl_lexer lexer)    /*+ the current lexer context +*/
820  | {
821  |     if (lexer == NULL) return (yajl_lex_error) -1;
822  |     return lexer->error;
823  | }
824  | 
825  | /*+
826  |  *  A helper for finding the line number of error in the input.
827  |  *
828  |  *  Returns the number of lines lexed by this lexer instance
829  |  +*/
830  | size_t
831  | yajl_lex_current_line(yajl_lexer lexer) /*+ the current lexer context +*/
832  | {
833  |     return lexer->lineOff;
834  | }
835  | 
836  | /*+
837  |  *  A helper for finding the exact context of an error in the input.
838  |  *
839  |  *  get the number of chars lexed by this lexer instance since the last
840  |  *  \n or \r
841  |  +*/
842  | size_t
843  | yajl_lex_current_char(yajl_lexer lexer) /*+ the current lexer context +*/
844  | {
845  |     return lexer->charOff;
846  | }
847  | 
848  | /*+
849  |  * have a peek at the next token, but don't move the lexer forward
850  |  *
851  |  * Returns the next token yagl_lex_lex() will return.
852  |  +*/
853  | yajl_tok
854  | yajl_lex_peek(yajl_lexer lexer, /*+ the current lexer context +*/
855  |                        const unsigned char * jsonText,
856  |                        size_t jsonTextLen,
857  |                        size_t offset)
858  | {
859  |     const unsigned char * outBuf;
860  |     size_t outLen;
861  |     size_t bufLen = yajl_buf_len(lexer->buf);
862  |     size_t bufOff = lexer->bufOff;
863  |     /* bool */ int bufInUse = lexer->bufInUse;
864  |     yajl_tok tok;
865  | 
866  |     tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
867  |                        &outBuf, &outLen);
868  | 
869  |     lexer->bufOff = bufOff;
870  |     lexer->bufInUse = bufInUse;
871  |     yajl_buf_truncate(lexer->buf, bufLen);
872  | 
873  |     return tok;
874  | }