1 | /*
2 | * Copyright (c) 2007-2014, Lloyd Hilaiel <me@lloyd.io>
3 | *
4 | * Permission to use, copy, modify, and/or distribute this software for any
5 | * purpose with or without fee is hereby granted, provided that the above
6 | * copyright notice and this permission notice appear in all copies.
7 | *
8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 | */
16 |
17 | /**
18 | * A JSON text lexical analyzer.
19 | *
20 | * The implementation.
21 | **/
22 |
23 | #include "yajl_lex.h"
24 | #include "yajl_buf.h"
25 |
26 | #include <stdlib.h>
27 | #include <stdio.h>
28 | #include <assert.h>
29 | #include <string.h>
30 |
31 | #ifdef YAJL_LEXER_DEBUG
32 | static const char *
33 | tokToStr(yajl_tok tok)
34 | {
35 | switch (tok) {
36 | case yajl_tok_bool: return "bool";
37 | case yajl_tok_colon: return "colon";
38 | case yajl_tok_comma: return "comma";
39 | case yajl_tok_eof: return "eof";
40 | case yajl_tok_error: return "error";
41 | case yajl_tok_left_brace: return "brace";
42 | case yajl_tok_left_bracket: return "bracket";
43 | case yajl_tok_null: return "null";
44 | case yajl_tok_integer: return "integer";
45 | case yajl_tok_double: return "double";
46 | case yajl_tok_right_brace: return "brace";
47 | case yajl_tok_right_bracket: return "bracket";
48 | case yajl_tok_string: return "string";
49 | case yajl_tok_string_with_escapes: return "string_with_escapes";
50 | }
51 | return "unknown";
52 | }
53 | #endif
54 |
55 | /*
56 | * Impact of the stream parsing feature on the lexer:
57 | *
58 | * YAJL supports parsing of streams. That is, the ability to parse the first
59 | * bits of a chunk of JSON before the last bits are available (still on
60 | * the network or disk). This makes the lexer more complex. The
61 | * responsibility of the lexer is to handle transparently the case where
62 | * a chunk boundary falls in the middle of a token. This is
63 | * accomplished is via a buffer and a character reading abstraction.
64 | *
65 | * Overview of implementation
66 | *
67 | * When we lex to end of input string before end of token is hit, we
68 | * copy all of the input text composing the token into our lexBuf.
69 | *
70 | * Every time we read a character, we do so through the readChar function.
71 | * readChar's responsibility is to handle pulling all chars from the buffer
72 | * before pulling chars from input text
73 | */
74 |
75 | /*+ the (private) lexer context +*/
76 | struct yajl_lexer_t {
77 | /*+ the current line count +*/
78 | size_t lineOff;
79 | /* the current character offset into the current line (i.e. since the last '\r' or '\n') */
80 | size_t charOff;
81 |
82 | /*+ error +*/
83 | yajl_lex_error error;
84 |
85 | /*+ a input buffer to handle the case where a token is spread over
86 | * multiple chunks +*/
87 | yajl_buf buf;
88 |
89 | /*+ in the case where we have data in the lexBuf, bufOff holds
90 | * the current offset into the lexBuf. +*/
91 | size_t bufOff;
92 |
93 | /*+ are we using the lex buf? +*/
94 | /* bool */ int bufInUse;
95 |
96 | /*+ shall we allow comments? +*/
97 | /* bool */ int allowComments;
98 |
99 | /*+ shall we validate utf8 inside strings? +*/
100 | /* bool */ int validateUTF8;
101 |
102 | /* the allocator functions being used by this lexer */
103 | yajl_alloc_funcs * alloc;
104 | };
105 |
106 | #define readChar(lxr, txt, off) \
107 | (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \
108 | (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \
109 | ((txt)[(*(off))++]))
110 |
111 | #define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--))
112 |
113 | /*+
114 | * allocate a lexer context
115 | *
116 | * Returns a lexer context object that must be passed to calls to
117 | * yajl_lex_lex(), etc., and which must be passed to yajl_lex_free() when lexing
118 | * is complete (successfully or not).
119 | +*/
120 | yajl_lexer
121 | yajl_lex_alloc(yajl_alloc_funcs * alloc, /*+ allocator functions, e.g. from yajl_set_default_alloc_funcs() +*/
122 | /* bool */ int allowComments, /*+ should this lexer handle comments embedded in the JSON text? +*/
123 | /* bool */ int validateUTF8) /*+ should this lexer validate UTF8 characters? +*/
124 | {
125 | yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t));
126 | memset((void *) lxr, 0, sizeof(struct yajl_lexer_t));
127 | lxr->buf = yajl_buf_alloc(alloc);
128 | lxr->allowComments = allowComments;
129 | lxr->validateUTF8 = validateUTF8;
130 | lxr->alloc = alloc;
131 | return lxr;
132 | }
133 |
134 | /*+ free a lexer context +*/
135 | void
136 | yajl_lex_free(yajl_lexer lxr) /*+ the lexer context to free +*/
137 | {
138 | yajl_buf_free(lxr->buf);
139 | YA_FREE(lxr->alloc, lxr);
140 | return;
141 | }
142 |
143 | #define VEC 0x01
144 | #define IJC 0x02
145 | #define VHC 0x04
146 | #define NFP 0x08
147 | #define NUC 0x10
148 |
149 | /*+
150 | * a lookup table which lets us quickly determine three things:
151 | *
152 | * VEC - valid escaped control char
153 | *
154 | * IJC - invalid json char
155 | *
156 | * VHC - valid hex char
157 | *
158 | * NFP - needs further processing (from a string scanning perspective)
159 | *
160 | * NUC - needs utf8 checking when enabled (from a string scanning perspective)
161 | *
162 | * note. the solidus '/' may be escaped or not.
163 | +*/
164 | static const char charLookupTable[256] =
165 | {
166 | /*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
167 | /*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
168 | /*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
169 | /*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
170 |
171 | /*20*/ 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
172 | /*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC ,
173 | /*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC ,
174 | /*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 ,
175 |
176 | /*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 ,
177 | /*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
178 | /*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
179 | /*58*/ 0 , 0 , 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 ,
180 |
181 | /*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 ,
182 | /*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 ,
183 | /*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 ,
184 | /*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
185 |
186 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
187 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
188 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
189 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
190 |
191 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
192 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
193 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
194 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
195 |
196 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
197 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
198 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
199 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
200 |
201 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
202 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
203 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
204 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC
205 | };
206 |
207 | #define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
208 |
209 | /*+
210 | * process a variable length utf8 encoded codepoint.
211 | *
212 | * returns:
213 | *
214 | * yajl_tok_string - if valid utf8 char was parsed and offset was
215 | * advanced
216 | *
217 | * yajl_tok_eof - if end of input was hit before validation could
218 | * complete
219 | *
220 | * yajl_tok_error - if invalid utf8 was encountered
221 | *
222 | * NOTE: on error the offset will point to the first char of the
223 | * invalid utf8
224 | +*/
225 | static yajl_tok
226 | yajl_lex_utf8_char(yajl_lexer lexer,
227 | const unsigned char * jsonText,
228 | size_t jsonTextLen,
229 | size_t * offset,
230 | unsigned int curChar)
231 | {
232 | if (curChar <= 0x7f) {
233 | /* single byte */
234 | return yajl_tok_string;
235 | } else if ((curChar >> 5) == 0x6) {
236 | /* two byte */
237 | UTF8_CHECK_EOF;
238 | curChar = readChar(lexer, jsonText, offset);
239 | if ((curChar >> 6) == 0x2) return yajl_tok_string;
240 | } else if ((curChar >> 4) == 0x0e) {
241 | /* three byte */
242 | UTF8_CHECK_EOF;
243 | curChar = readChar(lexer, jsonText, offset);
244 | if ((curChar >> 6) == 0x2) {
245 | UTF8_CHECK_EOF;
246 | curChar = readChar(lexer, jsonText, offset);
247 | if ((curChar >> 6) == 0x2) return yajl_tok_string;
248 | }
249 | } else if ((curChar >> 3) == 0x1e) {
250 | /* four byte */
251 | UTF8_CHECK_EOF;
252 | curChar = readChar(lexer, jsonText, offset);
253 | if ((curChar >> 6) == 0x2) {
254 | UTF8_CHECK_EOF;
255 | curChar = readChar(lexer, jsonText, offset);
256 | if ((curChar >> 6) == 0x2) {
257 | UTF8_CHECK_EOF;
258 | curChar = readChar(lexer, jsonText, offset);
259 | if ((curChar >> 6) == 0x2) return yajl_tok_string;
260 | }
261 | }
262 | }
263 |
264 | return yajl_tok_error;
265 | }
266 |
267 | #define STR_CHECK_EOF \
268 | if (*offset >= jsonTextLen) { \
269 | tok = yajl_tok_eof; \
270 | goto finish_string_lex; \
271 | }
272 |
273 | /*+
274 | * scan a string for interesting characters that might need further
275 | * review.
276 | *
277 | * returns the number of chars that are uninteresting and can be skipped.
278 | *
279 | * (lth) hi world, any thoughts on how to make this routine faster?
280 | +*/
281 | static size_t
282 | yajl_string_scan(const unsigned char * buf, size_t len, /* bool */ int utf8check)
283 | {
284 | unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
285 | size_t skip = 0;
286 | while (skip < len && !(charLookupTable[*buf] & mask))
287 | {
288 | skip++;
289 | buf++;
290 | }
291 | return skip;
292 | }
293 |
294 | /*+
295 | * lex a string.
296 | *
297 | * a token is returned which has the following meanings:
298 | *
299 | * yajl_tok_string: lex of string was successful. offset points to
300 | * terminating '"'.
301 | *
302 | * yajl_tok_eof: end of text was encountered before we could complete
303 | * the lex.
304 | *
305 | * yajl_tok_error: embedded in the string were unallowable chars. offset
306 | * points to the offending char
307 | +*/
308 | static yajl_tok
309 | yajl_lex_string(yajl_lexer lexer, /*+ the current lexer context +*/
310 | const unsigned char * jsonText, /*+ a pointer to the beginning of the JSON text +*/
311 | size_t jsonTextLen, /*+ length of the JSON text +*/
312 | size_t * offset) /*+ offset of the string to be lexed +*/
313 | {
314 | yajl_tok tok = yajl_tok_error;
315 | int hasEscapes = 0;
316 |
317 | for (;;) {
318 | unsigned char curChar;
319 |
320 | /* now jump into a faster scanning routine to skip as much
321 | * of the buffers as possible */
322 | {
323 | const unsigned char * p;
324 | size_t len;
325 |
326 | if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
327 | lexer->bufOff < yajl_buf_len(lexer->buf)))
328 | {
329 | p = ((const unsigned char *) yajl_buf_data(lexer->buf) +
330 | (lexer->bufOff));
331 | len = yajl_buf_len(lexer->buf) - lexer->bufOff;
332 | lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
333 | }
334 | else if (*offset < jsonTextLen)
335 | {
336 | p = jsonText + *offset;
337 | len = jsonTextLen - *offset;
338 | *offset += yajl_string_scan(p, len, lexer->validateUTF8);
339 | }
340 | }
341 |
342 | STR_CHECK_EOF;
343 |
344 | curChar = readChar(lexer, jsonText, offset);
345 |
346 | /* quote terminates */
347 | if (curChar == '"') {
348 | tok = yajl_tok_string;
349 | break;
350 | }
351 | /* backslash escapes a set of control chars, */
352 | else if (curChar == '\\') {
353 | hasEscapes = 1;
354 | STR_CHECK_EOF;
355 |
356 | /* special case \u */
357 | curChar = readChar(lexer, jsonText, offset);
358 | if (curChar == 'u') {
359 | unsigned int i = 0;
360 |
361 | for (i=0;i<4;i++) {
362 | STR_CHECK_EOF;
363 | curChar = readChar(lexer, jsonText, offset);
364 | if (!(charLookupTable[curChar] & VHC)) {
365 | /* back up to offending char */
366 | unreadChar(lexer, offset);
367 | lexer->error = yajl_lex_string_invalid_hex_char;
368 | goto finish_string_lex;
369 | }
370 | }
371 | } else if (!(charLookupTable[curChar] & VEC)) {
372 | /* back up to offending char */
373 | unreadChar(lexer, offset);
374 | lexer->error = yajl_lex_string_invalid_escaped_char;
375 | goto finish_string_lex;
376 | }
377 | }
378 | /* when not validating UTF8 it's a simple table lookup to determine
379 | * if the present character is invalid */
380 | else if(charLookupTable[curChar] & IJC) {
381 | /* back up to offending char */
382 | unreadChar(lexer, offset);
383 | lexer->error = yajl_lex_string_invalid_json_char;
384 | goto finish_string_lex;
385 | }
386 | /* when in validate UTF8 mode we need to do some extra work */
387 | else if (lexer->validateUTF8) {
388 | yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
389 | offset, (unsigned int) curChar);
390 |
391 | if (t == yajl_tok_eof) {
392 | tok = yajl_tok_eof;
393 | goto finish_string_lex;
394 | } else if (t == yajl_tok_error) {
395 | lexer->error = yajl_lex_string_invalid_utf8;
396 | goto finish_string_lex;
397 | }
398 | }
399 | /* accept it, and move on */
400 | }
401 | finish_string_lex:
402 | /* tell our buddy, the parser, wether he needs to process this string
403 | * again */
404 | if (hasEscapes && tok == yajl_tok_string) {
405 | tok = yajl_tok_string_with_escapes;
406 | }
407 |
408 | return tok;
409 | }
410 |
411 | #define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof;
412 |
413 | static yajl_tok
414 | yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
415 | size_t jsonTextLen, size_t * offset)
416 | {
417 | /*
418 | * XXX: numbers are the only entities in json that we must lex
419 | * _beyond_ in order to know that they are complete. There
420 | * is an ambiguous case for integers at EOF.
421 | */
422 |
423 | unsigned char c;
424 |
425 | yajl_tok tok = yajl_tok_integer;
426 |
427 | RETURN_IF_EOF;
428 | c = readChar(lexer, jsonText, offset);
429 |
430 | /* optional leading minus */
431 | if (c == '-') {
432 | RETURN_IF_EOF;
433 | c = readChar(lexer, jsonText, offset);
434 | }
435 |
436 | /* a single zero, or a series of integers */
437 | if (c == '0') {
438 | RETURN_IF_EOF;
439 | c = readChar(lexer, jsonText, offset);
440 | } else if (c >= '1' && c <= '9') {
441 | do {
442 | RETURN_IF_EOF;
443 | c = readChar(lexer, jsonText, offset);
444 | } while (c >= '0' && c <= '9');
445 | } else {
446 | unreadChar(lexer, offset);
447 | lexer->error = yajl_lex_missing_integer_after_minus;
448 | return yajl_tok_error;
449 | }
450 |
451 | /* optional fraction (indicates this is floating point) */
452 | if (c == '.') {
453 | int numRd = 0;
454 |
455 | RETURN_IF_EOF;
456 | c = readChar(lexer, jsonText, offset);
457 |
458 | while (c >= '0' && c <= '9') {
459 | numRd++;
460 | RETURN_IF_EOF;
461 | c = readChar(lexer, jsonText, offset);
462 | }
463 |
464 | if (!numRd) {
465 | unreadChar(lexer, offset);
466 | lexer->error = yajl_lex_missing_integer_after_decimal;
467 | return yajl_tok_error;
468 | }
469 | tok = yajl_tok_double;
470 | }
471 |
472 | /* optional exponent (indicates this is floating point) */
473 | if (c == 'e' || c == 'E') {
474 | RETURN_IF_EOF;
475 | c = readChar(lexer, jsonText, offset);
476 |
477 | /* optional sign */
478 | if (c == '+' || c == '-') {
479 | RETURN_IF_EOF;
480 | c = readChar(lexer, jsonText, offset);
481 | }
482 |
483 | if (c >= '0' && c <= '9') {
484 | do {
485 | RETURN_IF_EOF;
486 | c = readChar(lexer, jsonText, offset);
487 | } while (c >= '0' && c <= '9');
488 | } else {
489 | unreadChar(lexer, offset);
490 | lexer->error = yajl_lex_missing_integer_after_exponent;
491 | return yajl_tok_error;
492 | }
493 | tok = yajl_tok_double;
494 | }
495 |
496 | /* we always go "one too far" */
497 | unreadChar(lexer, offset);
498 |
499 | return tok;
500 | }
501 |
502 | static yajl_tok
503 | yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
504 | size_t jsonTextLen, size_t * offset)
505 | {
506 | unsigned char c;
507 |
508 | yajl_tok tok = yajl_tok_comment;
509 |
510 | RETURN_IF_EOF;
511 | c = readChar(lexer, jsonText, offset);
512 |
513 | /* either slash or star expected */
514 | if (c == '/') {
515 | /* now we throw away until end of line */
516 | do {
517 | RETURN_IF_EOF;
518 | c = readChar(lexer, jsonText, offset);
519 | } while (c != '\n');
520 | } else if (c == '*') {
521 | /* now we throw away until end of comment */
522 | for (;;) {
523 | RETURN_IF_EOF;
524 | c = readChar(lexer, jsonText, offset);
525 | if (c == '*') {
526 | RETURN_IF_EOF;
527 | c = readChar(lexer, jsonText, offset);
528 | if (c == '/') {
529 | break;
530 | } else {
531 | unreadChar(lexer, offset);
532 | }
533 | }
534 | }
535 | } else {
536 | lexer->error = yajl_lex_invalid_char;
537 | tok = yajl_tok_error;
538 | }
539 |
540 | return tok;
541 | }
542 |
543 | /*+
544 | * Begin or continue a lexer.
545 | *
546 | * Returns a JSON lexical token for the parser.
547 | *
548 | * When you pass the next chunk of data, context should be reinitialized to
549 | * zero. xxx ???
550 | +*/
551 | yajl_tok
552 | yajl_lex_lex(yajl_lexer lexer, /*+ the current lexer context +*/
553 | const unsigned char * jsonText, /*+ a chunk of JSON text to be analysed +*/
554 | size_t jsonTextLen, /*+ length of this chunk +*/
555 | size_t * offset, /*+ Offset is both input & output! It
556 | * should be initialized to zero for a
557 | * new chunk of target text, and upon
558 | * subsetquent calls with the same
559 | * target text should passed with the
560 | * value of the previous invocation.
561 | *
562 | * The caller may be interested in the
563 | * value of offset when an error is
564 | * returned from the lexer. This allows
565 | * the caller to render useful error
566 | * messages.
567 | +*/
568 | const unsigned char ** outBuf, /*+ Finally, the output buffer is
569 | * usually just a pointer into the
570 | * jsonText, however in cases where
571 | * the entity being lexed spans
572 | * multiple chunks, the lexer will
573 | * buffer the entity and the data
574 | * returned will be a pointer into
575 | * that buffer. +*/
576 | size_t * outLen) /*+ This behavior is abstracted from
577 | * client code except for the
578 | * performance implications which
579 | * require that the client choose a
580 | * reasonable chunk size to get adequate
581 | * performance. +*/
582 | {
583 | yajl_tok tok = yajl_tok_error;
584 | unsigned char c;
585 | size_t startOffset = *offset;
586 |
587 | *outBuf = NULL;
588 | *outLen = 0;
589 |
590 | for (;;) {
591 | assert(*offset <= jsonTextLen);
592 |
593 | if (*offset >= jsonTextLen) {
594 | tok = yajl_tok_eof;
595 | goto lexed;
596 | }
597 |
598 | c = readChar(lexer, jsonText, offset);
599 |
600 | switch (c) {
601 | case '{':
602 | tok = yajl_tok_left_bracket;
603 | goto lexed;
604 | case '}':
605 | tok = yajl_tok_right_bracket;
606 | goto lexed;
607 | case '[':
608 | tok = yajl_tok_left_brace;
609 | goto lexed;
610 | case ']':
611 | tok = yajl_tok_right_brace;
612 | goto lexed;
613 | case ',':
614 | tok = yajl_tok_comma;
615 | goto lexed;
616 | case ':':
617 | tok = yajl_tok_colon;
618 | goto lexed;
619 | case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
620 | startOffset++;
621 | break;
622 | case 't': {
623 | const char * want = "rue";
624 | do {
625 | if (*offset >= jsonTextLen) {
626 | tok = yajl_tok_eof;
627 | goto lexed;
628 | }
629 | c = readChar(lexer, jsonText, offset);
630 | if (c != *want) {
631 | unreadChar(lexer, offset);
632 | lexer->error = yajl_lex_invalid_string;
633 | tok = yajl_tok_error;
634 | goto lexed;
635 | }
636 | } while (*(++want));
637 | tok = yajl_tok_bool;
638 | goto lexed;
639 | }
640 | case 'f': {
641 | const char * want = "alse";
642 | do {
643 | if (*offset >= jsonTextLen) {
644 | tok = yajl_tok_eof;
645 | goto lexed;
646 | }
647 | c = readChar(lexer, jsonText, offset);
648 | if (c != *want) {
649 | unreadChar(lexer, offset);
650 | lexer->error = yajl_lex_invalid_string;
651 | tok = yajl_tok_error;
652 | goto lexed;
653 | }
654 | } while (*(++want));
655 | tok = yajl_tok_bool;
656 | goto lexed;
657 | }
658 | case 'n': {
659 | const char * want = "ull";
660 | do {
661 | if (*offset >= jsonTextLen) {
662 | tok = yajl_tok_eof;
663 | goto lexed;
664 | }
665 | c = readChar(lexer, jsonText, offset);
666 | if (c != *want) {
667 | unreadChar(lexer, offset);
668 | lexer->error = yajl_lex_invalid_string;
669 | tok = yajl_tok_error;
670 | goto lexed;
671 | }
672 | } while (*(++want));
673 | tok = yajl_tok_null;
674 | goto lexed;
675 | }
676 | case '"': {
677 | tok = yajl_lex_string(lexer, (const unsigned char *) jsonText,
678 | jsonTextLen, offset);
679 | goto lexed;
680 | }
681 | case '-':
682 | case '0': case '1': case '2': case '3': case '4':
683 | case '5': case '6': case '7': case '8': case '9': {
684 | /* integer parsing wants to start from the beginning */
685 | unreadChar(lexer, offset);
686 | tok = yajl_lex_number(lexer, (const unsigned char *) jsonText,
687 | jsonTextLen, offset);
688 | goto lexed;
689 | }
690 | case '/':
691 | /* hey, look, a probable comment! If comments are disabled
692 | * it's an error. */
693 | if (!lexer->allowComments) {
694 | unreadChar(lexer, offset);
695 | lexer->error = yajl_lex_unallowed_comment;
696 | tok = yajl_tok_error;
697 | goto lexed;
698 | }
699 | /* if comments are enabled, then we should try to lex
700 | * the thing. possible outcomes are
701 | * - successful lex (tok_comment, which means continue),
702 | * - malformed comment opening (slash not followed by
703 | * '*' or '/') (tok_error)
704 | * - eof hit. (tok_eof) */
705 | tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText,
706 | jsonTextLen, offset);
707 | if (tok == yajl_tok_comment) {
708 | /* "error" is silly, but that's the initial
709 | * state of tok. guilty until proven innocent. */
710 | tok = yajl_tok_error;
711 | yajl_buf_clear(lexer->buf);
712 | lexer->bufInUse = 0;
713 | startOffset = *offset;
714 | break;
715 | }
716 | /* hit error or eof, bail */
717 | goto lexed;
718 | default:
719 | lexer->error = yajl_lex_invalid_char;
720 | tok = yajl_tok_error;
721 | goto lexed;
722 | }
723 | }
724 |
725 |
726 | lexed:
727 | /* need to append to buffer if the buffer is in use or
728 | * if it's an EOF token */
729 | if (tok == yajl_tok_eof || lexer->bufInUse) {
730 | if (!lexer->bufInUse) yajl_buf_clear(lexer->buf);
731 | lexer->bufInUse = 1;
732 | yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
733 | lexer->bufOff = 0;
734 |
735 | if (tok != yajl_tok_eof) {
736 | *outBuf = yajl_buf_data(lexer->buf);
737 | *outLen = yajl_buf_len(lexer->buf);
738 | lexer->bufInUse = 0;
739 | }
740 | } else if (tok != yajl_tok_error) {
741 | *outBuf = jsonText + startOffset;
742 | *outLen = *offset - startOffset;
743 | }
744 |
745 | /* special case for strings. skip the quotes. */
746 | if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes)
747 | {
748 | assert(*outLen >= 2);
749 | (*outBuf)++;
750 | *outLen -= 2;
751 | }
752 |
753 |
754 | #ifdef YAJL_LEXER_DEBUG
755 | if (tok == yajl_tok_error) {
756 | printf("lexical error: %s\n",
757 | yajl_lex_error_to_string(yajl_lex_get_error(lexer)));
758 | } else if (tok == yajl_tok_eof) {
759 | printf("EOF hit\n");
760 | } else {
761 | printf("lexed %s: '", tokToStr(tok));
762 | fwrite(*outBuf, (size_t) 1, *outLen, stdout);
763 | printf("'\n");
764 | }
765 | #endif
766 |
767 | return tok;
768 | }
769 |
770 | /*+
771 | * convert a lexer error value returned by yajl_lex_get_error() to a
772 | * descriptive string
773 | +*/
774 | const char *
775 | yajl_lex_error_to_string(yajl_lex_error error) /*+ lexer error value +*/
776 | {
777 | switch (error) {
778 | case yajl_lex_e_ok:
779 | return "ok, no error";
780 | case yajl_lex_string_invalid_utf8:
781 | return "invalid bytes in UTF8 string.";
782 | case yajl_lex_string_invalid_escaped_char:
783 | return "inside a string, '\\' occurs before a character "
784 | "which it may not.";
785 | case yajl_lex_string_invalid_json_char:
786 | return "invalid character inside string.";
787 | case yajl_lex_string_invalid_hex_char:
788 | return "invalid (non-hex) character occurs after '\\u' inside "
789 | "string.";
790 | case yajl_lex_invalid_char:
791 | return "invalid char in json text.";
792 | case yajl_lex_invalid_string:
793 | return "invalid string in json text.";
794 | case yajl_lex_missing_integer_after_exponent:
795 | return "malformed number, a digit is required after the exponent.";
796 | case yajl_lex_missing_integer_after_decimal:
797 | return "malformed number, a digit is required after the "
798 | "decimal point.";
799 | case yajl_lex_missing_integer_after_minus:
800 | return "malformed number, a digit is required after the "
801 | "minus sign.";
802 | case yajl_lex_unallowed_comment:
803 | return "probable comment found in input text, comments are "
804 | "not enabled.";
805 | }
806 | /* NOTREACHED */
807 | return "unknown error code";
808 | }
809 |
810 |
811 | /*+
812 | * allows access to more specific information about the lexical
813 | * error when yajl_lex_lex() returns yajl_tok_error.
814 | *
815 | * Retunrs a value that may be passed to yajl_lex_error_to_string() to convert
816 | * it into descriptive error message text.
817 | +*/
818 | yajl_lex_error
819 | yajl_lex_get_error(yajl_lexer lexer) /*+ the current lexer context +*/
820 | {
821 | if (lexer == NULL) return (yajl_lex_error) -1;
822 | return lexer->error;
823 | }
824 |
825 | /*+
826 | * A helper for finding the line number of error in the input.
827 | *
828 | * Returns the number of lines lexed by this lexer instance
829 | +*/
830 | size_t
831 | yajl_lex_current_line(yajl_lexer lexer) /*+ the current lexer context +*/
832 | {
833 | return lexer->lineOff;
834 | }
835 |
836 | /*+
837 | * A helper for finding the exact context of an error in the input.
838 | *
839 | * get the number of chars lexed by this lexer instance since the last
840 | * \n or \r
841 | +*/
842 | size_t
843 | yajl_lex_current_char(yajl_lexer lexer) /*+ the current lexer context +*/
844 | {
845 | return lexer->charOff;
846 | }
847 |
848 | /*+
849 | * have a peek at the next token, but don't move the lexer forward
850 | *
851 | * Returns the next token yagl_lex_lex() will return.
852 | +*/
853 | yajl_tok
854 | yajl_lex_peek(yajl_lexer lexer, /*+ the current lexer context +*/
855 | const unsigned char * jsonText,
856 | size_t jsonTextLen,
857 | size_t offset)
858 | {
859 | const unsigned char * outBuf;
860 | size_t outLen;
861 | size_t bufLen = yajl_buf_len(lexer->buf);
862 | size_t bufOff = lexer->bufOff;
863 | /* bool */ int bufInUse = lexer->bufInUse;
864 | yajl_tok tok;
865 |
866 | tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
867 | &outBuf, &outLen);
868 |
869 | lexer->bufOff = bufOff;
870 | lexer->bufInUse = bufInUse;
871 | yajl_buf_truncate(lexer->buf, bufLen);
872 |
873 | return tok;
874 | }