1 | /* 2 | * Copyright (c) 2007-2014, Lloyd Hilaiel <me@lloyd.io> 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted, provided that the above 6 | * copyright notice and this permission notice appear in all copies. 7 | * 8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 | */ 16 | 17 | /** 18 | * A JSON text lexical analyzer. 19 | * 20 | * The implementation. 21 | **/ 22 | 23 | #include "yajl_lex.h" 24 | #include "yajl_buf.h" 25 | 26 | #include <stdlib.h> 27 | #include <stdio.h> 28 | #include <assert.h> 29 | #include <string.h> 30 | 31 | #ifdef YAJL_LEXER_DEBUG 32 | static const char * 33 | tokToStr(yajl_tok tok) 34 | { 35 | switch (tok) { 36 | case yajl_tok_bool: return "bool"; 37 | case yajl_tok_colon: return "colon"; 38 | case yajl_tok_comma: return "comma"; 39 | case yajl_tok_eof: return "eof"; 40 | case yajl_tok_error: return "error"; 41 | case yajl_tok_left_brace: return "brace"; 42 | case yajl_tok_left_bracket: return "bracket"; 43 | case yajl_tok_null: return "null"; 44 | case yajl_tok_integer: return "integer"; 45 | case yajl_tok_double: return "double"; 46 | case yajl_tok_right_brace: return "brace"; 47 | case yajl_tok_right_bracket: return "bracket"; 48 | case yajl_tok_string: return "string"; 49 | case yajl_tok_string_with_escapes: return "string_with_escapes"; 50 | } 51 | return "unknown"; 52 | } 53 | #endif 54 | 55 | /* 56 | * Impact of the stream parsing feature on the lexer: 57 | * 58 | * YAJL supports parsing of streams. That is, the ability to parse the first 59 | * bits of a chunk of JSON before the last bits are available (still on 60 | * the network or disk). This makes the lexer more complex. The 61 | * responsibility of the lexer is to handle transparently the case where 62 | * a chunk boundary falls in the middle of a token. This is 63 | * accomplished is via a buffer and a character reading abstraction. 64 | * 65 | * Overview of implementation 66 | * 67 | * When we lex to end of input string before end of token is hit, we 68 | * copy all of the input text composing the token into our lexBuf. 69 | * 70 | * Every time we read a character, we do so through the readChar function. 71 | * readChar's responsibility is to handle pulling all chars from the buffer 72 | * before pulling chars from input text 73 | */ 74 | 75 | /*+ the (private) lexer context +*/ 76 | struct yajl_lexer_t { 77 | /*+ the current line count +*/ 78 | size_t lineOff; 79 | /* the current character offset into the current line (i.e. since the last '\r' or '\n') */ 80 | size_t charOff; 81 | 82 | /*+ error +*/ 83 | yajl_lex_error error; 84 | 85 | /*+ a input buffer to handle the case where a token is spread over 86 | * multiple chunks +*/ 87 | yajl_buf buf; 88 | 89 | /*+ in the case where we have data in the lexBuf, bufOff holds 90 | * the current offset into the lexBuf. +*/ 91 | size_t bufOff; 92 | 93 | /*+ are we using the lex buf? +*/ 94 | /* bool */ int bufInUse; 95 | 96 | /*+ shall we allow comments? +*/ 97 | /* bool */ int allowComments; 98 | 99 | /*+ shall we validate utf8 inside strings? +*/ 100 | /* bool */ int validateUTF8; 101 | 102 | /* the allocator functions being used by this lexer */ 103 | yajl_alloc_funcs * alloc; 104 | }; 105 | 106 | #define readChar(lxr, txt, off) \ 107 | (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \ 108 | (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \ 109 | ((txt)[(*(off))++])) 110 | 111 | #define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--)) 112 | 113 | /*+ 114 | * allocate a lexer context 115 | * 116 | * Returns a lexer context object that must be passed to calls to 117 | * yajl_lex_lex(), etc., and which must be passed to yajl_lex_free() when lexing 118 | * is complete (successfully or not). 119 | +*/ 120 | yajl_lexer 121 | yajl_lex_alloc(yajl_alloc_funcs * alloc, /*+ allocator functions, e.g. from yajl_set_default_alloc_funcs() +*/ 122 | /* bool */ int allowComments, /*+ should this lexer handle comments embedded in the JSON text? +*/ 123 | /* bool */ int validateUTF8) /*+ should this lexer validate UTF8 characters? +*/ 124 | { 125 | yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t)); 126 | memset((void *) lxr, 0, sizeof(struct yajl_lexer_t)); 127 | lxr->buf = yajl_buf_alloc(alloc); 128 | lxr->allowComments = allowComments; 129 | lxr->validateUTF8 = validateUTF8; 130 | lxr->alloc = alloc; 131 | return lxr; 132 | } 133 | 134 | /*+ free a lexer context +*/ 135 | void 136 | yajl_lex_free(yajl_lexer lxr) /*+ the lexer context to free +*/ 137 | { 138 | yajl_buf_free(lxr->buf); 139 | YA_FREE(lxr->alloc, lxr); 140 | return; 141 | } 142 | 143 | #define VEC 0x01 144 | #define IJC 0x02 145 | #define VHC 0x04 146 | #define NFP 0x08 147 | #define NUC 0x10 148 | 149 | /*+ 150 | * a lookup table which lets us quickly determine three things: 151 | * 152 | * VEC - valid escaped control char 153 | * 154 | * IJC - invalid json char 155 | * 156 | * VHC - valid hex char 157 | * 158 | * NFP - needs further processing (from a string scanning perspective) 159 | * 160 | * NUC - needs utf8 checking when enabled (from a string scanning perspective) 161 | * 162 | * note. the solidus '/' may be escaped or not. 163 | +*/ 164 | static const char charLookupTable[256] = 165 | { 166 | /*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , 167 | /*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , 168 | /*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , 169 | /*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , 170 | 171 | /*20*/ 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 0 , 0 , 172 | /*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC , 173 | /*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC , 174 | /*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 , 175 | 176 | /*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 , 177 | /*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 178 | /*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 179 | /*58*/ 0 , 0 , 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 180 | 181 | /*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 , 182 | /*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 , 183 | /*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 , 184 | /*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 185 | 186 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , 187 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , 188 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , 189 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , 190 | 191 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , 192 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , 193 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , 194 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , 195 | 196 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , 197 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , 198 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , 199 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , 200 | 201 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , 202 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , 203 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , 204 | NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC 205 | }; 206 | 207 | #define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; } 208 | 209 | /*+ 210 | * process a variable length utf8 encoded codepoint. 211 | * 212 | * returns: 213 | * 214 | * yajl_tok_string - if valid utf8 char was parsed and offset was 215 | * advanced 216 | * 217 | * yajl_tok_eof - if end of input was hit before validation could 218 | * complete 219 | * 220 | * yajl_tok_error - if invalid utf8 was encountered 221 | * 222 | * NOTE: on error the offset will point to the first char of the 223 | * invalid utf8 224 | +*/ 225 | static yajl_tok 226 | yajl_lex_utf8_char(yajl_lexer lexer, 227 | const unsigned char * jsonText, 228 | size_t jsonTextLen, 229 | size_t * offset, 230 | unsigned int curChar) 231 | { 232 | if (curChar <= 0x7f) { 233 | /* single byte */ 234 | return yajl_tok_string; 235 | } else if ((curChar >> 5) == 0x6) { 236 | /* two byte */ 237 | UTF8_CHECK_EOF; 238 | curChar = readChar(lexer, jsonText, offset); 239 | if ((curChar >> 6) == 0x2) return yajl_tok_string; 240 | } else if ((curChar >> 4) == 0x0e) { 241 | /* three byte */ 242 | UTF8_CHECK_EOF; 243 | curChar = readChar(lexer, jsonText, offset); 244 | if ((curChar >> 6) == 0x2) { 245 | UTF8_CHECK_EOF; 246 | curChar = readChar(lexer, jsonText, offset); 247 | if ((curChar >> 6) == 0x2) return yajl_tok_string; 248 | } 249 | } else if ((curChar >> 3) == 0x1e) { 250 | /* four byte */ 251 | UTF8_CHECK_EOF; 252 | curChar = readChar(lexer, jsonText, offset); 253 | if ((curChar >> 6) == 0x2) { 254 | UTF8_CHECK_EOF; 255 | curChar = readChar(lexer, jsonText, offset); 256 | if ((curChar >> 6) == 0x2) { 257 | UTF8_CHECK_EOF; 258 | curChar = readChar(lexer, jsonText, offset); 259 | if ((curChar >> 6) == 0x2) return yajl_tok_string; 260 | } 261 | } 262 | } 263 | 264 | return yajl_tok_error; 265 | } 266 | 267 | #define STR_CHECK_EOF \ 268 | if (*offset >= jsonTextLen) { \ 269 | tok = yajl_tok_eof; \ 270 | goto finish_string_lex; \ 271 | } 272 | 273 | /*+ 274 | * scan a string for interesting characters that might need further 275 | * review. 276 | * 277 | * returns the number of chars that are uninteresting and can be skipped. 278 | * 279 | * (lth) hi world, any thoughts on how to make this routine faster? 280 | +*/ 281 | static size_t 282 | yajl_string_scan(const unsigned char * buf, size_t len, /* bool */ int utf8check) 283 | { 284 | unsigned char mask = IJC|NFP|(utf8check ? NUC : 0); 285 | size_t skip = 0; 286 | while (skip < len && !(charLookupTable[*buf] & mask)) 287 | { 288 | skip++; 289 | buf++; 290 | } 291 | return skip; 292 | } 293 | 294 | /*+ 295 | * lex a string. 296 | * 297 | * a token is returned which has the following meanings: 298 | * 299 | * yajl_tok_string: lex of string was successful. offset points to 300 | * terminating '"'. 301 | * 302 | * yajl_tok_eof: end of text was encountered before we could complete 303 | * the lex. 304 | * 305 | * yajl_tok_error: embedded in the string were unallowable chars. offset 306 | * points to the offending char 307 | +*/ 308 | static yajl_tok 309 | yajl_lex_string(yajl_lexer lexer, /*+ the current lexer context +*/ 310 | const unsigned char * jsonText, /*+ a pointer to the beginning of the JSON text +*/ 311 | size_t jsonTextLen, /*+ length of the JSON text +*/ 312 | size_t * offset) /*+ offset of the string to be lexed +*/ 313 | { 314 | yajl_tok tok = yajl_tok_error; 315 | int hasEscapes = 0; 316 | 317 | for (;;) { 318 | unsigned char curChar; 319 | 320 | /* now jump into a faster scanning routine to skip as much 321 | * of the buffers as possible */ 322 | { 323 | const unsigned char * p; 324 | size_t len; 325 | 326 | if ((lexer->bufInUse && yajl_buf_len(lexer->buf) && 327 | lexer->bufOff < yajl_buf_len(lexer->buf))) 328 | { 329 | p = ((const unsigned char *) yajl_buf_data(lexer->buf) + 330 | (lexer->bufOff)); 331 | len = yajl_buf_len(lexer->buf) - lexer->bufOff; 332 | lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8); 333 | } 334 | else if (*offset < jsonTextLen) 335 | { 336 | p = jsonText + *offset; 337 | len = jsonTextLen - *offset; 338 | *offset += yajl_string_scan(p, len, lexer->validateUTF8); 339 | } 340 | } 341 | 342 | STR_CHECK_EOF; 343 | 344 | curChar = readChar(lexer, jsonText, offset); 345 | 346 | /* quote terminates */ 347 | if (curChar == '"') { 348 | tok = yajl_tok_string; 349 | break; 350 | } 351 | /* backslash escapes a set of control chars, */ 352 | else if (curChar == '\\') { 353 | hasEscapes = 1; 354 | STR_CHECK_EOF; 355 | 356 | /* special case \u */ 357 | curChar = readChar(lexer, jsonText, offset); 358 | if (curChar == 'u') { 359 | unsigned int i = 0; 360 | 361 | for (i=0;i<4;i++) { 362 | STR_CHECK_EOF; 363 | curChar = readChar(lexer, jsonText, offset); 364 | if (!(charLookupTable[curChar] & VHC)) { 365 | /* back up to offending char */ 366 | unreadChar(lexer, offset); 367 | lexer->error = yajl_lex_string_invalid_hex_char; 368 | goto finish_string_lex; 369 | } 370 | } 371 | } else if (!(charLookupTable[curChar] & VEC)) { 372 | /* back up to offending char */ 373 | unreadChar(lexer, offset); 374 | lexer->error = yajl_lex_string_invalid_escaped_char; 375 | goto finish_string_lex; 376 | } 377 | } 378 | /* when not validating UTF8 it's a simple table lookup to determine 379 | * if the present character is invalid */ 380 | else if(charLookupTable[curChar] & IJC) { 381 | /* back up to offending char */ 382 | unreadChar(lexer, offset); 383 | lexer->error = yajl_lex_string_invalid_json_char; 384 | goto finish_string_lex; 385 | } 386 | /* when in validate UTF8 mode we need to do some extra work */ 387 | else if (lexer->validateUTF8) { 388 | yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen, 389 | offset, (unsigned int) curChar); 390 | 391 | if (t == yajl_tok_eof) { 392 | tok = yajl_tok_eof; 393 | goto finish_string_lex; 394 | } else if (t == yajl_tok_error) { 395 | lexer->error = yajl_lex_string_invalid_utf8; 396 | goto finish_string_lex; 397 | } 398 | } 399 | /* accept it, and move on */ 400 | } 401 | finish_string_lex: 402 | /* tell our buddy, the parser, wether he needs to process this string 403 | * again */ 404 | if (hasEscapes && tok == yajl_tok_string) { 405 | tok = yajl_tok_string_with_escapes; 406 | } 407 | 408 | return tok; 409 | } 410 | 411 | #define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof; 412 | 413 | static yajl_tok 414 | yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText, 415 | size_t jsonTextLen, size_t * offset) 416 | { 417 | /* 418 | * XXX: numbers are the only entities in json that we must lex 419 | * _beyond_ in order to know that they are complete. There 420 | * is an ambiguous case for integers at EOF. 421 | */ 422 | 423 | unsigned char c; 424 | 425 | yajl_tok tok = yajl_tok_integer; 426 | 427 | RETURN_IF_EOF; 428 | c = readChar(lexer, jsonText, offset); 429 | 430 | /* optional leading minus */ 431 | if (c == '-') { 432 | RETURN_IF_EOF; 433 | c = readChar(lexer, jsonText, offset); 434 | } 435 | 436 | /* a single zero, or a series of integers */ 437 | if (c == '0') { 438 | RETURN_IF_EOF; 439 | c = readChar(lexer, jsonText, offset); 440 | } else if (c >= '1' && c <= '9') { 441 | do { 442 | RETURN_IF_EOF; 443 | c = readChar(lexer, jsonText, offset); 444 | } while (c >= '0' && c <= '9'); 445 | } else { 446 | unreadChar(lexer, offset); 447 | lexer->error = yajl_lex_missing_integer_after_minus; 448 | return yajl_tok_error; 449 | } 450 | 451 | /* optional fraction (indicates this is floating point) */ 452 | if (c == '.') { 453 | int numRd = 0; 454 | 455 | RETURN_IF_EOF; 456 | c = readChar(lexer, jsonText, offset); 457 | 458 | while (c >= '0' && c <= '9') { 459 | numRd++; 460 | RETURN_IF_EOF; 461 | c = readChar(lexer, jsonText, offset); 462 | } 463 | 464 | if (!numRd) { 465 | unreadChar(lexer, offset); 466 | lexer->error = yajl_lex_missing_integer_after_decimal; 467 | return yajl_tok_error; 468 | } 469 | tok = yajl_tok_double; 470 | } 471 | 472 | /* optional exponent (indicates this is floating point) */ 473 | if (c == 'e' || c == 'E') { 474 | RETURN_IF_EOF; 475 | c = readChar(lexer, jsonText, offset); 476 | 477 | /* optional sign */ 478 | if (c == '+' || c == '-') { 479 | RETURN_IF_EOF; 480 | c = readChar(lexer, jsonText, offset); 481 | } 482 | 483 | if (c >= '0' && c <= '9') { 484 | do { 485 | RETURN_IF_EOF; 486 | c = readChar(lexer, jsonText, offset); 487 | } while (c >= '0' && c <= '9'); 488 | } else { 489 | unreadChar(lexer, offset); 490 | lexer->error = yajl_lex_missing_integer_after_exponent; 491 | return yajl_tok_error; 492 | } 493 | tok = yajl_tok_double; 494 | } 495 | 496 | /* we always go "one too far" */ 497 | unreadChar(lexer, offset); 498 | 499 | return tok; 500 | } 501 | 502 | static yajl_tok 503 | yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText, 504 | size_t jsonTextLen, size_t * offset) 505 | { 506 | unsigned char c; 507 | 508 | yajl_tok tok = yajl_tok_comment; 509 | 510 | RETURN_IF_EOF; 511 | c = readChar(lexer, jsonText, offset); 512 | 513 | /* either slash or star expected */ 514 | if (c == '/') { 515 | /* now we throw away until end of line */ 516 | do { 517 | RETURN_IF_EOF; 518 | c = readChar(lexer, jsonText, offset); 519 | } while (c != '\n'); 520 | } else if (c == '*') { 521 | /* now we throw away until end of comment */ 522 | for (;;) { 523 | RETURN_IF_EOF; 524 | c = readChar(lexer, jsonText, offset); 525 | if (c == '*') { 526 | RETURN_IF_EOF; 527 | c = readChar(lexer, jsonText, offset); 528 | if (c == '/') { 529 | break; 530 | } else { 531 | unreadChar(lexer, offset); 532 | } 533 | } 534 | } 535 | } else { 536 | lexer->error = yajl_lex_invalid_char; 537 | tok = yajl_tok_error; 538 | } 539 | 540 | return tok; 541 | } 542 | 543 | /*+ 544 | * Begin or continue a lexer. 545 | * 546 | * Returns a JSON lexical token for the parser. 547 | * 548 | * When you pass the next chunk of data, context should be reinitialized to 549 | * zero. xxx ??? 550 | +*/ 551 | yajl_tok 552 | yajl_lex_lex(yajl_lexer lexer, /*+ the current lexer context +*/ 553 | const unsigned char * jsonText, /*+ a chunk of JSON text to be analysed +*/ 554 | size_t jsonTextLen, /*+ length of this chunk +*/ 555 | size_t * offset, /*+ Offset is both input & output! It 556 | * should be initialized to zero for a 557 | * new chunk of target text, and upon 558 | * subsetquent calls with the same 559 | * target text should passed with the 560 | * value of the previous invocation. 561 | * 562 | * The caller may be interested in the 563 | * value of offset when an error is 564 | * returned from the lexer. This allows 565 | * the caller to render useful error 566 | * messages. 567 | +*/ 568 | const unsigned char ** outBuf, /*+ Finally, the output buffer is 569 | * usually just a pointer into the 570 | * jsonText, however in cases where 571 | * the entity being lexed spans 572 | * multiple chunks, the lexer will 573 | * buffer the entity and the data 574 | * returned will be a pointer into 575 | * that buffer. +*/ 576 | size_t * outLen) /*+ This behavior is abstracted from 577 | * client code except for the 578 | * performance implications which 579 | * require that the client choose a 580 | * reasonable chunk size to get adequate 581 | * performance. +*/ 582 | { 583 | yajl_tok tok = yajl_tok_error; 584 | unsigned char c; 585 | size_t startOffset = *offset; 586 | 587 | *outBuf = NULL; 588 | *outLen = 0; 589 | 590 | for (;;) { 591 | assert(*offset <= jsonTextLen); 592 | 593 | if (*offset >= jsonTextLen) { 594 | tok = yajl_tok_eof; 595 | goto lexed; 596 | } 597 | 598 | c = readChar(lexer, jsonText, offset); 599 | 600 | switch (c) { 601 | case '{': 602 | tok = yajl_tok_left_bracket; 603 | goto lexed; 604 | case '}': 605 | tok = yajl_tok_right_bracket; 606 | goto lexed; 607 | case '[': 608 | tok = yajl_tok_left_brace; 609 | goto lexed; 610 | case ']': 611 | tok = yajl_tok_right_brace; 612 | goto lexed; 613 | case ',': 614 | tok = yajl_tok_comma; 615 | goto lexed; 616 | case ':': 617 | tok = yajl_tok_colon; 618 | goto lexed; 619 | case '\t': case '\n': case '\v': case '\f': case '\r': case ' ': 620 | startOffset++; 621 | break; 622 | case 't': { 623 | const char * want = "rue"; 624 | do { 625 | if (*offset >= jsonTextLen) { 626 | tok = yajl_tok_eof; 627 | goto lexed; 628 | } 629 | c = readChar(lexer, jsonText, offset); 630 | if (c != *want) { 631 | unreadChar(lexer, offset); 632 | lexer->error = yajl_lex_invalid_string; 633 | tok = yajl_tok_error; 634 | goto lexed; 635 | } 636 | } while (*(++want)); 637 | tok = yajl_tok_bool; 638 | goto lexed; 639 | } 640 | case 'f': { 641 | const char * want = "alse"; 642 | do { 643 | if (*offset >= jsonTextLen) { 644 | tok = yajl_tok_eof; 645 | goto lexed; 646 | } 647 | c = readChar(lexer, jsonText, offset); 648 | if (c != *want) { 649 | unreadChar(lexer, offset); 650 | lexer->error = yajl_lex_invalid_string; 651 | tok = yajl_tok_error; 652 | goto lexed; 653 | } 654 | } while (*(++want)); 655 | tok = yajl_tok_bool; 656 | goto lexed; 657 | } 658 | case 'n': { 659 | const char * want = "ull"; 660 | do { 661 | if (*offset >= jsonTextLen) { 662 | tok = yajl_tok_eof; 663 | goto lexed; 664 | } 665 | c = readChar(lexer, jsonText, offset); 666 | if (c != *want) { 667 | unreadChar(lexer, offset); 668 | lexer->error = yajl_lex_invalid_string; 669 | tok = yajl_tok_error; 670 | goto lexed; 671 | } 672 | } while (*(++want)); 673 | tok = yajl_tok_null; 674 | goto lexed; 675 | } 676 | case '"': { 677 | tok = yajl_lex_string(lexer, (const unsigned char *) jsonText, 678 | jsonTextLen, offset); 679 | goto lexed; 680 | } 681 | case '-': 682 | case '0': case '1': case '2': case '3': case '4': 683 | case '5': case '6': case '7': case '8': case '9': { 684 | /* integer parsing wants to start from the beginning */ 685 | unreadChar(lexer, offset); 686 | tok = yajl_lex_number(lexer, (const unsigned char *) jsonText, 687 | jsonTextLen, offset); 688 | goto lexed; 689 | } 690 | case '/': 691 | /* hey, look, a probable comment! If comments are disabled 692 | * it's an error. */ 693 | if (!lexer->allowComments) { 694 | unreadChar(lexer, offset); 695 | lexer->error = yajl_lex_unallowed_comment; 696 | tok = yajl_tok_error; 697 | goto lexed; 698 | } 699 | /* if comments are enabled, then we should try to lex 700 | * the thing. possible outcomes are 701 | * - successful lex (tok_comment, which means continue), 702 | * - malformed comment opening (slash not followed by 703 | * '*' or '/') (tok_error) 704 | * - eof hit. (tok_eof) */ 705 | tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText, 706 | jsonTextLen, offset); 707 | if (tok == yajl_tok_comment) { 708 | /* "error" is silly, but that's the initial 709 | * state of tok. guilty until proven innocent. */ 710 | tok = yajl_tok_error; 711 | yajl_buf_clear(lexer->buf); 712 | lexer->bufInUse = 0; 713 | startOffset = *offset; 714 | break; 715 | } 716 | /* hit error or eof, bail */ 717 | goto lexed; 718 | default: 719 | lexer->error = yajl_lex_invalid_char; 720 | tok = yajl_tok_error; 721 | goto lexed; 722 | } 723 | } 724 | 725 | 726 | lexed: 727 | /* need to append to buffer if the buffer is in use or 728 | * if it's an EOF token */ 729 | if (tok == yajl_tok_eof || lexer->bufInUse) { 730 | if (!lexer->bufInUse) yajl_buf_clear(lexer->buf); 731 | lexer->bufInUse = 1; 732 | yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset); 733 | lexer->bufOff = 0; 734 | 735 | if (tok != yajl_tok_eof) { 736 | *outBuf = yajl_buf_data(lexer->buf); 737 | *outLen = yajl_buf_len(lexer->buf); 738 | lexer->bufInUse = 0; 739 | } 740 | } else if (tok != yajl_tok_error) { 741 | *outBuf = jsonText + startOffset; 742 | *outLen = *offset - startOffset; 743 | } 744 | 745 | /* special case for strings. skip the quotes. */ 746 | if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes) 747 | { 748 | assert(*outLen >= 2); 749 | (*outBuf)++; 750 | *outLen -= 2; 751 | } 752 | 753 | 754 | #ifdef YAJL_LEXER_DEBUG 755 | if (tok == yajl_tok_error) { 756 | printf("lexical error: %s\n", 757 | yajl_lex_error_to_string(yajl_lex_get_error(lexer))); 758 | } else if (tok == yajl_tok_eof) { 759 | printf("EOF hit\n"); 760 | } else { 761 | printf("lexed %s: '", tokToStr(tok)); 762 | fwrite(*outBuf, (size_t) 1, *outLen, stdout); 763 | printf("'\n"); 764 | } 765 | #endif 766 | 767 | return tok; 768 | } 769 | 770 | /*+ 771 | * convert a lexer error value returned by yajl_lex_get_error() to a 772 | * descriptive string 773 | +*/ 774 | const char * 775 | yajl_lex_error_to_string(yajl_lex_error error) /*+ lexer error value +*/ 776 | { 777 | switch (error) { 778 | case yajl_lex_e_ok: 779 | return "ok, no error"; 780 | case yajl_lex_string_invalid_utf8: 781 | return "invalid bytes in UTF8 string."; 782 | case yajl_lex_string_invalid_escaped_char: 783 | return "inside a string, '\\' occurs before a character " 784 | "which it may not."; 785 | case yajl_lex_string_invalid_json_char: 786 | return "invalid character inside string."; 787 | case yajl_lex_string_invalid_hex_char: 788 | return "invalid (non-hex) character occurs after '\\u' inside " 789 | "string."; 790 | case yajl_lex_invalid_char: 791 | return "invalid char in json text."; 792 | case yajl_lex_invalid_string: 793 | return "invalid string in json text."; 794 | case yajl_lex_missing_integer_after_exponent: 795 | return "malformed number, a digit is required after the exponent."; 796 | case yajl_lex_missing_integer_after_decimal: 797 | return "malformed number, a digit is required after the " 798 | "decimal point."; 799 | case yajl_lex_missing_integer_after_minus: 800 | return "malformed number, a digit is required after the " 801 | "minus sign."; 802 | case yajl_lex_unallowed_comment: 803 | return "probable comment found in input text, comments are " 804 | "not enabled."; 805 | } 806 | /* NOTREACHED */ 807 | return "unknown error code"; 808 | } 809 | 810 | 811 | /*+ 812 | * allows access to more specific information about the lexical 813 | * error when yajl_lex_lex() returns yajl_tok_error. 814 | * 815 | * Retunrs a value that may be passed to yajl_lex_error_to_string() to convert 816 | * it into descriptive error message text. 817 | +*/ 818 | yajl_lex_error 819 | yajl_lex_get_error(yajl_lexer lexer) /*+ the current lexer context +*/ 820 | { 821 | if (lexer == NULL) return (yajl_lex_error) -1; 822 | return lexer->error; 823 | } 824 | 825 | /*+ 826 | * A helper for finding the line number of error in the input. 827 | * 828 | * Returns the number of lines lexed by this lexer instance 829 | +*/ 830 | size_t 831 | yajl_lex_current_line(yajl_lexer lexer) /*+ the current lexer context +*/ 832 | { 833 | return lexer->lineOff; 834 | } 835 | 836 | /*+ 837 | * A helper for finding the exact context of an error in the input. 838 | * 839 | * get the number of chars lexed by this lexer instance since the last 840 | * \n or \r 841 | +*/ 842 | size_t 843 | yajl_lex_current_char(yajl_lexer lexer) /*+ the current lexer context +*/ 844 | { 845 | return lexer->charOff; 846 | } 847 | 848 | /*+ 849 | * have a peek at the next token, but don't move the lexer forward 850 | * 851 | * Returns the next token yagl_lex_lex() will return. 852 | +*/ 853 | yajl_tok 854 | yajl_lex_peek(yajl_lexer lexer, /*+ the current lexer context +*/ 855 | const unsigned char * jsonText, 856 | size_t jsonTextLen, 857 | size_t offset) 858 | { 859 | const unsigned char * outBuf; 860 | size_t outLen; 861 | size_t bufLen = yajl_buf_len(lexer->buf); 862 | size_t bufOff = lexer->bufOff; 863 | /* bool */ int bufInUse = lexer->bufInUse; 864 | yajl_tok tok; 865 | 866 | tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset, 867 | &outBuf, &outLen); 868 | 869 | lexer->bufOff = bufOff; 870 | lexer->bufInUse = bufInUse; 871 | yajl_buf_truncate(lexer->buf, bufLen); 872 | 873 | return tok; 874 | }