File src/yajl_lex.c

A JSON text lexical analyzer.
The implementation.

Included Files

#include "src/yajl_lex.h"
- #include "src/yajl/yajl_common.h"
  - #include </usr/include/stddef.h>

#include "src/yajl_buf.h"
- #include "src/yajl/yajl_common.h"
- #include "src/yajl_alloc.h"
  - #include "src/yajl/yajl_common.h"

#include </usr/include/stdlib.h>

#include </usr/include/stdio.h>

#include </usr/include/assert.h>

#include </usr/include/string.h>

Type struct yajl_lexer_t

the (private) lexer context

struct yajl_lexer_t

`struct yajl_lexer_t`
`{`
`size_t lineOff;`	the current line count
`size_t charOff;`
`yajl_lex_error error;`	error
`yajl_buf buf;`	a input buffer to handle the case where a token is spread over multiple chunks
`size_t bufOff;`	in the case where we have data in the lexBuf, bufOff holds the current offset into the lexBuf.
`int bufInUse;`	are we using the lex buf?
`int allowComments;`	shall we allow comments?
`int validateUTF8;`	shall we validate utf8 inside strings?
`yajl_alloc_funcs* alloc;`
`}`

Local Variables

charLookupTable
a lookup table which lets us quickly determine three things:
VEC - valid escaped control char
IJC - invalid json char
VHC - valid hex char
NFP - needs further processing (from a string scanning perspective)
NUC - needs utf8 checking when enabled (from a string scanning perspective)
note. the solidus '/' may be escaped or not.

static const char charLookupTable[256]

Used in:	yajl_lex_string()
	yajl_string_scan()

Global Function yajl_lex_alloc()

allocate a lexer context

yajl_lexer yajl_lex_alloc ( yajl_alloc_funcs* alloc, int allowComments, int validateUTF8 )

yajl_alloc_funcs* alloc: allocator functions, e.g. from yajl_set_default_alloc_funcs()
int allowComments: should this lexer handle comments embedded in the JSON text?
int validateUTF8: should this lexer validate UTF8 characters?

Returns a lexer context object that must be passed to calls to yajl_lex_lex(), etc., and which must be passed to yajl_lex_free() when lexing is complete (successfully or not).

Prototyped in:	src/yajl_lex.h
Calls:	yajl_buf_alloc()	src/yajl_buf.c
	memset()
Called by:	yajl_complete_parse()	src/yajl.c
	yajl_parse()	src/yajl.c

Global Function yajl_lex_current_char()

A helper for finding the exact context of an error in the input.

size_t yajl_lex_current_char ( yajl_lexer lexer )

yajl_lexer lexer: the current lexer context

get the number of chars lexed by this lexer instance since the last \n or \r

Prototyped in:

src/yajl_lex.h

Global Function yajl_lex_current_line()

A helper for finding the line number of error in the input.

size_t yajl_lex_current_line ( yajl_lexer lexer )

yajl_lexer lexer: the current lexer context

Returns the number of lines lexed by this lexer instance

Prototyped in:

src/yajl_lex.h

Global Function yajl_lex_error_to_string()

convert a lexer error value returned by yajl_lex_get_error() to a descriptive string

const char* yajl_lex_error_to_string ( yajl_lex_error error )

yajl_lex_error error: lexer error value

Prototyped in:	src/yajl_lex.h
Called by:	yajl_render_error_string()	src/yajl_parser.c

Global Function yajl_lex_free()

free a lexer context

void yajl_lex_free ( yajl_lexer lxr )

yajl_lexer lxr: the lexer context to free

Prototyped in:	src/yajl_lex.h
Calls:	yajl_buf_free()	src/yajl_buf.c
Called by:	yajl_free()	src/yajl.c

Global Function yajl_lex_get_error()

allows access to more specific information about the lexical error when yajl_lex_lex() returns yajl_tok_error.

yajl_lex_error yajl_lex_get_error ( yajl_lexer lexer )

yajl_lexer lexer: the current lexer context

Retunrs a value that may be passed to yajl_lex_error_to_string() to convert it into descriptive error message text.

Prototyped in:	src/yajl_lex.h
Called by:	yajl_render_error_string()	src/yajl_parser.c

Global Function yajl_lex_lex()

Begin or continue a lexer.

yajl_tok yajl_lex_lex ( yajl_lexer lexer, const unsigned char* jsonText, size_t jsonTextLen, size_t* offset, const unsigned char** outBuf, size_t* outLen )

yajl_lexer lexer: the current lexer context
const unsigned char* jsonText: a chunk of JSON text to be analysed
size_t jsonTextLen: length of this chunk
size_t* offset: Offset is both input & output! It should be initialized to zero for a new chunk of target text, and upon subsetquent calls with the same target text should passed with the value of the previous invocation.
The caller may be interested in the value of offset when an error is returned from the lexer. This allows the caller to render useful error messages.
const unsigned char** outBuf: Finally, the output buffer is usually just a pointer into the jsonText, however in cases where the entity being lexed spans multiple chunks, the lexer will buffer the entity and the data returned will be a pointer into that buffer.
size_t* outLen: This behavior is abstracted from client code except for the performance implications which require that the client choose a reasonable chunk size to get adequate performance.

Returns a JSON lexical token for the parser.
When you pass the next chunk of data, context should be reinitialized to zero. xxx ???

Prototyped in:	src/yajl_lex.h
Calls:	yajl_buf_append()	src/yajl_buf.c
	yajl_buf_clear()	src/yajl_buf.c
	yajl_buf_data()	src/yajl_buf.c
	yajl_buf_len()	src/yajl_buf.c
	yajl_lex_comment()	src/yajl_lex.c
	yajl_lex_number()	src/yajl_lex.c
	yajl_lex_string()	src/yajl_lex.c
	__assert13()
Called by:	yajl_do_parse()	src/yajl_parser.c
	yajl_lex_peek()	src/yajl_lex.c

Global Function yajl_lex_peek()

have a peek at the next token, but don't move the lexer forward

yajl_tok yajl_lex_peek ( yajl_lexer lexer, const unsigned char* jsonText, size_t jsonTextLen, size_t offset )

yajl_lexer lexer: the current lexer context
const unsigned char* jsonText
size_t jsonTextLen
size_t offset

Returns the next token yagl_lex_lex() will return.

Prototyped in:	src/yajl_lex.h
Calls:	yajl_buf_len()	src/yajl_buf.c
	yajl_buf_truncate()	src/yajl_buf.c
	yajl_lex_lex()	src/yajl_lex.c

Local Function yajl_lex_comment()

static yajl_tok yajl_lex_comment ( yajl_lexer lexer, const unsigned char* jsonText, size_t jsonTextLen, size_t* offset )

Calls:	yajl_buf_data()	src/yajl_buf.c
	yajl_buf_len()	src/yajl_buf.c
Called by:	yajl_lex_lex()	src/yajl_lex.c

Local Function yajl_lex_number()

static yajl_tok yajl_lex_number ( yajl_lexer lexer, const unsigned char* jsonText, size_t jsonTextLen, size_t* offset )

Calls:	yajl_buf_data()	src/yajl_buf.c
	yajl_buf_len()	src/yajl_buf.c
Called by:	yajl_lex_lex()	src/yajl_lex.c

Local Function yajl_lex_string()

lex a string.

static yajl_tok yajl_lex_string ( yajl_lexer lexer, const unsigned char* jsonText, size_t jsonTextLen, size_t* offset )

yajl_lexer lexer: the current lexer context
const unsigned char* jsonText: a pointer to the beginning of the JSON text
size_t jsonTextLen: length of the JSON text
size_t* offset: offset of the string to be lexed

a token is returned which has the following meanings:
yajl_tok_string: lex of string was successful. offset points to terminating '"'.
yajl_tok_eof: end of text was encountered before we could complete the lex.
yajl_tok_error: embedded in the string were unallowable chars. offset points to the offending char

Calls:	yajl_buf_data()	src/yajl_buf.c
	yajl_buf_len()	src/yajl_buf.c
	yajl_lex_utf8_char()	src/yajl_lex.c
	yajl_string_scan()	src/yajl_lex.c
Called by:	yajl_lex_lex()	src/yajl_lex.c
References Variables:	charLookupTable	src/yajl_lex.c

Local Function yajl_lex_utf8_char()

process a variable length utf8 encoded codepoint.

static yajl_tok yajl_lex_utf8_char ( yajl_lexer lexer, const unsigned char* jsonText, size_t jsonTextLen, size_t* offset, unsigned int curChar )
returns:
yajl_tok_string - if valid utf8 char was parsed and offset was advanced
yajl_tok_eof - if end of input was hit before validation could complete
yajl_tok_error - if invalid utf8 was encountered
NOTE: on error the offset will point to the first char of the invalid utf8

Calls:	yajl_buf_data()	src/yajl_buf.c
	yajl_buf_len()	src/yajl_buf.c
Called by:	yajl_lex_string()	src/yajl_lex.c

Local Function yajl_string_scan()

scan a string for interesting characters that might need further review.

static size_t yajl_string_scan ( const unsigned char* buf, size_t len, int utf8check )
returns the number of chars that are uninteresting and can be skipped.
(lth) hi world, any thoughts on how to make this routine faster?

Called by:	yajl_lex_string()	src/yajl_lex.c
References Variables:	charLookupTable	src/yajl_lex.c