1 // Package jlexer contains a JSON lexer implementation.
3 // It is expected that it is mostly used with generated parser code, so the interface is tuned
4 // for a parser that knows what kind of data is expected.
19 // tokenKind determines type of a token.
23 tokenUndef tokenKind = iota // No token.
24 tokenDelim // Delimiter: one of '{', '}', '[' or ']'.
25 tokenString // A string literal, e.g. "abc\u1234"
26 tokenNumber // Number literal, e.g. 1.5e5
27 tokenBool // Boolean literal: true or false.
28 tokenNull // null keyword.
31 // token describes a single token: type, position in the input and value.
33 kind tokenKind // Type of a token.
35 boolValue bool // Value if a boolean literal token.
36 byteValue []byte // Raw value of a token.
40 // Lexer is a JSON lexer: it iterates over JSON tokens in a byte slice.
42 Data []byte // Input data given to the lexer.
44 start int // Start of the current token.
45 pos int // Current unscanned position in the input stream.
46 token token // Last scanned token, if token.kind != tokenUndef.
48 firstElement bool // Whether current element is the first in array or an object.
49 wantSep byte // A comma or a colon character, which need to occur before a token.
51 UseMultipleErrors bool // If we want to use multiple errors.
52 fatalError error // Fatal error occurred during lexing. It is usually a syntax error.
53 multipleErrors []*LexerError // Semantic errors occurred during lexing. Marshalling will be continued after finding this errors.
56 // FetchToken scans the input for the next token.
57 func (r *Lexer) FetchToken() {
58 r.token.kind = tokenUndef
61 // Check if r.Data has r.pos element
62 // If it doesn't, it mean corrupted input data
63 if len(r.Data) < r.pos {
64 r.errParse("Unexpected end of data")
67 // Determine the type of a token by skipping whitespace and reading the
69 for _, c := range r.Data[r.pos:] {
80 case ' ', '\t', '\r', '\n':
89 r.token.kind = tokenString
98 r.token.kind = tokenDelim
99 r.token.delimValue = r.Data[r.pos]
104 if !r.firstElement && (r.wantSep != ',') {
108 r.token.kind = tokenDelim
109 r.token.delimValue = r.Data[r.pos]
113 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-':
117 r.token.kind = tokenNumber
126 r.token.kind = tokenNull
135 r.token.kind = tokenBool
136 r.token.boolValue = true
145 r.token.kind = tokenBool
146 r.token.boolValue = false
155 r.fatalError = io.EOF
159 // isTokenEnd returns true if the char can follow a non-delimiter token
160 func isTokenEnd(c byte) bool {
161 return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '[' || c == ']' || c == '{' || c == '}' || c == ',' || c == ':'
164 // fetchNull fetches and checks remaining bytes of null keyword.
165 func (r *Lexer) fetchNull() {
167 if r.pos > len(r.Data) ||
168 r.Data[r.pos-3] != 'u' ||
169 r.Data[r.pos-2] != 'l' ||
170 r.Data[r.pos-1] != 'l' ||
171 (r.pos != len(r.Data) && !isTokenEnd(r.Data[r.pos])) {
178 // fetchTrue fetches and checks remaining bytes of true keyword.
179 func (r *Lexer) fetchTrue() {
181 if r.pos > len(r.Data) ||
182 r.Data[r.pos-3] != 'r' ||
183 r.Data[r.pos-2] != 'u' ||
184 r.Data[r.pos-1] != 'e' ||
185 (r.pos != len(r.Data) && !isTokenEnd(r.Data[r.pos])) {
192 // fetchFalse fetches and checks remaining bytes of false keyword.
193 func (r *Lexer) fetchFalse() {
195 if r.pos > len(r.Data) ||
196 r.Data[r.pos-4] != 'a' ||
197 r.Data[r.pos-3] != 'l' ||
198 r.Data[r.pos-2] != 's' ||
199 r.Data[r.pos-1] != 'e' ||
200 (r.pos != len(r.Data) && !isTokenEnd(r.Data[r.pos])) {
207 // fetchNumber scans a number literal token.
208 func (r *Lexer) fetchNumber() {
214 for i, c := range r.Data[r.pos:] {
216 case c >= '0' && c <= '9':
218 case c == '.' && !hasDot:
220 case (c == 'e' || c == 'E') && !hasE:
224 case (c == '+' || c == '-') && afterE:
231 r.token.byteValue = r.Data[r.start:r.pos]
238 r.token.byteValue = r.Data[r.start:]
241 // findStringLen tries to scan into the string literal for ending quote char to determine required size.
242 // The size will be exact if no escapes are present and may be inexact if there are escaped chars.
243 func findStringLen(data []byte) (isValid, hasEscapes bool, length int) {
246 for i := 0; i < len(data); i++ {
251 if i < len(data) && data[i] == 'u' {
255 return true, (delta > 0), (i - delta)
259 return false, false, len(data)
262 // getu4 decodes \uXXXX from the beginning of s, returning the hex value,
264 func getu4(s []byte) rune {
265 if len(s) < 6 || s[0] != '\\' || s[1] != 'u' {
269 for i := 2; i < len(s) && i < 6; i++ {
273 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
275 case 'a', 'b', 'c', 'd', 'e', 'f':
277 case 'A', 'B', 'C', 'D', 'E', 'F':
289 // processEscape processes a single escape sequence and returns number of bytes processed.
290 func (r *Lexer) processEscape(data []byte) (int, error) {
292 return 0, fmt.Errorf("syntax error at %v", string(data))
298 r.token.byteValue = append(r.token.byteValue, c)
301 r.token.byteValue = append(r.token.byteValue, '\b')
304 r.token.byteValue = append(r.token.byteValue, '\f')
307 r.token.byteValue = append(r.token.byteValue, '\n')
310 r.token.byteValue = append(r.token.byteValue, '\r')
313 r.token.byteValue = append(r.token.byteValue, '\t')
318 return 0, errors.New("syntax error")
322 if utf16.IsSurrogate(rr) {
323 rr1 := getu4(data[read:])
324 if dec := utf16.DecodeRune(rr, rr1); dec != unicode.ReplacementChar {
328 rr = unicode.ReplacementChar
332 s := utf8.EncodeRune(d[:], rr)
333 r.token.byteValue = append(r.token.byteValue, d[:s]...)
337 return 0, errors.New("syntax error")
340 // fetchString scans a string literal token.
341 func (r *Lexer) fetchString() {
343 data := r.Data[r.pos:]
345 isValid, hasEscapes, length := findStringLen(data)
348 r.errParse("unterminated string literal")
352 r.token.byteValue = data[:length]
357 r.token.byteValue = make([]byte, 0, length)
359 for i := 0; i < len(data); {
363 r.token.byteValue = append(r.token.byteValue, data[p:i]...)
368 r.token.byteValue = append(r.token.byteValue, data[p:i]...)
369 off, err := r.processEscape(data[i:])
371 r.errParse(err.Error())
381 r.errParse("unterminated string literal")
384 // scanToken scans the next token if no token is currently available in the lexer.
385 func (r *Lexer) scanToken() {
386 if r.token.kind != tokenUndef || r.fatalError != nil {
393 // consume resets the current token to allow scanning the next one.
394 func (r *Lexer) consume() {
395 r.token.kind = tokenUndef
396 r.token.delimValue = 0
399 // Ok returns true if no error (including io.EOF) was encountered during scanning.
400 func (r *Lexer) Ok() bool {
401 return r.fatalError == nil
404 const maxErrorContextLen = 13
406 func (r *Lexer) errParse(what string) {
407 if r.fatalError == nil {
409 if len(r.Data)-r.pos <= maxErrorContextLen {
412 str = string(r.Data[r.pos:r.pos+maxErrorContextLen-3]) + "..."
414 r.fatalError = &LexerError{
422 func (r *Lexer) errSyntax() {
423 r.errParse("syntax error")
426 func (r *Lexer) errInvalidToken(expected string) {
427 if r.fatalError != nil {
430 if r.UseMultipleErrors {
436 r.token.delimValue = ']'
437 r.token.kind = tokenDelim
439 r.token.delimValue = '}'
440 r.token.kind = tokenDelim
442 r.addNonfatalError(&LexerError{
443 Reason: fmt.Sprintf("expected %s", expected),
445 Data: string(r.Data[r.start:r.pos]),
451 if len(r.token.byteValue) <= maxErrorContextLen {
452 str = string(r.token.byteValue)
454 str = string(r.token.byteValue[:maxErrorContextLen-3]) + "..."
456 r.fatalError = &LexerError{
457 Reason: fmt.Sprintf("expected %s", expected),
463 func (r *Lexer) GetPos() int {
467 // Delim consumes a token and verifies that it is the given delimiter.
468 func (r *Lexer) Delim(c byte) {
469 if r.token.kind == tokenUndef && r.Ok() {
473 if !r.Ok() || r.token.delimValue != c {
474 r.consume() // errInvalidToken can change token if UseMultipleErrors is enabled.
475 r.errInvalidToken(string([]byte{c}))
481 // IsDelim returns true if there was no scanning error and next token is the given delimiter.
482 func (r *Lexer) IsDelim(c byte) bool {
483 if r.token.kind == tokenUndef && r.Ok() {
486 return !r.Ok() || r.token.delimValue == c
489 // Null verifies that the next token is null and consumes it.
490 func (r *Lexer) Null() {
491 if r.token.kind == tokenUndef && r.Ok() {
494 if !r.Ok() || r.token.kind != tokenNull {
495 r.errInvalidToken("null")
500 // IsNull returns true if the next token is a null keyword.
501 func (r *Lexer) IsNull() bool {
502 if r.token.kind == tokenUndef && r.Ok() {
505 return r.Ok() && r.token.kind == tokenNull
508 // Skip skips a single token.
509 func (r *Lexer) Skip() {
510 if r.token.kind == tokenUndef && r.Ok() {
516 // SkipRecursive skips next array or object completely, or just skips a single token if not
519 // Note: no syntax validation is performed on the skipped data.
520 func (r *Lexer) SkipRecursive() {
524 switch r.token.delimValue {
526 start, end = '{', '}'
528 start, end = '[', ']'
540 for i, c := range r.Data[r.pos:] {
542 case c == start && !inQuotes:
544 case c == end && !inQuotes:
550 case c == '\\' && inQuotes:
551 wasEscape = !wasEscape
553 case c == '"' && inQuotes:
561 r.fatalError = &LexerError{
562 Reason: "EOF reached while skipping array/object or token",
564 Data: string(r.Data[r.pos:]),
568 // Raw fetches the next item recursively as a data slice
569 func (r *Lexer) Raw() []byte {
574 return r.Data[r.start:r.pos]
577 // IsStart returns whether the lexer is positioned at the start
578 // of an input string.
579 func (r *Lexer) IsStart() bool {
583 // Consumed reads all remaining bytes from the input, publishing an error if
584 // there is anything but whitespace remaining.
585 func (r *Lexer) Consumed() {
586 if r.pos > len(r.Data) || !r.Ok() {
590 for _, c := range r.Data[r.pos:] {
591 if c != ' ' && c != '\t' && c != '\r' && c != '\n' {
592 r.AddError(&LexerError{
593 Reason: "invalid character '" + string(c) + "' after top-level value",
595 Data: string(r.Data[r.pos:]),
605 func (r *Lexer) unsafeString() (string, []byte) {
606 if r.token.kind == tokenUndef && r.Ok() {
609 if !r.Ok() || r.token.kind != tokenString {
610 r.errInvalidToken("string")
613 bytes := r.token.byteValue
614 ret := bytesToStr(r.token.byteValue)
619 // UnsafeString returns the string value if the token is a string literal.
621 // Warning: returned string may point to the input buffer, so the string should not outlive
622 // the input buffer. Intended pattern of usage is as an argument to a switch statement.
623 func (r *Lexer) UnsafeString() string {
624 ret, _ := r.unsafeString()
628 // UnsafeBytes returns the byte slice if the token is a string literal.
629 func (r *Lexer) UnsafeBytes() []byte {
630 _, ret := r.unsafeString()
634 // String reads a string literal.
635 func (r *Lexer) String() string {
636 if r.token.kind == tokenUndef && r.Ok() {
639 if !r.Ok() || r.token.kind != tokenString {
640 r.errInvalidToken("string")
643 ret := string(r.token.byteValue)
648 // Bytes reads a string literal and base64 decodes it into a byte slice.
649 func (r *Lexer) Bytes() []byte {
650 if r.token.kind == tokenUndef && r.Ok() {
653 if !r.Ok() || r.token.kind != tokenString {
654 r.errInvalidToken("string")
657 ret := make([]byte, base64.StdEncoding.DecodedLen(len(r.token.byteValue)))
658 n, err := base64.StdEncoding.Decode(ret, r.token.byteValue)
660 r.fatalError = &LexerError{
670 // Bool reads a true or false boolean keyword.
671 func (r *Lexer) Bool() bool {
672 if r.token.kind == tokenUndef && r.Ok() {
675 if !r.Ok() || r.token.kind != tokenBool {
676 r.errInvalidToken("bool")
679 ret := r.token.boolValue
684 func (r *Lexer) number() string {
685 if r.token.kind == tokenUndef && r.Ok() {
688 if !r.Ok() || r.token.kind != tokenNumber {
689 r.errInvalidToken("number")
692 ret := bytesToStr(r.token.byteValue)
697 func (r *Lexer) Uint8() uint8 {
703 n, err := strconv.ParseUint(s, 10, 8)
705 r.addNonfatalError(&LexerError{
714 func (r *Lexer) Uint16() uint16 {
720 n, err := strconv.ParseUint(s, 10, 16)
722 r.addNonfatalError(&LexerError{
731 func (r *Lexer) Uint32() uint32 {
737 n, err := strconv.ParseUint(s, 10, 32)
739 r.addNonfatalError(&LexerError{
748 func (r *Lexer) Uint64() uint64 {
754 n, err := strconv.ParseUint(s, 10, 64)
756 r.addNonfatalError(&LexerError{
765 func (r *Lexer) Uint() uint {
766 return uint(r.Uint64())
769 func (r *Lexer) Int8() int8 {
775 n, err := strconv.ParseInt(s, 10, 8)
777 r.addNonfatalError(&LexerError{
786 func (r *Lexer) Int16() int16 {
792 n, err := strconv.ParseInt(s, 10, 16)
794 r.addNonfatalError(&LexerError{
803 func (r *Lexer) Int32() int32 {
809 n, err := strconv.ParseInt(s, 10, 32)
811 r.addNonfatalError(&LexerError{
820 func (r *Lexer) Int64() int64 {
826 n, err := strconv.ParseInt(s, 10, 64)
828 r.addNonfatalError(&LexerError{
837 func (r *Lexer) Int() int {
838 return int(r.Int64())
841 func (r *Lexer) Uint8Str() uint8 {
842 s, b := r.unsafeString()
847 n, err := strconv.ParseUint(s, 10, 8)
849 r.addNonfatalError(&LexerError{
858 func (r *Lexer) Uint16Str() uint16 {
859 s, b := r.unsafeString()
864 n, err := strconv.ParseUint(s, 10, 16)
866 r.addNonfatalError(&LexerError{
875 func (r *Lexer) Uint32Str() uint32 {
876 s, b := r.unsafeString()
881 n, err := strconv.ParseUint(s, 10, 32)
883 r.addNonfatalError(&LexerError{
892 func (r *Lexer) Uint64Str() uint64 {
893 s, b := r.unsafeString()
898 n, err := strconv.ParseUint(s, 10, 64)
900 r.addNonfatalError(&LexerError{
909 func (r *Lexer) UintStr() uint {
910 return uint(r.Uint64Str())
913 func (r *Lexer) UintptrStr() uintptr {
914 return uintptr(r.Uint64Str())
917 func (r *Lexer) Int8Str() int8 {
918 s, b := r.unsafeString()
923 n, err := strconv.ParseInt(s, 10, 8)
925 r.addNonfatalError(&LexerError{
934 func (r *Lexer) Int16Str() int16 {
935 s, b := r.unsafeString()
940 n, err := strconv.ParseInt(s, 10, 16)
942 r.addNonfatalError(&LexerError{
951 func (r *Lexer) Int32Str() int32 {
952 s, b := r.unsafeString()
957 n, err := strconv.ParseInt(s, 10, 32)
959 r.addNonfatalError(&LexerError{
968 func (r *Lexer) Int64Str() int64 {
969 s, b := r.unsafeString()
974 n, err := strconv.ParseInt(s, 10, 64)
976 r.addNonfatalError(&LexerError{
985 func (r *Lexer) IntStr() int {
986 return int(r.Int64Str())
989 func (r *Lexer) Float32() float32 {
995 n, err := strconv.ParseFloat(s, 32)
997 r.addNonfatalError(&LexerError{
1006 func (r *Lexer) Float32Str() float32 {
1007 s, b := r.unsafeString()
1011 n, err := strconv.ParseFloat(s, 32)
1013 r.addNonfatalError(&LexerError{
1015 Reason: err.Error(),
1022 func (r *Lexer) Float64() float64 {
1028 n, err := strconv.ParseFloat(s, 64)
1030 r.addNonfatalError(&LexerError{
1032 Reason: err.Error(),
1039 func (r *Lexer) Float64Str() float64 {
1040 s, b := r.unsafeString()
1044 n, err := strconv.ParseFloat(s, 64)
1046 r.addNonfatalError(&LexerError{
1048 Reason: err.Error(),
1055 func (r *Lexer) Error() error {
1059 func (r *Lexer) AddError(e error) {
1060 if r.fatalError == nil {
1065 func (r *Lexer) AddNonFatalError(e error) {
1066 r.addNonfatalError(&LexerError{
1068 Data: string(r.Data[r.start:r.pos]),
1073 func (r *Lexer) addNonfatalError(err *LexerError) {
1074 if r.UseMultipleErrors {
1075 // We don't want to add errors with the same offset.
1076 if len(r.multipleErrors) != 0 && r.multipleErrors[len(r.multipleErrors)-1].Offset == err.Offset {
1079 r.multipleErrors = append(r.multipleErrors, err)
1085 func (r *Lexer) GetNonFatalErrors() []*LexerError {
1086 return r.multipleErrors
1089 // JsonNumber fetches and json.Number from 'encoding/json' package.
1090 // Both int, float or string, contains them are valid values
1091 func (r *Lexer) JsonNumber() json.Number {
1092 if r.token.kind == tokenUndef && r.Ok() {
1096 r.errInvalidToken("json.Number")
1097 return json.Number("")
1100 switch r.token.kind {
1102 return json.Number(r.String())
1104 return json.Number(r.Raw())
1107 return json.Number("")
1110 return json.Number("")
1114 // Interface fetches an interface{} analogous to the 'encoding/json' package.
1115 func (r *Lexer) Interface() interface{} {
1116 if r.token.kind == tokenUndef && r.Ok() {
1123 switch r.token.kind {
1135 if r.token.delimValue == '{' {
1138 ret := map[string]interface{}{}
1139 for !r.IsDelim('}') {
1142 ret[key] = r.Interface()
1152 } else if r.token.delimValue == '[' {
1155 var ret []interface{}
1156 for !r.IsDelim(']') {
1157 ret = append(ret, r.Interface())
1172 // WantComma requires a comma to be present before fetching next token.
1173 func (r *Lexer) WantComma() {
1175 r.firstElement = false
1178 // WantColon requires a colon to be present before fetching next token.
1179 func (r *Lexer) WantColon() {
1181 r.firstElement = false