1
0
mirror of https://github.com/json-iterator/go.git synced 2024-11-24 08:22:14 +02:00

optimize read string

This commit is contained in:
Tao Wen 2017-01-16 23:43:20 +08:00
parent b9e3f01bfd
commit e7ff7339b2
3 changed files with 221 additions and 185 deletions

211
feature_iter_string.go Normal file
View File

@ -0,0 +1,211 @@
package jsoniter
import (
"unicode/utf16"
"unsafe"
)
// TODO: avoid append
func (iter *Iterator) ReadString() (ret string) {
c := iter.nextToken()
if c == '"' {
copied := make([]byte, 32)
j := 0
fast_loop:
for {
i := iter.head
for ; i < iter.tail && j < len(copied); i++ {
c := iter.buf[i]
if c == '"' {
iter.head = i + 1
copied = copied[:j]
return *(*string)(unsafe.Pointer(&copied))
} else if c == '\\' {
iter.head = i
break fast_loop
}
copied[j] = c
j++
}
iter.head = i
if j == len(copied) {
newBuf := make([]byte, len(copied) * 2)
copy(newBuf, copied)
copied = newBuf
}
}
return iter.readStringSlowPath(copied[:j])
}
iter.reportError("ReadString", `expects " or n`)
return
}
func (iter *Iterator) readStringSlowPath(str []byte) (ret string) {
var c byte
for iter.Error == nil {
c = iter.readByte()
if c == '"' {
return *(*string)(unsafe.Pointer(&str))
}
if c == '\\' {
c = iter.readByte()
switch c {
case 'u':
r := iter.readU4()
if utf16.IsSurrogate(r) {
c = iter.readByte()
if iter.Error != nil {
return
}
if c != '\\' {
iter.reportError("ReadString",
`expects \u after utf16 surrogate, but \ not found`)
return
}
c = iter.readByte()
if iter.Error != nil {
return
}
if c != 'u' {
iter.reportError("ReadString",
`expects \u after utf16 surrogate, but \u not found`)
return
}
r2 := iter.readU4()
if iter.Error != nil {
return
}
combined := utf16.DecodeRune(r, r2)
str = appendRune(str, combined)
} else {
str = appendRune(str, r)
}
case '"':
str = append(str, '"')
case '\\':
str = append(str, '\\')
case '/':
str = append(str, '/')
case 'b':
str = append(str, '\b')
case 'f':
str = append(str, '\f')
case 'n':
str = append(str, '\n')
case 'r':
str = append(str, '\r')
case 't':
str = append(str, '\t')
default:
iter.reportError("ReadString",
`invalid escape char after \`)
return
}
} else {
str = append(str, c)
}
}
return
}
func (iter *Iterator) readStringAsBytes() (ret []byte) {
c := iter.nextToken()
if c == '"' {
for i := iter.head; i < iter.tail; i++ {
c := iter.buf[i]
if c == '"' {
// fast path: reuse the underlying buffer
ret = iter.buf[iter.head : i]
iter.head = i + 1
return ret
}
}
iter.head = iter.tail
readLen := iter.tail - iter.head
copied := make([]byte, readLen, readLen * 2)
copy(copied, iter.buf[iter.head:iter.tail])
for iter.Error == nil {
c := iter.readByte()
if c == '"' {
return copied
}
copied = append(copied, c)
}
return
}
if c == 'n' {
iter.skipUntilBreak()
return
}
iter.reportError("ReadString", `expects " or n`)
return
}
func (iter *Iterator) readU4() (ret rune) {
for i := 0; i < 4; i++ {
c := iter.readByte()
if iter.Error != nil {
return
}
if c >= '0' && c <= '9' {
ret = ret * 16 + rune(c - '0')
} else if c >= 'a' && c <= 'f' {
ret = ret * 16 + rune(c - 'a' + 10)
} else {
iter.reportError("readU4", "expects 0~9 or a~f")
return
}
}
return ret
}
const (
t1 = 0x00 // 0000 0000
tx = 0x80 // 1000 0000
t2 = 0xC0 // 1100 0000
t3 = 0xE0 // 1110 0000
t4 = 0xF0 // 1111 0000
t5 = 0xF8 // 1111 1000
maskx = 0x3F // 0011 1111
mask2 = 0x1F // 0001 1111
mask3 = 0x0F // 0000 1111
mask4 = 0x07 // 0000 0111
rune1Max = 1 << 7 - 1
rune2Max = 1 << 11 - 1
rune3Max = 1 << 16 - 1
surrogateMin = 0xD800
surrogateMax = 0xDFFF
maxRune = '\U0010FFFF' // Maximum valid Unicode code point.
runeError = '\uFFFD' // the "error" Rune or "Unicode replacement character"
)
func appendRune(p []byte, r rune) []byte {
// Negative values are erroneous. Making it unsigned addresses the problem.
switch i := uint32(r); {
case i <= rune1Max:
p = append(p, byte(r))
return p
case i <= rune2Max:
p = append(p, t2 | byte(r >> 6))
p = append(p, tx | byte(r) & maskx)
return p
case i > maxRune, surrogateMin <= i && i <= surrogateMax:
r = runeError
fallthrough
case i <= rune3Max:
p = append(p, t3 | byte(r >> 12))
p = append(p, tx | byte(r >> 6) & maskx)
p = append(p, tx | byte(r) & maskx)
return p
default:
p = append(p, t4 | byte(r >> 18))
p = append(p, tx | byte(r >> 12) & maskx)
p = append(p, tx | byte(r >> 6) & maskx)
p = append(p, tx | byte(r) & maskx)
return p
}
}

View File

@ -4,7 +4,6 @@ import (
"encoding/base64"
"fmt"
"io"
"unicode/utf16"
)
type ValueType int
@ -224,175 +223,6 @@ func (iter *Iterator) unreadByte() {
return
}
// ReadString reads a json object as String
func (iter *Iterator) ReadString() (ret string) {
return string(iter.readStringAsBytes())
}
func (iter *Iterator) readStringAsBytes() (ret []byte) {
c := iter.nextToken()
if c == '"' {
end := iter.findStringEndWithoutEscape()
if end != -1 {
// fast path: reuse the underlying buffer
ret = iter.buf[iter.head : end-1]
iter.head = end
return ret
}
return iter.readStringAsBytesSlowPath()
}
if c == 'n' {
iter.skipUntilBreak()
return
}
iter.reportError("ReadString", `expects " or n`)
return
}
func (iter *Iterator) readStringAsBytesSlowPath() (ret []byte) {
str := make([]byte, 0, 8)
var c byte
for iter.Error == nil {
c = iter.readByte()
if c == '"' {
return str
}
if c == '\\' {
c = iter.readByte()
if iter.Error != nil {
return
}
switch c {
case 'u':
r := iter.readU4()
if iter.Error != nil {
return
}
if utf16.IsSurrogate(r) {
c = iter.readByte()
if iter.Error != nil {
return
}
if c != '\\' {
iter.reportError("ReadString",
`expects \u after utf16 surrogate, but \ not found`)
return
}
c = iter.readByte()
if iter.Error != nil {
return
}
if c != 'u' {
iter.reportError("ReadString",
`expects \u after utf16 surrogate, but \u not found`)
return
}
r2 := iter.readU4()
if iter.Error != nil {
return
}
combined := utf16.DecodeRune(r, r2)
str = appendRune(str, combined)
} else {
str = appendRune(str, r)
}
case '"':
str = append(str, '"')
case '\\':
str = append(str, '\\')
case '/':
str = append(str, '/')
case 'b':
str = append(str, '\b')
case 'f':
str = append(str, '\f')
case 'n':
str = append(str, '\n')
case 'r':
str = append(str, '\r')
case 't':
str = append(str, '\t')
default:
iter.reportError("ReadString",
`invalid escape char after \`)
return
}
} else {
str = append(str, c)
}
}
return
}
func (iter *Iterator) readU4() (ret rune) {
for i := 0; i < 4; i++ {
c := iter.readByte()
if iter.Error != nil {
return
}
if c >= '0' && c <= '9' {
ret = ret*16 + rune(c-'0')
} else if c >= 'a' && c <= 'f' {
ret = ret*16 + rune(c-'a'+10)
} else {
iter.reportError("readU4", "expects 0~9 or a~f")
return
}
}
return ret
}
const (
t1 = 0x00 // 0000 0000
tx = 0x80 // 1000 0000
t2 = 0xC0 // 1100 0000
t3 = 0xE0 // 1110 0000
t4 = 0xF0 // 1111 0000
t5 = 0xF8 // 1111 1000
maskx = 0x3F // 0011 1111
mask2 = 0x1F // 0001 1111
mask3 = 0x0F // 0000 1111
mask4 = 0x07 // 0000 0111
rune1Max = 1<<7 - 1
rune2Max = 1<<11 - 1
rune3Max = 1<<16 - 1
surrogateMin = 0xD800
surrogateMax = 0xDFFF
MaxRune = '\U0010FFFF' // Maximum valid Unicode code point.
RuneError = '\uFFFD' // the "error" Rune or "Unicode replacement character"
)
func appendRune(p []byte, r rune) []byte {
// Negative values are erroneous. Making it unsigned addresses the problem.
switch i := uint32(r); {
case i <= rune1Max:
p = append(p, byte(r))
return p
case i <= rune2Max:
p = append(p, t2|byte(r>>6))
p = append(p, tx|byte(r)&maskx)
return p
case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
r = RuneError
fallthrough
case i <= rune3Max:
p = append(p, t3|byte(r>>12))
p = append(p, tx|byte(r>>6)&maskx)
p = append(p, tx|byte(r)&maskx)
return p
default:
p = append(p, t4|byte(r>>18))
p = append(p, tx|byte(r>>12)&maskx)
p = append(p, tx|byte(r>>6)&maskx)
p = append(p, tx|byte(r)&maskx)
return p
}
}
// ReadArray reads a json object as Array
func (iter *Iterator) ReadArray() (ret bool) {
c := iter.nextToken()
@ -418,7 +248,7 @@ func (iter *Iterator) ReadArray() (ret bool) {
case ',':
return true
default:
iter.reportError("ReadArray", "expect [ or , or ] or n, but found: "+string([]byte{c}))
iter.reportError("ReadArray", "expect [ or , or ] or n, but found: " + string([]byte{c}))
return
}
}
@ -555,18 +385,6 @@ func (iter *Iterator) findStringEnd() (int, bool) {
return -1, true // end with \
}
func (iter *Iterator) findStringEndWithoutEscape() int {
for i := iter.head; i < iter.tail; i++ {
c := iter.buf[i]
if c == '"' {
return i + 1
} else if c == '\\' {
return -1
}
}
return -1
}
func (iter *Iterator) skipArray() {
level := 1
for {

View File

@ -7,6 +7,12 @@ import (
"github.com/json-iterator/go/require"
)
func Test_read_large_string(t *testing.T) {
should := require.New(t)
iter := ParseString(`"0123456789012345678901234567890123456789"`)
should.Equal("0123456789012345678901234567890123456789", iter.ReadString())
}
func Test_decode_string_empty(t *testing.T) {
iter := Parse(bytes.NewBufferString(`""`), 4096)
val := iter.ReadString()
@ -113,10 +119,11 @@ func Benchmark_jsoniter_unicode(b *testing.B) {
}
func Benchmark_jsoniter_ascii(b *testing.B) {
iter := ParseString(`"hello, world!"`)
iter := NewIterator()
input := []byte(`"hello, world! hello, world!"`)
b.ResetTimer()
for n := 0; n < b.N; n++ {
iter.ResetBytes(iter.buf)
iter.ResetBytes(input)
iter.ReadString()
}
}