optimize read string

2024-11-24 08:22:14 +02:00 · 2017-01-16 23:43:20 +08:00 · 2017-01-16 23:43:20 +08:00 · e7ff7339b2
commit e7ff7339b2
parent b9e3f01bfd
3 changed files with 221 additions and 185 deletions
--- a/feature_iter_string.go
+++ b/feature_iter_string.go
@ -0,0 +1,211 @@
+package jsoniter
+
+import (
+	"unicode/utf16"
+	"unsafe"
+)
+
+// TODO: avoid append
+func (iter *Iterator) ReadString() (ret string) {
+	c := iter.nextToken()
+	if c == '"' {
+		copied := make([]byte, 32)
+		j := 0
+		fast_loop:
+		for {
+			i := iter.head
+			for ; i < iter.tail && j < len(copied); i++ {
+				c := iter.buf[i]
+				if c == '"' {
+					iter.head = i + 1
+					copied = copied[:j]
+					return *(*string)(unsafe.Pointer(&copied))
+				} else if c == '\\' {
+					iter.head = i
+					break fast_loop
+				}
+				copied[j] = c
+				j++
+			}
+			iter.head = i
+			if j == len(copied) {
+				newBuf := make([]byte, len(copied) * 2)
+				copy(newBuf, copied)
+				copied = newBuf
+			}
+		}
+		return iter.readStringSlowPath(copied[:j])
+	}
+	iter.reportError("ReadString", `expects " or n`)
+	return
+}
+
+func (iter *Iterator) readStringSlowPath(str []byte) (ret string) {
+	var c byte
+	for iter.Error == nil {
+		c = iter.readByte()
+		if c == '"' {
+			return *(*string)(unsafe.Pointer(&str))
+		}
+		if c == '\\' {
+			c = iter.readByte()
+			switch c {
+			case 'u':
+				r := iter.readU4()
+				if utf16.IsSurrogate(r) {
+					c = iter.readByte()
+					if iter.Error != nil {
+						return
+					}
+					if c != '\\' {
+						iter.reportError("ReadString",
+							`expects \u after utf16 surrogate, but \ not found`)
+						return
+					}
+					c = iter.readByte()
+					if iter.Error != nil {
+						return
+					}
+					if c != 'u' {
+						iter.reportError("ReadString",
+							`expects \u after utf16 surrogate, but \u not found`)
+						return
+					}
+					r2 := iter.readU4()
+					if iter.Error != nil {
+						return
+					}
+					combined := utf16.DecodeRune(r, r2)
+					str = appendRune(str, combined)
+				} else {
+					str = appendRune(str, r)
+				}
+			case '"':
+				str = append(str, '"')
+			case '\\':
+				str = append(str, '\\')
+			case '/':
+				str = append(str, '/')
+			case 'b':
+				str = append(str, '\b')
+			case 'f':
+				str = append(str, '\f')
+			case 'n':
+				str = append(str, '\n')
+			case 'r':
+				str = append(str, '\r')
+			case 't':
+				str = append(str, '\t')
+			default:
+				iter.reportError("ReadString",
+					`invalid escape char after \`)
+				return
+			}
+		} else {
+			str = append(str, c)
+		}
+	}
+	return
+}
+
+func (iter *Iterator) readStringAsBytes() (ret []byte) {
+	c := iter.nextToken()
+	if c == '"' {
+		for i := iter.head; i < iter.tail; i++ {
+			c := iter.buf[i]
+			if c == '"' {
+				// fast path: reuse the underlying buffer
+				ret = iter.buf[iter.head : i]
+				iter.head = i + 1
+				return ret
+			}
+		}
+		iter.head = iter.tail
+		readLen := iter.tail - iter.head
+		copied := make([]byte, readLen, readLen * 2)
+		copy(copied, iter.buf[iter.head:iter.tail])
+		for iter.Error == nil {
+			c := iter.readByte()
+			if c == '"' {
+				return copied
+			}
+			copied = append(copied, c)
+		}
+		return
+	}
+	if c == 'n' {
+		iter.skipUntilBreak()
+		return
+	}
+	iter.reportError("ReadString", `expects " or n`)
+	return
+}
+
+func (iter *Iterator) readU4() (ret rune) {
+	for i := 0; i < 4; i++ {
+		c := iter.readByte()
+		if iter.Error != nil {
+			return
+		}
+		if c >= '0' && c <= '9' {
+			ret = ret * 16 + rune(c - '0')
+		} else if c >= 'a' && c <= 'f' {
+			ret = ret * 16 + rune(c - 'a' + 10)
+		} else {
+			iter.reportError("readU4", "expects 0~9 or a~f")
+			return
+		}
+	}
+	return ret
+}
+
+const (
+	t1 = 0x00 // 0000 0000
+	tx = 0x80 // 1000 0000
+	t2 = 0xC0 // 1100 0000
+	t3 = 0xE0 // 1110 0000
+	t4 = 0xF0 // 1111 0000
+	t5 = 0xF8 // 1111 1000
+
+	maskx = 0x3F // 0011 1111
+	mask2 = 0x1F // 0001 1111
+	mask3 = 0x0F // 0000 1111
+	mask4 = 0x07 // 0000 0111
+
+	rune1Max = 1 << 7 - 1
+	rune2Max = 1 << 11 - 1
+	rune3Max = 1 << 16 - 1
+
+	surrogateMin = 0xD800
+	surrogateMax = 0xDFFF
+
+	maxRune = '\U0010FFFF' // Maximum valid Unicode code point.
+	runeError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
+)
+
+func appendRune(p []byte, r rune) []byte {
+	// Negative values are erroneous. Making it unsigned addresses the problem.
+	switch i := uint32(r); {
+	case i <= rune1Max:
+		p = append(p, byte(r))
+		return p
+	case i <= rune2Max:
+		p = append(p, t2 | byte(r >> 6))
+		p = append(p, tx | byte(r) & maskx)
+		return p
+	case i > maxRune, surrogateMin <= i && i <= surrogateMax:
+		r = runeError
+		fallthrough
+	case i <= rune3Max:
+		p = append(p, t3 | byte(r >> 12))
+		p = append(p, tx | byte(r >> 6) & maskx)
+		p = append(p, tx | byte(r) & maskx)
+		return p
+	default:
+		p = append(p, t4 | byte(r >> 18))
+		p = append(p, tx | byte(r >> 12) & maskx)
+		p = append(p, tx | byte(r >> 6) & maskx)
+		p = append(p, tx | byte(r) & maskx)
+		return p
+	}
+}
--- a/iterator.go
+++ b/iterator.go
@ -4,7 +4,6 @@ import (
 	"encoding/base64"
 	"fmt"
 	"io"
-	"unicode/utf16"
 )

 type ValueType int
@ -224,175 +223,6 @@ func (iter *Iterator) unreadByte() {
 	return
 }

-// ReadString reads a json object as String
-func (iter *Iterator) ReadString() (ret string) {
-	return string(iter.readStringAsBytes())
-}
-
-func (iter *Iterator) readStringAsBytes() (ret []byte) {
-	c := iter.nextToken()
-	if c == '"' {
-		end := iter.findStringEndWithoutEscape()
-		if end != -1 {
-			// fast path: reuse the underlying buffer
-			ret = iter.buf[iter.head : end-1]
-			iter.head = end
-			return ret
-		}
-		return iter.readStringAsBytesSlowPath()
-	}
-	if c == 'n' {
-		iter.skipUntilBreak()
-		return
-	}
-	iter.reportError("ReadString", `expects " or n`)
-	return
-}
-
-func (iter *Iterator) readStringAsBytesSlowPath() (ret []byte) {
-	str := make([]byte, 0, 8)
-	var c byte
-	for iter.Error == nil {
-		c = iter.readByte()
-		if c == '"' {
-			return str
-		}
-		if c == '\\' {
-			c = iter.readByte()
-			if iter.Error != nil {
-				return
-			}
-			switch c {
-			case 'u':
-				r := iter.readU4()
-				if iter.Error != nil {
-					return
-				}
-				if utf16.IsSurrogate(r) {
-					c = iter.readByte()
-					if iter.Error != nil {
-						return
-					}
-					if c != '\\' {
-						iter.reportError("ReadString",
-							`expects \u after utf16 surrogate, but \ not found`)
-						return
-					}
-					c = iter.readByte()
-					if iter.Error != nil {
-						return
-					}
-					if c != 'u' {
-						iter.reportError("ReadString",
-							`expects \u after utf16 surrogate, but \u not found`)
-						return
-					}
-					r2 := iter.readU4()
-					if iter.Error != nil {
-						return
-					}
-					combined := utf16.DecodeRune(r, r2)
-					str = appendRune(str, combined)
-				} else {
-					str = appendRune(str, r)
-				}
-			case '"':
-				str = append(str, '"')
-			case '\\':
-				str = append(str, '\\')
-			case '/':
-				str = append(str, '/')
-			case 'b':
-				str = append(str, '\b')
-			case 'f':
-				str = append(str, '\f')
-			case 'n':
-				str = append(str, '\n')
-			case 'r':
-				str = append(str, '\r')
-			case 't':
-				str = append(str, '\t')
-			default:
-				iter.reportError("ReadString",
-					`invalid escape char after \`)
-				return
-			}
-		} else {
-			str = append(str, c)
-		}
-	}
-	return
-}
-
-func (iter *Iterator) readU4() (ret rune) {
-	for i := 0; i < 4; i++ {
-		c := iter.readByte()
-		if iter.Error != nil {
-			return
-		}
-		if c >= '0' && c <= '9' {
-			ret = ret*16 + rune(c-'0')
-		} else if c >= 'a' && c <= 'f' {
-			ret = ret*16 + rune(c-'a'+10)
-		} else {
-			iter.reportError("readU4", "expects 0~9 or a~f")
-			return
-		}
-	}
-	return ret
-}
-
-const (
-	t1 = 0x00 // 0000 0000
-	tx = 0x80 // 1000 0000
-	t2 = 0xC0 // 1100 0000
-	t3 = 0xE0 // 1110 0000
-	t4 = 0xF0 // 1111 0000
-	t5 = 0xF8 // 1111 1000
-
-	maskx = 0x3F // 0011 1111
-	mask2 = 0x1F // 0001 1111
-	mask3 = 0x0F // 0000 1111
-	mask4 = 0x07 // 0000 0111
-
-	rune1Max = 1<<7 - 1
-	rune2Max = 1<<11 - 1
-	rune3Max = 1<<16 - 1
-
-	surrogateMin = 0xD800
-	surrogateMax = 0xDFFF
-
-	MaxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
-	RuneError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
-)
-
-func appendRune(p []byte, r rune) []byte {
-	// Negative values are erroneous. Making it unsigned addresses the problem.
-	switch i := uint32(r); {
-	case i <= rune1Max:
-		p = append(p, byte(r))
-		return p
-	case i <= rune2Max:
-		p = append(p, t2|byte(r>>6))
-		p = append(p, tx|byte(r)&maskx)
-		return p
-	case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
-		r = RuneError
-		fallthrough
-	case i <= rune3Max:
-		p = append(p, t3|byte(r>>12))
-		p = append(p, tx|byte(r>>6)&maskx)
-		p = append(p, tx|byte(r)&maskx)
-		return p
-	default:
-		p = append(p, t4|byte(r>>18))
-		p = append(p, tx|byte(r>>12)&maskx)
-		p = append(p, tx|byte(r>>6)&maskx)
-		p = append(p, tx|byte(r)&maskx)
-		return p
-	}
-}
-
 // ReadArray reads a json object as Array
 func (iter *Iterator) ReadArray() (ret bool) {
 	c := iter.nextToken()
@ -418,7 +248,7 @@ func (iter *Iterator) ReadArray() (ret bool) {
 	case ',':
 		return true
 	default:
-		iter.reportError("ReadArray", "expect [ or , or ] or n, but found: "+string([]byte{c}))
+		iter.reportError("ReadArray", "expect [ or , or ] or n, but found: " + string([]byte{c}))
 		return
 	}
 }
@ -555,18 +385,6 @@ func (iter *Iterator) findStringEnd() (int, bool) {
 	return -1, true // end with \
 }

-func (iter *Iterator) findStringEndWithoutEscape() int {
-	for i := iter.head; i < iter.tail; i++ {
-		c := iter.buf[i]
-		if c == '"' {
-			return i + 1
-		} else if c == '\\' {
-			return -1
-		}
-	}
-	return -1
-}
-
 func (iter *Iterator) skipArray() {
 	level := 1
 	for {
--- a/jsoniter_string_test.go
+++ b/jsoniter_string_test.go
@ -7,6 +7,12 @@ import (
 	"github.com/json-iterator/go/require"
 )

+func Test_read_large_string(t *testing.T) {
+	should := require.New(t)
+	iter := ParseString(`"0123456789012345678901234567890123456789"`)
+	should.Equal("0123456789012345678901234567890123456789", iter.ReadString())
+}
+
 func Test_decode_string_empty(t *testing.T) {
 	iter := Parse(bytes.NewBufferString(`""`), 4096)
 	val := iter.ReadString()
@ -113,10 +119,11 @@ func Benchmark_jsoniter_unicode(b *testing.B) {
 }

 func Benchmark_jsoniter_ascii(b *testing.B) {
-	iter := ParseString(`"hello, world!"`)
+	iter := NewIterator()
+	input := []byte(`"hello, world! hello, world!"`)
 	b.ResetTimer()
 	for n := 0; n < b.N; n++ {
-		iter.ResetBytes(iter.buf)
+		iter.ResetBytes(input)
 		iter.ReadString()
 	}
 }