Replace go-runewidth with uniseg

2026-05-22 10:15:43 +02:00 · 2025-11-22 15:37:46 +01:00
parent fec7e9ce61
commit 5423e7459c
30 changed files with 32 additions and 3134 deletions
@@ -1,2 +0,0 @@
-.DS_Store
-*.test
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2025 Matt Sherman
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
@@ -1,64 +0,0 @@
-# stringish
-
-A small Go module that provides a generic type constraint for “string-like”
-data, and a utf8 package that works with both strings and byte slices
-without conversions.
-
-```go
-type Interface interface {
-	~[]byte | ~string
-}
-```
-
-[![Go Reference](https://pkg.go.dev/badge/github.com/clipperhouse/stringish/utf8.svg)](https://pkg.go.dev/github.com/clipperhouse/stringish/utf8)
-[![Test Status](https://github.com/clipperhouse/stringish/actions/workflows/gotest.yml/badge.svg)](https://github.com/clipperhouse/stringish/actions/workflows/gotest.yml)
-
-## Install
-
-```
-go get github.com/clipperhouse/stringish
-```
-
-## Examples
-
-```go
-import (
-    "github.com/clipperhouse/stringish"
-    "github.com/clipperhouse/stringish/utf8"
-)
-
-s := "Hello, 世界"
-r, size := utf8.DecodeRune(s)   // not DecodeRuneInString 🎉
-
-b := []byte("Hello, 世界")
-r, size = utf8.DecodeRune(b)    // same API!
-
-func MyFoo[T stringish.Interface](s T) T {
-    // pass a string or a []byte
-    // iterate, slice, transform, whatever
-}
-```
-
-## Motivation
-
-Sometimes we want APIs to accept `string` or `[]byte` without having to convert
-between those types. That conversion usually allocates!
-
-By implementing with `stringish.Interface`, we can have a single API, and
-single implementation for both types: one `Foo` instead of `Foo` and
-`FooString`.
-
-We have converted the
-[`unicode/utf8` package](https://github.com/clipperhouse/stringish/blob/main/utf8/utf8.go)
-as an example -- note the absence of`*InString` funcs. We might look at `x/text`
-next.
-
-## Used by
-
- clipperhouse/uax29: [stringish trie](https://github.com/clipperhouse/uax29/blob/master/graphemes/trie.go#L27), [stringish iterator](https://github.com/clipperhouse/uax29/blob/master/internal/iterators/iterator.go#L9), [stringish SplitFunc](https://github.com/clipperhouse/uax29/blob/master/graphemes/splitfunc.go#L21)
-
- [clipperhouse/displaywidth](https://github.com/clipperhouse/displaywidth)
-
-## Prior discussion
-
- [Consideration of similar by the Go team](https://github.com/golang/go/issues/48643)
@@ -1,5 +0,0 @@
-package stringish
-
-type Interface interface {
-	~[]byte | ~string
-}
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2020 Matt Sherman
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
@@ -1,94 +0,0 @@
-An implementation of grapheme cluster boundaries from [Unicode text segmentation](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) (UAX 29), for Unicode version 15.0.0.
-
-[![Documentation](https://pkg.go.dev/badge/github.com/clipperhouse/uax29/v2/graphemes.svg)](https://pkg.go.dev/github.com/clipperhouse/uax29/v2/graphemes)
-![Tests](https://github.com/clipperhouse/uax29/actions/workflows/gotest.yml/badge.svg)
-![Fuzz](https://github.com/clipperhouse/uax29/actions/workflows/gofuzz.yml/badge.svg)
-
-## Quick start
-
-```
-go get "github.com/clipperhouse/uax29/v2/graphemes"
-```
-
-```go
-import "github.com/clipperhouse/uax29/v2/graphemes"
-
-text := "Hello, 世界. Nice dog! 👍🐶"
-
-tokens := graphemes.FromString(text)
-
-for tokens.Next() {                     // Next() returns true until end of data
-	fmt.Println(tokens.Value())         // Do something with the current grapheme
-}
-```
-
-_A grapheme is a “single visible character”, which might be a simple as a single letter, or a complex emoji that consists of several Unicode code points._
-
-## Conformance
-
-We use the Unicode [test suite](https://unicode.org/reports/tr41/tr41-26.html#Tests29).
-
-![Tests](https://github.com/clipperhouse/uax29/actions/workflows/gotest.yml/badge.svg)
-![Fuzz](https://github.com/clipperhouse/uax29/actions/workflows/gofuzz.yml/badge.svg)
-
-## APIs
-
-### If you have a `string`
-
-```go
-text := "Hello, 世界. Nice dog! 👍🐶"
-
-tokens := graphemes.FromString(text)
-
-for tokens.Next() {                     // Next() returns true until end of data
-	fmt.Println(tokens.Value())         // Do something with the current grapheme
-}
-```
-
-### If you have an `io.Reader`
-
-`FromReader` embeds a [`bufio.Scanner`](https://pkg.go.dev/bufio#Scanner), so just use those methods.
-
-```go
-r := getYourReader()                        // from a file or network maybe
-tokens := graphemes.FromReader(r)
-
-for tokens.Scan() {                         // Scan() returns true until error or EOF
-	fmt.Println(tokens.Text())              // Do something with the current grapheme
-}
-
-if tokens.Err() != nil {                    // Check the error
-	log.Fatal(tokens.Err())
-}
-```
-
-### If you have a `[]byte`
-
-```go
-b := []byte("Hello, 世界. Nice dog! 👍🐶")
-
-tokens := graphemes.FromBytes(b)
-
-for tokens.Next() {                     // Next() returns true until end of data
-	fmt.Println(tokens.Value())         // Do something with the current grapheme
-}
-```
-
-### Benchmarks
-
-On a Mac M2 laptop, we see around 200MB/s, or around 100 million graphemes per second, and no allocations.
-
-```
-goos: darwin
-goarch: arm64
-pkg: github.com/clipperhouse/uax29/graphemes/comparative
-cpu: Apple M2
-BenchmarkGraphemes/clipperhouse/uax29-8    	    173805 ns/op	 201.16 MB/s      0 B/op	   0 allocs/op
-BenchmarkGraphemes/rivo/uniseg-8           	   2045128 ns/op	  17.10 MB/s      0 B/op	   0 allocs/op
-```
-
-### Invalid inputs
-
-Invalid UTF-8 input is considered undefined behavior. We test to ensure that bad inputs will not cause pathological outcomes, such as a panic or infinite loop. Callers should expect “garbage-in, garbage-out”.
-
-Your pipeline should probably include a call to [`utf8.Valid()`](https://pkg.go.dev/unicode/utf8#Valid).
@@ -1,31 +0,0 @@
-package graphemes
-
-import (
-	"github.com/clipperhouse/stringish"
-	"github.com/clipperhouse/uax29/v2/internal/iterators"
-)
-
-type Iterator[T stringish.Interface] struct {
-	*iterators.Iterator[T]
-}
-
-var (
-	splitFuncString = splitFunc[string]
-	splitFuncBytes  = splitFunc[[]byte]
-)
-
-// FromString returns an iterator for the grapheme clusters in the input string.
-// Iterate while Next() is true, and access the grapheme via Value().
-func FromString(s string) Iterator[string] {
-	return Iterator[string]{
-		iterators.New(splitFuncString, s),
-	}
-}
-
-// FromBytes returns an iterator for the grapheme clusters in the input bytes.
-// Iterate while Next() is true, and access the grapheme via Value().
-func FromBytes(b []byte) Iterator[[]byte] {
-	return Iterator[[]byte]{
-		iterators.New(splitFuncBytes, b),
-	}
-}
@@ -1,25 +0,0 @@
-// Package graphemes implements Unicode grapheme cluster boundaries: https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
-package graphemes
-
-import (
-	"bufio"
-	"io"
-)
-
-type Scanner struct {
-	*bufio.Scanner
-}
-
-// FromReader returns a Scanner, to split graphemes per
-// https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
-//
-// It embeds a [bufio.Scanner], so you can use its methods.
-//
-// Iterate through graphemes by calling Scan() until false, then check Err().
-func FromReader(r io.Reader) *Scanner {
-	sc := bufio.NewScanner(r)
-	sc.Split(SplitFunc)
-	return &Scanner{
-		Scanner: sc,
-	}
-}
@@ -1,174 +0,0 @@
-package graphemes
-
-import (
-	"bufio"
-
-	"github.com/clipperhouse/stringish"
-)
-
-// is determines if lookup intersects propert(ies)
-func (lookup property) is(properties property) bool {
-	return (lookup & properties) != 0
-}
-
-const _Ignore = _Extend
-
-// SplitFunc is a bufio.SplitFunc implementation of Unicode grapheme cluster segmentation, for use with bufio.Scanner.
-//
-// See https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
-var SplitFunc bufio.SplitFunc = splitFunc[[]byte]
-
-func splitFunc[T stringish.Interface](data T, atEOF bool) (advance int, token T, err error) {
-	var empty T
-	if len(data) == 0 {
-		return 0, empty, nil
-	}
-
-	// These vars are stateful across loop iterations
-	var pos int
-	var lastExIgnore property = 0     // "last excluding ignored categories"
-	var lastLastExIgnore property = 0 // "last one before that"
-	var regionalIndicatorCount int
-
-	// Rules are usually of the form Cat1 × Cat2; "current" refers to the first property
-	// to the right of the ×, from which we look back or forward
-
-	current, w := lookup(data[pos:])
-	if w == 0 {
-		if !atEOF {
-			// Rune extends past current data, request more
-			return 0, empty, nil
-		}
-		pos = len(data)
-		return pos, data[:pos], nil
-	}
-
-	// https://unicode.org/reports/tr29/#GB1
-	// Start of text always advances
-	pos += w
-
-	for {
-		eot := pos == len(data) // "end of text"
-
-		if eot {
-			if !atEOF {
-				// Token extends past current data, request more
-				return 0, empty, nil
-			}
-
-			// https://unicode.org/reports/tr29/#GB2
-			break
-		}
-
-		/*
-			We've switched the evaluation order of GB1↓ and GB2↑. It's ok:
-			because we've checked for len(data) at the top of this function,
-			sot and eot are mutually exclusive, order doesn't matter.
-		*/
-
-		// Rules are usually of the form Cat1 × Cat2; "current" refers to the first property
-		// to the right of the ×, from which we look back or forward
-
-		// Remember previous properties to avoid lookups/lookbacks
-		last := current
-		if !last.is(_Ignore) {
-			lastLastExIgnore = lastExIgnore
-			lastExIgnore = last
-		}
-
-		current, w = lookup(data[pos:])
-		if w == 0 {
-			if atEOF {
-				// Just return the bytes, we can't do anything with them
-				pos = len(data)
-				break
-			}
-			// Rune extends past current data, request more
-			return 0, empty, nil
-		}
-
-		// Optimization: no rule can possibly apply
-		if current|last == 0 { // i.e. both are zero
-			break
-		}
-
-		// https://unicode.org/reports/tr29/#GB3
-		if current.is(_LF) && last.is(_CR) {
-			pos += w
-			continue
-		}
-
-		// https://unicode.org/reports/tr29/#GB4
-		// https://unicode.org/reports/tr29/#GB5
-		if (current | last).is(_Control | _CR | _LF) {
-			break
-		}
-
-		// https://unicode.org/reports/tr29/#GB6
-		if current.is(_L|_V|_LV|_LVT) && last.is(_L) {
-			pos += w
-			continue
-		}
-
-		// https://unicode.org/reports/tr29/#GB7
-		if current.is(_V|_T) && last.is(_LV|_V) {
-			pos += w
-			continue
-		}
-
-		// https://unicode.org/reports/tr29/#GB8
-		if current.is(_T) && last.is(_LVT|_T) {
-			pos += w
-			continue
-		}
-
-		// https://unicode.org/reports/tr29/#GB9
-		if current.is(_Extend | _ZWJ) {
-			pos += w
-			continue
-		}
-
-		// https://unicode.org/reports/tr29/#GB9a
-		if current.is(_SpacingMark) {
-			pos += w
-			continue
-		}
-
-		// https://unicode.org/reports/tr29/#GB9b
-		if last.is(_Prepend) {
-			pos += w
-			continue
-		}
-
-		// https://unicode.org/reports/tr29/#GB9c
-		// TODO(clipperhouse):
-		// It appears to be added in Unicode 15.1.0:
-		// https://unicode.org/versions/Unicode15.1.0/#Migration
-		// This package currently supports Unicode 15.0.0, so
-		// out of scope for now
-
-		// https://unicode.org/reports/tr29/#GB11
-		if current.is(_ExtendedPictographic) && last.is(_ZWJ) && lastLastExIgnore.is(_ExtendedPictographic) {
-			pos += w
-			continue
-		}
-
-		// https://unicode.org/reports/tr29/#GB12
-		// https://unicode.org/reports/tr29/#GB13
-		if (current & last).is(_RegionalIndicator) {
-			regionalIndicatorCount++
-
-			odd := regionalIndicatorCount%2 == 1
-			if odd {
-				pos += w
-				continue
-			}
-		}
-
-		// If we fall through all the above rules, it's a grapheme cluster break
-		break
-	}
-
-	// Return token
-	return pos, data[:pos], nil
-}
@@ -1,100 +0,0 @@
-package iterators
-
-import "github.com/clipperhouse/stringish"
-
-type SplitFunc[T stringish.Interface] func(T, bool) (int, T, error)
-
-// Iterator is a generic iterator for words that are either []byte or string.
-// Iterate while Next() is true, and access the word via Value().
-type Iterator[T stringish.Interface] struct {
-	split SplitFunc[T]
-	data  T
-	start int
-	pos   int
-}
-
-// New creates a new Iterator for the given data and SplitFunc.
-func New[T stringish.Interface](split SplitFunc[T], data T) *Iterator[T] {
-	return &Iterator[T]{
-		split: split,
-		data:  data,
-	}
-}
-
-// SetText sets the text for the iterator to operate on, and resets all state.
-func (iter *Iterator[T]) SetText(data T) {
-	iter.data = data
-	iter.start = 0
-	iter.pos = 0
-}
-
-// Split sets the SplitFunc for the Iterator.
-func (iter *Iterator[T]) Split(split SplitFunc[T]) {
-	iter.split = split
-}
-
-// Next advances the iterator to the next token. It returns false when there
-// are no remaining tokens or an error occurred.
-func (iter *Iterator[T]) Next() bool {
-	if iter.pos == len(iter.data) {
-		return false
-	}
-	if iter.pos > len(iter.data) {
-		panic("SplitFunc advanced beyond the end of the data")
-	}
-
-	iter.start = iter.pos
-
-	advance, _, err := iter.split(iter.data[iter.pos:], true)
-	if err != nil {
-		panic(err)
-	}
-	if advance <= 0 {
-		panic("SplitFunc returned a zero or negative advance")
-	}
-
-	iter.pos += advance
-	if iter.pos > len(iter.data) {
-		panic("SplitFunc advanced beyond the end of the data")
-	}
-
-	return true
-}
-
-// Value returns the current token.
-func (iter *Iterator[T]) Value() T {
-	return iter.data[iter.start:iter.pos]
-}
-
-// Start returns the byte position of the current token in the original data.
-func (iter *Iterator[T]) Start() int {
-	return iter.start
-}
-
-// End returns the byte position after the current token in the original data.
-func (iter *Iterator[T]) End() int {
-	return iter.pos
-}
-
-// Reset resets the iterator to the beginning of the data.
-func (iter *Iterator[T]) Reset() {
-	iter.start = 0
-	iter.pos = 0
-}
-
-func (iter *Iterator[T]) First() T {
-	if len(iter.data) == 0 {
-		return iter.data
-	}
-	advance, _, err := iter.split(iter.data, true)
-	if err != nil {
-		panic(err)
-	}
-	if advance <= 0 {
-		panic("SplitFunc returned a zero or negative advance")
-	}
-	if advance > len(iter.data) {
-		panic("SplitFunc advanced beyond the end of the data")
-	}
-	return iter.data[:advance]
-}