mirror of
https://github.com/jesseduffield/lazygit.git
synced 2026-05-22 10:15:43 +02:00
fec7e9ce61
and adapt lazygit's client code accordingly.
175 lines
4.0 KiB
Go
175 lines
4.0 KiB
Go
package graphemes
|
|
|
|
import (
|
|
"bufio"
|
|
|
|
"github.com/clipperhouse/stringish"
|
|
)
|
|
|
|
// is determines if lookup intersects propert(ies)
|
|
func (lookup property) is(properties property) bool {
|
|
return (lookup & properties) != 0
|
|
}
|
|
|
|
const _Ignore = _Extend
|
|
|
|
// SplitFunc is a bufio.SplitFunc implementation of Unicode grapheme cluster segmentation, for use with bufio.Scanner.
|
|
//
|
|
// See https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
|
|
var SplitFunc bufio.SplitFunc = splitFunc[[]byte]
|
|
|
|
func splitFunc[T stringish.Interface](data T, atEOF bool) (advance int, token T, err error) {
|
|
var empty T
|
|
if len(data) == 0 {
|
|
return 0, empty, nil
|
|
}
|
|
|
|
// These vars are stateful across loop iterations
|
|
var pos int
|
|
var lastExIgnore property = 0 // "last excluding ignored categories"
|
|
var lastLastExIgnore property = 0 // "last one before that"
|
|
var regionalIndicatorCount int
|
|
|
|
// Rules are usually of the form Cat1 × Cat2; "current" refers to the first property
|
|
// to the right of the ×, from which we look back or forward
|
|
|
|
current, w := lookup(data[pos:])
|
|
if w == 0 {
|
|
if !atEOF {
|
|
// Rune extends past current data, request more
|
|
return 0, empty, nil
|
|
}
|
|
pos = len(data)
|
|
return pos, data[:pos], nil
|
|
}
|
|
|
|
// https://unicode.org/reports/tr29/#GB1
|
|
// Start of text always advances
|
|
pos += w
|
|
|
|
for {
|
|
eot := pos == len(data) // "end of text"
|
|
|
|
if eot {
|
|
if !atEOF {
|
|
// Token extends past current data, request more
|
|
return 0, empty, nil
|
|
}
|
|
|
|
// https://unicode.org/reports/tr29/#GB2
|
|
break
|
|
}
|
|
|
|
/*
|
|
We've switched the evaluation order of GB1↓ and GB2↑. It's ok:
|
|
because we've checked for len(data) at the top of this function,
|
|
sot and eot are mutually exclusive, order doesn't matter.
|
|
*/
|
|
|
|
// Rules are usually of the form Cat1 × Cat2; "current" refers to the first property
|
|
// to the right of the ×, from which we look back or forward
|
|
|
|
// Remember previous properties to avoid lookups/lookbacks
|
|
last := current
|
|
if !last.is(_Ignore) {
|
|
lastLastExIgnore = lastExIgnore
|
|
lastExIgnore = last
|
|
}
|
|
|
|
current, w = lookup(data[pos:])
|
|
if w == 0 {
|
|
if atEOF {
|
|
// Just return the bytes, we can't do anything with them
|
|
pos = len(data)
|
|
break
|
|
}
|
|
// Rune extends past current data, request more
|
|
return 0, empty, nil
|
|
}
|
|
|
|
// Optimization: no rule can possibly apply
|
|
if current|last == 0 { // i.e. both are zero
|
|
break
|
|
}
|
|
|
|
// https://unicode.org/reports/tr29/#GB3
|
|
if current.is(_LF) && last.is(_CR) {
|
|
pos += w
|
|
continue
|
|
}
|
|
|
|
// https://unicode.org/reports/tr29/#GB4
|
|
// https://unicode.org/reports/tr29/#GB5
|
|
if (current | last).is(_Control | _CR | _LF) {
|
|
break
|
|
}
|
|
|
|
// https://unicode.org/reports/tr29/#GB6
|
|
if current.is(_L|_V|_LV|_LVT) && last.is(_L) {
|
|
pos += w
|
|
continue
|
|
}
|
|
|
|
// https://unicode.org/reports/tr29/#GB7
|
|
if current.is(_V|_T) && last.is(_LV|_V) {
|
|
pos += w
|
|
continue
|
|
}
|
|
|
|
// https://unicode.org/reports/tr29/#GB8
|
|
if current.is(_T) && last.is(_LVT|_T) {
|
|
pos += w
|
|
continue
|
|
}
|
|
|
|
// https://unicode.org/reports/tr29/#GB9
|
|
if current.is(_Extend | _ZWJ) {
|
|
pos += w
|
|
continue
|
|
}
|
|
|
|
// https://unicode.org/reports/tr29/#GB9a
|
|
if current.is(_SpacingMark) {
|
|
pos += w
|
|
continue
|
|
}
|
|
|
|
// https://unicode.org/reports/tr29/#GB9b
|
|
if last.is(_Prepend) {
|
|
pos += w
|
|
continue
|
|
}
|
|
|
|
// https://unicode.org/reports/tr29/#GB9c
|
|
// TODO(clipperhouse):
|
|
// It appears to be added in Unicode 15.1.0:
|
|
// https://unicode.org/versions/Unicode15.1.0/#Migration
|
|
// This package currently supports Unicode 15.0.0, so
|
|
// out of scope for now
|
|
|
|
// https://unicode.org/reports/tr29/#GB11
|
|
if current.is(_ExtendedPictographic) && last.is(_ZWJ) && lastLastExIgnore.is(_ExtendedPictographic) {
|
|
pos += w
|
|
continue
|
|
}
|
|
|
|
// https://unicode.org/reports/tr29/#GB12
|
|
// https://unicode.org/reports/tr29/#GB13
|
|
if (current & last).is(_RegionalIndicator) {
|
|
regionalIndicatorCount++
|
|
|
|
odd := regionalIndicatorCount%2 == 1
|
|
if odd {
|
|
pos += w
|
|
continue
|
|
}
|
|
}
|
|
|
|
// If we fall through all the above rules, it's a grapheme cluster break
|
|
break
|
|
}
|
|
|
|
// Return token
|
|
return pos, data[:pos], nil
|
|
}
|