1
0
mirror of https://github.com/jesseduffield/lazygit.git synced 2025-06-21 00:30:00 +02:00

various changes to improve integration tests

This commit is contained in:
Jesse Duffield
2022-09-09 21:11:05 -07:00
parent 7b757d1cfe
commit 7af7af27c6
103 changed files with 2883 additions and 1525 deletions

View File

@ -1,16 +1,11 @@
<a href="https://stand-with-ukraine.pp.ua">
<img src="https://upload.wikimedia.org/wikipedia/commons/d/d2/Flag_of_Ukraine.png" height="20px" width="100%"/>
</a>
<img src="logos/tcell.png" style="float: right"/>
Please see [here](UKRAINE.md) for an important message for the people of Russia.
# Tcell
_Tcell_ is a _Go_ package that provides a cell based view for text terminals, like _XTerm_.
It was inspired by _termbox_, but includes many additional improvements.
[![Stand With Ukraine](https://raw.githubusercontent.com/vshymanskyy/StandWithUkraine/main/badges/StandWithUkraine.svg)](https://stand-with-ukraine.pp.ua)
[![Linux](https://img.shields.io/github/workflow/status/gdamore/tcell/linux?logoColor=grey&logo=linux&label=)](https://github.com/gdamore/tcell/actions/workflows/linux.yml)
[![Windows](https://img.shields.io/github/workflow/status/gdamore/tcell/windows?logoColor=grey&logo=windows&label=)](https://github.com/gdamore/tcell/actions/workflows/windows.yml)
[![Apache License](https://img.shields.io/github/license/gdamore/tcell.svg?logoColor=silver&logo=opensourceinitiative&color=blue&label=)](https://github.com/gdamore/tcell/blob/master/LICENSE)
@ -18,6 +13,8 @@ It was inspired by _termbox_, but includes many additional improvements.
[![Discord](https://img.shields.io/discord/639503822733180969?label=&logo=discord)](https://discord.gg/urTTxDN)
[![Coverage](https://img.shields.io/codecov/c/github/gdamore/tcell?logoColor=grey&logo=codecov&label=)](https://codecov.io/gh/gdamore/tcell)
Please see [here](UKRAINE.md) for an important message for the people of Russia.
NOTE: This is version 2 of _Tcell_. There are breaking changes relative to version 1.
Version 1.x remains available using the import `github.com/gdamore/tcell`.

View File

@ -206,6 +206,16 @@ func NewGui(mode OutputMode, supportOverlaps bool, playMode PlayMode, headless b
return nil, err
}
if headless || runtime.GOOS == "windows" {
g.maxX, g.maxY = g.screen.Size()
} else {
// TODO: find out if we actually need this bespoke logic for linux
g.maxX, g.maxY, err = g.getTermWindowSize()
if err != nil {
return nil, err
}
}
g.outputMode = mode
g.stop = make(chan struct{})
@ -225,15 +235,6 @@ func NewGui(mode OutputMode, supportOverlaps bool, playMode PlayMode, headless b
}
}
if runtime.GOOS != "windows" {
g.maxX, g.maxY, err = g.getTermWindowSize()
if err != nil {
return nil, err
}
} else {
g.maxX, g.maxY = Screen.Size()
}
g.BgColor, g.FgColor, g.FrameColor = ColorDefault, ColorDefault, ColorDefault
g.SelBgColor, g.SelFgColor, g.SelFrameColor = ColorDefault, ColorDefault, ColorDefault

View File

@ -84,6 +84,10 @@ func (g *Gui) tcellInitSimulation() error {
} else {
g.screen = s
Screen = s
// setting to a larger value than the typical terminal size
// so that during a test we're more likely to see an item to select in a view.
s.SetSize(100, 100)
s.Sync()
return nil
}
}

View File

@ -1245,6 +1245,9 @@ func (v *View) SelectedLineIdx() int {
// expected to only be used in tests
func (v *View) SelectedLine() string {
if len(v.lines) == 0 {
return ""
}
line := v.lines[v.SelectedLineIdx()]
str := lineType(line).String()
return strings.Replace(str, "\x00", " ", -1)

View File

@ -3,13 +3,13 @@
[![Go Reference](https://pkg.go.dev/badge/github.com/rivo/uniseg.svg)](https://pkg.go.dev/github.com/rivo/uniseg)
[![Go Report](https://img.shields.io/badge/go%20report-A%2B-brightgreen.svg)](https://goreportcard.com/report/github.com/rivo/uniseg)
This Go package implements Unicode Text Segmentation according to [Unicode Standard Annex #29](https://unicode.org/reports/tr29/) and Unicode Line Breaking according to [Unicode Standard Annex #14](https://unicode.org/reports/tr14/) (Unicode version 14.0.0).
This Go package implements Unicode Text Segmentation according to [Unicode Standard Annex #29](https://unicode.org/reports/tr29/), Unicode Line Breaking according to [Unicode Standard Annex #14](https://unicode.org/reports/tr14/) (Unicode version 14.0.0), and monospace font string width calculation similar to [wcwidth](https://man7.org/linux/man-pages/man3/wcwidth.3.html).
## Background
### Grapheme Clusters
In Go, [strings are read-only slices of bytes](https://blog.golang.org/strings). They can be turned into Unicode code points using the `for` loop or by casting: `[]rune(str)`. However, multiple code points may be combined into one user-perceived character or what the Unicode specification calls "grapheme cluster". Here are some examples:
In Go, [strings are read-only slices of bytes](https://go.dev/blog/strings). They can be turned into Unicode code points using the `for` loop or by casting: `[]rune(str)`. However, multiple code points may be combined into one user-perceived character or what the Unicode specification calls "grapheme cluster". Here are some examples:
|String|Bytes (UTF-8)|Code points (runes)|Grapheme clusters|
|-|-|-|-|
@ -31,6 +31,10 @@ Sentence boundaries are often used for triple-click or some other method of sele
Line breaking, also known as word wrapping, is the process of breaking a section of text into lines such that it will fit in the available width of a page, window or other display area. This package provides tools to determine where a string may or may not be broken and where it must be broken (for example after newline characters).
### Monospace Width
Most terminals or text displays / text editors using a monospace font (for example source code editors) use a fixed width for each character. Some characters such as emojis or characters found in Asian and other languages may take up more than one character cell. This package provides tools to determine the number of cells a string will take up when displayed in a monospace font. See [here](https://pkg.go.dev/github.com/rivo/uniseg#hdr-Monospace_Width) for more information.
## Installation
```bash
@ -47,6 +51,14 @@ fmt.Println(n)
// 2
```
### Calculating the Monospace String Width
```go
width := uniseg.StringWidth("🇩🇪🏳️‍🌈!")
fmt.Println(width)
// 5
```
### Using the [`Graphemes`](https://pkg.go.dev/github.com/rivo/uniseg#Graphemes) Class
This is the most convenient method of iterating over grapheme clusters:

77
vendor/github.com/rivo/uniseg/doc.go generated vendored
View File

@ -1,8 +1,9 @@
/*
Package uniseg implements Unicode Text Segmentation and Unicode Line Breaking.
Unicode Text Segmentation conforms to Unicode Standard Annex #29
(https://unicode.org/reports/tr29/) and Unicode Line Breaking conforms to
Unicode Standard Annex #14 (https://unicode.org/reports/tr14/).
Package uniseg implements Unicode Text Segmentation, Unicode Line Breaking, and
string width calculation for monospace fonts. Unicode Text Segmentation conforms
to Unicode Standard Annex #29 (https://unicode.org/reports/tr29/) and Unicode
Line Breaking conforms to Unicode Standard Annex #14
(https://unicode.org/reports/tr14/).
In short, using this package, you can split a string into grapheme clusters
(what people would usually refer to as a "character"), into words, and into
@ -12,8 +13,23 @@ as emojis, combining characters, or characters from Asian, Arabic, Hebrew, or
other languages. Additionally, you can use it to implement line breaking (or
"word wrapping"), that is, to determine where text can be broken over to the
next line when the width of the line is not big enough to fit the entire text.
Finally, you can use it to calculate the display width of a string for monospace
fonts.
Grapheme Clusters
# Getting Started
If you just want to count the number of characters in a string, you can use
[GraphemeClusterCount]. If you want to determine the display width of a string,
you can use [StringWidth]. If you want to iterate over a string, you can use
[Step], [StepString], or the [Graphemes] class (more convenient but less
performant). This will provide you with all information: grapheme clusters,
word boundaries, sentence boundaries, line breaks, and monospace character
widths. The specialized functions [FirstGraphemeCluster],
[FirstGraphemeClusterInString], [FirstWord], [FirstWordInString],
[FirstSentence], and [FirstSentenceInString] can be used if only one type of
information is needed.
# Grapheme Clusters
Consider the rainbow flag emoji: 🏳️‍🌈. On most modern systems, it appears as one
character. But its string representation actually has 14 bytes, so counting
@ -21,11 +37,11 @@ bytes (or using len("🏳️‍🌈")) will not work as expected. Counting runes
either: The flag has 4 Unicode code points, thus 4 runes. The stdlib function
utf8.RuneCountInString("🏳️‍🌈") and len([]rune("🏳️‍🌈")) will both return 4.
The uniseg.GraphemeClusterCount(str) function will return 1 for the rainbow flag
emoji. The Graphemes class and a variety of functions in this package will allow
you to split strings into its grapheme clusters.
The [GraphemeClusterCount] function will return 1 for the rainbow flag emoji.
The Graphemes class and a variety of functions in this package will allow you to
split strings into its grapheme clusters.
Word Boundaries
# Word Boundaries
Word boundaries are used in a number of different contexts. The most familiar
ones are selection (double-click mouse selection), cursor movement ("move to
@ -33,7 +49,7 @@ next word" control-arrow keys), and the dialog option "Whole Word Search" for
search and replace. This package provides methods for determining word
boundaries.
Sentence Boundaries
# Sentence Boundaries
Sentence boundaries are often used for triple-click or some other method of
selecting or iterating through blocks of text that are larger than single words.
@ -41,7 +57,7 @@ They are also used to determine whether words occur within the same sentence in
database queries. This package provides methods for determining sentence
boundaries.
Line Breaking
# Line Breaking
Line breaking, also known as word wrapping, is the process of breaking a section
of text into lines such that it will fit in the available width of a page,
@ -49,5 +65,44 @@ window or other display area. This package provides methods to determine the
positions in a string where a line must be broken, may be broken, or must not be
broken.
# Monospace Width
Monospace width, as referred to in this package, is the width of a string in a
monospace font. This is commonly used in terminal user interfaces or text
displays or editors that don't support proportional fonts. A width of 1
corresponds to a single character cell. The C function [wcwidth()] and its
implementation in other programming languages is in widespread use for the same
purpose. However, there is no standard for the calculation of such widths, and
this package differs from wcwidth() in a number of ways, presumably to generate
more visually pleasing results.
To start, we assume that every code point has a width of 1, with the following
exceptions:
- Code points with grapheme cluster break properties Control, CR, LF, Extend,
and ZWJ have a width of 0.
- U+2E3A, Two-Em Dash, has a width of 3.
- U+2E3B, Three-Em Dash, has a width of 4.
- Characters with the East-Asian Width properties "Fullwidth" (F) and "Wide"
(W) have a width of 2. (Properties "Ambiguous" (A) and "Neutral" (N) both
have a width of 1.)
- Code points with grapheme cluster break property Regional Indicator have a
width of 2.
- Code points with grapheme cluster break property Extended Pictographic have
a width of 2, unless their Emoji Presentation flag is "No", in which case
the width is 1.
For Hangul grapheme clusters composed of conjoining Jamo and for Regional
Indicators (flags), all code points except the first one have a width of 0. For
grapheme clusters starting with an Extended Pictographic, any additional code
point will force a total width of 2, except if the Variation Selector-15
(U+FE0E) is included, in which case the total width is always 1. Grapheme
clusters ending with Variation Selector-16 (U+FE0F) have a width of 2.
Note that whether these widths appear correct depends on your application's
render engine, to which extent it conforms to the Unicode Standard, and its
choice of font.
[wcwidth()]: https://man7.org/linux/man-pages/man3/wcwidth.3.html
*/
package uniseg

View File

@ -4,7 +4,10 @@ package uniseg
// eastAsianWidth are taken from
// https://www.unicode.org/Public/14.0.0/ucd/EastAsianWidth.txt
// on July 25, 2022. See https://www.unicode.org/license.html for the Unicode
// and
// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
// ("Extended_Pictographic" only)
// on September 10, 2022. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var eastAsianWidth = [][3]int{
{0x0000, 0x001F, prN}, // Cc [32] <control-0000>..<control-001F>

285
vendor/github.com/rivo/uniseg/emojipresentation.go generated vendored Normal file
View File

@ -0,0 +1,285 @@
package uniseg
// Code generated via go generate from gen_properties.go. DO NOT EDIT.
// emojiPresentation are taken from
//
// and
// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
// ("Extended_Pictographic" only)
// on September 10, 2022. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var emojiPresentation = [][3]int{
{0x231A, 0x231B, prEmojiPresentation}, // E0.6 [2] (⌚..⌛) watch..hourglass done
{0x23E9, 0x23EC, prEmojiPresentation}, // E0.6 [4] (⏩..⏬) fast-forward button..fast down button
{0x23F0, 0x23F0, prEmojiPresentation}, // E0.6 [1] (⏰) alarm clock
{0x23F3, 0x23F3, prEmojiPresentation}, // E0.6 [1] (⏳) hourglass not done
{0x25FD, 0x25FE, prEmojiPresentation}, // E0.6 [2] (◽..◾) white medium-small square..black medium-small square
{0x2614, 0x2615, prEmojiPresentation}, // E0.6 [2] (☔..☕) umbrella with rain drops..hot beverage
{0x2648, 0x2653, prEmojiPresentation}, // E0.6 [12] (♈..♓) Aries..Pisces
{0x267F, 0x267F, prEmojiPresentation}, // E0.6 [1] (♿) wheelchair symbol
{0x2693, 0x2693, prEmojiPresentation}, // E0.6 [1] (⚓) anchor
{0x26A1, 0x26A1, prEmojiPresentation}, // E0.6 [1] (⚡) high voltage
{0x26AA, 0x26AB, prEmojiPresentation}, // E0.6 [2] (⚪..⚫) white circle..black circle
{0x26BD, 0x26BE, prEmojiPresentation}, // E0.6 [2] (⚽..⚾) soccer ball..baseball
{0x26C4, 0x26C5, prEmojiPresentation}, // E0.6 [2] (⛄..⛅) snowman without snow..sun behind cloud
{0x26CE, 0x26CE, prEmojiPresentation}, // E0.6 [1] (⛎) Ophiuchus
{0x26D4, 0x26D4, prEmojiPresentation}, // E0.6 [1] (⛔) no entry
{0x26EA, 0x26EA, prEmojiPresentation}, // E0.6 [1] (⛪) church
{0x26F2, 0x26F3, prEmojiPresentation}, // E0.6 [2] (⛲..⛳) fountain..flag in hole
{0x26F5, 0x26F5, prEmojiPresentation}, // E0.6 [1] (⛵) sailboat
{0x26FA, 0x26FA, prEmojiPresentation}, // E0.6 [1] (⛺) tent
{0x26FD, 0x26FD, prEmojiPresentation}, // E0.6 [1] (⛽) fuel pump
{0x2705, 0x2705, prEmojiPresentation}, // E0.6 [1] (✅) check mark button
{0x270A, 0x270B, prEmojiPresentation}, // E0.6 [2] (✊..✋) raised fist..raised hand
{0x2728, 0x2728, prEmojiPresentation}, // E0.6 [1] (✨) sparkles
{0x274C, 0x274C, prEmojiPresentation}, // E0.6 [1] (❌) cross mark
{0x274E, 0x274E, prEmojiPresentation}, // E0.6 [1] (❎) cross mark button
{0x2753, 0x2755, prEmojiPresentation}, // E0.6 [3] (❓..❕) red question mark..white exclamation mark
{0x2757, 0x2757, prEmojiPresentation}, // E0.6 [1] (❗) red exclamation mark
{0x2795, 0x2797, prEmojiPresentation}, // E0.6 [3] (➕..➗) plus..divide
{0x27B0, 0x27B0, prEmojiPresentation}, // E0.6 [1] (➰) curly loop
{0x27BF, 0x27BF, prEmojiPresentation}, // E1.0 [1] (➿) double curly loop
{0x2B1B, 0x2B1C, prEmojiPresentation}, // E0.6 [2] (⬛..⬜) black large square..white large square
{0x2B50, 0x2B50, prEmojiPresentation}, // E0.6 [1] (⭐) star
{0x2B55, 0x2B55, prEmojiPresentation}, // E0.6 [1] (⭕) hollow red circle
{0x1F004, 0x1F004, prEmojiPresentation}, // E0.6 [1] (🀄) mahjong red dragon
{0x1F0CF, 0x1F0CF, prEmojiPresentation}, // E0.6 [1] (🃏) joker
{0x1F18E, 0x1F18E, prEmojiPresentation}, // E0.6 [1] (🆎) AB button (blood type)
{0x1F191, 0x1F19A, prEmojiPresentation}, // E0.6 [10] (🆑..🆚) CL button..VS button
{0x1F1E6, 0x1F1FF, prEmojiPresentation}, // E0.0 [26] (🇦..🇿) regional indicator symbol letter a..regional indicator symbol letter z
{0x1F201, 0x1F201, prEmojiPresentation}, // E0.6 [1] (🈁) Japanese “here” button
{0x1F21A, 0x1F21A, prEmojiPresentation}, // E0.6 [1] (🈚) Japanese “free of charge” button
{0x1F22F, 0x1F22F, prEmojiPresentation}, // E0.6 [1] (🈯) Japanese “reserved” button
{0x1F232, 0x1F236, prEmojiPresentation}, // E0.6 [5] (🈲..🈶) Japanese “prohibited” button..Japanese “not free of charge” button
{0x1F238, 0x1F23A, prEmojiPresentation}, // E0.6 [3] (🈸..🈺) Japanese “application” button..Japanese “open for business” button
{0x1F250, 0x1F251, prEmojiPresentation}, // E0.6 [2] (🉐..🉑) Japanese “bargain” button..Japanese “acceptable” button
{0x1F300, 0x1F30C, prEmojiPresentation}, // E0.6 [13] (🌀..🌌) cyclone..milky way
{0x1F30D, 0x1F30E, prEmojiPresentation}, // E0.7 [2] (🌍..🌎) globe showing Europe-Africa..globe showing Americas
{0x1F30F, 0x1F30F, prEmojiPresentation}, // E0.6 [1] (🌏) globe showing Asia-Australia
{0x1F310, 0x1F310, prEmojiPresentation}, // E1.0 [1] (🌐) globe with meridians
{0x1F311, 0x1F311, prEmojiPresentation}, // E0.6 [1] (🌑) new moon
{0x1F312, 0x1F312, prEmojiPresentation}, // E1.0 [1] (🌒) waxing crescent moon
{0x1F313, 0x1F315, prEmojiPresentation}, // E0.6 [3] (🌓..🌕) first quarter moon..full moon
{0x1F316, 0x1F318, prEmojiPresentation}, // E1.0 [3] (🌖..🌘) waning gibbous moon..waning crescent moon
{0x1F319, 0x1F319, prEmojiPresentation}, // E0.6 [1] (🌙) crescent moon
{0x1F31A, 0x1F31A, prEmojiPresentation}, // E1.0 [1] (🌚) new moon face
{0x1F31B, 0x1F31B, prEmojiPresentation}, // E0.6 [1] (🌛) first quarter moon face
{0x1F31C, 0x1F31C, prEmojiPresentation}, // E0.7 [1] (🌜) last quarter moon face
{0x1F31D, 0x1F31E, prEmojiPresentation}, // E1.0 [2] (🌝..🌞) full moon face..sun with face
{0x1F31F, 0x1F320, prEmojiPresentation}, // E0.6 [2] (🌟..🌠) glowing star..shooting star
{0x1F32D, 0x1F32F, prEmojiPresentation}, // E1.0 [3] (🌭..🌯) hot dog..burrito
{0x1F330, 0x1F331, prEmojiPresentation}, // E0.6 [2] (🌰..🌱) chestnut..seedling
{0x1F332, 0x1F333, prEmojiPresentation}, // E1.0 [2] (🌲..🌳) evergreen tree..deciduous tree
{0x1F334, 0x1F335, prEmojiPresentation}, // E0.6 [2] (🌴..🌵) palm tree..cactus
{0x1F337, 0x1F34A, prEmojiPresentation}, // E0.6 [20] (🌷..🍊) tulip..tangerine
{0x1F34B, 0x1F34B, prEmojiPresentation}, // E1.0 [1] (🍋) lemon
{0x1F34C, 0x1F34F, prEmojiPresentation}, // E0.6 [4] (🍌..🍏) banana..green apple
{0x1F350, 0x1F350, prEmojiPresentation}, // E1.0 [1] (🍐) pear
{0x1F351, 0x1F37B, prEmojiPresentation}, // E0.6 [43] (🍑..🍻) peach..clinking beer mugs
{0x1F37C, 0x1F37C, prEmojiPresentation}, // E1.0 [1] (🍼) baby bottle
{0x1F37E, 0x1F37F, prEmojiPresentation}, // E1.0 [2] (🍾..🍿) bottle with popping cork..popcorn
{0x1F380, 0x1F393, prEmojiPresentation}, // E0.6 [20] (🎀..🎓) ribbon..graduation cap
{0x1F3A0, 0x1F3C4, prEmojiPresentation}, // E0.6 [37] (🎠..🏄) carousel horse..person surfing
{0x1F3C5, 0x1F3C5, prEmojiPresentation}, // E1.0 [1] (🏅) sports medal
{0x1F3C6, 0x1F3C6, prEmojiPresentation}, // E0.6 [1] (🏆) trophy
{0x1F3C7, 0x1F3C7, prEmojiPresentation}, // E1.0 [1] (🏇) horse racing
{0x1F3C8, 0x1F3C8, prEmojiPresentation}, // E0.6 [1] (🏈) american football
{0x1F3C9, 0x1F3C9, prEmojiPresentation}, // E1.0 [1] (🏉) rugby football
{0x1F3CA, 0x1F3CA, prEmojiPresentation}, // E0.6 [1] (🏊) person swimming
{0x1F3CF, 0x1F3D3, prEmojiPresentation}, // E1.0 [5] (🏏..🏓) cricket game..ping pong
{0x1F3E0, 0x1F3E3, prEmojiPresentation}, // E0.6 [4] (🏠..🏣) house..Japanese post office
{0x1F3E4, 0x1F3E4, prEmojiPresentation}, // E1.0 [1] (🏤) post office
{0x1F3E5, 0x1F3F0, prEmojiPresentation}, // E0.6 [12] (🏥..🏰) hospital..castle
{0x1F3F4, 0x1F3F4, prEmojiPresentation}, // E1.0 [1] (🏴) black flag
{0x1F3F8, 0x1F407, prEmojiPresentation}, // E1.0 [16] (🏸..🐇) badminton..rabbit
{0x1F408, 0x1F408, prEmojiPresentation}, // E0.7 [1] (🐈) cat
{0x1F409, 0x1F40B, prEmojiPresentation}, // E1.0 [3] (🐉..🐋) dragon..whale
{0x1F40C, 0x1F40E, prEmojiPresentation}, // E0.6 [3] (🐌..🐎) snail..horse
{0x1F40F, 0x1F410, prEmojiPresentation}, // E1.0 [2] (🐏..🐐) ram..goat
{0x1F411, 0x1F412, prEmojiPresentation}, // E0.6 [2] (🐑..🐒) ewe..monkey
{0x1F413, 0x1F413, prEmojiPresentation}, // E1.0 [1] (🐓) rooster
{0x1F414, 0x1F414, prEmojiPresentation}, // E0.6 [1] (🐔) chicken
{0x1F415, 0x1F415, prEmojiPresentation}, // E0.7 [1] (🐕) dog
{0x1F416, 0x1F416, prEmojiPresentation}, // E1.0 [1] (🐖) pig
{0x1F417, 0x1F429, prEmojiPresentation}, // E0.6 [19] (🐗..🐩) boar..poodle
{0x1F42A, 0x1F42A, prEmojiPresentation}, // E1.0 [1] (🐪) camel
{0x1F42B, 0x1F43E, prEmojiPresentation}, // E0.6 [20] (🐫..🐾) two-hump camel..paw prints
{0x1F440, 0x1F440, prEmojiPresentation}, // E0.6 [1] (👀) eyes
{0x1F442, 0x1F464, prEmojiPresentation}, // E0.6 [35] (👂..👤) ear..bust in silhouette
{0x1F465, 0x1F465, prEmojiPresentation}, // E1.0 [1] (👥) busts in silhouette
{0x1F466, 0x1F46B, prEmojiPresentation}, // E0.6 [6] (👦..👫) boy..woman and man holding hands
{0x1F46C, 0x1F46D, prEmojiPresentation}, // E1.0 [2] (👬..👭) men holding hands..women holding hands
{0x1F46E, 0x1F4AC, prEmojiPresentation}, // E0.6 [63] (👮..💬) police officer..speech balloon
{0x1F4AD, 0x1F4AD, prEmojiPresentation}, // E1.0 [1] (💭) thought balloon
{0x1F4AE, 0x1F4B5, prEmojiPresentation}, // E0.6 [8] (💮..💵) white flower..dollar banknote
{0x1F4B6, 0x1F4B7, prEmojiPresentation}, // E1.0 [2] (💶..💷) euro banknote..pound banknote
{0x1F4B8, 0x1F4EB, prEmojiPresentation}, // E0.6 [52] (💸..📫) money with wings..closed mailbox with raised flag
{0x1F4EC, 0x1F4ED, prEmojiPresentation}, // E0.7 [2] (📬..📭) open mailbox with raised flag..open mailbox with lowered flag
{0x1F4EE, 0x1F4EE, prEmojiPresentation}, // E0.6 [1] (📮) postbox
{0x1F4EF, 0x1F4EF, prEmojiPresentation}, // E1.0 [1] (📯) postal horn
{0x1F4F0, 0x1F4F4, prEmojiPresentation}, // E0.6 [5] (📰..📴) newspaper..mobile phone off
{0x1F4F5, 0x1F4F5, prEmojiPresentation}, // E1.0 [1] (📵) no mobile phones
{0x1F4F6, 0x1F4F7, prEmojiPresentation}, // E0.6 [2] (📶..📷) antenna bars..camera
{0x1F4F8, 0x1F4F8, prEmojiPresentation}, // E1.0 [1] (📸) camera with flash
{0x1F4F9, 0x1F4FC, prEmojiPresentation}, // E0.6 [4] (📹..📼) video camera..videocassette
{0x1F4FF, 0x1F502, prEmojiPresentation}, // E1.0 [4] (📿..🔂) prayer beads..repeat single button
{0x1F503, 0x1F503, prEmojiPresentation}, // E0.6 [1] (🔃) clockwise vertical arrows
{0x1F504, 0x1F507, prEmojiPresentation}, // E1.0 [4] (🔄..🔇) counterclockwise arrows button..muted speaker
{0x1F508, 0x1F508, prEmojiPresentation}, // E0.7 [1] (🔈) speaker low volume
{0x1F509, 0x1F509, prEmojiPresentation}, // E1.0 [1] (🔉) speaker medium volume
{0x1F50A, 0x1F514, prEmojiPresentation}, // E0.6 [11] (🔊..🔔) speaker high volume..bell
{0x1F515, 0x1F515, prEmojiPresentation}, // E1.0 [1] (🔕) bell with slash
{0x1F516, 0x1F52B, prEmojiPresentation}, // E0.6 [22] (🔖..🔫) bookmark..water pistol
{0x1F52C, 0x1F52D, prEmojiPresentation}, // E1.0 [2] (🔬..🔭) microscope..telescope
{0x1F52E, 0x1F53D, prEmojiPresentation}, // E0.6 [16] (🔮..🔽) crystal ball..downwards button
{0x1F54B, 0x1F54E, prEmojiPresentation}, // E1.0 [4] (🕋..🕎) kaaba..menorah
{0x1F550, 0x1F55B, prEmojiPresentation}, // E0.6 [12] (🕐..🕛) one o’clock..twelve o’clock
{0x1F55C, 0x1F567, prEmojiPresentation}, // E0.7 [12] (🕜..🕧) one-thirty..twelve-thirty
{0x1F57A, 0x1F57A, prEmojiPresentation}, // E3.0 [1] (🕺) man dancing
{0x1F595, 0x1F596, prEmojiPresentation}, // E1.0 [2] (🖕..🖖) middle finger..vulcan salute
{0x1F5A4, 0x1F5A4, prEmojiPresentation}, // E3.0 [1] (🖤) black heart
{0x1F5FB, 0x1F5FF, prEmojiPresentation}, // E0.6 [5] (🗻..🗿) mount fuji..moai
{0x1F600, 0x1F600, prEmojiPresentation}, // E1.0 [1] (😀) grinning face
{0x1F601, 0x1F606, prEmojiPresentation}, // E0.6 [6] (😁..😆) beaming face with smiling eyes..grinning squinting face
{0x1F607, 0x1F608, prEmojiPresentation}, // E1.0 [2] (😇..😈) smiling face with halo..smiling face with horns
{0x1F609, 0x1F60D, prEmojiPresentation}, // E0.6 [5] (😉..😍) winking face..smiling face with heart-eyes
{0x1F60E, 0x1F60E, prEmojiPresentation}, // E1.0 [1] (😎) smiling face with sunglasses
{0x1F60F, 0x1F60F, prEmojiPresentation}, // E0.6 [1] (😏) smirking face
{0x1F610, 0x1F610, prEmojiPresentation}, // E0.7 [1] (😐) neutral face
{0x1F611, 0x1F611, prEmojiPresentation}, // E1.0 [1] (😑) expressionless face
{0x1F612, 0x1F614, prEmojiPresentation}, // E0.6 [3] (😒..😔) unamused face..pensive face
{0x1F615, 0x1F615, prEmojiPresentation}, // E1.0 [1] (😕) confused face
{0x1F616, 0x1F616, prEmojiPresentation}, // E0.6 [1] (😖) confounded face
{0x1F617, 0x1F617, prEmojiPresentation}, // E1.0 [1] (😗) kissing face
{0x1F618, 0x1F618, prEmojiPresentation}, // E0.6 [1] (😘) face blowing a kiss
{0x1F619, 0x1F619, prEmojiPresentation}, // E1.0 [1] (😙) kissing face with smiling eyes
{0x1F61A, 0x1F61A, prEmojiPresentation}, // E0.6 [1] (😚) kissing face with closed eyes
{0x1F61B, 0x1F61B, prEmojiPresentation}, // E1.0 [1] (😛) face with tongue
{0x1F61C, 0x1F61E, prEmojiPresentation}, // E0.6 [3] (😜..😞) winking face with tongue..disappointed face
{0x1F61F, 0x1F61F, prEmojiPresentation}, // E1.0 [1] (😟) worried face
{0x1F620, 0x1F625, prEmojiPresentation}, // E0.6 [6] (😠..😥) angry face..sad but relieved face
{0x1F626, 0x1F627, prEmojiPresentation}, // E1.0 [2] (😦..😧) frowning face with open mouth..anguished face
{0x1F628, 0x1F62B, prEmojiPresentation}, // E0.6 [4] (😨..😫) fearful face..tired face
{0x1F62C, 0x1F62C, prEmojiPresentation}, // E1.0 [1] (😬) grimacing face
{0x1F62D, 0x1F62D, prEmojiPresentation}, // E0.6 [1] (😭) loudly crying face
{0x1F62E, 0x1F62F, prEmojiPresentation}, // E1.0 [2] (😮..😯) face with open mouth..hushed face
{0x1F630, 0x1F633, prEmojiPresentation}, // E0.6 [4] (😰..😳) anxious face with sweat..flushed face
{0x1F634, 0x1F634, prEmojiPresentation}, // E1.0 [1] (😴) sleeping face
{0x1F635, 0x1F635, prEmojiPresentation}, // E0.6 [1] (😵) face with crossed-out eyes
{0x1F636, 0x1F636, prEmojiPresentation}, // E1.0 [1] (😶) face without mouth
{0x1F637, 0x1F640, prEmojiPresentation}, // E0.6 [10] (😷..🙀) face with medical mask..weary cat
{0x1F641, 0x1F644, prEmojiPresentation}, // E1.0 [4] (🙁..🙄) slightly frowning face..face with rolling eyes
{0x1F645, 0x1F64F, prEmojiPresentation}, // E0.6 [11] (🙅..🙏) person gesturing NO..folded hands
{0x1F680, 0x1F680, prEmojiPresentation}, // E0.6 [1] (🚀) rocket
{0x1F681, 0x1F682, prEmojiPresentation}, // E1.0 [2] (🚁..🚂) helicopter..locomotive
{0x1F683, 0x1F685, prEmojiPresentation}, // E0.6 [3] (🚃..🚅) railway car..bullet train
{0x1F686, 0x1F686, prEmojiPresentation}, // E1.0 [1] (🚆) train
{0x1F687, 0x1F687, prEmojiPresentation}, // E0.6 [1] (🚇) metro
{0x1F688, 0x1F688, prEmojiPresentation}, // E1.0 [1] (🚈) light rail
{0x1F689, 0x1F689, prEmojiPresentation}, // E0.6 [1] (🚉) station
{0x1F68A, 0x1F68B, prEmojiPresentation}, // E1.0 [2] (🚊..🚋) tram..tram car
{0x1F68C, 0x1F68C, prEmojiPresentation}, // E0.6 [1] (🚌) bus
{0x1F68D, 0x1F68D, prEmojiPresentation}, // E0.7 [1] (🚍) oncoming bus
{0x1F68E, 0x1F68E, prEmojiPresentation}, // E1.0 [1] (🚎) trolleybus
{0x1F68F, 0x1F68F, prEmojiPresentation}, // E0.6 [1] (🚏) bus stop
{0x1F690, 0x1F690, prEmojiPresentation}, // E1.0 [1] (🚐) minibus
{0x1F691, 0x1F693, prEmojiPresentation}, // E0.6 [3] (🚑..🚓) ambulance..police car
{0x1F694, 0x1F694, prEmojiPresentation}, // E0.7 [1] (🚔) oncoming police car
{0x1F695, 0x1F695, prEmojiPresentation}, // E0.6 [1] (🚕) taxi
{0x1F696, 0x1F696, prEmojiPresentation}, // E1.0 [1] (🚖) oncoming taxi
{0x1F697, 0x1F697, prEmojiPresentation}, // E0.6 [1] (🚗) automobile
{0x1F698, 0x1F698, prEmojiPresentation}, // E0.7 [1] (🚘) oncoming automobile
{0x1F699, 0x1F69A, prEmojiPresentation}, // E0.6 [2] (🚙..🚚) sport utility vehicle..delivery truck
{0x1F69B, 0x1F6A1, prEmojiPresentation}, // E1.0 [7] (🚛..🚡) articulated lorry..aerial tramway
{0x1F6A2, 0x1F6A2, prEmojiPresentation}, // E0.6 [1] (🚢) ship
{0x1F6A3, 0x1F6A3, prEmojiPresentation}, // E1.0 [1] (🚣) person rowing boat
{0x1F6A4, 0x1F6A5, prEmojiPresentation}, // E0.6 [2] (🚤..🚥) speedboat..horizontal traffic light
{0x1F6A6, 0x1F6A6, prEmojiPresentation}, // E1.0 [1] (🚦) vertical traffic light
{0x1F6A7, 0x1F6AD, prEmojiPresentation}, // E0.6 [7] (🚧..🚭) construction..no smoking
{0x1F6AE, 0x1F6B1, prEmojiPresentation}, // E1.0 [4] (🚮..🚱) litter in bin sign..non-potable water
{0x1F6B2, 0x1F6B2, prEmojiPresentation}, // E0.6 [1] (🚲) bicycle
{0x1F6B3, 0x1F6B5, prEmojiPresentation}, // E1.0 [3] (🚳..🚵) no bicycles..person mountain biking
{0x1F6B6, 0x1F6B6, prEmojiPresentation}, // E0.6 [1] (🚶) person walking
{0x1F6B7, 0x1F6B8, prEmojiPresentation}, // E1.0 [2] (🚷..🚸) no pedestrians..children crossing
{0x1F6B9, 0x1F6BE, prEmojiPresentation}, // E0.6 [6] (🚹..🚾) men’s room..water closet
{0x1F6BF, 0x1F6BF, prEmojiPresentation}, // E1.0 [1] (🚿) shower
{0x1F6C0, 0x1F6C0, prEmojiPresentation}, // E0.6 [1] (🛀) person taking bath
{0x1F6C1, 0x1F6C5, prEmojiPresentation}, // E1.0 [5] (🛁..🛅) bathtub..left luggage
{0x1F6CC, 0x1F6CC, prEmojiPresentation}, // E1.0 [1] (🛌) person in bed
{0x1F6D0, 0x1F6D0, prEmojiPresentation}, // E1.0 [1] (🛐) place of worship
{0x1F6D1, 0x1F6D2, prEmojiPresentation}, // E3.0 [2] (🛑..🛒) stop sign..shopping cart
{0x1F6D5, 0x1F6D5, prEmojiPresentation}, // E12.0 [1] (🛕) hindu temple
{0x1F6D6, 0x1F6D7, prEmojiPresentation}, // E13.0 [2] (🛖..🛗) hut..elevator
{0x1F6DD, 0x1F6DF, prEmojiPresentation}, // E14.0 [3] (🛝..🛟) playground slide..ring buoy
{0x1F6EB, 0x1F6EC, prEmojiPresentation}, // E1.0 [2] (🛫..🛬) airplane departure..airplane arrival
{0x1F6F4, 0x1F6F6, prEmojiPresentation}, // E3.0 [3] (🛴..🛶) kick scooter..canoe
{0x1F6F7, 0x1F6F8, prEmojiPresentation}, // E5.0 [2] (🛷..🛸) sled..flying saucer
{0x1F6F9, 0x1F6F9, prEmojiPresentation}, // E11.0 [1] (🛹) skateboard
{0x1F6FA, 0x1F6FA, prEmojiPresentation}, // E12.0 [1] (🛺) auto rickshaw
{0x1F6FB, 0x1F6FC, prEmojiPresentation}, // E13.0 [2] (🛻..🛼) pickup truck..roller skate
{0x1F7E0, 0x1F7EB, prEmojiPresentation}, // E12.0 [12] (🟠..🟫) orange circle..brown square
{0x1F7F0, 0x1F7F0, prEmojiPresentation}, // E14.0 [1] (🟰) heavy equals sign
{0x1F90C, 0x1F90C, prEmojiPresentation}, // E13.0 [1] (🤌) pinched fingers
{0x1F90D, 0x1F90F, prEmojiPresentation}, // E12.0 [3] (🤍..🤏) white heart..pinching hand
{0x1F910, 0x1F918, prEmojiPresentation}, // E1.0 [9] (🤐..🤘) zipper-mouth face..sign of the horns
{0x1F919, 0x1F91E, prEmojiPresentation}, // E3.0 [6] (🤙..🤞) call me hand..crossed fingers
{0x1F91F, 0x1F91F, prEmojiPresentation}, // E5.0 [1] (🤟) love-you gesture
{0x1F920, 0x1F927, prEmojiPresentation}, // E3.0 [8] (🤠..🤧) cowboy hat face..sneezing face
{0x1F928, 0x1F92F, prEmojiPresentation}, // E5.0 [8] (🤨..🤯) face with raised eyebrow..exploding head
{0x1F930, 0x1F930, prEmojiPresentation}, // E3.0 [1] (🤰) pregnant woman
{0x1F931, 0x1F932, prEmojiPresentation}, // E5.0 [2] (🤱..🤲) breast-feeding..palms up together
{0x1F933, 0x1F93A, prEmojiPresentation}, // E3.0 [8] (🤳..🤺) selfie..person fencing
{0x1F93C, 0x1F93E, prEmojiPresentation}, // E3.0 [3] (🤼..🤾) people wrestling..person playing handball
{0x1F93F, 0x1F93F, prEmojiPresentation}, // E12.0 [1] (🤿) diving mask
{0x1F940, 0x1F945, prEmojiPresentation}, // E3.0 [6] (🥀..🥅) wilted flower..goal net
{0x1F947, 0x1F94B, prEmojiPresentation}, // E3.0 [5] (🥇..🥋) 1st place medal..martial arts uniform
{0x1F94C, 0x1F94C, prEmojiPresentation}, // E5.0 [1] (🥌) curling stone
{0x1F94D, 0x1F94F, prEmojiPresentation}, // E11.0 [3] (🥍..🥏) lacrosse..flying disc
{0x1F950, 0x1F95E, prEmojiPresentation}, // E3.0 [15] (🥐..🥞) croissant..pancakes
{0x1F95F, 0x1F96B, prEmojiPresentation}, // E5.0 [13] (🥟..🥫) dumpling..canned food
{0x1F96C, 0x1F970, prEmojiPresentation}, // E11.0 [5] (🥬..🥰) leafy green..smiling face with hearts
{0x1F971, 0x1F971, prEmojiPresentation}, // E12.0 [1] (🥱) yawning face
{0x1F972, 0x1F972, prEmojiPresentation}, // E13.0 [1] (🥲) smiling face with tear
{0x1F973, 0x1F976, prEmojiPresentation}, // E11.0 [4] (🥳..🥶) partying face..cold face
{0x1F977, 0x1F978, prEmojiPresentation}, // E13.0 [2] (🥷..🥸) ninja..disguised face
{0x1F979, 0x1F979, prEmojiPresentation}, // E14.0 [1] (🥹) face holding back tears
{0x1F97A, 0x1F97A, prEmojiPresentation}, // E11.0 [1] (🥺) pleading face
{0x1F97B, 0x1F97B, prEmojiPresentation}, // E12.0 [1] (🥻) sari
{0x1F97C, 0x1F97F, prEmojiPresentation}, // E11.0 [4] (🥼..🥿) lab coat..flat shoe
{0x1F980, 0x1F984, prEmojiPresentation}, // E1.0 [5] (🦀..🦄) crab..unicorn
{0x1F985, 0x1F991, prEmojiPresentation}, // E3.0 [13] (🦅..🦑) eagle..squid
{0x1F992, 0x1F997, prEmojiPresentation}, // E5.0 [6] (🦒..🦗) giraffe..cricket
{0x1F998, 0x1F9A2, prEmojiPresentation}, // E11.0 [11] (🦘..🦢) kangaroo..swan
{0x1F9A3, 0x1F9A4, prEmojiPresentation}, // E13.0 [2] (🦣..🦤) mammoth..dodo
{0x1F9A5, 0x1F9AA, prEmojiPresentation}, // E12.0 [6] (🦥..🦪) sloth..oyster
{0x1F9AB, 0x1F9AD, prEmojiPresentation}, // E13.0 [3] (🦫..🦭) beaver..seal
{0x1F9AE, 0x1F9AF, prEmojiPresentation}, // E12.0 [2] (🦮..🦯) guide dog..white cane
{0x1F9B0, 0x1F9B9, prEmojiPresentation}, // E11.0 [10] (🦰..🦹) red hair..supervillain
{0x1F9BA, 0x1F9BF, prEmojiPresentation}, // E12.0 [6] (🦺..🦿) safety vest..mechanical leg
{0x1F9C0, 0x1F9C0, prEmojiPresentation}, // E1.0 [1] (🧀) cheese wedge
{0x1F9C1, 0x1F9C2, prEmojiPresentation}, // E11.0 [2] (🧁..🧂) cupcake..salt
{0x1F9C3, 0x1F9CA, prEmojiPresentation}, // E12.0 [8] (🧃..🧊) beverage box..ice
{0x1F9CB, 0x1F9CB, prEmojiPresentation}, // E13.0 [1] (🧋) bubble tea
{0x1F9CC, 0x1F9CC, prEmojiPresentation}, // E14.0 [1] (🧌) troll
{0x1F9CD, 0x1F9CF, prEmojiPresentation}, // E12.0 [3] (🧍..🧏) person standing..deaf person
{0x1F9D0, 0x1F9E6, prEmojiPresentation}, // E5.0 [23] (🧐..🧦) face with monocle..socks
{0x1F9E7, 0x1F9FF, prEmojiPresentation}, // E11.0 [25] (🧧..🧿) red envelope..nazar amulet
{0x1FA70, 0x1FA73, prEmojiPresentation}, // E12.0 [4] (🩰..🩳) ballet shoes..shorts
{0x1FA74, 0x1FA74, prEmojiPresentation}, // E13.0 [1] (🩴) thong sandal
{0x1FA78, 0x1FA7A, prEmojiPresentation}, // E12.0 [3] (🩸..🩺) drop of blood..stethoscope
{0x1FA7B, 0x1FA7C, prEmojiPresentation}, // E14.0 [2] (🩻..🩼) x-ray..crutch
{0x1FA80, 0x1FA82, prEmojiPresentation}, // E12.0 [3] (🪀..🪂) yo-yo..parachute
{0x1FA83, 0x1FA86, prEmojiPresentation}, // E13.0 [4] (🪃..🪆) boomerang..nesting dolls
{0x1FA90, 0x1FA95, prEmojiPresentation}, // E12.0 [6] (🪐..🪕) ringed planet..banjo
{0x1FA96, 0x1FAA8, prEmojiPresentation}, // E13.0 [19] (🪖..🪨) military helmet..rock
{0x1FAA9, 0x1FAAC, prEmojiPresentation}, // E14.0 [4] (🪩..🪬) mirror ball..hamsa
{0x1FAB0, 0x1FAB6, prEmojiPresentation}, // E13.0 [7] (🪰..🪶) fly..feather
{0x1FAB7, 0x1FABA, prEmojiPresentation}, // E14.0 [4] (🪷..🪺) lotus..nest with eggs
{0x1FAC0, 0x1FAC2, prEmojiPresentation}, // E13.0 [3] (🫀..🫂) anatomical heart..people hugging
{0x1FAC3, 0x1FAC5, prEmojiPresentation}, // E14.0 [3] (🫃..🫅) pregnant man..person with crown
{0x1FAD0, 0x1FAD6, prEmojiPresentation}, // E13.0 [7] (🫐..🫖) blueberries..teapot
{0x1FAD7, 0x1FAD9, prEmojiPresentation}, // E14.0 [3] (🫗..🫙) pouring liquid..jar
{0x1FAE0, 0x1FAE7, prEmojiPresentation}, // E14.0 [8] (🫠..🫧) melting face..bubbles
{0x1FAF0, 0x1FAF6, prEmojiPresentation}, // E14.0 [7] (🫰..🫶) hand with index finger and thumb crossed..heart hands
}

View File

@ -3,19 +3,22 @@
// This program generates a property file in Go file from Unicode Character
// Database auxiliary data files. The command line arguments are as follows:
//
// 1. The name of the Unicode data file (just the filename, without extension).
// 2. The name of the locally generated Go file.
// 3. The name of the slice mapping code points to properties.
// 4. The name of the generator, for logging purposes.
// 5. (Optional) Flags, comma-separated. The following flags are available:
// - "emojis": include emoji properties (Extended Pictographic only).
// - "gencat": include general category properties.
// 1. The name of the Unicode data file (just the filename, without extension).
// Can be "-" (to skip) if the emoji flag is included.
// 2. The name of the locally generated Go file.
// 3. The name of the slice mapping code points to properties.
// 4. The name of the generator, for logging purposes.
// 5. (Optional) Flags, comma-separated. The following flags are available:
// - "emojis=<property>": include the specified emoji properties (e.g.
// "Extended_Pictographic").
// - "gencat": include general category properties.
//
//go:generate go run gen_properties.go auxiliary/GraphemeBreakProperty graphemeproperties.go graphemeCodePoints graphemes emojis
//go:generate go run gen_properties.go auxiliary/WordBreakProperty wordproperties.go workBreakCodePoints words emojis
//go:generate go run gen_properties.go auxiliary/GraphemeBreakProperty graphemeproperties.go graphemeCodePoints graphemes emojis=Extended_Pictographic
//go:generate go run gen_properties.go auxiliary/WordBreakProperty wordproperties.go workBreakCodePoints words emojis=Extended_Pictographic
//go:generate go run gen_properties.go auxiliary/SentenceBreakProperty sentenceproperties.go sentenceBreakCodePoints sentences
//go:generate go run gen_properties.go LineBreak lineproperties.go lineBreakCodePoints lines gencat
//go:generate go run gen_properties.go EastAsianWidth eastasianwidth.go eastAsianWidth eastasianwidth
//go:generate go run gen_properties.go - emojipresentation.go emojiPresentation emojipresentation emojis=Emoji_Presentation
package main
import (
@ -38,8 +41,8 @@ import (
// We want to test against a specific version rather than the latest. When the
// package is upgraded to a new version, change these to generate new tests.
const (
gbpURL = `https://www.unicode.org/Public/14.0.0/ucd/%s.txt`
emojiURL = `https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt`
propertyURL = `https://www.unicode.org/Public/14.0.0/ucd/%s.txt`
emojiURL = `https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt`
)
// The regular expression for a line containing a code point range property.
@ -55,20 +58,25 @@ func main() {
log.SetFlags(0)
// Parse flags.
flags := make(map[string]struct{})
flags := make(map[string]string)
if len(os.Args) >= 6 {
for _, flag := range strings.Split(os.Args[5], ",") {
flags[flag] = struct{}{}
flagFields := strings.Split(flag, "=")
if len(flagFields) == 1 {
flags[flagFields[0]] = "yes"
} else {
flags[flagFields[0]] = flagFields[1]
}
}
}
// Parse the text file and generate Go source code from it.
var emojis string
if _, ok := flags["emojis"]; ok {
emojis = emojiURL
}
_, includeGeneralCategory := flags["gencat"]
src, err := parse(fmt.Sprintf(gbpURL, os.Args[1]), emojis, includeGeneralCategory)
var mainURL string
if os.Args[1] != "-" {
mainURL = fmt.Sprintf(propertyURL, os.Args[1])
}
src, err := parse(mainURL, flags["emojis"], includeGeneralCategory)
if err != nil {
log.Fatal(err)
}
@ -88,49 +96,57 @@ func main() {
// parse parses the Unicode Properties text files located at the given URLs and
// returns their equivalent Go source code to be used in the uniseg package. If
// "emojiURL" is an empty string, no emoji code points will be included. If
// "emojiProperty" is not an empty string, emoji code points for that emoji
// property (e.g. "Extended_Pictographic") will be included. In those cases, you
// may pass an empty "propertyURL" to skip parsing the main properties file. If
// "includeGeneralCategory" is true, the Unicode General Category property will
// be extracted from the comments and included in the output.
func parse(gbpURL, emojiURL string, includeGeneralCategory bool) (string, error) {
func parse(propertyURL, emojiProperty string, includeGeneralCategory bool) (string, error) {
if propertyURL == "" && emojiProperty == "" {
return "", errors.New("no properties to parse")
}
// Temporary buffer to hold properties.
var properties [][4]string
// Open the first URL.
log.Printf("Parsing %s", gbpURL)
res, err := http.Get(gbpURL)
if err != nil {
return "", err
}
in1 := res.Body
defer in1.Close()
// Parse it.
scanner := bufio.NewScanner(in1)
num := 0
for scanner.Scan() {
num++
line := strings.TrimSpace(scanner.Text())
// Skip comments and empty lines.
if strings.HasPrefix(line, "#") || line == "" {
continue
}
// Everything else must be a code point range, a property and a comment.
from, to, property, comment, err := parseProperty(line)
if propertyURL != "" {
log.Printf("Parsing %s", propertyURL)
res, err := http.Get(propertyURL)
if err != nil {
return "", fmt.Errorf("%s line %d: %v", os.Args[4], num, err)
return "", err
}
in1 := res.Body
defer in1.Close()
// Parse it.
scanner := bufio.NewScanner(in1)
num := 0
for scanner.Scan() {
num++
line := strings.TrimSpace(scanner.Text())
// Skip comments and empty lines.
if strings.HasPrefix(line, "#") || line == "" {
continue
}
// Everything else must be a code point range, a property and a comment.
from, to, property, comment, err := parseProperty(line)
if err != nil {
return "", fmt.Errorf("%s line %d: %v", os.Args[4], num, err)
}
properties = append(properties, [4]string{from, to, property, comment})
}
if err := scanner.Err(); err != nil {
return "", err
}
properties = append(properties, [4]string{from, to, property, comment})
}
if err := scanner.Err(); err != nil {
return "", err
}
// Open the second URL.
if emojiURL != "" {
if emojiProperty != "" {
log.Printf("Parsing %s", emojiURL)
res, err = http.Get(emojiURL)
res, err := http.Get(emojiURL)
if err != nil {
return "", err
}
@ -138,15 +154,15 @@ func parse(gbpURL, emojiURL string, includeGeneralCategory bool) (string, error)
defer in2.Close()
// Parse it.
scanner = bufio.NewScanner(in2)
num = 0
scanner := bufio.NewScanner(in2)
num := 0
for scanner.Scan() {
num++
line := scanner.Text()
// Skip comments, empty lines, and everything not containing
// "Extended_Pictographic".
if strings.HasPrefix(line, "#") || line == "" || !strings.Contains(line, "Extended_Pictographic") {
if strings.HasPrefix(line, "#") || line == "" || !strings.Contains(line, emojiProperty) {
continue
}
@ -189,7 +205,7 @@ func parse(gbpURL, emojiURL string, includeGeneralCategory bool) (string, error)
// Code generated via go generate from gen_properties.go. DO NOT EDIT.
// ` + os.Args[3] + ` are taken from
// ` + gbpURL + emojiComment + `
// ` + propertyURL + emojiComment + `
// on ` + time.Now().Format("January 2, 2006") + `. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var ` + os.Args[3] + ` = [][` + strconv.Itoa(columns) + `]int{

View File

@ -4,12 +4,14 @@ import "unicode/utf8"
// Graphemes implements an iterator over Unicode grapheme clusters, or
// user-perceived characters. While iterating, it also provides information
// about word boundaries, sentence boundaries, and line breaks.
// about word boundaries, sentence boundaries, line breaks, and monospace
// character widths.
//
// After constructing the class via [NewGraphemes] for a given string "str",
// [Next] is called for every grapheme cluster in a loop until it returns false.
// Inside the loop, information about the grapheme cluster as well as boundary
// information is available via the various methods (see examples below).
// [Graphemes.Next] is called for every grapheme cluster in a loop until it
// returns false. Inside the loop, information about the grapheme cluster as
// well as boundary information and character width is available via the various
// methods (see examples below).
//
// Using this class to iterate over a string is convenient but it is much slower
// than using this package's [Step] or [StepString] functions or any of the
@ -28,18 +30,18 @@ type Graphemes struct {
// string.
offset int
// The current boundary information of the Step() parser.
// The current boundary information of the [Step] parser.
boundaries int
// The current state of the Step() parser.
// The current state of the [Step] parser.
state int
}
// NewGraphemes returns a new grapheme cluster iterator.
func NewGraphemes(s string) *Graphemes {
func NewGraphemes(str string) *Graphemes {
return &Graphemes{
original: s,
remaining: s,
original: str,
remaining: str,
state: -1,
}
}
@ -60,8 +62,8 @@ func (g *Graphemes) Next() bool {
}
// Runes returns a slice of runes (code points) which corresponds to the current
// grapheme cluster. If the iterator is already past the end or [Next] has not
// yet been called, nil is returned.
// grapheme cluster. If the iterator is already past the end or [Graphemes.Next]
// has not yet been called, nil is returned.
func (g *Graphemes) Runes() []rune {
if g.state < 0 {
return nil
@ -70,15 +72,15 @@ func (g *Graphemes) Runes() []rune {
}
// Str returns a substring of the original string which corresponds to the
// current grapheme cluster. If the iterator is already past the end or [Next]
// has not yet been called, an empty string is returned.
// current grapheme cluster. If the iterator is already past the end or
// [Graphemes.Next] has not yet been called, an empty string is returned.
func (g *Graphemes) Str() string {
return g.cluster
}
// Bytes returns a byte slice which corresponds to the current grapheme cluster.
// If the iterator is already past the end or [Next] has not yet been called,
// nil is returned.
// If the iterator is already past the end or [Graphemes.Next] has not yet been
// called, nil is returned.
func (g *Graphemes) Bytes() []byte {
if g.state < 0 {
return nil
@ -90,8 +92,8 @@ func (g *Graphemes) Bytes() []byte {
// positions into the original string. The first returned value "from" indexes
// the first byte and the second returned value "to" indexes the first byte that
// is not included anymore, i.e. str[from:to] is the current grapheme cluster of
// the original string "str". If [Next] has not yet been called, both values are
// 0. If the iterator is already past the end, both values are 1.
// the original string "str". If [Graphemes.Next] has not yet been called, both
// values are 0. If the iterator is already past the end, both values are 1.
func (g *Graphemes) Positions() (int, int) {
if g.state == -1 {
return 0, 0
@ -133,8 +135,16 @@ func (g *Graphemes) LineBreak() int {
return g.boundaries & MaskLine
}
// Width returns the monospace width of the current grapheme cluster.
func (g *Graphemes) Width() int {
if g.state < 0 {
return 0
}
return g.boundaries >> ShiftWidth
}
// Reset puts the iterator into its initial state such that the next call to
// [Next] sets it to the first grapheme cluster again.
// [Graphemes.Next] sets it to the first grapheme cluster again.
func (g *Graphemes) Reset() {
g.state = -1
g.offset = 0
@ -153,6 +163,10 @@ func GraphemeClusterCount(s string) (n int) {
return
}
// The number of bits the grapheme property must be shifted to make place for
// grapheme states.
const shiftGraphemePropState = 4
// FirstGraphemeCluster returns the first grapheme cluster found in the given
// byte slice according to the rules of Unicode Standard Annex #29, Grapheme
// Cluster Boundaries. This function can be called continuously to extract all
@ -168,15 +182,15 @@ func GraphemeClusterCount(s string) (n int) {
// "cluster" byte slice is the sub-slice of the input slice containing the
// identified grapheme cluster.
//
// The returned width is the width of the grapheme cluster for most monospace
// fonts where a value of 1 represents one character cell.
//
// Given an empty byte slice "b", the function returns nil values.
//
// While slightly less convenient than using the Graphemes class, this function
// has much better performance and makes no allocations. It lends itself well to
// large byte slices.
//
// The "reserved" return value is a placeholder for future functionality and may
// be ignored for the time being.
func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, reserved, newState int) {
func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) {
// An empty byte slice returns nothing.
if len(b) == 0 {
return
@ -185,34 +199,60 @@ func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, reserved,
// Extract the first rune.
r, length := utf8.DecodeRune(b)
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
return b, nil, 0, grAny
var prop int
if state < 0 {
prop = property(graphemeCodePoints, r)
} else {
prop = state >> shiftGraphemePropState
}
return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
}
// If we don't know the state, determine it now.
var firstProp int
if state < 0 {
state, _ = transitionGraphemeState(state, r)
state, firstProp, _ = transitionGraphemeState(state, r)
} else {
firstProp = state >> shiftGraphemePropState
}
width += runeWidth(r, firstProp)
// Transition until we find a boundary.
var boundary bool
for {
var (
prop int
boundary bool
)
r, l := utf8.DecodeRune(b[length:])
state, boundary = transitionGraphemeState(state, r)
state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
if boundary {
return b[:length], b[length:], 0, state
return b[:length], b[length:], width, state | (prop << shiftGraphemePropState)
}
if r == vs16 {
width = 2
} else if firstProp != prExtendedPictographic && firstProp != prRegionalIndicator && firstProp != prL {
width += runeWidth(r, prop)
} else if firstProp == prExtendedPictographic {
if r == vs15 {
width = 1
} else {
width = 2
}
}
length += l
if len(b) <= length {
return b, nil, 0, grAny
return b, nil, width, grAny | (prop << shiftGraphemePropState)
}
}
}
// FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and
// outputs are strings.
func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, reserved, newState int) {
func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) {
// An empty string returns nothing.
if len(str) == 0 {
return
@ -221,27 +261,53 @@ func FirstGraphemeClusterInString(str string, state int) (cluster, rest string,
// Extract the first rune.
r, length := utf8.DecodeRuneInString(str)
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
return str, "", 0, grAny
var prop int
if state < 0 {
prop = property(graphemeCodePoints, r)
} else {
prop = state >> shiftGraphemePropState
}
return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
}
// If we don't know the state, determine it now.
var firstProp int
if state < 0 {
state, _ = transitionGraphemeState(state, r)
state, firstProp, _ = transitionGraphemeState(state, r)
} else {
firstProp = state >> shiftGraphemePropState
}
width += runeWidth(r, firstProp)
// Transition until we find a boundary.
var boundary bool
for {
var (
prop int
boundary bool
)
r, l := utf8.DecodeRuneInString(str[length:])
state, boundary = transitionGraphemeState(state, r)
state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
if boundary {
return str[:length], str[length:], 0, state
return str[:length], str[length:], width, state | (prop << shiftGraphemePropState)
}
if r == vs16 {
width = 2
} else if firstProp != prExtendedPictographic && firstProp != prRegionalIndicator && firstProp != prL {
width += runeWidth(r, prop)
} else if firstProp == prExtendedPictographic {
if r == vs15 {
width = 1
} else {
width = 2
}
}
length += l
if len(str) <= length {
return str, "", 0, grAny
return str, "", width, grAny | (prop << shiftGraphemePropState)
}
}
}

View File

@ -7,7 +7,7 @@ package uniseg
// and
// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
// ("Extended_Pictographic" only)
// on July 25, 2022. See https://www.unicode.org/license.html for the Unicode
// on September 10, 2022. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var graphemeCodePoints = [][3]int{
{0x0000, 0x0009, prControl}, // Cc [10] <control-0000>..<control-0009>

View File

@ -27,14 +27,14 @@ const (
//
// This map is queried as follows:
//
// 1. Find specific state + specific property. Stop if found.
// 2. Find specific state + any property.
// 3. Find any state + specific property.
// 4. If only (2) or (3) (but not both) was found, stop.
// 5. If both (2) and (3) were found, use state from (3) and breaking instruction
// from the transition with the lower rule number, prefer (3) if rule numbers
// are equal. Stop.
// 6. Assume grAny and grBoundary.
// 1. Find specific state + specific property. Stop if found.
// 2. Find specific state + any property.
// 3. Find any state + specific property.
// 4. If only (2) or (3) (but not both) was found, stop.
// 5. If both (2) and (3) were found, use state from (3) and breaking instruction
// from the transition with the lower rule number, prefer (3) if rule numbers
// are equal. Stop.
// 6. Assume grAny and grBoundary.
//
// Unicode version 14.0.0.
var grTransitions = map[[2]int][3]int{
@ -92,22 +92,23 @@ var grTransitions = map[[2]int][3]int{
}
// transitionGraphemeState determines the new state of the grapheme cluster
// parser given the current state and the next code point. It also returns
// whether a cluster boundary was detected.
func transitionGraphemeState(state int, r rune) (newState int, boundary bool) {
// parser given the current state and the next code point. It also returns the
// code point's grapheme property (the value mapped by the [graphemeCodePoints]
// table) and whether a cluster boundary was detected.
func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) {
// Determine the property of the next character.
nextProperty := property(graphemeCodePoints, r)
prop = property(graphemeCodePoints, r)
// Find the applicable transition.
transition, ok := grTransitions[[2]int{state, nextProperty}]
transition, ok := grTransitions[[2]int{state, prop}]
if ok {
// We have a specific transition. We'll use it.
return transition[0], transition[1] == grBoundary
return transition[0], prop, transition[1] == grBoundary
}
// No specific transition found. Try the less specific ones.
transAnyProp, okAnyProp := grTransitions[[2]int{state, prAny}]
transAnyState, okAnyState := grTransitions[[2]int{grAny, nextProperty}]
transAnyState, okAnyState := grTransitions[[2]int{grAny, prop}]
if okAnyProp && okAnyState {
// Both apply. We'll use a mix (see comments for grTransitions).
newState = transAnyState[0]
@ -120,7 +121,7 @@ func transitionGraphemeState(state int, r rune) (newState int, boundary bool) {
if okAnyProp {
// We only have a specific state.
return transAnyProp[0], transAnyProp[1] == grBoundary
return transAnyProp[0], prop, transAnyProp[1] == grBoundary
// This branch will probably never be reached because okAnyState will
// always be true given the current transition map. But we keep it here
// for future modifications to the transition map where this may not be
@ -129,9 +130,9 @@ func transitionGraphemeState(state int, r rune) (newState int, boundary bool) {
if okAnyState {
// We only have a specific property.
return transAnyState[0], transAnyState[1] == grBoundary
return transAnyState[0], prop, transAnyState[1] == grBoundary
}
// No known transition. GB999: Any ÷ Any.
return grAny, true
return grAny, prop, true
}

View File

@ -13,7 +13,7 @@ import "unicode/utf8"
//
// The returned "segment" may not be broken into smaller parts, unless no other
// breaking opportunities present themselves, in which case you may break by
// grapheme clusters (using the FirstGraphemeCluster() function to determine the
// grapheme clusters (using the [FirstGraphemeCluster] function to determine the
// grapheme clusters).
//
// The "mustBreak" flag indicates whether you MUST break the line after the
@ -42,7 +42,7 @@ import "unicode/utf8"
//
// Note also that this algorithm may break within grapheme clusters. This is
// addressed in Section 8.2 Example 6 of UAX #14. To avoid this, you can use
// the Step() function instead.
// the [Step] function instead.
func FirstLineSegment(b []byte, state int) (segment, rest []byte, mustBreak bool, newState int) {
// An empty byte slice returns nothing.
if len(b) == 0 {
@ -114,7 +114,9 @@ func FirstLineSegmentInString(str string, state int) (segment, rest string, must
}
// HasTrailingLineBreak returns true if the last rune in the given byte slice is
// one of the hard line break code points as defined in LB4 and LB5 of UAX #14.
// one of the hard line break code points defined in LB4 and LB5 of [UAX #14].
//
// [UAX #14]: https://www.unicode.org/reports/tr14/#Algorithm
func HasTrailingLineBreak(b []byte) bool {
r, _ := utf8.DecodeLastRune(b)
property, _ := propertyWithGenCat(lineBreakCodePoints, r)

View File

@ -4,7 +4,10 @@ package uniseg
// lineBreakCodePoints are taken from
// https://www.unicode.org/Public/14.0.0/ucd/LineBreak.txt
// on July 25, 2022. See https://www.unicode.org/license.html for the Unicode
// and
// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
// ("Extended_Pictographic" only)
// on September 10, 2022. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var lineBreakCodePoints = [][4]int{
{0x0000, 0x0008, prCM, gcCc}, // [9] <control-0000>..<control-0008>

View File

@ -3,9 +3,9 @@ package uniseg
// The Unicode properties as used in the various parsers. Only the ones needed
// in the context of this package are included.
const (
prXX = 0 // Same as prAny.
prAny = iota // prAny must be 0.
prPrepend
prXX = 0 // Same as prAny.
prAny = iota // prAny must be 0.
prPrepend // Grapheme properties must come first, to reduce the number of bits stored in the state vector.
prCR
prLF
prControl
@ -86,6 +86,7 @@ const (
prW
prH
prF
prEmojiPresentation
)
// Unicode General Categories. Only the ones needed in the context of this
@ -124,6 +125,12 @@ const (
gcCo
)
// Special code points.
const (
vs15 = 0xfe0e // Variation Selector-15 (text presentation)
vs16 = 0xfe0f // Variation Selector-16 (emoji presentation)
)
// propertySearch performs a binary search on a property slice and returns the
// entry whose range (start = first array element, end = second array element)
// includes r, or an array of 0's if no such entry was found.

View File

@ -4,7 +4,10 @@ package uniseg
// sentenceBreakCodePoints are taken from
// https://www.unicode.org/Public/14.0.0/ucd/auxiliary/SentenceBreakProperty.txt
// on July 25, 2022. See https://www.unicode.org/license.html for the Unicode
// and
// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
// ("Extended_Pictographic" only)
// on September 10, 2022. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var sentenceBreakCodePoints = [][3]int{
{0x0009, 0x0009, prSp}, // Cc <control-0009>

112
vendor/github.com/rivo/uniseg/step.go generated vendored
View File

@ -2,31 +2,37 @@ package uniseg
import "unicode/utf8"
// The bit masks used to extract boundary information returned by the Step()
// function.
// The bit masks used to extract boundary information returned by [Step].
const (
MaskLine = 3
MaskWord = 4
MaskSentence = 8
)
// The bit positions by which boundary flags are shifted by the Step() function.
// This must correspond to the Mask constants.
// The number of bits to shift the boundary information returned by [Step] to
// obtain the monospace width of the grapheme cluster.
const ShiftWidth = 4
// The bit positions by which boundary flags are shifted by the [Step] function.
// These must correspond to the Mask constants.
const (
shiftWord = 2
shiftSentence = 3
// shiftwWidth is ShiftWidth above. No mask as these are always the remaining bits.
)
// The bit positions by which states are shifted by the Step() function. These
// The bit positions by which states are shifted by the [Step] function. These
// values must ensure state values defined for each of the boundary algorithms
// don't overlap (and that they all still fit in a single int).
// don't overlap (and that they all still fit in a single int). These must
// correspond to the Mask constants.
const (
shiftWordState = 4
shiftSentenceState = 9
shiftLineState = 13
shiftPropState = 21 // No mask as these are always the remaining bits.
)
// The bit mask used to extract the state returned by the Step() function, after
// The bit mask used to extract the state returned by the [Step] function, after
// shifting. These values must correspond to the shift constants.
const (
maskGraphemeState = 0xf
@ -37,10 +43,11 @@ const (
// Step returns the first grapheme cluster (user-perceived character) found in
// the given byte slice. It also returns information about the boundary between
// that grapheme cluster and the one following it. There are three types of
// boundary information: word boundaries, sentence boundaries, and line breaks.
// This function is therefore a combination of FirstGraphemeCluster(),
// FirstWord(), FirstSentence(), and FirstLineSegment().
// that grapheme cluster and the one following it as well as the monospace width
// of the grapheme cluster. There are three types of boundary information: word
// boundaries, sentence boundaries, and line breaks. This function is therefore
// a combination of [FirstGraphemeCluster], [FirstWord], [FirstSentence], and
// [FirstLineSegment].
//
// The "boundaries" return value can be evaluated as follows:
//
@ -54,6 +61,8 @@ const (
// boundary.
// - boundaries&MaskLine == LineCanBreak: You may or may not break the line at
// the boundary.
// - boundaries >> ShiftWidth: The width of the grapheme cluster for most
// monospace fonts where a value of 1 represents one character cell.
//
// This function can be called continuously to extract all grapheme clusters
// from a byte slice, as illustrated in the examples below.
@ -87,14 +96,20 @@ func Step(b []byte, state int) (cluster, rest []byte, boundaries int, newState i
// Extract the first rune.
r, length := utf8.DecodeRune(b)
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
var prop int
if state < 0 {
prop = property(graphemeCodePoints, r)
} else {
prop = state >> shiftPropState
}
return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
}
// If we don't know the state, determine it now.
var graphemeState, wordState, sentenceState, lineState int
var graphemeState, wordState, sentenceState, lineState, firstProp int
remainder := b[length:]
if state < 0 {
graphemeState, _ = transitionGraphemeState(state, r)
graphemeState, firstProp, _ = transitionGraphemeState(state, r)
wordState, _ = transitionWordBreakState(state, r, remainder, "")
sentenceState, _ = transitionSentenceBreakState(state, r, remainder, "")
lineState, _ = transitionLineBreakState(state, r, remainder, "")
@ -103,36 +118,51 @@ func Step(b []byte, state int) (cluster, rest []byte, boundaries int, newState i
wordState = (state >> shiftWordState) & maskWordState
sentenceState = (state >> shiftSentenceState) & maskSentenceState
lineState = (state >> shiftLineState) & maskLineState
firstProp = state >> shiftPropState
}
// Transition until we find a grapheme cluster boundary.
var (
graphemeBoundary, wordBoundary, sentenceBoundary bool
lineBreak int
)
width := runeWidth(r, firstProp)
for {
var (
graphemeBoundary, wordBoundary, sentenceBoundary bool
lineBreak, prop int
)
r, l := utf8.DecodeRune(remainder)
remainder = b[length+l:]
graphemeState, graphemeBoundary = transitionGraphemeState(graphemeState, r)
graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r)
wordState, wordBoundary = transitionWordBreakState(wordState, r, remainder, "")
sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, remainder, "")
lineState, lineBreak = transitionLineBreakState(lineState, r, remainder, "")
if graphemeBoundary {
boundary := lineBreak
boundary := lineBreak | (width << ShiftWidth)
if wordBoundary {
boundary |= 1 << shiftWord
}
if sentenceBoundary {
boundary |= 1 << shiftSentence
}
return b[:length], b[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState)
return b[:length], b[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState)
}
if r == vs16 {
width = 2
} else if firstProp != prExtendedPictographic && firstProp != prRegionalIndicator && firstProp != prL {
width += runeWidth(r, prop)
} else if firstProp == prExtendedPictographic {
if r == vs15 {
width = 1
} else {
width = 2
}
}
length += l
if len(b) <= length {
return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
}
}
}
@ -147,14 +177,15 @@ func StepString(str string, state int) (cluster, rest string, boundaries int, ne
// Extract the first rune.
r, length := utf8.DecodeRuneInString(str)
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
prop := property(graphemeCodePoints, r)
return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
}
// If we don't know the state, determine it now.
var graphemeState, wordState, sentenceState, lineState int
var graphemeState, wordState, sentenceState, lineState, firstProp int
remainder := str[length:]
if state < 0 {
graphemeState, _ = transitionGraphemeState(state, r)
graphemeState, firstProp, _ = transitionGraphemeState(state, r)
wordState, _ = transitionWordBreakState(state, r, nil, remainder)
sentenceState, _ = transitionSentenceBreakState(state, r, nil, remainder)
lineState, _ = transitionLineBreakState(state, r, nil, remainder)
@ -163,36 +194,51 @@ func StepString(str string, state int) (cluster, rest string, boundaries int, ne
wordState = (state >> shiftWordState) & maskWordState
sentenceState = (state >> shiftSentenceState) & maskSentenceState
lineState = (state >> shiftLineState) & maskLineState
firstProp = state >> shiftPropState
}
// Transition until we find a grapheme cluster boundary.
var (
graphemeBoundary, wordBoundary, sentenceBoundary bool
lineBreak int
)
width := runeWidth(r, firstProp)
for {
var (
graphemeBoundary, wordBoundary, sentenceBoundary bool
lineBreak, prop int
)
r, l := utf8.DecodeRuneInString(remainder)
remainder = str[length+l:]
graphemeState, graphemeBoundary = transitionGraphemeState(graphemeState, r)
graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r)
wordState, wordBoundary = transitionWordBreakState(wordState, r, nil, remainder)
sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, nil, remainder)
lineState, lineBreak = transitionLineBreakState(lineState, r, nil, remainder)
if graphemeBoundary {
boundary := lineBreak
boundary := lineBreak | (width << ShiftWidth)
if wordBoundary {
boundary |= 1 << shiftWord
}
if sentenceBoundary {
boundary |= 1 << shiftSentence
}
return str[:length], str[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState)
return str[:length], str[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState)
}
if r == vs16 {
width = 2
} else if firstProp != prExtendedPictographic && firstProp != prRegionalIndicator && firstProp != prL {
width += runeWidth(r, prop)
} else if firstProp == prExtendedPictographic {
if r == vs15 {
width = 1
} else {
width = 2
}
}
length += l
if len(str) <= length {
return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
}
}
}

54
vendor/github.com/rivo/uniseg/width.go generated vendored Normal file
View File

@ -0,0 +1,54 @@
package uniseg
// runeWidth returns the monospace width for the given rune. The provided
// grapheme property is a value mapped by the [graphemeCodePoints] table.
//
// Every rune has a width of 1, except for runes with the following properties
// (evaluated in this order):
//
// - Control, CR, LF, Extend, ZWJ: Width of 0
// - \u2e3a, TWO-EM DASH: Width of 3
// - \u2e3b, THREE-EM DASH: Width of 4
// - East-Asian width Fullwidth and Wide: Width of 2 (Ambiguous and Neutral
// have a width of 1)
// - Regional Indicator: Width of 2
// - Extended Pictographic: Width of 2, unless Emoji Presentation is "No".
func runeWidth(r rune, graphemeProperty int) int {
switch graphemeProperty {
case prControl, prCR, prLF, prExtend, prZWJ:
return 0
case prRegionalIndicator:
return 2
case prExtendedPictographic:
if property(emojiPresentation, r) == prEmojiPresentation {
return 2
}
return 1
}
switch r {
case 0x2e3a:
return 3
case 0x2e3b:
return 4
}
switch property(eastAsianWidth, r) {
case prW, prF:
return 2
}
return 1
}
// StringWidth returns the monospace width for the given string, that is, the
// number of same-size cells to be occupied by the string.
func StringWidth(s string) (width int) {
state := -1
for len(s) > 0 {
var w int
_, s, w, state = FirstGraphemeClusterInString(s, state)
width += w
}
return
}

View File

@ -7,7 +7,7 @@ package uniseg
// and
// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
// ("Extended_Pictographic" only)
// on July 25, 2022. See https://www.unicode.org/license.html for the Unicode
// on September 10, 2022. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var workBreakCodePoints = [][3]int{
{0x000A, 0x000A, prLF}, // Cc <control-000A>
@ -624,8 +624,8 @@ var workBreakCodePoints = [][3]int{
{0x212A, 0x212D, prALetter}, // L& [4] KELVIN SIGN..BLACK-LETTER CAPITAL C
{0x212F, 0x2134, prALetter}, // L& [6] SCRIPT SMALL E..SCRIPT SMALL O
{0x2135, 0x2138, prALetter}, // Lo [4] ALEF SYMBOL..DALET SYMBOL
{0x2139, 0x2139, prALetter}, // L& INFORMATION SOURCE
{0x2139, 0x2139, prExtendedPictographic}, // E0.6 [1] (ℹ️) information
{0x2139, 0x2139, prALetter}, // L& INFORMATION SOURCE
{0x213C, 0x213F, prALetter}, // L& [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI
{0x2145, 0x2149, prALetter}, // L& [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J
{0x214E, 0x214E, prALetter}, // L& TURNED SMALL F