1
0
mirror of https://github.com/pbnjay/grate.git synced 2024-12-14 06:06:17 +02:00

why not? initial implementation of xlsx

This commit is contained in:
Jeremy Jay 2021-02-11 01:16:02 -05:00
parent ee3b4224e0
commit 8812c44704
6 changed files with 795 additions and 0 deletions

82
xlsx/formats.go Normal file
View File

@ -0,0 +1,82 @@
package xlsx
var builtInFormats = map[uint16]string{
0: `General`,
1: `0`,
2: `0.00`,
3: `#,##0`,
4: `#,##0.00`,
9: `0%`,
10: `0.00%`,
11: `0.00E+00`,
12: `# ?/?`,
13: `# ??/??`,
14: `mm-dd-yy`,
15: `d-mmm-yy`,
16: `d-mmm`,
17: `mmm-yy`,
18: `h:mm AM/PM`,
19: `h:mm:ss AM/PM`,
20: `h:mm`,
21: `h:mm:ss`,
22: `m/d/yy h:mm`,
37: `#,##0 ;(#,##0)`,
38: `#,##0 ;[Red](#,##0)`,
39: `#,##0.00;(#,##0.00)`,
40: `#,##0.00;[Red](#,##0.00)`,
41: `_(* #,##0_);_(* \(#,##0\);_(* "-"_);_(@_)`,
42: `_("$"* #,##0_);_("$"* \(#,##0\);_("$"* "-"_);_(@_)`,
43: `_(* #,##0.00_);_(* \(#,##0.00\);_(* "-"??_);_(@_)`,
44: `_("$"* #,##0.00_);_("$"* \(#,##0.00\);_("$"* "-"??_);_(@_)`,
45: `mm:ss`,
46: `[h]:mm:ss`,
47: `mmss.0`,
48: `##0.0E+0`,
49: `@`,
// zh-cn format codes
27: `yyyy"年"m"月"`,
28: `m"月"d"日"`,
29: `m"月"d"日"`,
30: `m-d-yy`,
31: `yyyy"年"m"月"d"日"`,
32: `h"时"mm"分"`,
33: `h"时"mm"分"ss"秒"`,
34: `上午/下午 h"时"mm"分"`,
35: `上午/下午 h"时"mm"分"ss"秒"`,
36: `yyyy"年"m"月"`,
50: `yyyy"年"m"月"`,
51: `m"月"d"日"`,
52: `yyyy"年"m"月"`,
53: `m"月"d"日"`,
54: `m"月"d"日"`,
55: `上午/下午 h"时"mm"分"`,
56: `上午/下午 h"时"mm"分"ss"秒`,
57: `yyyy"年"m"月"`,
58: `m"月"d"日"`,
// th-th format codes
59: `t0`,
60: `t0.00`,
61: `t#,##0`,
62: `t#,##0.00`,
67: `t0%`,
68: `t0.00%`,
69: `t# ?/?`,
70: `t# ??/??`,
// th format code, but translated to aid the parser
71: `d/m/yyyy`, // `ว/ด/ปปปป`,
72: `d-mmm-yy`, // `ว-ดดด-ปป`,
73: `d-mmm`, // `ว-ดดด`,
74: `mmm-yy`, // `ดดด-ปป`,
75: `h:mm`, // `ช:นน`,
76: `h:mm:ss`, // `ช:นน:ทท`,
77: `d/m/yyyy h:mm`, // `ว/ด/ปปปป ช:นน`,
78: `mm:ss`, // `นน:ทท`,
79: `[h]:mm:ss`, // `[ช]:นน:ทท`,
80: `mm:ss.0`, // `นน:ทท.0`,
81: `d/m/bb`, // `d/m/bb`,
}

269
xlsx/sheets.go Normal file
View File

@ -0,0 +1,269 @@
package xlsx
import (
"encoding/xml"
"errors"
"fmt"
"io"
"log"
"path/filepath"
"strconv"
"strings"
"time"
)
type Sheet struct {
d *Document
relID string
name string
docname string
err error
minRow int
maxRow int
minCol int
maxCol int
rows []*row
empty bool
iterRow int
}
type row struct {
// each value must be one of: int, float64, string, or time.Time
cols []interface{}
}
func (s *Sheet) parseSheet() error {
linkmap := make(map[string]string)
base := filepath.Base(s.docname)
sub := strings.TrimSuffix(s.docname, base)
relsname := filepath.Join(sub, "_rels", base+".rels")
dec, clo, err := s.d.openXML(relsname)
if err == nil {
// rels might not exist for every sheet
tok, err := dec.Token()
for ; err == nil; tok, err = dec.Token() {
if v, ok := tok.(xml.StartElement); ok && v.Name.Local == "Relationship" {
ax := attrMap(v.Attr)
if ax["TargetMode"] == "External" && ax["Type"] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" {
linkmap[ax["Id"]] = ax["Target"]
}
}
}
clo.Close()
}
dec, clo, err = s.d.openXML(s.docname)
if err != nil {
return err
}
defer clo.Close()
currentCellType := BlankCellType
currentCell := ""
numFormat := ""
tok, err := dec.Token()
for ; err == nil; tok, err = dec.Token() {
switch v := tok.(type) {
case xml.CharData:
if currentCell == "" {
continue
}
c, r := refToIndexes(currentCell)
if c >= 0 && r >= 0 {
var val interface{} = string(v)
switch currentCellType {
case BooleanCellType:
if v[0] == '1' {
val = true
} else {
val = false
}
case DateCellType:
log.Println("CELL DATE", val, numFormat)
case NumberCellType:
fval, err := strconv.ParseFloat(string(v), 64)
if err == nil {
val = fval
}
//log.Println("CELL NUMBER", val, numFormat)
case SharedStringCellType:
//log.Println("CELL SHSTR", val, currentCellType, numFormat)
si, _ := strconv.ParseInt(string(v), 10, 64)
val = s.d.strings[si]
case BlankCellType:
//log.Println("CELL BLANK")
// don't place any values
continue
case ErrorCellType, FormulaStringCellType, InlineStringCellType:
//log.Println("CELL ERR/FORM/INLINE", val, currentCellType)
default:
log.Println("CELL UNKNOWN", val, currentCellType, numFormat)
}
s.placeValue(r, c, val)
} else {
//log.Println("FAIL row/col: ", currentCell)
}
case xml.StartElement:
ax := attrMap(v.Attr)
switch v.Name.Local {
case "dimension":
if ax["ref"] == "A1" {
// short-circuit empty sheet
s.minCol, s.minRow = 0, 0
s.maxCol, s.maxRow = 1, 1
s.empty = true
continue
}
dims := strings.Split(ax["ref"], ":")
s.minCol, s.minRow = refToIndexes(dims[0])
s.maxCol, s.maxRow = refToIndexes(dims[1])
//log.Println("DIMENSION:", s.minRow, s.minCol, ">", s.maxRow, s.maxCol)
case "row":
//currentRow = ax["r"] // unsigned int row index
//log.Println("ROW", currentRow)
case "c":
currentCellType = CellType(ax["t"])
if currentCellType == BlankCellType {
currentCellType = NumberCellType
}
currentCell = ax["r"] // always an A1 style reference
style := ax["s"]
sid, _ := strconv.ParseInt(style, 10, 64)
numFormat = s.d.xfs[sid] // unsigned integer lookup
//log.Println("CELL", currentCell, sid, numFormat, currentCellType)
case "v":
//log.Println("CELL VALUE", ax)
case "mergeCell":
dims := strings.Split(ax["ref"], ":")
startCol, startRow := refToIndexes(dims[0])
endCol, endRow := refToIndexes(dims[1])
for r := startRow; r <= endRow; r++ {
for c := startCol; c <= endCol; c++ {
if r == startRow && c == startCol {
// has data already!
} else if c == startCol {
// first and last column MAY be the same
if r == endRow {
s.placeValue(r, c, endRowMerged)
} else {
s.placeValue(r, c, continueRowMerged)
}
} else if c == endCol {
// first and last column are NOT the same
s.placeValue(r, c, endColumnMerged)
} else {
s.placeValue(r, c, continueColumnMerged)
}
}
}
case "hyperlink":
col, row := refToIndexes(ax["ref"])
link := linkmap[ax["id"]]
if len(s.rows) > row && len(s.rows[row].cols) > col {
if sstr, ok := s.rows[row].cols[col].(string); ok {
link = sstr + " <" + link + ">"
}
}
s.placeValue(row, col, link)
case "mergeCells", "hyperlinks":
// NB don't need these outer containers
case "f":
//log.Println("start: ", v.Name.Local, v.Attr)
default:
//log.Println("start: ", v.Name.Local, v.Attr)
}
case xml.EndElement:
switch v.Name.Local {
case "c":
currentCell = ""
case "row":
//currentRow = ""
}
//log.Println(" end: ", v.Name.Local)
default:
//log.Printf("%T %+v", tok, tok)
}
}
if err == io.EOF {
err = nil
}
return err
}
func (s *Sheet) placeValue(rowIndex, colIndex int, val interface{}) {
if colIndex > s.maxCol || rowIndex > s.maxRow {
// invalid
return
}
// ensure we always have a complete matrix
for len(s.rows) <= rowIndex {
emptyRow := make([]interface{}, s.maxCol+1)
for i := 0; i <= s.maxCol; i++ {
emptyRow[i] = staticBlank
}
s.rows = append(s.rows, &row{emptyRow})
}
s.empty = false
s.rows[rowIndex].cols[colIndex] = val
}
// Next advances to the next row of content.
// It MUST be called prior to any Scan().
func (s *Sheet) Next() bool {
s.iterRow++
return s.iterRow < len(s.rows)
}
func (s *Sheet) Strings() []string {
currow := s.rows[s.iterRow]
res := make([]string, len(currow.cols))
for i, col := range currow.cols {
res[i] = fmt.Sprint(col)
}
return res
}
// Scan extracts values from the row into the provided arguments
// Arguments must be pointers to one of 5 supported types:
// bool, int, float64, string, or time.Time
func (s *Sheet) Scan(args ...interface{}) error {
currow := s.rows[s.iterRow]
for i, a := range args {
switch v := a.(type) {
case *bool:
*v = currow.cols[i].(bool)
case *int:
*v = currow.cols[i].(int)
case *float64:
*v = currow.cols[i].(float64)
case *string:
*v = currow.cols[i].(string)
case *time.Time:
*v = currow.cols[i].(time.Time)
default:
return ErrInvalidType
}
}
return nil
}
func (s *Sheet) IsEmpty() bool {
return s.empty
}
// Err returns the last error that occured.
func (s *Sheet) Err() error {
return s.err
}
// ErrInvalidType is returned by Scan for invalid arguments.
var ErrInvalidType = errors.New("xlsx: Scan only supports *bool, *int, *float64, *string, *time.Time arguments")

32
xlsx/simple_test.go Normal file
View File

@ -0,0 +1,32 @@
package xlsx
import (
"log"
"testing"
)
func noTestOpen(t *testing.T) {
_, err := Open("test.xlsx")
if err != nil {
log.Fatal(err)
}
}
func TestOpen2(t *testing.T) {
wb, err := Open("test2.xlsx")
if err != nil {
log.Fatal(err)
}
for _, s := range wb.Sheets() {
//log.Println(s)
sheet, err := wb.Get(s)
if err != nil {
t.Fatal(err)
}
for sheet.Next() {
sheet.Strings()
}
}
}

88
xlsx/types.go Normal file
View File

@ -0,0 +1,88 @@
package xlsx
import (
"encoding/xml"
"strconv"
"strings"
)
type CellType string
// CellTypes define data type in section 18.18.11
const (
BlankCellType CellType = ""
BooleanCellType CellType = "b"
DateCellType CellType = "d"
ErrorCellType CellType = "e"
NumberCellType CellType = "n"
SharedStringCellType CellType = "s"
FormulaStringCellType CellType = "str"
InlineStringCellType CellType = "inlineStr"
)
type staticCellType rune
const (
staticBlank staticCellType = 0
// marks a continuation column within a merged cell.
continueColumnMerged staticCellType = '→'
// marks the last column of a merged cell.
endColumnMerged staticCellType = '⇥'
// marks a continuation row within a merged cell.
continueRowMerged staticCellType = '↓'
// marks the last row of a merged cell.
endRowMerged staticCellType = '⤓'
)
func (s staticCellType) String() string {
if s == 0 {
return ""
}
return string([]rune{rune(s)})
}
// returns the 0-based index of the column string:
// "A"=0, "B"=1, "AA"=26, "BB"=53
func col2int(col string) int {
idx := 0
for _, c := range col {
idx *= 26
idx += int(c - '@')
}
return idx - 1
}
func refToIndexes(r string) (column, row int) {
if len(r) < 2 {
return -1, -1
}
i1 := strings.IndexAny(r, "0123456789")
if i1 <= 0 {
return -1, -1
}
// A1 Reference mode
col1 := r[:i1]
i2 := strings.IndexByte(r[i1:], 'C')
if i2 == -1 {
rn, _ := strconv.ParseInt(r[i1:], 10, 64)
return col2int(col1), int(rn)
}
// R1C1 Reference Mode
col1 = r[i1:i2]
row1 := r[i2+1:]
cn, _ := strconv.ParseInt(col1, 10, 64)
rn, _ := strconv.ParseInt(row1, 10, 64)
return int(cn), int(rn)
}
func attrMap(attrs []xml.Attr) map[string]string {
m := make(map[string]string, len(attrs))
for _, a := range attrs {
m[a.Name.Local] = a.Value
}
return m
}

167
xlsx/workbook.go Normal file
View File

@ -0,0 +1,167 @@
package xlsx
import (
"encoding/xml"
"errors"
"io"
"path/filepath"
"strconv"
)
func (d *Document) parseRels(dec *xml.Decoder, basedir string) error {
tok, err := dec.Token()
for ; err == nil; tok, err = dec.Token() {
switch v := tok.(type) {
// the tags we're interested in are all self-closing
case xml.StartElement:
switch v.Name.Local {
case "Relationship":
vals := make(map[string]string, 5)
for _, a := range v.Attr {
vals[a.Name.Local] = a.Value
}
if _, ok := d.rels[vals["Type"]]; !ok {
d.rels[vals["Type"]] = make(map[string]string)
}
d.rels[vals["Type"]][vals["Id"]] = filepath.Join(basedir, vals["Target"])
if vals["Type"] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" {
d.primaryDoc = vals["Target"]
}
}
}
}
if err == io.EOF {
err = nil
}
return err
}
func (d *Document) parseWorkbook(dec *xml.Decoder) error {
tok, err := dec.Token()
for ; err == nil; tok, err = dec.Token() {
switch v := tok.(type) {
case xml.StartElement:
//log.Println("start: ", v.Name.Local)
switch v.Name.Local {
case "sheet":
vals := make(map[string]string, 5)
for _, a := range v.Attr {
vals[a.Name.Local] = a.Value
}
sheetID, ok1 := vals["id"]
sheetName, ok2 := vals["name"]
if !ok1 || !ok2 {
return errors.New("xlsx: invalid sheet definition")
}
s := &Sheet{
d: d,
relID: sheetID,
name: sheetName,
docname: d.rels["http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"][sheetID],
}
d.sheets = append(d.sheets, s)
}
case xml.EndElement:
//log.Println(" end: ", v.Name.Local)
default:
//log.Printf("%T %+v", tok, tok)
}
}
if err == io.EOF {
err = nil
}
return err
}
func (d *Document) parseStyles(dec *xml.Decoder) error {
csxfNumFormat := []string{}
d.xfs = d.xfs[:0]
section := 0
tok, err := dec.Token()
for ; err == nil; tok, err = dec.Token() {
switch v := tok.(type) {
case xml.StartElement:
attrs := attrMap(v.Attr)
switch v.Name.Local {
case "cellStyleXfs":
section = 1
case "cellXfs":
section = 2
n, _ := strconv.ParseInt(attrs["count"], 10, 64)
d.xfs = make([]string, 0, n)
case "xf":
if section == 1 {
if _, ok := attrs["applyNumberFormat"]; ok {
csxfNumFormat = append(csxfNumFormat, attrs["numFmtId"])
} else {
csxfNumFormat = append(csxfNumFormat, "-")
}
} else if section == 2 {
baseID, _ := strconv.ParseInt(attrs["xfId"], 10, 64)
thisXF := csxfNumFormat[baseID]
if _, ok := attrs["applyNumberFormat"]; ok {
thisXF = attrs["numFmtId"]
} else {
thisXF = "="
}
nfid, _ := strconv.ParseInt(thisXF, 10, 16)
thisXF = builtInFormats[uint16(nfid)]
d.xfs = append(d.xfs, thisXF)
} else {
panic("wheres is this xf??")
}
default:
//log.Println("start: ", v.Name.Local, v.Attr)
}
case xml.EndElement:
switch v.Name.Local {
case "cellStyleXfs":
section = 0
case "cellXfs":
section = 0
}
//log.Println(" end: ", v.Name.Local)
default:
//log.Printf("%T %+v", tok, tok)
}
}
if err == io.EOF {
err = nil
}
return err
}
func (d *Document) parseSharedStrings(dec *xml.Decoder) error {
val := ""
tok, err := dec.Token()
for ; err == nil; tok, err = dec.Token() {
switch v := tok.(type) {
case xml.CharData:
val += string(v)
case xml.StartElement:
switch v.Name.Local {
case "si":
val = ""
default:
//log.Println("start: ", v.Name.Local)
}
case xml.EndElement:
if v.Name.Local == "si" {
d.strings = append(d.strings, val)
continue
}
//log.Println(" end: ", v.Name.Local)
default:
//log.Printf("%T %+v", tok, tok)
}
}
if err == io.EOF {
err = nil
}
return err
}

157
xlsx/xlsx.go Normal file
View File

@ -0,0 +1,157 @@
package xlsx
import (
"archive/zip"
"encoding/xml"
"errors"
"io"
"os"
"path/filepath"
"strings"
)
// Document contains an Office Open XML document.
type Document struct {
filename string
r *zip.Reader
primaryDoc string
// type => id => filename
rels map[string]map[string]string
sheets []*Sheet
strings []string
xfs []string
}
func Open(filename string) (*Document, error) {
f, err := os.Open(filename)
if err != nil {
return nil, err
}
info, err := f.Stat()
if err != nil {
return nil, err
}
z, err := zip.NewReader(f, info.Size())
if err != nil {
return nil, err
}
d := &Document{
filename: filename,
r: z,
}
d.rels = make(map[string]map[string]string, 4)
// parse the primary relationships
dec, c, err := d.openXML("_rels/.rels")
if err != nil {
return nil, err
}
err = d.parseRels(dec, "")
c.Close()
if err != nil {
return nil, err
}
if d.primaryDoc == "" {
return nil, errors.New("xlsx: invalid document")
}
// parse the secondary relationships to primary doc
base := filepath.Base(d.primaryDoc)
sub := strings.TrimSuffix(d.primaryDoc, base)
relfn := filepath.Join(sub, "_rels", base+".rels")
dec, c, err = d.openXML(relfn)
if err != nil {
return nil, err
}
err = d.parseRels(dec, sub)
c.Close()
if err != nil {
return nil, err
}
// parse the workbook structure
dec, c, err = d.openXML(d.primaryDoc)
if err != nil {
return nil, err
}
err = d.parseWorkbook(dec)
c.Close()
if err != nil {
return nil, err
}
styn := d.rels["http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles"]
for _, sst := range styn {
//log.Println(styn)
// parse the shared string table
dec, c, err = d.openXML(sst)
if err != nil {
return nil, err
}
err = d.parseStyles(dec)
c.Close()
if err != nil {
return nil, err
}
}
ssn := d.rels["http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings"]
for _, sst := range ssn {
//log.Println(ssn)
// parse the shared string table
dec, c, err = d.openXML(sst)
if err != nil {
return nil, err
}
err = d.parseSharedStrings(dec)
c.Close()
if err != nil {
return nil, err
}
}
for _, s := range d.sheets {
err = s.parseSheet()
if err != nil {
return nil, err
}
}
return d, nil
}
func (d *Document) openXML(name string) (*xml.Decoder, io.Closer, error) {
//log.Println(name)
for _, zf := range d.r.File {
if zf.Name == name {
zfr, err := zf.Open()
if err != nil {
return nil, nil, err
}
dec := xml.NewDecoder(zfr)
return dec, zfr, nil
}
}
return nil, nil, io.EOF
}
func (d *Document) Sheets() []string {
res := make([]string, 0, len(d.sheets))
for _, s := range d.sheets {
//if (s.HiddenState & 0x03) == 0 {
res = append(res, s.name)
//}
}
return res
}
func (d *Document) Get(sheetName string) (*Sheet, error) {
for _, s := range d.sheets {
if s.name == sheetName {
return s, nil
}
}
return nil, errors.New("xlsx: sheet not found")
}