mirror of
https://github.com/pbnjay/grate.git
synced 2024-12-12 21:49:14 +02:00
a5be267bf7
did not reduce total allocations much (bytes.Reader is more efficient than I thought), but reduced walltime from 99s to 55s for a large collection
513 lines
13 KiB
Go
513 lines
13 KiB
Go
package xls
|
|
|
|
import (
|
|
"encoding/binary"
|
|
"errors"
|
|
"fmt"
|
|
"log"
|
|
"math"
|
|
"time"
|
|
"unicode/utf16"
|
|
|
|
"github.com/pbnjay/grate"
|
|
)
|
|
|
|
func (b *WorkBook) List() ([]string, error) {
|
|
res := make([]string, 0, len(b.sheets))
|
|
for _, s := range b.sheets {
|
|
if (s.HiddenState & 0x03) == 0 {
|
|
res = append(res, s.Name)
|
|
}
|
|
}
|
|
return res, nil
|
|
}
|
|
|
|
func (b *WorkBook) Get(sheetName string) (grate.Collection, error) {
|
|
for _, s := range b.sheets {
|
|
if s.Name == sheetName {
|
|
ss := b.pos2substream[int64(s.Position)]
|
|
ws := &WorkSheet{
|
|
b: b, s: s, ss: ss,
|
|
iterRow: -1,
|
|
}
|
|
return ws, ws.parse()
|
|
}
|
|
}
|
|
return nil, errors.New("xls: sheet not found")
|
|
}
|
|
|
|
type WorkSheet struct {
|
|
b *WorkBook
|
|
s *boundSheet
|
|
ss int
|
|
err error
|
|
|
|
minRow int
|
|
maxRow int // maximum valid row index (0xFFFF)
|
|
minCol int
|
|
maxCol int // maximum valid column index (0xFF)
|
|
rows []*row
|
|
empty bool
|
|
|
|
iterRow int
|
|
iterMC int
|
|
}
|
|
|
|
type staticCellType rune
|
|
|
|
const (
|
|
staticBlank staticCellType = 0
|
|
|
|
// marks a continuation column within a merged cell.
|
|
continueColumnMerged staticCellType = '→'
|
|
// marks the last column of a merged cell.
|
|
endColumnMerged staticCellType = '⇥'
|
|
|
|
// marks a continuation row within a merged cell.
|
|
continueRowMerged staticCellType = '↓'
|
|
// marks the last row of a merged cell.
|
|
endRowMerged staticCellType = '⤓'
|
|
)
|
|
|
|
func (s staticCellType) String() string {
|
|
if s == 0 {
|
|
return ""
|
|
}
|
|
return string([]rune{rune(s)})
|
|
}
|
|
|
|
type row struct {
|
|
// each value must be one of: int, float64, string, or time.Time
|
|
cols []interface{}
|
|
}
|
|
|
|
func (s *WorkSheet) placeValue(rowIndex, colIndex int, val interface{}) {
|
|
if colIndex > s.maxCol || rowIndex > s.maxRow {
|
|
// invalid
|
|
return
|
|
}
|
|
|
|
// ensure we always have a complete matrix
|
|
for len(s.rows) <= rowIndex {
|
|
emptyRow := make([]interface{}, s.maxCol+1)
|
|
s.rows = append(s.rows, &row{emptyRow})
|
|
}
|
|
|
|
s.rows[rowIndex].cols[colIndex] = val
|
|
}
|
|
|
|
func (s *WorkSheet) IsEmpty() bool {
|
|
return s.empty
|
|
}
|
|
|
|
func (s *WorkSheet) parse() error {
|
|
inSubstream := 0
|
|
for idx, r := range s.b.substreams[s.ss] {
|
|
if inSubstream > 0 {
|
|
if r.RecType == RecTypeEOF {
|
|
inSubstream--
|
|
}
|
|
continue
|
|
}
|
|
switch r.RecType {
|
|
case RecTypeBOF:
|
|
if idx > 0 {
|
|
inSubstream++
|
|
continue
|
|
}
|
|
case RecTypeWsBool:
|
|
if (r.Data[1] & 0x10) != 0 {
|
|
// it's a dialog
|
|
return nil
|
|
}
|
|
|
|
case RecTypeDimensions:
|
|
// max = 0-based index of the row AFTER the last valid index
|
|
minRow := binary.LittleEndian.Uint32(r.Data[:4])
|
|
maxRow := binary.LittleEndian.Uint32(r.Data[4:8]) // max = 0x010000
|
|
minCol := binary.LittleEndian.Uint16(r.Data[8:10])
|
|
maxCol := binary.LittleEndian.Uint16(r.Data[10:12]) // max = 0x000100
|
|
if grate.Debug {
|
|
log.Printf(" Sheet dimensions (%d, %d) - (%d,%d)",
|
|
minCol, minRow, maxCol, maxRow)
|
|
}
|
|
if minRow > 0x0000FFFF || maxRow > 0x00010000 {
|
|
log.Println("invalid dimensions")
|
|
}
|
|
if minCol > 0x00FF || maxCol > 0x0100 {
|
|
log.Println("invalid dimensions")
|
|
}
|
|
s.minRow = int(uint64(minRow) & 0x0FFFF)
|
|
s.maxRow = int(uint64(maxRow)&0x1FFFF) - 1 // translate to last valid index
|
|
s.minCol = int(uint64(minCol) & 0x000FF)
|
|
s.maxCol = int(uint64(maxCol)&0x001FF) - 1 // translate to last valid index
|
|
if (maxRow-minRow) == 0 || (maxCol-minCol) == 0 {
|
|
s.empty = true
|
|
} else {
|
|
// pre-allocate cells
|
|
s.placeValue(s.maxRow, s.maxCol, nil)
|
|
}
|
|
}
|
|
}
|
|
inSubstream = 0
|
|
|
|
var formulaRow, formulaCol uint16
|
|
for ridx, r := range s.b.substreams[s.ss] {
|
|
if inSubstream > 0 {
|
|
if r.RecType == RecTypeEOF {
|
|
inSubstream--
|
|
} else if grate.Debug {
|
|
log.Println(" Unhandled sheet substream record type:", r.RecType, ridx)
|
|
}
|
|
continue
|
|
}
|
|
|
|
// sec 2.1.7.20.6 Common Productions ABNF:
|
|
/*
|
|
CELLTABLE = 1*(1*Row *CELL 1*DBCell) *EntExU2
|
|
CELL = FORMULA / Blank / MulBlank / RK / MulRk / BoolErr / Number / LabelSst
|
|
FORMULA = [Uncalced] Formula [Array / Table / ShrFmla / SUB] [String *Continue]
|
|
|
|
Not parsed form the list above:
|
|
DBCell, EntExU2, Uncalced, Array, Table,ShrFmla
|
|
NB: no idea what "SUB" is
|
|
*/
|
|
|
|
switch r.RecType {
|
|
case RecTypeBOF:
|
|
if ridx > 0 {
|
|
inSubstream++
|
|
continue
|
|
}
|
|
|
|
case RecTypeBoolErr:
|
|
rowIndex := binary.LittleEndian.Uint16(r.Data[:2])
|
|
colIndex := binary.LittleEndian.Uint16(r.Data[2:4])
|
|
//ixfe := binary.LittleEndian.Uint16(r.Data[4:6])
|
|
if r.Data[7] == 0 {
|
|
bv := false
|
|
if r.Data[6] == 1 {
|
|
bv = true
|
|
}
|
|
s.placeValue(int(rowIndex), int(colIndex), bv)
|
|
//log.Printf("bool/error spec: %d %d %+v", rowIndex, colIndex, bv)
|
|
} else {
|
|
be, ok := berrLookup[r.Data[6]]
|
|
if !ok {
|
|
be = "<unknown error>"
|
|
}
|
|
s.placeValue(int(rowIndex), int(colIndex), be)
|
|
//log.Printf("bool/error spec: %d %d %s", rowIndex, colIndex, be)
|
|
}
|
|
|
|
case RecTypeMulRk:
|
|
mr := &shMulRK{}
|
|
nrk := int((r.RecSize - 6) / 6)
|
|
mr.RowIndex = binary.LittleEndian.Uint16(r.Data[:2])
|
|
mr.FirstCol = binary.LittleEndian.Uint16(r.Data[2:4])
|
|
mr.Values = make([]RkRec, nrk)
|
|
for i := 0; i < nrk; i++ {
|
|
off := 4 + i*6
|
|
rr := RkRec{}
|
|
rr.IXFCell = binary.LittleEndian.Uint16(r.Data[off:])
|
|
rr.Value = RKNumber(binary.LittleEndian.Uint32(r.Data[off:]))
|
|
mr.Values[i] = rr
|
|
|
|
var rval interface{}
|
|
if rr.Value.IsInteger() {
|
|
rval = rr.Value.Int()
|
|
} else {
|
|
rval = rr.Value.Float64()
|
|
fno := s.b.xfs[rr.IXFCell]
|
|
rval, _ = s.b.nfmt.Apply(fno, rval)
|
|
}
|
|
s.placeValue(int(mr.RowIndex), int(mr.FirstCol)+i, rval)
|
|
}
|
|
//log.Printf("mulrow spec: %+v", *mr)
|
|
|
|
case RecTypeNumber:
|
|
rowIndex := binary.LittleEndian.Uint16(r.Data[:2])
|
|
colIndex := binary.LittleEndian.Uint16(r.Data[2:4])
|
|
ixfe := binary.LittleEndian.Uint16(r.Data[4:6])
|
|
xnum := binary.LittleEndian.Uint64(r.Data[6:])
|
|
|
|
value := math.Float64frombits(xnum)
|
|
fno := s.b.xfs[ixfe]
|
|
rval, _ := s.b.nfmt.Apply(fno, value)
|
|
|
|
s.placeValue(int(rowIndex), int(colIndex), rval)
|
|
//log.Printf("Number spec: %d %d = %f", rowIndex, colIndex, value)
|
|
|
|
case RecTypeRK:
|
|
rowIndex := binary.LittleEndian.Uint16(r.Data[:2])
|
|
colIndex := binary.LittleEndian.Uint16(r.Data[2:4])
|
|
rr := RkRec{}
|
|
rr.IXFCell = binary.LittleEndian.Uint16(r.Data[4:])
|
|
rr.Value = RKNumber(binary.LittleEndian.Uint32(r.Data[6:]))
|
|
|
|
var rval interface{}
|
|
if rr.Value.IsInteger() {
|
|
rval = rr.Value.Int()
|
|
} else {
|
|
rval = rr.Value.Float64()
|
|
fno := s.b.xfs[rr.IXFCell]
|
|
rval, _ = s.b.nfmt.Apply(fno, rval)
|
|
}
|
|
s.placeValue(int(rowIndex), int(colIndex), rval)
|
|
//log.Printf("RK spec: %d %d = %s", rowIndex, colIndex, rr.Value.String())
|
|
|
|
case RecTypeFormula:
|
|
formulaRow = binary.LittleEndian.Uint16(r.Data[:2])
|
|
formulaCol = binary.LittleEndian.Uint16(r.Data[2:4])
|
|
ixfe := binary.LittleEndian.Uint16(r.Data[4:6])
|
|
fdata := r.Data[6:]
|
|
if fdata[6] == 0xFF && r.Data[7] == 0xFF {
|
|
switch fdata[0] {
|
|
case 0:
|
|
// string in next record
|
|
case 1:
|
|
// boolean
|
|
bv := false
|
|
if fdata[2] != 0 {
|
|
bv = true
|
|
}
|
|
s.placeValue(int(formulaRow), int(formulaCol), bv)
|
|
case 2:
|
|
// error value
|
|
be, ok := berrLookup[fdata[2]]
|
|
if !ok {
|
|
be = "<unknown error>"
|
|
}
|
|
s.placeValue(int(formulaRow), int(formulaCol), be)
|
|
case 3:
|
|
// blank string
|
|
default:
|
|
log.Println("unknown formula value type")
|
|
}
|
|
} else {
|
|
xnum := binary.LittleEndian.Uint64(r.Data[6:])
|
|
value := math.Float64frombits(xnum)
|
|
fno := s.b.xfs[ixfe]
|
|
rval, _ := s.b.nfmt.Apply(fno, value)
|
|
s.placeValue(int(formulaRow), int(formulaCol), rval)
|
|
}
|
|
//log.Printf("formula spec: %d %d ~~ %+v", formulaRow, formulaCol, r.Data)
|
|
|
|
case RecTypeString:
|
|
charCount := binary.LittleEndian.Uint16(r.Data[:2])
|
|
flags := r.Data[2]
|
|
fstr := ""
|
|
if (flags & 1) == 0 {
|
|
fstr = string(r.Data[3:])
|
|
} else {
|
|
raw := r.Data[3:]
|
|
us := make([]uint16, charCount)
|
|
for i := 0; i < int(charCount); i++ {
|
|
us[i] = binary.LittleEndian.Uint16(raw)
|
|
raw = raw[2:]
|
|
}
|
|
fstr = string(utf16.Decode(us))
|
|
}
|
|
|
|
if (ridx + 1) < len(s.b.substreams[s.ss]) {
|
|
ridx2 := ridx + 1
|
|
nrecs := len(s.b.substreams[s.ss])
|
|
for ridx2 < nrecs {
|
|
r2 := s.b.substreams[s.ss][ridx2]
|
|
if r2.RecType != RecTypeContinue {
|
|
break
|
|
}
|
|
if (r2.Data[0] & 1) == 0 {
|
|
fstr += string(r2.Data[1:])
|
|
} else {
|
|
raw := r2.Data[1:]
|
|
slen := len(raw) / 2
|
|
us := make([]uint16, slen)
|
|
for i := 0; i < slen; i++ {
|
|
us[i] = binary.LittleEndian.Uint16(raw)
|
|
raw = raw[2:]
|
|
}
|
|
fstr += string(utf16.Decode(us))
|
|
}
|
|
ridx2++
|
|
}
|
|
}
|
|
// TODO: does formula record formatted dates as pre-computed strings?
|
|
s.placeValue(int(formulaRow), int(formulaCol), fstr)
|
|
|
|
case RecTypeLabelSst:
|
|
rowIndex := binary.LittleEndian.Uint16(r.Data[:2])
|
|
colIndex := binary.LittleEndian.Uint16(r.Data[2:4])
|
|
//ixfe := binary.LittleEndian.Uint16(r.Data[4:6])
|
|
sstIndex := binary.LittleEndian.Uint32(r.Data[6:])
|
|
if int(sstIndex) > len(s.b.strings) {
|
|
return errors.New("xls: invalid sst index")
|
|
}
|
|
s.placeValue(int(rowIndex), int(colIndex), s.b.strings[sstIndex])
|
|
//log.Printf("SST spec: %d %d = [%d] %s", rowIndex, colIndex, sstIndex, s.b.strings[sstIndex])
|
|
|
|
case RecTypeHLink:
|
|
loc := &shRef8{}
|
|
loc.FirstRow = binary.LittleEndian.Uint16(r.Data[:2])
|
|
loc.LastRow = binary.LittleEndian.Uint16(r.Data[2:4])
|
|
loc.FirstCol = binary.LittleEndian.Uint16(r.Data[4:6])
|
|
loc.LastCol = binary.LittleEndian.Uint16(r.Data[6:])
|
|
if int(loc.FirstCol) > s.maxCol {
|
|
//log.Println("invalid hyperlink column")
|
|
continue
|
|
}
|
|
if int(loc.FirstRow) > s.maxRow {
|
|
//log.Println("invalid hyperlink row")
|
|
continue
|
|
}
|
|
if loc.LastRow == 0xFFFF {
|
|
loc.LastRow = uint16(s.maxRow)
|
|
}
|
|
if loc.LastCol == 0xFF {
|
|
loc.LastCol = uint16(s.maxCol)
|
|
}
|
|
|
|
displayText, linkText, err := decodeHyperlinks(r.Data[8:])
|
|
if err != nil {
|
|
log.Println(err)
|
|
continue
|
|
}
|
|
|
|
// apply merge cell rules
|
|
for rn := int(loc.FirstRow); rn <= int(loc.LastRow); rn++ {
|
|
for cn := int(loc.FirstCol); cn <= int(loc.LastCol); cn++ {
|
|
if rn == int(loc.FirstRow) && cn == int(loc.FirstCol) {
|
|
s.placeValue(rn, cn, displayText+" <"+linkText+">")
|
|
} else if cn == int(loc.FirstCol) {
|
|
// first and last column MAY be the same
|
|
if rn == int(loc.LastRow) {
|
|
s.placeValue(rn, cn, endRowMerged)
|
|
} else {
|
|
s.placeValue(rn, cn, continueRowMerged)
|
|
}
|
|
} else if cn == int(loc.LastCol) {
|
|
// first and last column are NOT the same
|
|
s.placeValue(rn, cn, endColumnMerged)
|
|
} else {
|
|
s.placeValue(rn, cn, continueColumnMerged)
|
|
}
|
|
}
|
|
}
|
|
|
|
case RecTypeMergeCells:
|
|
cmcs := binary.LittleEndian.Uint16(r.Data[:2])
|
|
raw := r.Data[2:]
|
|
loc := shRef8{}
|
|
for i := 0; i < int(cmcs); i++ {
|
|
loc.FirstRow = binary.LittleEndian.Uint16(raw[:2])
|
|
loc.LastRow = binary.LittleEndian.Uint16(raw[2:4])
|
|
loc.FirstCol = binary.LittleEndian.Uint16(raw[4:6])
|
|
loc.LastCol = binary.LittleEndian.Uint16(raw[6:])
|
|
raw = raw[8:]
|
|
|
|
if loc.LastRow == 0xFFFF {
|
|
loc.LastRow = uint16(s.maxRow)
|
|
}
|
|
if loc.LastCol == 0xFF {
|
|
loc.LastCol = uint16(s.maxCol)
|
|
}
|
|
for rn := int(loc.FirstRow); rn <= int(loc.LastRow); rn++ {
|
|
for cn := int(loc.FirstCol); cn <= int(loc.LastCol); cn++ {
|
|
if rn == int(loc.FirstRow) && cn == int(loc.FirstCol) {
|
|
// should be a value there already!
|
|
} else if cn == int(loc.FirstCol) {
|
|
// first and last column MAY be the same
|
|
if rn == int(loc.LastRow) {
|
|
s.placeValue(rn, cn, endRowMerged)
|
|
} else {
|
|
s.placeValue(rn, cn, continueRowMerged)
|
|
}
|
|
} else if cn == int(loc.LastCol) {
|
|
// first and last column are NOT the same
|
|
s.placeValue(rn, cn, endColumnMerged)
|
|
} else {
|
|
s.placeValue(rn, cn, continueColumnMerged)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
/*
|
|
case RecTypeBlank, RecTypeMulBlank:
|
|
// cells default value is blank, no need for these
|
|
|
|
case RecTypeContinue:
|
|
// the only situation so far is when used in RecTypeString above
|
|
|
|
case RecTypeRow, RecTypeDimensions, RecTypeEOF, RecTypeWsBool:
|
|
// handled in initial pass
|
|
|
|
default:
|
|
if grate.Debug {
|
|
log.Println(" Unhandled sheet record type:", r.RecType, ridx)
|
|
}
|
|
*/
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Err returns the last error that occured.
|
|
func (s *WorkSheet) Err() error {
|
|
return s.err
|
|
}
|
|
|
|
// Next advances to the next row of content.
|
|
// It MUST be called prior to any Scan().
|
|
func (s *WorkSheet) Next() bool {
|
|
s.iterRow++
|
|
return s.iterRow < len(s.rows)
|
|
}
|
|
|
|
func (s *WorkSheet) Strings() []string {
|
|
currow := s.rows[s.iterRow]
|
|
res := make([]string, len(currow.cols))
|
|
for i, col := range currow.cols {
|
|
if col == nil || col == "" {
|
|
continue
|
|
}
|
|
res[i] = fmt.Sprint(col)
|
|
}
|
|
return res
|
|
}
|
|
|
|
// Scan extracts values from the row into the provided arguments
|
|
// Arguments must be pointers to one of 5 supported types:
|
|
// bool, int, float64, string, or time.Time
|
|
func (s *WorkSheet) Scan(args ...interface{}) error {
|
|
currow := s.rows[s.iterRow]
|
|
|
|
for i, a := range args {
|
|
switch v := a.(type) {
|
|
case *bool:
|
|
*v = currow.cols[i].(bool)
|
|
case *int:
|
|
*v = currow.cols[i].(int)
|
|
case *float64:
|
|
*v = currow.cols[i].(float64)
|
|
case *string:
|
|
*v = currow.cols[i].(string)
|
|
case *time.Time:
|
|
*v = currow.cols[i].(time.Time)
|
|
default:
|
|
return grate.ErrInvalidScanType
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
var berrLookup = map[byte]string{
|
|
0x00: "#NULL!",
|
|
0x07: "#DIV/0!",
|
|
0x0F: "#VALUE!",
|
|
0x17: "#REF!",
|
|
0x1D: "#NAME?",
|
|
0x24: "#NUM!",
|
|
0x2A: "#N/A",
|
|
0x2B: "#GETTING_DATA",
|
|
}
|