mirror of
https://github.com/pbnjay/grate.git
synced 2025-03-04 16:16:03 +02:00
on a test dataset, usage goes from 6.0GB => 3.3GB and walltime improves from 31.6s => 19.4s largest remaining driver is the slow/hungry xml.Decoder
278 lines
6.7 KiB
Go
278 lines
6.7 KiB
Go
package xlsx
|
|
|
|
import (
|
|
"encoding/xml"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/pbnjay/grate"
|
|
"github.com/pbnjay/grate/commonxl"
|
|
)
|
|
|
|
type Sheet struct {
|
|
d *Document
|
|
relID string
|
|
name string
|
|
docname string
|
|
|
|
err error
|
|
|
|
minRow int
|
|
maxRow int
|
|
minCol int
|
|
maxCol int
|
|
rows []*row
|
|
empty bool
|
|
|
|
iterRow int
|
|
}
|
|
|
|
var errNotLoaded = errors.New("xlsx: sheet not loaded")
|
|
|
|
type row struct {
|
|
// each value must be one of: int, float64, string, or time.Time
|
|
cols []interface{}
|
|
}
|
|
|
|
func (s *Sheet) parseSheet() error {
|
|
linkmap := make(map[string]string)
|
|
base := filepath.Base(s.docname)
|
|
sub := strings.TrimSuffix(s.docname, base)
|
|
relsname := filepath.Join(sub, "_rels", base+".rels")
|
|
dec, clo, err := s.d.openXML(relsname)
|
|
if err == nil {
|
|
// rels might not exist for every sheet
|
|
tok, err := dec.RawToken()
|
|
for ; err == nil; tok, err = dec.RawToken() {
|
|
if v, ok := tok.(xml.StartElement); ok && v.Name.Local == "Relationship" {
|
|
ax := getAttrs(v.Attr, "Id", "Type", "Target", "TargetMode")
|
|
if ax[3] == "External" && ax[1] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" {
|
|
linkmap[ax[0]] = ax[2]
|
|
}
|
|
}
|
|
}
|
|
clo.Close()
|
|
}
|
|
|
|
dec, clo, err = s.d.openXML(s.docname)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer clo.Close()
|
|
|
|
currentCellType := BlankCellType
|
|
currentCell := ""
|
|
var numFormat commonxl.FmtFunc
|
|
tok, err := dec.RawToken()
|
|
for ; err == nil; tok, err = dec.RawToken() {
|
|
switch v := tok.(type) {
|
|
case xml.CharData:
|
|
if currentCell == "" {
|
|
continue
|
|
}
|
|
c, r := refToIndexes(currentCell)
|
|
if c >= 0 && r >= 0 {
|
|
var val interface{} = string(v)
|
|
switch currentCellType {
|
|
case BooleanCellType:
|
|
if v[0] == '1' {
|
|
val = true
|
|
} else {
|
|
val = false
|
|
}
|
|
case DateCellType:
|
|
log.Println("CELL DATE", val, numFormat)
|
|
case NumberCellType:
|
|
fval, err := strconv.ParseFloat(string(v), 64)
|
|
if err == nil {
|
|
val = fval
|
|
}
|
|
//log.Println("CELL NUMBER", val, numFormat)
|
|
case SharedStringCellType:
|
|
//log.Println("CELL SHSTR", val, currentCellType, numFormat)
|
|
si, _ := strconv.ParseInt(string(v), 10, 64)
|
|
val = s.d.strings[si]
|
|
case BlankCellType:
|
|
//log.Println("CELL BLANK")
|
|
// don't place any values
|
|
continue
|
|
case ErrorCellType, FormulaStringCellType, InlineStringCellType:
|
|
//log.Println("CELL ERR/FORM/INLINE", val, currentCellType)
|
|
default:
|
|
log.Println("CELL UNKNOWN", val, currentCellType, numFormat)
|
|
}
|
|
s.placeValue(r, c, val)
|
|
} else {
|
|
//log.Println("FAIL row/col: ", currentCell)
|
|
}
|
|
case xml.StartElement:
|
|
switch v.Name.Local {
|
|
case "dimension":
|
|
ax := getAttrs(v.Attr, "ref")
|
|
if ax[0] == "A1" {
|
|
// short-circuit empty sheet
|
|
s.minCol, s.minRow = 0, 0
|
|
s.maxCol, s.maxRow = 1, 1
|
|
s.empty = true
|
|
continue
|
|
}
|
|
dims := strings.Split(ax[0], ":")
|
|
s.minCol, s.minRow = refToIndexes(dims[0])
|
|
s.maxCol, s.maxRow = refToIndexes(dims[1])
|
|
//log.Println("DIMENSION:", s.minRow, s.minCol, ">", s.maxRow, s.maxCol)
|
|
case "row":
|
|
//currentRow = ax["r"] // unsigned int row index
|
|
//log.Println("ROW", currentRow)
|
|
case "c":
|
|
ax := getAttrs(v.Attr, "t", "r", "s")
|
|
currentCellType = CellType(ax[0])
|
|
if currentCellType == BlankCellType {
|
|
currentCellType = NumberCellType
|
|
}
|
|
currentCell = ax[1] // always an A1 style reference
|
|
style := ax[2]
|
|
sid, _ := strconv.ParseInt(style, 10, 64)
|
|
numFormat = s.d.xfs[sid] // unsigned integer lookup
|
|
//log.Println("CELL", currentCell, sid, numFormat, currentCellType)
|
|
case "v":
|
|
//log.Println("CELL VALUE", ax)
|
|
|
|
case "mergeCell":
|
|
ax := getAttrs(v.Attr, "ref")
|
|
dims := strings.Split(ax[0], ":")
|
|
startCol, startRow := refToIndexes(dims[0])
|
|
endCol, endRow := refToIndexes(dims[1])
|
|
for r := startRow; r <= endRow; r++ {
|
|
for c := startCol; c <= endCol; c++ {
|
|
if r == startRow && c == startCol {
|
|
// has data already!
|
|
} else if c == startCol {
|
|
// first and last column MAY be the same
|
|
if r == endRow {
|
|
s.placeValue(r, c, endRowMerged)
|
|
} else {
|
|
s.placeValue(r, c, continueRowMerged)
|
|
}
|
|
} else if c == endCol {
|
|
// first and last column are NOT the same
|
|
s.placeValue(r, c, endColumnMerged)
|
|
} else {
|
|
s.placeValue(r, c, continueColumnMerged)
|
|
}
|
|
}
|
|
}
|
|
|
|
case "hyperlink":
|
|
ax := getAttrs(v.Attr, "ref", "id")
|
|
col, row := refToIndexes(ax[0])
|
|
link := linkmap[ax[1]]
|
|
if len(s.rows) > row && len(s.rows[row].cols) > col {
|
|
if sstr, ok := s.rows[row].cols[col].(string); ok {
|
|
link = sstr + " <" + link + ">"
|
|
}
|
|
}
|
|
s.placeValue(row, col, link)
|
|
|
|
case "worksheet", "mergeCells", "hyperlinks":
|
|
// containers
|
|
case "f":
|
|
//log.Println("start: ", v.Name.Local, v.Attr)
|
|
default:
|
|
if grate.Debug {
|
|
log.Println(" Unhandled sheet xml tag", v.Name.Local, v.Attr)
|
|
}
|
|
}
|
|
case xml.EndElement:
|
|
|
|
switch v.Name.Local {
|
|
case "c":
|
|
currentCell = ""
|
|
case "row":
|
|
//currentRow = ""
|
|
}
|
|
default:
|
|
if grate.Debug {
|
|
log.Printf(" Unhandled sheet xml tokens %T %+v", tok, tok)
|
|
}
|
|
}
|
|
}
|
|
if err == io.EOF {
|
|
err = nil
|
|
}
|
|
return err
|
|
}
|
|
|
|
func (s *Sheet) placeValue(rowIndex, colIndex int, val interface{}) {
|
|
if colIndex > s.maxCol || rowIndex > s.maxRow {
|
|
// invalid
|
|
return
|
|
}
|
|
|
|
// ensure we always have a complete matrix
|
|
for len(s.rows) <= rowIndex {
|
|
emptyRow := make([]interface{}, s.maxCol+1)
|
|
for i := 0; i <= s.maxCol; i++ {
|
|
emptyRow[i] = staticBlank
|
|
}
|
|
s.rows = append(s.rows, &row{emptyRow})
|
|
}
|
|
s.empty = false
|
|
s.rows[rowIndex].cols[colIndex] = val
|
|
}
|
|
|
|
// Next advances to the next row of content.
|
|
// It MUST be called prior to any Scan().
|
|
func (s *Sheet) Next() bool {
|
|
s.iterRow++
|
|
return s.iterRow < len(s.rows)
|
|
}
|
|
|
|
func (s *Sheet) Strings() []string {
|
|
currow := s.rows[s.iterRow]
|
|
res := make([]string, len(currow.cols))
|
|
for i, col := range currow.cols {
|
|
res[i] = fmt.Sprint(col)
|
|
}
|
|
return res
|
|
}
|
|
|
|
// Scan extracts values from the row into the provided arguments
|
|
// Arguments must be pointers to one of 5 supported types:
|
|
// bool, int, float64, string, or time.Time
|
|
func (s *Sheet) Scan(args ...interface{}) error {
|
|
currow := s.rows[s.iterRow]
|
|
|
|
for i, a := range args {
|
|
switch v := a.(type) {
|
|
case *bool:
|
|
*v = currow.cols[i].(bool)
|
|
case *int:
|
|
*v = currow.cols[i].(int)
|
|
case *float64:
|
|
*v = currow.cols[i].(float64)
|
|
case *string:
|
|
*v = currow.cols[i].(string)
|
|
case *time.Time:
|
|
*v = currow.cols[i].(time.Time)
|
|
default:
|
|
return grate.ErrInvalidScanType
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Sheet) IsEmpty() bool {
|
|
return s.empty
|
|
}
|
|
|
|
// Err returns the last error that occured.
|
|
func (s *Sheet) Err() error {
|
|
return s.err
|
|
}
|