1
0
mirror of https://github.com/pbnjay/grate.git synced 2025-03-04 16:16:03 +02:00
grate/xlsx/sheets.go
Jeremy Jay a5be267bf7 more tweaks to memory usage in xls this time
did not reduce total allocations much (bytes.Reader is more efficient
than I thought), but reduced walltime from 99s to 55s for a large collection
2021-02-13 00:06:04 -05:00

278 lines
6.7 KiB
Go

package xlsx
import (
"encoding/xml"
"errors"
"fmt"
"io"
"log"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/pbnjay/grate"
"github.com/pbnjay/grate/commonxl"
)
type Sheet struct {
d *Document
relID string
name string
docname string
err error
minRow int
maxRow int
minCol int
maxCol int
rows []*row
empty bool
iterRow int
}
var errNotLoaded = errors.New("xlsx: sheet not loaded")
type row struct {
// each value must be one of: int, float64, string, or time.Time
cols []interface{}
}
func (s *Sheet) parseSheet() error {
linkmap := make(map[string]string)
base := filepath.Base(s.docname)
sub := strings.TrimSuffix(s.docname, base)
relsname := filepath.Join(sub, "_rels", base+".rels")
dec, clo, err := s.d.openXML(relsname)
if err == nil {
// rels might not exist for every sheet
tok, err := dec.RawToken()
for ; err == nil; tok, err = dec.RawToken() {
if v, ok := tok.(xml.StartElement); ok && v.Name.Local == "Relationship" {
ax := getAttrs(v.Attr, "Id", "Type", "Target", "TargetMode")
if ax[3] == "External" && ax[1] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" {
linkmap[ax[0]] = ax[2]
}
}
}
clo.Close()
}
dec, clo, err = s.d.openXML(s.docname)
if err != nil {
return err
}
defer clo.Close()
currentCellType := BlankCellType
currentCell := ""
var numFormat commonxl.FmtFunc
tok, err := dec.RawToken()
for ; err == nil; tok, err = dec.RawToken() {
switch v := tok.(type) {
case xml.CharData:
if currentCell == "" {
continue
}
c, r := refToIndexes(currentCell)
if c >= 0 && r >= 0 {
var val interface{} = string(v)
switch currentCellType {
case BooleanCellType:
if v[0] == '1' {
val = true
} else {
val = false
}
case DateCellType:
log.Println("CELL DATE", val, numFormat)
case NumberCellType:
fval, err := strconv.ParseFloat(string(v), 64)
if err == nil {
val = fval
}
//log.Println("CELL NUMBER", val, numFormat)
case SharedStringCellType:
//log.Println("CELL SHSTR", val, currentCellType, numFormat)
si, _ := strconv.ParseInt(string(v), 10, 64)
val = s.d.strings[si]
case BlankCellType:
//log.Println("CELL BLANK")
// don't place any values
continue
case ErrorCellType, FormulaStringCellType, InlineStringCellType:
//log.Println("CELL ERR/FORM/INLINE", val, currentCellType)
default:
log.Println("CELL UNKNOWN", val, currentCellType, numFormat)
}
s.placeValue(r, c, val)
} else {
//log.Println("FAIL row/col: ", currentCell)
}
case xml.StartElement:
switch v.Name.Local {
case "dimension":
ax := getAttrs(v.Attr, "ref")
if ax[0] == "A1" {
// short-circuit empty sheet
s.minCol, s.minRow = 0, 0
s.maxCol, s.maxRow = 1, 1
s.empty = true
continue
}
dims := strings.Split(ax[0], ":")
s.minCol, s.minRow = refToIndexes(dims[0])
s.maxCol, s.maxRow = refToIndexes(dims[1])
//log.Println("DIMENSION:", s.minRow, s.minCol, ">", s.maxRow, s.maxCol)
case "row":
//currentRow = ax["r"] // unsigned int row index
//log.Println("ROW", currentRow)
case "c":
ax := getAttrs(v.Attr, "t", "r", "s")
currentCellType = CellType(ax[0])
if currentCellType == BlankCellType {
currentCellType = NumberCellType
}
currentCell = ax[1] // always an A1 style reference
style := ax[2]
sid, _ := strconv.ParseInt(style, 10, 64)
numFormat = s.d.xfs[sid] // unsigned integer lookup
//log.Println("CELL", currentCell, sid, numFormat, currentCellType)
case "v":
//log.Println("CELL VALUE", ax)
case "mergeCell":
ax := getAttrs(v.Attr, "ref")
dims := strings.Split(ax[0], ":")
startCol, startRow := refToIndexes(dims[0])
endCol, endRow := refToIndexes(dims[1])
for r := startRow; r <= endRow; r++ {
for c := startCol; c <= endCol; c++ {
if r == startRow && c == startCol {
// has data already!
} else if c == startCol {
// first and last column MAY be the same
if r == endRow {
s.placeValue(r, c, endRowMerged)
} else {
s.placeValue(r, c, continueRowMerged)
}
} else if c == endCol {
// first and last column are NOT the same
s.placeValue(r, c, endColumnMerged)
} else {
s.placeValue(r, c, continueColumnMerged)
}
}
}
case "hyperlink":
ax := getAttrs(v.Attr, "ref", "id")
col, row := refToIndexes(ax[0])
link := linkmap[ax[1]]
if len(s.rows) > row && len(s.rows[row].cols) > col {
if sstr, ok := s.rows[row].cols[col].(string); ok {
link = sstr + " <" + link + ">"
}
}
s.placeValue(row, col, link)
case "worksheet", "mergeCells", "hyperlinks":
// containers
case "f":
//log.Println("start: ", v.Name.Local, v.Attr)
default:
if grate.Debug {
log.Println(" Unhandled sheet xml tag", v.Name.Local, v.Attr)
}
}
case xml.EndElement:
switch v.Name.Local {
case "c":
currentCell = ""
case "row":
//currentRow = ""
}
default:
if grate.Debug {
log.Printf(" Unhandled sheet xml tokens %T %+v", tok, tok)
}
}
}
if err == io.EOF {
err = nil
}
return err
}
func (s *Sheet) placeValue(rowIndex, colIndex int, val interface{}) {
if colIndex > s.maxCol || rowIndex > s.maxRow {
// invalid
return
}
// ensure we always have a complete matrix
for len(s.rows) <= rowIndex {
emptyRow := make([]interface{}, s.maxCol+1)
s.rows = append(s.rows, &row{emptyRow})
}
s.empty = false
s.rows[rowIndex].cols[colIndex] = val
}
// Next advances to the next row of content.
// It MUST be called prior to any Scan().
func (s *Sheet) Next() bool {
s.iterRow++
return s.iterRow < len(s.rows)
}
func (s *Sheet) Strings() []string {
currow := s.rows[s.iterRow]
res := make([]string, len(currow.cols))
for i, col := range currow.cols {
if col == nil || col == "" {
continue
}
res[i] = fmt.Sprint(col)
}
return res
}
// Scan extracts values from the row into the provided arguments
// Arguments must be pointers to one of 5 supported types:
// bool, int, float64, string, or time.Time
func (s *Sheet) Scan(args ...interface{}) error {
currow := s.rows[s.iterRow]
for i, a := range args {
switch v := a.(type) {
case *bool:
*v = currow.cols[i].(bool)
case *int:
*v = currow.cols[i].(int)
case *float64:
*v = currow.cols[i].(float64)
case *string:
*v = currow.cols[i].(string)
case *time.Time:
*v = currow.cols[i].(time.Time)
default:
return grate.ErrInvalidScanType
}
}
return nil
}
func (s *Sheet) IsEmpty() bool {
return s.empty
}
// Err returns the last error that occured.
func (s *Sheet) Err() error {
return s.err
}