1
0
mirror of https://github.com/pbnjay/grate.git synced 2025-03-04 08:08:05 +02:00

tweaks to reduce memory usage in xlsx

on a test dataset, usage goes from 6.0GB => 3.3GB
and walltime improves from 31.6s => 19.4s

largest remaining driver is the slow/hungry xml.Decoder
This commit is contained in:
Jeremy Jay 2021-02-12 13:44:46 -05:00
parent e244917a51
commit b953163a8d
4 changed files with 62 additions and 39 deletions

View File

@ -10,6 +10,7 @@ import (
"os"
"path/filepath"
"regexp"
"runtime"
"runtime/pprof"
"strings"
"time"
@ -28,12 +29,27 @@ var (
trimSpaces = flag.Bool("w", true, "trim whitespace from cell contents")
skipBlanks = flag.Bool("b", true, "discard blank rows from the output")
cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
memprofile = flag.String("memprofile", "", "write memory profile to file")
)
func main() {
timeFormat := "2006-01-02 15:04:05"
flag.Parse()
if *memprofile != "" {
f, err := os.Create(*memprofile)
if err != nil {
log.Fatal(err)
}
defer func() {
runtime.GC() // get up-to-date statistics
if err := pprof.WriteHeapProfile(f); err != nil {
log.Fatal("could not write memory profile: ", err)
}
f.Close() // error handling omitted for example
}()
}
if *cpuprofile != "" {
f, err := os.Create(*cpuprofile)
if err != nil {

View File

@ -48,12 +48,12 @@ func (s *Sheet) parseSheet() error {
dec, clo, err := s.d.openXML(relsname)
if err == nil {
// rels might not exist for every sheet
tok, err := dec.Token()
for ; err == nil; tok, err = dec.Token() {
tok, err := dec.RawToken()
for ; err == nil; tok, err = dec.RawToken() {
if v, ok := tok.(xml.StartElement); ok && v.Name.Local == "Relationship" {
ax := attrMap(v.Attr)
if ax["TargetMode"] == "External" && ax["Type"] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" {
linkmap[ax["Id"]] = ax["Target"]
ax := getAttrs(v.Attr, "Id", "Type", "Target", "TargetMode")
if ax[3] == "External" && ax[1] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" {
linkmap[ax[0]] = ax[2]
}
}
}
@ -69,8 +69,8 @@ func (s *Sheet) parseSheet() error {
currentCellType := BlankCellType
currentCell := ""
var numFormat commonxl.FmtFunc
tok, err := dec.Token()
for ; err == nil; tok, err = dec.Token() {
tok, err := dec.RawToken()
for ; err == nil; tok, err = dec.RawToken() {
switch v := tok.(type) {
case xml.CharData:
if currentCell == "" {
@ -112,17 +112,17 @@ func (s *Sheet) parseSheet() error {
//log.Println("FAIL row/col: ", currentCell)
}
case xml.StartElement:
ax := attrMap(v.Attr)
switch v.Name.Local {
case "dimension":
if ax["ref"] == "A1" {
ax := getAttrs(v.Attr, "ref")
if ax[0] == "A1" {
// short-circuit empty sheet
s.minCol, s.minRow = 0, 0
s.maxCol, s.maxRow = 1, 1
s.empty = true
continue
}
dims := strings.Split(ax["ref"], ":")
dims := strings.Split(ax[0], ":")
s.minCol, s.minRow = refToIndexes(dims[0])
s.maxCol, s.maxRow = refToIndexes(dims[1])
//log.Println("DIMENSION:", s.minRow, s.minCol, ">", s.maxRow, s.maxCol)
@ -130,12 +130,13 @@ func (s *Sheet) parseSheet() error {
//currentRow = ax["r"] // unsigned int row index
//log.Println("ROW", currentRow)
case "c":
currentCellType = CellType(ax["t"])
ax := getAttrs(v.Attr, "t", "r", "s")
currentCellType = CellType(ax[0])
if currentCellType == BlankCellType {
currentCellType = NumberCellType
}
currentCell = ax["r"] // always an A1 style reference
style := ax["s"]
currentCell = ax[1] // always an A1 style reference
style := ax[2]
sid, _ := strconv.ParseInt(style, 10, 64)
numFormat = s.d.xfs[sid] // unsigned integer lookup
//log.Println("CELL", currentCell, sid, numFormat, currentCellType)
@ -143,7 +144,8 @@ func (s *Sheet) parseSheet() error {
//log.Println("CELL VALUE", ax)
case "mergeCell":
dims := strings.Split(ax["ref"], ":")
ax := getAttrs(v.Attr, "ref")
dims := strings.Split(ax[0], ":")
startCol, startRow := refToIndexes(dims[0])
endCol, endRow := refToIndexes(dims[1])
for r := startRow; r <= endRow; r++ {
@ -167,8 +169,9 @@ func (s *Sheet) parseSheet() error {
}
case "hyperlink":
col, row := refToIndexes(ax["ref"])
link := linkmap[ax["id"]]
ax := getAttrs(v.Attr, "ref", "id")
col, row := refToIndexes(ax[0])
link := linkmap[ax[1]]
if len(s.rows) > row && len(s.rows[row].cols) > col {
if sstr, ok := s.rows[row].cols[col].(string); ok {
link = sstr + " <" + link + ">"
@ -193,7 +196,6 @@ func (s *Sheet) parseSheet() error {
case "row":
//currentRow = ""
}
default:
if grate.Debug {
log.Printf(" Unhandled sheet xml tokens %T %+v", tok, tok)

View File

@ -79,10 +79,14 @@ func refToIndexes(r string) (column, row int) {
return int(cn), int(rn)
}
func attrMap(attrs []xml.Attr) map[string]string {
m := make(map[string]string, len(attrs))
func getAttrs(attrs []xml.Attr, keys ...string) []string {
res := make([]string, len(keys))
for _, a := range attrs {
m[a.Name.Local] = a.Value
for i, k := range keys {
if a.Name.Local == k {
res[i] = a.Value
}
}
}
return m
return res
}

View File

@ -13,8 +13,8 @@ import (
)
func (d *Document) parseRels(dec *xml.Decoder, basedir string) error {
tok, err := dec.Token()
for ; err == nil; tok, err = dec.Token() {
tok, err := dec.RawToken()
for ; err == nil; tok, err = dec.RawToken() {
switch v := tok.(type) {
case xml.StartElement:
switch v.Name.Local {
@ -52,8 +52,8 @@ func (d *Document) parseRels(dec *xml.Decoder, basedir string) error {
}
func (d *Document) parseWorkbook(dec *xml.Decoder) error {
tok, err := dec.Token()
for ; err == nil; tok, err = dec.Token() {
tok, err := dec.RawToken()
for ; err == nil; tok, err = dec.RawToken() {
switch v := tok.(type) {
case xml.StartElement:
switch v.Name.Local {
@ -101,43 +101,44 @@ func (d *Document) parseStyles(dec *xml.Decoder) error {
d.xfs = d.xfs[:0]
section := 0
tok, err := dec.Token()
for ; err == nil; tok, err = dec.Token() {
tok, err := dec.RawToken()
for ; err == nil; tok, err = dec.RawToken() {
switch v := tok.(type) {
case xml.StartElement:
attrs := attrMap(v.Attr)
switch v.Name.Local {
case "styleSheet":
// container
case "numFmt":
fmtNo, _ := strconv.ParseInt(attrs["numFmtId"], 10, 16)
d.fmt.Add(uint16(fmtNo), attrs["formatCode"])
ax := getAttrs(v.Attr, "numFmtId", "formatCode")
fmtNo, _ := strconv.ParseInt(ax[0], 10, 16)
d.fmt.Add(uint16(fmtNo), ax[1])
case "cellStyleXfs":
section = 1
case "cellXfs":
section = 2
n, _ := strconv.ParseInt(attrs["count"], 10, 64)
ax := getAttrs(v.Attr, "count")
n, _ := strconv.ParseInt(ax[0], 10, 64)
d.xfs = make([]commonxl.FmtFunc, 0, n)
case "xf":
ax := getAttrs(v.Attr, "numFmtId", "applyNumberFormat", "xfId")
if section == 1 {
// load base styles, but only save number format
if _, ok := attrs["applyNumberFormat"]; ok {
baseNumFormats = append(baseNumFormats, attrs["numFmtId"])
if ax[1] != "1" {
baseNumFormats = append(baseNumFormats, ax[0])
} else {
baseNumFormats = append(baseNumFormats, "0")
}
} else if section == 2 {
// actual referencable cell styles
// 1) get base style so we can inherit format properly
baseID, _ := strconv.ParseInt(attrs["xfId"], 10, 64)
baseID, _ := strconv.ParseInt(ax[2], 10, 64)
numFmtID := baseNumFormats[baseID]
// 2) check if this XF overrides the base format
if _, ok := attrs["applyNumberFormat"]; ok {
numFmtID = attrs["numFmtId"]
if ax[1] == "1" {
numFmtID = ax[0]
} else {
// remove the format (if it was inherited)
numFmtID = "0"
@ -178,8 +179,8 @@ func (d *Document) parseStyles(dec *xml.Decoder) error {
func (d *Document) parseSharedStrings(dec *xml.Decoder) error {
val := ""
tok, err := dec.Token()
for ; err == nil; tok, err = dec.Token() {
tok, err := dec.RawToken()
for ; err == nil; tok, err = dec.RawToken() {
switch v := tok.(type) {
case xml.CharData:
val += string(v)