From b953163a8d3ea47ee82b0eea0a611969306f0522 Mon Sep 17 00:00:00 2001 From: Jeremy Jay Date: Fri, 12 Feb 2021 13:44:46 -0500 Subject: [PATCH] tweaks to reduce memory usage in xlsx on a test dataset, usage goes from 6.0GB => 3.3GB and walltime improves from 31.6s => 19.4s largest remaining driver is the slow/hungry xml.Decoder --- cmd/grate2tsv/main.go | 16 ++++++++++++++++ xlsx/sheets.go | 36 +++++++++++++++++++----------------- xlsx/types.go | 12 ++++++++---- xlsx/workbook.go | 37 +++++++++++++++++++------------------ 4 files changed, 62 insertions(+), 39 deletions(-) diff --git a/cmd/grate2tsv/main.go b/cmd/grate2tsv/main.go index c31be67..8f013ca 100644 --- a/cmd/grate2tsv/main.go +++ b/cmd/grate2tsv/main.go @@ -10,6 +10,7 @@ import ( "os" "path/filepath" "regexp" + "runtime" "runtime/pprof" "strings" "time" @@ -28,12 +29,27 @@ var ( trimSpaces = flag.Bool("w", true, "trim whitespace from cell contents") skipBlanks = flag.Bool("b", true, "discard blank rows from the output") cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file") + memprofile = flag.String("memprofile", "", "write memory profile to file") ) func main() { timeFormat := "2006-01-02 15:04:05" flag.Parse() + if *memprofile != "" { + f, err := os.Create(*memprofile) + if err != nil { + log.Fatal(err) + } + defer func() { + runtime.GC() // get up-to-date statistics + if err := pprof.WriteHeapProfile(f); err != nil { + log.Fatal("could not write memory profile: ", err) + } + f.Close() // error handling omitted for example + }() + } + if *cpuprofile != "" { f, err := os.Create(*cpuprofile) if err != nil { diff --git a/xlsx/sheets.go b/xlsx/sheets.go index 4c02fcc..eda2e8a 100644 --- a/xlsx/sheets.go +++ b/xlsx/sheets.go @@ -48,12 +48,12 @@ func (s *Sheet) parseSheet() error { dec, clo, err := s.d.openXML(relsname) if err == nil { // rels might not exist for every sheet - tok, err := dec.Token() - for ; err == nil; tok, err = dec.Token() { + tok, err := dec.RawToken() + for ; err == nil; tok, err = dec.RawToken() { if v, ok := tok.(xml.StartElement); ok && v.Name.Local == "Relationship" { - ax := attrMap(v.Attr) - if ax["TargetMode"] == "External" && ax["Type"] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" { - linkmap[ax["Id"]] = ax["Target"] + ax := getAttrs(v.Attr, "Id", "Type", "Target", "TargetMode") + if ax[3] == "External" && ax[1] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" { + linkmap[ax[0]] = ax[2] } } } @@ -69,8 +69,8 @@ func (s *Sheet) parseSheet() error { currentCellType := BlankCellType currentCell := "" var numFormat commonxl.FmtFunc - tok, err := dec.Token() - for ; err == nil; tok, err = dec.Token() { + tok, err := dec.RawToken() + for ; err == nil; tok, err = dec.RawToken() { switch v := tok.(type) { case xml.CharData: if currentCell == "" { @@ -112,17 +112,17 @@ func (s *Sheet) parseSheet() error { //log.Println("FAIL row/col: ", currentCell) } case xml.StartElement: - ax := attrMap(v.Attr) switch v.Name.Local { case "dimension": - if ax["ref"] == "A1" { + ax := getAttrs(v.Attr, "ref") + if ax[0] == "A1" { // short-circuit empty sheet s.minCol, s.minRow = 0, 0 s.maxCol, s.maxRow = 1, 1 s.empty = true continue } - dims := strings.Split(ax["ref"], ":") + dims := strings.Split(ax[0], ":") s.minCol, s.minRow = refToIndexes(dims[0]) s.maxCol, s.maxRow = refToIndexes(dims[1]) //log.Println("DIMENSION:", s.minRow, s.minCol, ">", s.maxRow, s.maxCol) @@ -130,12 +130,13 @@ func (s *Sheet) parseSheet() error { //currentRow = ax["r"] // unsigned int row index //log.Println("ROW", currentRow) case "c": - currentCellType = CellType(ax["t"]) + ax := getAttrs(v.Attr, "t", "r", "s") + currentCellType = CellType(ax[0]) if currentCellType == BlankCellType { currentCellType = NumberCellType } - currentCell = ax["r"] // always an A1 style reference - style := ax["s"] + currentCell = ax[1] // always an A1 style reference + style := ax[2] sid, _ := strconv.ParseInt(style, 10, 64) numFormat = s.d.xfs[sid] // unsigned integer lookup //log.Println("CELL", currentCell, sid, numFormat, currentCellType) @@ -143,7 +144,8 @@ func (s *Sheet) parseSheet() error { //log.Println("CELL VALUE", ax) case "mergeCell": - dims := strings.Split(ax["ref"], ":") + ax := getAttrs(v.Attr, "ref") + dims := strings.Split(ax[0], ":") startCol, startRow := refToIndexes(dims[0]) endCol, endRow := refToIndexes(dims[1]) for r := startRow; r <= endRow; r++ { @@ -167,8 +169,9 @@ func (s *Sheet) parseSheet() error { } case "hyperlink": - col, row := refToIndexes(ax["ref"]) - link := linkmap[ax["id"]] + ax := getAttrs(v.Attr, "ref", "id") + col, row := refToIndexes(ax[0]) + link := linkmap[ax[1]] if len(s.rows) > row && len(s.rows[row].cols) > col { if sstr, ok := s.rows[row].cols[col].(string); ok { link = sstr + " <" + link + ">" @@ -193,7 +196,6 @@ func (s *Sheet) parseSheet() error { case "row": //currentRow = "" } - default: if grate.Debug { log.Printf(" Unhandled sheet xml tokens %T %+v", tok, tok) diff --git a/xlsx/types.go b/xlsx/types.go index ef11d73..f43bfef 100644 --- a/xlsx/types.go +++ b/xlsx/types.go @@ -79,10 +79,14 @@ func refToIndexes(r string) (column, row int) { return int(cn), int(rn) } -func attrMap(attrs []xml.Attr) map[string]string { - m := make(map[string]string, len(attrs)) +func getAttrs(attrs []xml.Attr, keys ...string) []string { + res := make([]string, len(keys)) for _, a := range attrs { - m[a.Name.Local] = a.Value + for i, k := range keys { + if a.Name.Local == k { + res[i] = a.Value + } + } } - return m + return res } diff --git a/xlsx/workbook.go b/xlsx/workbook.go index 04dfb8e..8784036 100644 --- a/xlsx/workbook.go +++ b/xlsx/workbook.go @@ -13,8 +13,8 @@ import ( ) func (d *Document) parseRels(dec *xml.Decoder, basedir string) error { - tok, err := dec.Token() - for ; err == nil; tok, err = dec.Token() { + tok, err := dec.RawToken() + for ; err == nil; tok, err = dec.RawToken() { switch v := tok.(type) { case xml.StartElement: switch v.Name.Local { @@ -52,8 +52,8 @@ func (d *Document) parseRels(dec *xml.Decoder, basedir string) error { } func (d *Document) parseWorkbook(dec *xml.Decoder) error { - tok, err := dec.Token() - for ; err == nil; tok, err = dec.Token() { + tok, err := dec.RawToken() + for ; err == nil; tok, err = dec.RawToken() { switch v := tok.(type) { case xml.StartElement: switch v.Name.Local { @@ -101,43 +101,44 @@ func (d *Document) parseStyles(dec *xml.Decoder) error { d.xfs = d.xfs[:0] section := 0 - tok, err := dec.Token() - for ; err == nil; tok, err = dec.Token() { + tok, err := dec.RawToken() + for ; err == nil; tok, err = dec.RawToken() { switch v := tok.(type) { case xml.StartElement: - attrs := attrMap(v.Attr) - switch v.Name.Local { case "styleSheet": // container case "numFmt": - fmtNo, _ := strconv.ParseInt(attrs["numFmtId"], 10, 16) - d.fmt.Add(uint16(fmtNo), attrs["formatCode"]) + ax := getAttrs(v.Attr, "numFmtId", "formatCode") + fmtNo, _ := strconv.ParseInt(ax[0], 10, 16) + d.fmt.Add(uint16(fmtNo), ax[1]) case "cellStyleXfs": section = 1 case "cellXfs": section = 2 - n, _ := strconv.ParseInt(attrs["count"], 10, 64) + ax := getAttrs(v.Attr, "count") + n, _ := strconv.ParseInt(ax[0], 10, 64) d.xfs = make([]commonxl.FmtFunc, 0, n) case "xf": + ax := getAttrs(v.Attr, "numFmtId", "applyNumberFormat", "xfId") if section == 1 { // load base styles, but only save number format - if _, ok := attrs["applyNumberFormat"]; ok { - baseNumFormats = append(baseNumFormats, attrs["numFmtId"]) + if ax[1] != "1" { + baseNumFormats = append(baseNumFormats, ax[0]) } else { baseNumFormats = append(baseNumFormats, "0") } } else if section == 2 { // actual referencable cell styles // 1) get base style so we can inherit format properly - baseID, _ := strconv.ParseInt(attrs["xfId"], 10, 64) + baseID, _ := strconv.ParseInt(ax[2], 10, 64) numFmtID := baseNumFormats[baseID] // 2) check if this XF overrides the base format - if _, ok := attrs["applyNumberFormat"]; ok { - numFmtID = attrs["numFmtId"] + if ax[1] == "1" { + numFmtID = ax[0] } else { // remove the format (if it was inherited) numFmtID = "0" @@ -178,8 +179,8 @@ func (d *Document) parseStyles(dec *xml.Decoder) error { func (d *Document) parseSharedStrings(dec *xml.Decoder) error { val := "" - tok, err := dec.Token() - for ; err == nil; tok, err = dec.Token() { + tok, err := dec.RawToken() + for ; err == nil; tok, err = dec.RawToken() { switch v := tok.(type) { case xml.CharData: val += string(v)