mirror of
https://github.com/pbnjay/grate.git
synced 2025-03-04 08:08:05 +02:00
tweaks to reduce memory usage in xlsx
on a test dataset, usage goes from 6.0GB => 3.3GB and walltime improves from 31.6s => 19.4s largest remaining driver is the slow/hungry xml.Decoder
This commit is contained in:
parent
e244917a51
commit
b953163a8d
@ -10,6 +10,7 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"runtime/pprof"
|
||||
"strings"
|
||||
"time"
|
||||
@ -28,12 +29,27 @@ var (
|
||||
trimSpaces = flag.Bool("w", true, "trim whitespace from cell contents")
|
||||
skipBlanks = flag.Bool("b", true, "discard blank rows from the output")
|
||||
cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
|
||||
memprofile = flag.String("memprofile", "", "write memory profile to file")
|
||||
)
|
||||
|
||||
func main() {
|
||||
timeFormat := "2006-01-02 15:04:05"
|
||||
flag.Parse()
|
||||
|
||||
if *memprofile != "" {
|
||||
f, err := os.Create(*memprofile)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer func() {
|
||||
runtime.GC() // get up-to-date statistics
|
||||
if err := pprof.WriteHeapProfile(f); err != nil {
|
||||
log.Fatal("could not write memory profile: ", err)
|
||||
}
|
||||
f.Close() // error handling omitted for example
|
||||
}()
|
||||
}
|
||||
|
||||
if *cpuprofile != "" {
|
||||
f, err := os.Create(*cpuprofile)
|
||||
if err != nil {
|
||||
|
@ -48,12 +48,12 @@ func (s *Sheet) parseSheet() error {
|
||||
dec, clo, err := s.d.openXML(relsname)
|
||||
if err == nil {
|
||||
// rels might not exist for every sheet
|
||||
tok, err := dec.Token()
|
||||
for ; err == nil; tok, err = dec.Token() {
|
||||
tok, err := dec.RawToken()
|
||||
for ; err == nil; tok, err = dec.RawToken() {
|
||||
if v, ok := tok.(xml.StartElement); ok && v.Name.Local == "Relationship" {
|
||||
ax := attrMap(v.Attr)
|
||||
if ax["TargetMode"] == "External" && ax["Type"] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" {
|
||||
linkmap[ax["Id"]] = ax["Target"]
|
||||
ax := getAttrs(v.Attr, "Id", "Type", "Target", "TargetMode")
|
||||
if ax[3] == "External" && ax[1] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" {
|
||||
linkmap[ax[0]] = ax[2]
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -69,8 +69,8 @@ func (s *Sheet) parseSheet() error {
|
||||
currentCellType := BlankCellType
|
||||
currentCell := ""
|
||||
var numFormat commonxl.FmtFunc
|
||||
tok, err := dec.Token()
|
||||
for ; err == nil; tok, err = dec.Token() {
|
||||
tok, err := dec.RawToken()
|
||||
for ; err == nil; tok, err = dec.RawToken() {
|
||||
switch v := tok.(type) {
|
||||
case xml.CharData:
|
||||
if currentCell == "" {
|
||||
@ -112,17 +112,17 @@ func (s *Sheet) parseSheet() error {
|
||||
//log.Println("FAIL row/col: ", currentCell)
|
||||
}
|
||||
case xml.StartElement:
|
||||
ax := attrMap(v.Attr)
|
||||
switch v.Name.Local {
|
||||
case "dimension":
|
||||
if ax["ref"] == "A1" {
|
||||
ax := getAttrs(v.Attr, "ref")
|
||||
if ax[0] == "A1" {
|
||||
// short-circuit empty sheet
|
||||
s.minCol, s.minRow = 0, 0
|
||||
s.maxCol, s.maxRow = 1, 1
|
||||
s.empty = true
|
||||
continue
|
||||
}
|
||||
dims := strings.Split(ax["ref"], ":")
|
||||
dims := strings.Split(ax[0], ":")
|
||||
s.minCol, s.minRow = refToIndexes(dims[0])
|
||||
s.maxCol, s.maxRow = refToIndexes(dims[1])
|
||||
//log.Println("DIMENSION:", s.minRow, s.minCol, ">", s.maxRow, s.maxCol)
|
||||
@ -130,12 +130,13 @@ func (s *Sheet) parseSheet() error {
|
||||
//currentRow = ax["r"] // unsigned int row index
|
||||
//log.Println("ROW", currentRow)
|
||||
case "c":
|
||||
currentCellType = CellType(ax["t"])
|
||||
ax := getAttrs(v.Attr, "t", "r", "s")
|
||||
currentCellType = CellType(ax[0])
|
||||
if currentCellType == BlankCellType {
|
||||
currentCellType = NumberCellType
|
||||
}
|
||||
currentCell = ax["r"] // always an A1 style reference
|
||||
style := ax["s"]
|
||||
currentCell = ax[1] // always an A1 style reference
|
||||
style := ax[2]
|
||||
sid, _ := strconv.ParseInt(style, 10, 64)
|
||||
numFormat = s.d.xfs[sid] // unsigned integer lookup
|
||||
//log.Println("CELL", currentCell, sid, numFormat, currentCellType)
|
||||
@ -143,7 +144,8 @@ func (s *Sheet) parseSheet() error {
|
||||
//log.Println("CELL VALUE", ax)
|
||||
|
||||
case "mergeCell":
|
||||
dims := strings.Split(ax["ref"], ":")
|
||||
ax := getAttrs(v.Attr, "ref")
|
||||
dims := strings.Split(ax[0], ":")
|
||||
startCol, startRow := refToIndexes(dims[0])
|
||||
endCol, endRow := refToIndexes(dims[1])
|
||||
for r := startRow; r <= endRow; r++ {
|
||||
@ -167,8 +169,9 @@ func (s *Sheet) parseSheet() error {
|
||||
}
|
||||
|
||||
case "hyperlink":
|
||||
col, row := refToIndexes(ax["ref"])
|
||||
link := linkmap[ax["id"]]
|
||||
ax := getAttrs(v.Attr, "ref", "id")
|
||||
col, row := refToIndexes(ax[0])
|
||||
link := linkmap[ax[1]]
|
||||
if len(s.rows) > row && len(s.rows[row].cols) > col {
|
||||
if sstr, ok := s.rows[row].cols[col].(string); ok {
|
||||
link = sstr + " <" + link + ">"
|
||||
@ -193,7 +196,6 @@ func (s *Sheet) parseSheet() error {
|
||||
case "row":
|
||||
//currentRow = ""
|
||||
}
|
||||
|
||||
default:
|
||||
if grate.Debug {
|
||||
log.Printf(" Unhandled sheet xml tokens %T %+v", tok, tok)
|
||||
|
@ -79,10 +79,14 @@ func refToIndexes(r string) (column, row int) {
|
||||
return int(cn), int(rn)
|
||||
}
|
||||
|
||||
func attrMap(attrs []xml.Attr) map[string]string {
|
||||
m := make(map[string]string, len(attrs))
|
||||
func getAttrs(attrs []xml.Attr, keys ...string) []string {
|
||||
res := make([]string, len(keys))
|
||||
for _, a := range attrs {
|
||||
m[a.Name.Local] = a.Value
|
||||
for i, k := range keys {
|
||||
if a.Name.Local == k {
|
||||
res[i] = a.Value
|
||||
}
|
||||
}
|
||||
}
|
||||
return m
|
||||
return res
|
||||
}
|
||||
|
@ -13,8 +13,8 @@ import (
|
||||
)
|
||||
|
||||
func (d *Document) parseRels(dec *xml.Decoder, basedir string) error {
|
||||
tok, err := dec.Token()
|
||||
for ; err == nil; tok, err = dec.Token() {
|
||||
tok, err := dec.RawToken()
|
||||
for ; err == nil; tok, err = dec.RawToken() {
|
||||
switch v := tok.(type) {
|
||||
case xml.StartElement:
|
||||
switch v.Name.Local {
|
||||
@ -52,8 +52,8 @@ func (d *Document) parseRels(dec *xml.Decoder, basedir string) error {
|
||||
}
|
||||
|
||||
func (d *Document) parseWorkbook(dec *xml.Decoder) error {
|
||||
tok, err := dec.Token()
|
||||
for ; err == nil; tok, err = dec.Token() {
|
||||
tok, err := dec.RawToken()
|
||||
for ; err == nil; tok, err = dec.RawToken() {
|
||||
switch v := tok.(type) {
|
||||
case xml.StartElement:
|
||||
switch v.Name.Local {
|
||||
@ -101,43 +101,44 @@ func (d *Document) parseStyles(dec *xml.Decoder) error {
|
||||
d.xfs = d.xfs[:0]
|
||||
|
||||
section := 0
|
||||
tok, err := dec.Token()
|
||||
for ; err == nil; tok, err = dec.Token() {
|
||||
tok, err := dec.RawToken()
|
||||
for ; err == nil; tok, err = dec.RawToken() {
|
||||
switch v := tok.(type) {
|
||||
case xml.StartElement:
|
||||
attrs := attrMap(v.Attr)
|
||||
|
||||
switch v.Name.Local {
|
||||
case "styleSheet":
|
||||
// container
|
||||
case "numFmt":
|
||||
fmtNo, _ := strconv.ParseInt(attrs["numFmtId"], 10, 16)
|
||||
d.fmt.Add(uint16(fmtNo), attrs["formatCode"])
|
||||
ax := getAttrs(v.Attr, "numFmtId", "formatCode")
|
||||
fmtNo, _ := strconv.ParseInt(ax[0], 10, 16)
|
||||
d.fmt.Add(uint16(fmtNo), ax[1])
|
||||
|
||||
case "cellStyleXfs":
|
||||
section = 1
|
||||
case "cellXfs":
|
||||
section = 2
|
||||
n, _ := strconv.ParseInt(attrs["count"], 10, 64)
|
||||
ax := getAttrs(v.Attr, "count")
|
||||
n, _ := strconv.ParseInt(ax[0], 10, 64)
|
||||
d.xfs = make([]commonxl.FmtFunc, 0, n)
|
||||
|
||||
case "xf":
|
||||
ax := getAttrs(v.Attr, "numFmtId", "applyNumberFormat", "xfId")
|
||||
if section == 1 {
|
||||
// load base styles, but only save number format
|
||||
if _, ok := attrs["applyNumberFormat"]; ok {
|
||||
baseNumFormats = append(baseNumFormats, attrs["numFmtId"])
|
||||
if ax[1] != "1" {
|
||||
baseNumFormats = append(baseNumFormats, ax[0])
|
||||
} else {
|
||||
baseNumFormats = append(baseNumFormats, "0")
|
||||
}
|
||||
} else if section == 2 {
|
||||
// actual referencable cell styles
|
||||
// 1) get base style so we can inherit format properly
|
||||
baseID, _ := strconv.ParseInt(attrs["xfId"], 10, 64)
|
||||
baseID, _ := strconv.ParseInt(ax[2], 10, 64)
|
||||
numFmtID := baseNumFormats[baseID]
|
||||
|
||||
// 2) check if this XF overrides the base format
|
||||
if _, ok := attrs["applyNumberFormat"]; ok {
|
||||
numFmtID = attrs["numFmtId"]
|
||||
if ax[1] == "1" {
|
||||
numFmtID = ax[0]
|
||||
} else {
|
||||
// remove the format (if it was inherited)
|
||||
numFmtID = "0"
|
||||
@ -178,8 +179,8 @@ func (d *Document) parseStyles(dec *xml.Decoder) error {
|
||||
|
||||
func (d *Document) parseSharedStrings(dec *xml.Decoder) error {
|
||||
val := ""
|
||||
tok, err := dec.Token()
|
||||
for ; err == nil; tok, err = dec.Token() {
|
||||
tok, err := dec.RawToken()
|
||||
for ; err == nil; tok, err = dec.RawToken() {
|
||||
switch v := tok.(type) {
|
||||
case xml.CharData:
|
||||
val += string(v)
|
||||
|
Loading…
x
Reference in New Issue
Block a user