From d9793eb9dd7ecd48c1ff2728269cedbe7ad339c3 Mon Sep 17 00:00:00 2001 From: Jeremy Jay Date: Sat, 13 Feb 2021 23:36:42 -0500 Subject: [PATCH] reuse memory as often as we can. update tests --- grate.go | 3 + simple/simple.go | 4 ++ xls/comp_test.go | 44 ++++++++++++ xls/sheets.go | 163 +++++++++++++++++++++++--------------------- xls/simple_test.go | 59 ++++++++++++---- xls/strings.go | 8 ++- xls/xls.go | 42 ++++++++++-- xlsx/comp_test.go | 44 ++++++++++++ xlsx/simple_test.go | 38 ++++++++--- xlsx/xlsx.go | 12 ++++ 10 files changed, 309 insertions(+), 108 deletions(-) create mode 100644 xls/comp_test.go create mode 100644 xlsx/comp_test.go diff --git a/grate.go b/grate.go index f879755..4176617 100644 --- a/grate.go +++ b/grate.go @@ -15,6 +15,9 @@ type Source interface { // Get a Collection from the source by name. Get(name string) (Collection, error) + + // Close the source and discard memory. + Close() error } // Collection represents an iterable collection of records. diff --git a/simple/simple.go b/simple/simple.go index 817e2e7..f18052e 100644 --- a/simple/simple.go +++ b/simple/simple.go @@ -22,6 +22,10 @@ func (t *simpleFile) List() ([]string, error) { return []string{t.filename}, nil } +func (t *simpleFile) Close() error { + return nil +} + // Get a Collection from the source by name. func (t *simpleFile) Get(name string) (grate.Collection, error) { return t, nil diff --git a/xls/comp_test.go b/xls/comp_test.go new file mode 100644 index 0000000..7fd51c4 --- /dev/null +++ b/xls/comp_test.go @@ -0,0 +1,44 @@ +package xls + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestManyFiles(t *testing.T) { + err := filepath.Walk("../testdata", func(p string, info os.FileInfo, err error) error { + if info.IsDir() { + return nil + } + if !strings.HasSuffix(info.Name(), ".xls") { + return nil + } + wb, err := Open(p) + if err != nil { + return nil //err + } + + sheets, err := wb.List() + if err != nil { + return err + } + for _, s := range sheets { + //log.Println(s) + sheet, err := wb.Get(s) + if err != nil { + return err + } + + for sheet.Next() { + sheet.Strings() + } + } + + return wb.Close() + }) + if err != nil { + t.Fatal(err) + } +} diff --git a/xls/sheets.go b/xls/sheets.go index 0a390f2..7cac03d 100644 --- a/xls/sheets.go +++ b/xls/sheets.go @@ -81,17 +81,26 @@ type row struct { cols []interface{} } +func (s *WorkSheet) makeCells() { + // ensure we always have a complete matrix + for len(s.rows) <= s.maxRow { + emptyRow := make([]interface{}, s.maxCol+1) + s.rows = append(s.rows, &row{emptyRow}) + } +} + func (s *WorkSheet) placeValue(rowIndex, colIndex int, val interface{}) { if colIndex > s.maxCol || rowIndex > s.maxRow { // invalid return } - - // ensure we always have a complete matrix - for len(s.rows) <= rowIndex { - emptyRow := make([]interface{}, s.maxCol+1) - s.rows = append(s.rows, &row{emptyRow}) - } + /* + // ensure we always have a complete matrix + for len(s.rows) <= rowIndex { + emptyRow := make([]interface{}, s.maxCol+1) + s.rows = append(s.rows, &row{emptyRow}) + } + */ s.rows[rowIndex].cols[colIndex] = val } @@ -101,6 +110,9 @@ func (s *WorkSheet) IsEmpty() bool { } func (s *WorkSheet) parse() error { + // temporary string buffer + us := make([]uint16, 8224) + inSubstream := 0 for idx, r := range s.b.substreams[s.ss] { if inSubstream > 0 { @@ -145,7 +157,8 @@ func (s *WorkSheet) parse() error { s.empty = true } else { // pre-allocate cells - s.placeValue(s.maxRow, s.maxCol, nil) + s.makeCells() + //s.placeValue(s.maxRow, s.maxCol, nil) } } } @@ -181,79 +194,74 @@ func (s *WorkSheet) parse() error { } case RecTypeBoolErr: - rowIndex := binary.LittleEndian.Uint16(r.Data[:2]) - colIndex := binary.LittleEndian.Uint16(r.Data[2:4]) + rowIndex := int(binary.LittleEndian.Uint16(r.Data[:2])) + colIndex := int(binary.LittleEndian.Uint16(r.Data[2:4])) //ixfe := binary.LittleEndian.Uint16(r.Data[4:6]) if r.Data[7] == 0 { bv := false if r.Data[6] == 1 { bv = true } - s.placeValue(int(rowIndex), int(colIndex), bv) + s.placeValue(rowIndex, colIndex, bv) //log.Printf("bool/error spec: %d %d %+v", rowIndex, colIndex, bv) } else { be, ok := berrLookup[r.Data[6]] if !ok { be = "" } - s.placeValue(int(rowIndex), int(colIndex), be) + s.placeValue(rowIndex, colIndex, be) //log.Printf("bool/error spec: %d %d %s", rowIndex, colIndex, be) } case RecTypeMulRk: - mr := &shMulRK{} nrk := int((r.RecSize - 6) / 6) - mr.RowIndex = binary.LittleEndian.Uint16(r.Data[:2]) - mr.FirstCol = binary.LittleEndian.Uint16(r.Data[2:4]) - mr.Values = make([]RkRec, nrk) + rowIndex := int(binary.LittleEndian.Uint16(r.Data[:2])) + colIndex := int(binary.LittleEndian.Uint16(r.Data[2:4])) for i := 0; i < nrk; i++ { off := 4 + i*6 - rr := RkRec{} - rr.IXFCell = binary.LittleEndian.Uint16(r.Data[off:]) - rr.Value = RKNumber(binary.LittleEndian.Uint32(r.Data[off:])) - mr.Values[i] = rr + ixfe := binary.LittleEndian.Uint16(r.Data[off:]) + value := RKNumber(binary.LittleEndian.Uint32(r.Data[off:])) var rval interface{} - if rr.Value.IsInteger() { - rval = rr.Value.Int() + if value.IsInteger() { + rval = value.Int() } else { - rval = rr.Value.Float64() - fno := s.b.xfs[rr.IXFCell] + rval = value.Float64() + fno := s.b.xfs[ixfe] rval, _ = s.b.nfmt.Apply(fno, rval) } - s.placeValue(int(mr.RowIndex), int(mr.FirstCol)+i, rval) + s.placeValue(rowIndex, colIndex+i, rval) } //log.Printf("mulrow spec: %+v", *mr) case RecTypeNumber: - rowIndex := binary.LittleEndian.Uint16(r.Data[:2]) - colIndex := binary.LittleEndian.Uint16(r.Data[2:4]) - ixfe := binary.LittleEndian.Uint16(r.Data[4:6]) + rowIndex := int(binary.LittleEndian.Uint16(r.Data[:2])) + colIndex := int(binary.LittleEndian.Uint16(r.Data[2:4])) + ixfe := int(binary.LittleEndian.Uint16(r.Data[4:6])) xnum := binary.LittleEndian.Uint64(r.Data[6:]) value := math.Float64frombits(xnum) fno := s.b.xfs[ixfe] rval, _ := s.b.nfmt.Apply(fno, value) - s.placeValue(int(rowIndex), int(colIndex), rval) + s.placeValue(rowIndex, colIndex, rval) //log.Printf("Number spec: %d %d = %f", rowIndex, colIndex, value) case RecTypeRK: - rowIndex := binary.LittleEndian.Uint16(r.Data[:2]) - colIndex := binary.LittleEndian.Uint16(r.Data[2:4]) - rr := RkRec{} - rr.IXFCell = binary.LittleEndian.Uint16(r.Data[4:]) - rr.Value = RKNumber(binary.LittleEndian.Uint32(r.Data[6:])) + rowIndex := int(binary.LittleEndian.Uint16(r.Data[:2])) + colIndex := int(binary.LittleEndian.Uint16(r.Data[2:4])) + ixfe := int(binary.LittleEndian.Uint16(r.Data[4:])) + value := RKNumber(binary.LittleEndian.Uint32(r.Data[6:])) var rval interface{} - if rr.Value.IsInteger() { - rval = rr.Value.Int() + if value.IsInteger() { + rval = value.Int() } else { - rval = rr.Value.Float64() - fno := s.b.xfs[rr.IXFCell] + rval = value.Float64() + fno := s.b.xfs[ixfe] rval, _ = s.b.nfmt.Apply(fno, rval) } - s.placeValue(int(rowIndex), int(colIndex), rval) + s.placeValue(rowIndex, colIndex, rval) //log.Printf("RK spec: %d %d = %s", rowIndex, colIndex, rr.Value.String()) case RecTypeFormula: @@ -301,7 +309,10 @@ func (s *WorkSheet) parse() error { fstr = string(r.Data[3:]) } else { raw := r.Data[3:] - us := make([]uint16, charCount) + if int(charCount) > cap(us) { + us = make([]uint16, charCount) + } + us = us[:charCount] for i := 0; i < int(charCount); i++ { us[i] = binary.LittleEndian.Uint16(raw) raw = raw[2:] @@ -322,7 +333,7 @@ func (s *WorkSheet) parse() error { } else { raw := r2.Data[1:] slen := len(raw) / 2 - us := make([]uint16, slen) + us = us[:slen] for i := 0; i < slen; i++ { us[i] = binary.LittleEndian.Uint16(raw) raw = raw[2:] @@ -336,35 +347,34 @@ func (s *WorkSheet) parse() error { s.placeValue(int(formulaRow), int(formulaCol), fstr) case RecTypeLabelSst: - rowIndex := binary.LittleEndian.Uint16(r.Data[:2]) - colIndex := binary.LittleEndian.Uint16(r.Data[2:4]) + rowIndex := int(binary.LittleEndian.Uint16(r.Data[:2])) + colIndex := int(binary.LittleEndian.Uint16(r.Data[2:4])) //ixfe := binary.LittleEndian.Uint16(r.Data[4:6]) - sstIndex := binary.LittleEndian.Uint32(r.Data[6:]) - if int(sstIndex) > len(s.b.strings) { + sstIndex := int(binary.LittleEndian.Uint32(r.Data[6:])) + if sstIndex > len(s.b.strings) { return errors.New("xls: invalid sst index") } - s.placeValue(int(rowIndex), int(colIndex), s.b.strings[sstIndex]) + s.placeValue(rowIndex, colIndex, s.b.strings[sstIndex]) //log.Printf("SST spec: %d %d = [%d] %s", rowIndex, colIndex, sstIndex, s.b.strings[sstIndex]) case RecTypeHLink: - loc := &shRef8{} - loc.FirstRow = binary.LittleEndian.Uint16(r.Data[:2]) - loc.LastRow = binary.LittleEndian.Uint16(r.Data[2:4]) - loc.FirstCol = binary.LittleEndian.Uint16(r.Data[4:6]) - loc.LastCol = binary.LittleEndian.Uint16(r.Data[6:]) - if int(loc.FirstCol) > s.maxCol { + firstRow := binary.LittleEndian.Uint16(r.Data[:2]) + lastRow := binary.LittleEndian.Uint16(r.Data[2:4]) + firstCol := binary.LittleEndian.Uint16(r.Data[4:6]) + lastCol := binary.LittleEndian.Uint16(r.Data[6:]) + if int(firstCol) > s.maxCol { //log.Println("invalid hyperlink column") continue } - if int(loc.FirstRow) > s.maxRow { + if int(firstRow) > s.maxRow { //log.Println("invalid hyperlink row") continue } - if loc.LastRow == 0xFFFF { - loc.LastRow = uint16(s.maxRow) + if lastRow == 0xFFFF { + lastRow = uint16(s.maxRow) } - if loc.LastCol == 0xFF { - loc.LastCol = uint16(s.maxCol) + if lastCol == 0xFF { + lastCol = uint16(s.maxCol) } displayText, linkText, err := decodeHyperlinks(r.Data[8:]) @@ -374,18 +384,18 @@ func (s *WorkSheet) parse() error { } // apply merge cell rules - for rn := int(loc.FirstRow); rn <= int(loc.LastRow); rn++ { - for cn := int(loc.FirstCol); cn <= int(loc.LastCol); cn++ { - if rn == int(loc.FirstRow) && cn == int(loc.FirstCol) { + for rn := int(firstRow); rn <= int(lastRow); rn++ { + for cn := int(firstCol); cn <= int(lastCol); cn++ { + if rn == int(firstRow) && cn == int(firstCol) { s.placeValue(rn, cn, displayText+" <"+linkText+">") - } else if cn == int(loc.FirstCol) { + } else if cn == int(firstCol) { // first and last column MAY be the same - if rn == int(loc.LastRow) { + if rn == int(lastRow) { s.placeValue(rn, cn, endRowMerged) } else { s.placeValue(rn, cn, continueRowMerged) } - } else if cn == int(loc.LastCol) { + } else if cn == int(lastCol) { // first and last column are NOT the same s.placeValue(rn, cn, endColumnMerged) } else { @@ -397,32 +407,31 @@ func (s *WorkSheet) parse() error { case RecTypeMergeCells: cmcs := binary.LittleEndian.Uint16(r.Data[:2]) raw := r.Data[2:] - loc := shRef8{} for i := 0; i < int(cmcs); i++ { - loc.FirstRow = binary.LittleEndian.Uint16(raw[:2]) - loc.LastRow = binary.LittleEndian.Uint16(raw[2:4]) - loc.FirstCol = binary.LittleEndian.Uint16(raw[4:6]) - loc.LastCol = binary.LittleEndian.Uint16(raw[6:]) + firstRow := binary.LittleEndian.Uint16(r.Data[:2]) + lastRow := binary.LittleEndian.Uint16(r.Data[2:4]) + firstCol := binary.LittleEndian.Uint16(r.Data[4:6]) + lastCol := binary.LittleEndian.Uint16(r.Data[6:]) raw = raw[8:] - if loc.LastRow == 0xFFFF { - loc.LastRow = uint16(s.maxRow) + if lastRow == 0xFFFF { + lastRow = uint16(s.maxRow) } - if loc.LastCol == 0xFF { - loc.LastCol = uint16(s.maxCol) + if lastCol == 0xFF { + lastCol = uint16(s.maxCol) } - for rn := int(loc.FirstRow); rn <= int(loc.LastRow); rn++ { - for cn := int(loc.FirstCol); cn <= int(loc.LastCol); cn++ { - if rn == int(loc.FirstRow) && cn == int(loc.FirstCol) { + for rn := int(firstRow); rn <= int(lastRow); rn++ { + for cn := int(firstCol); cn <= int(lastCol); cn++ { + if rn == int(firstRow) && cn == int(firstCol) { // should be a value there already! - } else if cn == int(loc.FirstCol) { + } else if cn == int(firstCol) { // first and last column MAY be the same - if rn == int(loc.LastRow) { + if rn == int(lastRow) { s.placeValue(rn, cn, endRowMerged) } else { s.placeValue(rn, cn, continueRowMerged) } - } else if cn == int(loc.LastCol) { + } else if cn == int(lastCol) { // first and last column are NOT the same s.placeValue(rn, cn, endColumnMerged) } else { diff --git a/xls/simple_test.go b/xls/simple_test.go index 87a25a2..64bccb0 100644 --- a/xls/simple_test.go +++ b/xls/simple_test.go @@ -1,19 +1,20 @@ package xls import ( - "context" - "log" "testing" ) func TestHeader(t *testing.T) { - wb, err := Open(context.Background(), "testdata/test.xls") + wb, err := Open("../testdata/test.xls") if err != nil { t.Fatal(err) } - log.Println(wb.filename) - for _, s := range wb.Sheets() { + sheets, err := wb.List() + if err != nil { + t.Fatal(err) + } + for _, s := range sheets { //log.Println(s) sheet, err := wb.Get(s) if err != nil { @@ -24,16 +25,24 @@ func TestHeader(t *testing.T) { sheet.Strings() } } + + err = wb.Close() + if err != nil { + t.Fatal(err) + } } func TestHeader2(t *testing.T) { - wb, err := Open(context.Background(), "testdata/test2.xls") + wb, err := Open("../testdata/test2.xls") if err != nil { t.Fatal(err) } - log.Println(wb.filename) - for _, s := range wb.Sheets() { + sheets, err := wb.List() + if err != nil { + t.Fatal(err) + } + for _, s := range sheets { //log.Println(s) sheet, err := wb.Get(s) if err != nil { @@ -44,16 +53,24 @@ func TestHeader2(t *testing.T) { sheet.Strings() } } + + err = wb.Close() + if err != nil { + t.Fatal(err) + } } func TestHeader3(t *testing.T) { - wb, err := Open(context.Background(), "testdata/test3.xls") + wb, err := Open("../testdata/test3.xls") if err != nil { t.Fatal(err) } - log.Println(wb.filename) - for _, s := range wb.Sheets() { + sheets, err := wb.List() + if err != nil { + t.Fatal(err) + } + for _, s := range sheets { //log.Println(s) sheet, err := wb.Get(s) if err != nil { @@ -64,17 +81,24 @@ func TestHeader3(t *testing.T) { sheet.Strings() } } + + err = wb.Close() + if err != nil { + t.Fatal(err) + } } func TestHeader4(t *testing.T) { - - wb, err := Open(context.Background(), "testdata/test4.xls") + wb, err := Open("../testdata/test4.xls") if err != nil { t.Fatal(err) } - log.Println(wb.filename) - for _, s := range wb.Sheets() { + sheets, err := wb.List() + if err != nil { + t.Fatal(err) + } + for _, s := range sheets { //log.Println(s) sheet, err := wb.Get(s) if err != nil { @@ -85,4 +109,9 @@ func TestHeader4(t *testing.T) { sheet.Strings() } } + + err = wb.Close() + if err != nil { + t.Fatal(err) + } } diff --git a/xls/strings.go b/xls/strings.go index e4a7059..28db75f 100644 --- a/xls/strings.go +++ b/xls/strings.go @@ -139,12 +139,12 @@ func parseSST(recs []*rec) ([]string, error) { numStrings := binary.LittleEndian.Uint32(recs[0].Data[4:8]) all := make([]string, 0, numStrings) + current := make([]uint16, 32*1024) buf := recs[0].Data[8:] for i := 0; i < len(recs); { var cRunBytes int var flags byte - var current []uint16 var cbExtRs uint32 for len(buf) > 0 { @@ -177,7 +177,11 @@ func parseSST(recs []*rec) ([]string, error) { // this block will read the string data, but transparently // handle continuing across records - current = make([]uint16, slen) + if int(slen) > cap(current) { + current = make([]uint16, slen) + } else { + current = current[:slen] + } for j := 0; j < int(slen); j++ { if len(buf) == 0 { i++ diff --git a/xls/xls.go b/xls/xls.go index 0adb920..02f0486 100644 --- a/xls/xls.go +++ b/xls/xls.go @@ -12,6 +12,7 @@ import ( "errors" "io" "log" + "sync" "github.com/pbnjay/grate" "github.com/pbnjay/grate/commonxl" @@ -146,14 +147,35 @@ func (b *WorkBook) loadFromStreamWithDecryptor(raw []byte, dec crypto.Decryptor) return b.loadFromStream2(alldata, true) } +func (b *WorkBook) Close() error { + // return records to the pool for reuse + for i, sub := range b.substreams { + for _, r := range sub { + r.Data = nil // allow GC + recPool.Put(r) + } + b.substreams[i] = b.substreams[i][:0] + } + b.substreams = b.substreams[:0] + return nil +} + func (b *WorkBook) loadFromStream2(raw []byte, isDecrypted bool) error { b.h = &header{} substr := -1 nestedBOF := 0 - b.substreams = b.substreams[:0] b.pos2substream = make(map[int64]int, 10) b.fpos = 0 + // IMPORTANT: if there are any existing record, we need to return them to the pool + for i, sub := range b.substreams { + for _, r := range sub { + recPool.Put(r) + } + b.substreams[i] = b.substreams[i][:0] + } + b.substreams = b.substreams[:0] + rawfull := raw nr, no, err := b.nextRecord(raw) for err == nil { @@ -290,14 +312,24 @@ func (b *WorkBook) loadFromStream2(raw []byte, isDecrypted bool) error { return err } +var recPool = sync.Pool{ + New: func() interface{} { + return &rec{} + }, +} + func (b *WorkBook) nextRecord(raw []byte) (*rec, int, error) { if len(raw) < 4 { return nil, 0, io.EOF } - rt := recordType(binary.LittleEndian.Uint16(raw[:2])) - rs := binary.LittleEndian.Uint16(raw[2:4]) - if len(raw[4:]) < int(rs) { + rec := recPool.Get().(*rec) + + rec.RecType = recordType(binary.LittleEndian.Uint16(raw[:2])) + rec.RecSize = binary.LittleEndian.Uint16(raw[2:4]) + if len(raw[4:]) < int(rec.RecSize) { + recPool.Put(rec) return nil, 4, io.ErrUnexpectedEOF } - return &rec{rt, rs, raw[4 : 4+rs]}, int(4 + rs), nil + rec.Data = raw[4 : 4+rec.RecSize] + return rec, int(4 + rec.RecSize), nil } diff --git a/xlsx/comp_test.go b/xlsx/comp_test.go new file mode 100644 index 0000000..b6d2a3d --- /dev/null +++ b/xlsx/comp_test.go @@ -0,0 +1,44 @@ +package xlsx + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestManyFiles(t *testing.T) { + err := filepath.Walk("../testdata", func(p string, info os.FileInfo, err error) error { + if info.IsDir() { + return nil + } + if !strings.HasSuffix(info.Name(), ".xlsx") { + return nil + } + wb, err := Open(p) + if err != nil { + return nil //err + } + + sheets, err := wb.List() + if err != nil { + return err + } + for _, s := range sheets { + //log.Println(s) + sheet, err := wb.Get(s) + if err != nil { + return err + } + + for sheet.Next() { + sheet.Strings() + } + } + + return wb.Close() + }) + if err != nil { + t.Fatal(err) + } +} diff --git a/xlsx/simple_test.go b/xlsx/simple_test.go index 4e0ee22..dad0873 100644 --- a/xlsx/simple_test.go +++ b/xlsx/simple_test.go @@ -6,19 +6,39 @@ import ( ) func noTestOpen(t *testing.T) { - _, err := Open("test.xlsx") - if err != nil { - log.Fatal(err) - } -} - -func TestOpen2(t *testing.T) { - wb, err := Open("test2.xlsx") + wb, err := Open("test.xlsx") if err != nil { log.Fatal(err) } - for _, s := range wb.Sheets() { + sheets, err := wb.List() + if err != nil { + t.Fatal(err) + } + for _, s := range sheets { + //log.Println(s) + sheet, err := wb.Get(s) + if err != nil { + t.Fatal(err) + } + + for sheet.Next() { + sheet.Strings() + } + } +} + +func TestOpen2(t *testing.T) { + wb, err := Open("test2.xlsx") + if err != nil { + log.Fatal(err) + } + + sheets, err := wb.List() + if err != nil { + t.Fatal(err) + } + for _, s := range sheets { //log.Println(s) sheet, err := wb.Get(s) if err != nil { diff --git a/xlsx/xlsx.go b/xlsx/xlsx.go index c83a965..e08dd5c 100644 --- a/xlsx/xlsx.go +++ b/xlsx/xlsx.go @@ -19,6 +19,7 @@ var _ = grate.Register("xlsx", 5, Open) // Document contains an Office Open XML document. type Document struct { filename string + f *os.File r *zip.Reader primaryDoc string @@ -30,6 +31,16 @@ type Document struct { fmt commonxl.Formatter } +func (d *Document) Close() error { + d.xfs = d.xfs[:0] + d.xfs = nil + d.strings = d.strings[:0] + d.strings = nil + d.sheets = d.sheets[:0] + d.sheets = nil + return d.f.Close() +} + func Open(filename string) (grate.Source, error) { f, err := os.Open(filename) if err != nil { @@ -45,6 +56,7 @@ func Open(filename string) (grate.Source, error) { } d := &Document{ filename: filename, + f: f, r: z, }