improve error handling/drop panics

2025-03-04 16:16:03 +02:00 · 2021-02-08 15:36:08 -05:00 · 2021-02-08 15:36:08 -05:00 · bf6d144fa3
commit bf6d144fa3
parent 80c3b4cc81
5 changed files with 93 additions and 29 deletions
--- a/cmd/xls2tsv/main.go
+++ b/cmd/xls2tsv/main.go
@ -4,6 +4,8 @@ import (
 	"context"
 	"flag"
 	"fmt"
+	"io"
+	"io/ioutil"
 	"log"
 	"os"
 	"path/filepath"
@ -14,6 +16,7 @@ import (
 )

 func main() {
+	pretend := flag.Bool("p", false, "pretend to output .tsv")
 	//infoOnly := flag.Bool("i", false, "show info/stats ONLY")
 	removeNewlines := flag.Bool("r", true, "remove embedded tabs, newlines, and condense spaces in cell contents")
 	trimSpaces := flag.Bool("w", true, "trim whitespace from cell contents")
@ -45,9 +48,14 @@ func main() {
 				continue
 			}
 			s2 := sanitize.ReplaceAllString(s, "_")
-			f, err := os.Create(fn2 + "." + s2 + ".tsv")
-			if err != nil {
-				log.Fatal(err)
+			var w io.Writer = ioutil.Discard
+			if !*pretend {
+				f, err := os.Create(fn2 + "." + s2 + ".tsv")
+				if err != nil {
+					log.Fatal(err)
+				}
+				defer f.Close()
+				w = f
 			}

 			for sheet.Next() {
@ -66,10 +74,12 @@ func main() {
 					}
 				}
 				if nonblank || !*skipBlanks {
-					fmt.Fprintln(f, strings.Join(row, "\t"))
+					fmt.Fprintln(w, strings.Join(row, "\t"))
 				}
 			}
-			f.Close()
+			if c, ok := w.(io.Closer); ok {
+				c.Close()
+			}
 		}
 	}
 }
--- a/xls/cfb/cfb.go
+++ b/xls/cfb/cfb.go
@ -76,7 +76,7 @@ type directory struct {

 func (d *directory) String() string {
 	if (d.NameByteLen&1) == 1 || d.NameByteLen > 64 {
-		panic("invalid utf16 string")
+		return "<invalid utf16 string>"
 	}
 	r16 := utf16.Decode(d.Name[:int(d.NameByteLen)/2])
 	// trim off null terminator
@ -117,39 +117,39 @@ func (d *doc) load(rx io.ReadSeeker) error {
 	}
 	if fullAssertions {
 		if h.ClassID[0] != 0 || h.ClassID[1] != 0 {
-			panic("invalid CLSID")
+			return errors.New("ole2: invalid CLSID")
 		}
 		if h.MajorVersion != 3 && h.MajorVersion != 4 {
-			panic("unknown major version")
+			return errors.New("ole2: unknown major version")
 		}
 		if h.MinorVersion != 0x3E {
 			log.Printf("WARNING MinorVersion = 0x%02x NOT 0x3E", h.MinorVersion)
-			//panic("unknown minor version")
+			//return errors.New("ole2: unknown minor version")
 		}

 		for _, v := range h.Reserved1 {
 			if v != 0 {
-				panic("reserved section is non-zero")
+				return errors.New("ole2: reserved section is non-zero")
 			}
 		}
 		if h.MajorVersion == 3 {
 			if h.SectorShift != 9 {
-				panic("invalid sector size")
+				return errors.New("ole2: invalid sector size")
 			}
 			if h.NumDirectorySectors != 0 {
-				panic("version 3 does not support directory sectors")
+				return errors.New("ole2: version 3 does not support directory sectors")
 			}
 		}
 		if h.MajorVersion == 4 {
 			if h.SectorShift != 12 {
-				panic("invalid sector size")
+				return errors.New("ole2: invalid sector size")
 			}
 		}
 		if h.MiniSectorShift != 6 {
-			panic("invalid mini sector size")
+			return errors.New("ole2: invalid mini sector size")
 		}
 		if h.MiniStreamCutoffSize != 0x00001000 {
-			panic("invalid mini sector cutoff")
+			return errors.New("ole2: invalid mini sector cutoff")
 		}
 	}
 	d.header = h
@ -166,6 +166,9 @@ func (d *doc) load(rx io.ReadSeeker) error {
 			break
 		}
 		offs := int64(1+sid) << int32(h.SectorShift)
+		if offs >= int64(len(d.data)) {
+			return errors.New("xls/cfb: unable to load file")
+		}
 		sector := d.data[offs:]
 		for j := 0; j < numFATentries; j++ {
 			sid2 := le.Uint32(sector)
@ -188,7 +191,7 @@ func (d *doc) load(rx io.ReadSeeker) error {
 				}

 				offs := int64(1+sid2) << int32(h.SectorShift)
-				if offs > int64(len(d.data)) {
+				if offs >= int64(len(d.data)) {
 					return errors.New("xls/cfb: unable to load file")
 				}
 				sector := d.data[offs:]
@ -209,18 +212,22 @@ func (d *doc) load(rx io.ReadSeeker) error {
 	sid := h.FirstMiniFATSectorLocation
 	for sid != secEndOfChain {
 		offs := int64(1+sid) << int32(h.SectorShift)
+		if offs >= int64(len(d.data)) {
+			return errors.New("xls/cfb: unable to load file")
+		}
 		sector := d.data[offs:]
 		for j := 0; j < numFATentries; j++ {
 			sid = le.Uint32(sector)
 			d.minifat = append(d.minifat, sid)
 			sector = sector[4:]
 		}
-		// chain the next mini FAT sector
-		sid = le.Uint32(sector)

 		if len(d.minifat) >= int(h.NumMiniFATSectors) {
 			break
 		}
+
+		// chain the next mini FAT sector
+		sid = le.Uint32(sector)
 	}

 	// step 3: read the Directory Entries
@ -270,7 +277,7 @@ func (d *doc) buildDirs(br *bytes.Reader) error {
 	return nil
 }

-func (d *doc) getStreamReader(sid uint32, size uint64) io.ReadSeeker {
+func (d *doc) getStreamReader(sid uint32, size uint64) (io.ReadSeeker, error) {
 	// NB streamData is a slice of slices of the raw data, so this is the
 	// only allocation - for the (much smaller) list of sector slices
 	streamData := make([][]byte, 1+(size>>d.header.SectorShift))
@ -294,13 +301,13 @@ func (d *doc) getStreamReader(sid uint32, size uint64) io.ReadSeeker {
 		x++
 	}
 	if size != 0 {
-		panic("incomplete read")
+		return nil, errors.New("ole2: incomplete read")
 	}

-	return &SliceReader{Data: streamData}
+	return &SliceReader{Data: streamData}, nil
 }

-func (d *doc) getMiniStreamReader(sid uint32, size uint64) io.ReadSeeker {
+func (d *doc) getMiniStreamReader(sid uint32, size uint64) (io.ReadSeeker, error) {
 	// TODO: move into a separate cache so we don't recalculate it each time
 	fatStreamData := make([][]byte, 1+(d.ministreamsize>>d.header.SectorShift))

@ -346,5 +353,5 @@ func (d *doc) getMiniStreamReader(sid uint32, size uint64) io.ReadSeeker {
 		sid = d.minifat[sid]
 	}

-	return &SliceReader{Data: streamData}
+	return &SliceReader{Data: streamData}, nil
 }
--- a/xls/cfb/interface.go
+++ b/xls/cfb/interface.go
@ -45,9 +45,9 @@ func (d *doc) Open(name string) (io.ReadSeeker, error) {
 	for _, e := range d.dir {
 		if e.String() == name && e.ObjectType == typeStream {
 			if e.StreamSize < uint64(d.header.MiniStreamCutoffSize) {
-				return d.getMiniStreamReader(uint32(e.StartingSectorLocation), e.StreamSize), nil
+				return d.getMiniStreamReader(uint32(e.StartingSectorLocation), e.StreamSize)
 			} else if e.StreamSize != 0 {
-				return d.getStreamReader(uint32(e.StartingSectorLocation), e.StreamSize), nil
+				return d.getStreamReader(uint32(e.StartingSectorLocation), e.StreamSize)
 			}
 		}
 	}
--- a/xls/sheets.go
+++ b/xls/sheets.go
@ -99,8 +99,20 @@ func (s *WorkSheet) parse() error {
 	}

 	var formulaRow, formulaCol uint16
-	for _, r := range s.b.substreams[s.ss] {
+	for ridx, r := range s.b.substreams[s.ss] {
 		bb := bytes.NewReader(r.Data)
+		//log.Println(ridx, r.RecType)
+
+		// sec 2.1.7.20.6 Common Productions ABNF:
+		/*
+			CELLTABLE = 1*(1*Row *CELL 1*DBCell) *EntExU2
+			CELL = FORMULA / Blank / MulBlank / RK / MulRk / BoolErr / Number / LabelSst
+			FORMULA = [Uncalced] Formula [Array / Table / ShrFmla / SUB] [String *Continue]
+
+			Not parsed form the list above:
+				DBCell, EntExU2, Uncalced, Array, Table,ShrFmla
+				NB: no idea what "SUB" is
+		*/

 		switch r.RecType {
 		//case RecTypeWindow2:
@ -266,6 +278,27 @@ func (s *WorkSheet) parse() error {
 				binary.Read(bb, binary.LittleEndian, us)
 				fstr = string(utf16.Decode(us))
 			}
+
+			if (ridx + 1) < len(s.b.substreams[s.ss]) {
+				ridx2 := ridx + 1
+				nrecs := len(s.b.substreams[s.ss])
+				for ridx2 < nrecs {
+					r2 := s.b.substreams[s.ss][ridx2]
+					if r2.RecType != RecTypeContinue {
+						break
+					}
+					if (r2.Data[0] & 1) == 0 {
+						fstr += string(r2.Data[1:])
+					} else {
+						bb2 := bytes.NewReader(r2.Data[1:])
+						us := make([]uint16, len(r2.Data)-1)
+						binary.Read(bb2, binary.LittleEndian, us)
+						fstr += string(utf16.Decode(us))
+					}
+					ridx2++
+				}
+			}
+
 			s.placeValue(int(formulaRow), int(formulaCol), fstr)

 		case RecTypeLabelSst:
@ -276,7 +309,7 @@ func (s *WorkSheet) parse() error {
 			binary.Read(bb, binary.LittleEndian, &ixfe)
 			binary.Read(bb, binary.LittleEndian, &sstIndex)
 			if int(sstIndex) > len(s.b.strings) {
-				panic("invalid sst")
+				return errors.New("xls: invalid sst index")
 			}
 			s.placeValue(int(rowIndex), int(colIndex), s.b.strings[sstIndex])
 			//log.Printf("SST spec: %d %d = [%d] %s", rowIndex, colIndex, sstIndex, s.b.strings[sstIndex])
@ -284,6 +317,8 @@ func (s *WorkSheet) parse() error {
 		case RecTypeHLink:
 			loc := &shRef8{}
 			binary.Read(bb, binary.LittleEndian, loc)
+			loc.FirstCol &= 0x00FF // spec doesn't say what to do when MUST is disregarded...
+			loc.LastCol &= 0x00FF
 			var x uint64
 			binary.Read(bb, binary.LittleEndian, &x) // skip and discard classid
 			binary.Read(bb, binary.LittleEndian, &x)
@ -301,10 +336,18 @@ func (s *WorkSheet) parse() error {
 				binary.Read(bb, binary.LittleEndian, us)
 				str = string(utf16.Decode(us))
 			}
+			//log.Printf("hyperlink spec: %+v = %s", loc, str)
+			if loc.FirstCol > maxCol {
+				//log.Println("invalid hyperlink column")
+				continue
+			}
+			if uint32(loc.FirstRow) > maxRow {
+				//log.Println("invalid hyperlink row")
+				continue
+			}

 			// TODO: apply merge cell rules
 			s.placeValue(int(loc.FirstRow), int(loc.FirstCol), str)
-			log.Printf("hyperlink spec: %+v = %s", loc, str)

 		case RecTypeMergeCells:
 			var cmcs uint16
@ -317,6 +360,9 @@ func (s *WorkSheet) parse() error {
 			// 	log.Printf("    %d: %+v", j, mc)
 			// }

+		case RecTypeContinue:
+			// the only situation so far is when used in RecTypeString above
+
 		default:
 			//log.Println("worksheet", r.RecType, r.RecSize)

--- a/xls/strings.go
+++ b/xls/strings.go
@ -2,6 +2,7 @@ package xls

 import (
 	"encoding/binary"
+	"errors"
 	"io"
 	"io/ioutil"
 	"unicode/utf16"
@ -173,7 +174,7 @@ func parseSST(recs []*rec) ([]string, error) {
 					current[j] = uint16(binary.LittleEndian.Uint16(buf[:2]))
 					buf = buf[2:]
 					if len(buf) == 1 {
-						panic("off by one")
+						return nil, errors.New("xls: off by one")
 					}
 				}
 			}