1
0
mirror of https://github.com/pbnjay/grate.git synced 2024-12-13 13:58:27 +02:00
grate/xls/xls.go

361 lines
9.1 KiB
Go

// Package xls implements the Microsoft Excel Binary File Format (.xls) Structure.
// More specifically, it contains just enough detail to extract cell contents,
// data types, and last-calculated formula values. In particular, it does NOT
// implement formatting or formula calculations.
package xls
// https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-xls/cd03cb5f-ca02-4934-a391-bb674cb8aa06
import (
"context"
"encoding/binary"
"errors"
"io"
"log"
"sync"
"github.com/pbnjay/grate"
"github.com/pbnjay/grate/commonxl"
"github.com/pbnjay/grate/xls/cfb"
"github.com/pbnjay/grate/xls/crypto"
)
var _ = grate.Register("xls", 1, Open)
// WorkBook represents an Excel workbook containing 1 or more sheets.
type WorkBook struct {
filename string
ctx context.Context
doc *cfb.Document
prot bool
h *header
sheets []*boundSheet
codepage uint16
dateMode uint16
strings []string
password string
substreams [][]*rec
fpos int64
pos2substream map[int64]int
nfmt commonxl.Formatter
xfs []uint16
}
func (b *WorkBook) IsProtected() bool {
return b.prot
}
func Open(filename string) (grate.Source, error) {
doc, err := cfb.Open(filename)
if err != nil {
return nil, err
}
b := &WorkBook{
filename: filename,
doc: doc,
pos2substream: make(map[int64]int, 16),
xfs: make([]uint16, 0, 128),
}
rdr, err := doc.Open("Workbook")
if err != nil {
return nil, grate.WrapErr(err, grate.ErrNotInFormat)
}
raw, err := io.ReadAll(rdr)
if err != nil {
return nil, err
}
err = b.loadFromStream(raw)
return b, err
}
func (b *WorkBook) loadFromStream(raw []byte) error {
return b.loadFromStream2(raw, false)
}
func (b *WorkBook) loadFromStreamWithDecryptor(raw []byte, dec crypto.Decryptor) error {
// interestingly (insecurely) BIFF8 keeps Record Types and sizes in the clear,
// has a few records that are not encrypted, and has 1 record type that does
// not encrypt the 32bit integer position at the beginning (while encrypting
// the rest). It also resets the encryption block counter every 1024 bytes
// (counting all the "skipped" bytes described above).
//
// So this code streams the records through the decryption, but also records
// a set of overlays applied to the final result which restore the
// "cleartext" contents in line with the decrypted content.
if grate.Debug {
log.Println(" Decrypting xls stream with standard RC4")
}
pos := 0
zeros := [8224]byte{}
type overlay struct {
Pos int
RecType recordType
DataBytes uint16
Data []byte // NB len() not necessarily = DataBytes
}
replaceBlocks := []overlay{}
var err error
for err == nil && len(raw[pos:]) > 4 {
o := overlay{}
o.Pos = pos
o.RecType = recordType(binary.LittleEndian.Uint16(raw[pos : pos+2]))
o.DataBytes = binary.LittleEndian.Uint16(raw[pos+2 : pos+4])
pos += 4
// copy to output and decryption stream
binary.Write(dec, binary.LittleEndian, o.RecType)
binary.Write(dec, binary.LittleEndian, o.DataBytes)
tocopy := int(o.DataBytes)
switch o.RecType {
case RecTypeBOF, RecTypeFilePass, RecTypeUsrExcl, RecTypeFileLock, RecTypeInterfaceHdr, RecTypeRRDInfo, RecTypeRRDHead:
// untouched data goes directly into output
o.Data = raw[pos : pos+int(o.DataBytes)]
pos += int(o.DataBytes)
dec.Write(zeros[:int(o.DataBytes)])
tocopy = 0
case RecTypeBoundSheet8:
// copy 32-bit position to output
o.Data = raw[pos : pos+4]
pos += 4
dec.Write(zeros[:4])
tocopy -= 4
}
if tocopy > 0 {
_, err = dec.Write(raw[pos : pos+tocopy])
pos += tocopy
}
replaceBlocks = append(replaceBlocks, o)
}
dec.Flush()
alldata := dec.Bytes()
for _, o := range replaceBlocks {
offs := int(o.Pos)
binary.LittleEndian.PutUint16(alldata[offs:], uint16(o.RecType))
binary.LittleEndian.PutUint16(alldata[offs+2:], uint16(o.DataBytes))
if len(o.Data) > 0 {
offs += 4
copy(alldata[offs:], o.Data)
}
}
// recurse into the stream parser now that things are decrypted
return b.loadFromStream2(alldata, true)
}
func (b *WorkBook) Close() error {
// return records to the pool for reuse
for i, sub := range b.substreams {
for _, r := range sub {
r.Data = nil // allow GC
recPool.Put(r)
}
b.substreams[i] = b.substreams[i][:0]
}
b.substreams = b.substreams[:0]
return nil
}
func (b *WorkBook) loadFromStream2(raw []byte, isDecrypted bool) error {
b.h = &header{}
substr := -1
nestedBOF := 0
b.pos2substream = make(map[int64]int, 10)
b.fpos = 0
// IMPORTANT: if there are any existing records, we need to return them to the pool
for i, sub := range b.substreams {
for _, r := range sub {
recPool.Put(r)
}
b.substreams[i] = b.substreams[i][:0]
}
b.substreams = b.substreams[:0]
rawfull := raw
nr, no, err := b.nextRecord(raw)
for err == nil {
raw = raw[no:]
switch nr.RecType {
case RecTypeEOF:
nestedBOF--
case RecTypeBOF:
// when substreams are nested, keep them in the same grouping
if nestedBOF == 0 {
substr = len(b.substreams)
b.substreams = append(b.substreams, []*rec{})
b.pos2substream[b.fpos] = substr
}
nestedBOF++
}
b.fpos += int64(4 + len(nr.Data))
// if there's a FilePass record, the data is encrypted
if nr.RecType == RecTypeFilePass && !isDecrypted {
etype := binary.LittleEndian.Uint16(nr.Data)
switch etype {
case 1:
dec, err := crypto.NewBasicRC4(nr.Data[2:])
if err != nil {
log.Println("xls: rc4 encryption failed to set up", err)
return err
}
return b.loadFromStreamWithDecryptor(rawfull, dec)
case 2, 3, 4:
log.Println("need Crypto API RC4 decryptor")
return errors.New("xls: unsupported Crypto API encryption method")
default:
return errors.New("xls: unsupported encryption method")
}
}
b.substreams[substr] = append(b.substreams[substr], nr)
nr, no, err = b.nextRecord(raw)
}
if err == io.EOF {
err = nil
}
if err != nil {
return err
}
for ss, records := range b.substreams {
if grate.Debug {
log.Printf(" Processing substream %d/%d (%d records)", ss, len(b.substreams), len(records))
}
for i, nr := range records {
if len(nr.Data) == 0 {
continue
}
switch nr.RecType {
case RecTypeSST:
// Shared String Table is often continued across multiple records,
// so we want to gather them all before starting to parse (some
// strings may span the gap between records)
recSet := []*rec{nr}
lastIndex := i
for len(records) > (lastIndex+1) && records[lastIndex+1].RecType == RecTypeContinue {
lastIndex++
recSet = append(recSet, records[lastIndex])
}
b.strings, err = parseSST(recSet)
if err != nil {
return err
}
case RecTypeContinue:
// no-op (used above)
case RecTypeEOF:
// done
case RecTypeBOF:
b.h = &header{
Version: binary.LittleEndian.Uint16(nr.Data[0:2]),
DocType: binary.LittleEndian.Uint16(nr.Data[2:4]),
RupBuild: binary.LittleEndian.Uint16(nr.Data[4:6]),
RupYear: binary.LittleEndian.Uint16(nr.Data[6:8]),
MiscBits: binary.LittleEndian.Uint64(nr.Data[8:16]),
}
if b.h.Version != 0x0600 {
return errors.New("xls: invalid file version")
}
if b.h.RupYear != 0x07CC && b.h.RupYear != 0x07CD {
return errors.New("xls: unsupported biff version")
}
/*
if b.h.DocType != 0x0005 && b.h.DocType != 0x0010 {
// we only support the workbook or worksheet substreams
log.Println("xls: unsupported document type")
//break
}
*/
case RecTypeCodePage:
// BIFF8 is entirely UTF-16LE so this is actually ignored
b.codepage = binary.LittleEndian.Uint16(nr.Data)
case RecTypeDate1904:
b.dateMode = binary.LittleEndian.Uint16(nr.Data)
case RecTypeFormat:
// Format maps a format ID to a code string
fmtNo := binary.LittleEndian.Uint16(nr.Data)
formatStr, _, err := decodeXLUnicodeString(nr.Data[2:])
if err != nil {
log.Println("fail2", err)
return err
}
b.nfmt.Add(fmtNo, formatStr)
case RecTypeXF:
// XF records merge multiple style and format directives to one ID
// ignore font id at nr.Data[0:2]
fmtNo := binary.LittleEndian.Uint16(nr.Data[2:])
b.xfs = append(b.xfs, fmtNo)
case RecTypeBoundSheet8:
// Identifies the postition within the stream, visibility state,
// and name of a worksheet
bs := &boundSheet{}
bs.Position = binary.LittleEndian.Uint32(nr.Data[:4])
bs.HiddenState = nr.Data[4]
bs.SheetType = nr.Data[5]
bs.Name, _, err = decodeShortXLUnicodeString(nr.Data[6:])
if err != nil {
return err
}
b.sheets = append(b.sheets, bs)
default:
if grate.Debug && ss == 0 {
log.Println(" Unhandled record type:", nr.RecType, i)
}
}
}
}
return err
}
var recPool = sync.Pool{
New: func() interface{} {
return &rec{}
},
}
func (b *WorkBook) nextRecord(raw []byte) (*rec, int, error) {
if len(raw) < 4 {
return nil, 0, io.EOF
}
rec := recPool.Get().(*rec)
rec.RecType = recordType(binary.LittleEndian.Uint16(raw[:2]))
rec.RecSize = binary.LittleEndian.Uint16(raw[2:4])
if len(raw[4:]) < int(rec.RecSize) {
recPool.Put(rec)
return nil, 4, io.ErrUnexpectedEOF
}
rec.Data = raw[4 : 4+rec.RecSize]
return rec, int(4 + rec.RecSize), nil
}