diff --git a/xlsx/formats.go b/xlsx/formats.go new file mode 100644 index 0000000..894d3a0 --- /dev/null +++ b/xlsx/formats.go @@ -0,0 +1,82 @@ +package xlsx + +var builtInFormats = map[uint16]string{ + 0: `General`, + 1: `0`, + 2: `0.00`, + 3: `#,##0`, + 4: `#,##0.00`, + 9: `0%`, + 10: `0.00%`, + + 11: `0.00E+00`, + 12: `# ?/?`, + 13: `# ??/??`, + 14: `mm-dd-yy`, + 15: `d-mmm-yy`, + 16: `d-mmm`, + 17: `mmm-yy`, + 18: `h:mm AM/PM`, + 19: `h:mm:ss AM/PM`, + 20: `h:mm`, + 21: `h:mm:ss`, + 22: `m/d/yy h:mm`, + 37: `#,##0 ;(#,##0)`, + 38: `#,##0 ;[Red](#,##0)`, + 39: `#,##0.00;(#,##0.00)`, + 40: `#,##0.00;[Red](#,##0.00)`, + + 41: `_(* #,##0_);_(* \(#,##0\);_(* "-"_);_(@_)`, + 42: `_("$"* #,##0_);_("$"* \(#,##0\);_("$"* "-"_);_(@_)`, + 43: `_(* #,##0.00_);_(* \(#,##0.00\);_(* "-"??_);_(@_)`, + 44: `_("$"* #,##0.00_);_("$"* \(#,##0.00\);_("$"* "-"??_);_(@_)`, + + 45: `mm:ss`, + 46: `[h]:mm:ss`, + 47: `mmss.0`, + 48: `##0.0E+0`, + 49: `@`, + + // zh-cn format codes + 27: `yyyy"年"m"月"`, + 28: `m"月"d"日"`, + 29: `m"月"d"日"`, + 30: `m-d-yy`, + 31: `yyyy"年"m"月"d"日"`, + 32: `h"时"mm"分"`, + 33: `h"时"mm"分"ss"秒"`, + 34: `上午/下午 h"时"mm"分"`, + 35: `上午/下午 h"时"mm"分"ss"秒"`, + 36: `yyyy"年"m"月"`, + 50: `yyyy"年"m"月"`, + 51: `m"月"d"日"`, + 52: `yyyy"年"m"月"`, + 53: `m"月"d"日"`, + 54: `m"月"d"日"`, + 55: `上午/下午 h"时"mm"分"`, + 56: `上午/下午 h"时"mm"分"ss"秒`, + 57: `yyyy"年"m"月"`, + 58: `m"月"d"日"`, + + // th-th format codes + 59: `t0`, + 60: `t0.00`, + 61: `t#,##0`, + 62: `t#,##0.00`, + 67: `t0%`, + 68: `t0.00%`, + 69: `t# ?/?`, + 70: `t# ??/??`, + // th format code, but translated to aid the parser + 71: `d/m/yyyy`, // `ว/ด/ปปปป`, + 72: `d-mmm-yy`, // `ว-ดดด-ปป`, + 73: `d-mmm`, // `ว-ดดด`, + 74: `mmm-yy`, // `ดดด-ปป`, + 75: `h:mm`, // `ช:นน`, + 76: `h:mm:ss`, // `ช:นน:ทท`, + 77: `d/m/yyyy h:mm`, // `ว/ด/ปปปป ช:นน`, + 78: `mm:ss`, // `นน:ทท`, + 79: `[h]:mm:ss`, // `[ช]:นน:ทท`, + 80: `mm:ss.0`, // `นน:ทท.0`, + 81: `d/m/bb`, // `d/m/bb`, +} diff --git a/xlsx/sheets.go b/xlsx/sheets.go new file mode 100644 index 0000000..45c2aab --- /dev/null +++ b/xlsx/sheets.go @@ -0,0 +1,269 @@ +package xlsx + +import ( + "encoding/xml" + "errors" + "fmt" + "io" + "log" + "path/filepath" + "strconv" + "strings" + "time" +) + +type Sheet struct { + d *Document + relID string + name string + docname string + + err error + + minRow int + maxRow int + minCol int + maxCol int + rows []*row + empty bool + + iterRow int +} + +type row struct { + // each value must be one of: int, float64, string, or time.Time + cols []interface{} +} + +func (s *Sheet) parseSheet() error { + linkmap := make(map[string]string) + base := filepath.Base(s.docname) + sub := strings.TrimSuffix(s.docname, base) + relsname := filepath.Join(sub, "_rels", base+".rels") + dec, clo, err := s.d.openXML(relsname) + if err == nil { + // rels might not exist for every sheet + tok, err := dec.Token() + for ; err == nil; tok, err = dec.Token() { + if v, ok := tok.(xml.StartElement); ok && v.Name.Local == "Relationship" { + ax := attrMap(v.Attr) + if ax["TargetMode"] == "External" && ax["Type"] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" { + linkmap[ax["Id"]] = ax["Target"] + } + } + } + clo.Close() + } + + dec, clo, err = s.d.openXML(s.docname) + if err != nil { + return err + } + defer clo.Close() + + currentCellType := BlankCellType + currentCell := "" + numFormat := "" + tok, err := dec.Token() + for ; err == nil; tok, err = dec.Token() { + switch v := tok.(type) { + case xml.CharData: + if currentCell == "" { + continue + } + c, r := refToIndexes(currentCell) + if c >= 0 && r >= 0 { + var val interface{} = string(v) + switch currentCellType { + case BooleanCellType: + if v[0] == '1' { + val = true + } else { + val = false + } + case DateCellType: + log.Println("CELL DATE", val, numFormat) + case NumberCellType: + fval, err := strconv.ParseFloat(string(v), 64) + if err == nil { + val = fval + } + //log.Println("CELL NUMBER", val, numFormat) + case SharedStringCellType: + //log.Println("CELL SHSTR", val, currentCellType, numFormat) + si, _ := strconv.ParseInt(string(v), 10, 64) + val = s.d.strings[si] + case BlankCellType: + //log.Println("CELL BLANK") + // don't place any values + continue + case ErrorCellType, FormulaStringCellType, InlineStringCellType: + //log.Println("CELL ERR/FORM/INLINE", val, currentCellType) + default: + log.Println("CELL UNKNOWN", val, currentCellType, numFormat) + } + s.placeValue(r, c, val) + } else { + //log.Println("FAIL row/col: ", currentCell) + } + case xml.StartElement: + ax := attrMap(v.Attr) + switch v.Name.Local { + case "dimension": + if ax["ref"] == "A1" { + // short-circuit empty sheet + s.minCol, s.minRow = 0, 0 + s.maxCol, s.maxRow = 1, 1 + s.empty = true + continue + } + dims := strings.Split(ax["ref"], ":") + s.minCol, s.minRow = refToIndexes(dims[0]) + s.maxCol, s.maxRow = refToIndexes(dims[1]) + //log.Println("DIMENSION:", s.minRow, s.minCol, ">", s.maxRow, s.maxCol) + case "row": + //currentRow = ax["r"] // unsigned int row index + //log.Println("ROW", currentRow) + case "c": + currentCellType = CellType(ax["t"]) + if currentCellType == BlankCellType { + currentCellType = NumberCellType + } + currentCell = ax["r"] // always an A1 style reference + style := ax["s"] + sid, _ := strconv.ParseInt(style, 10, 64) + numFormat = s.d.xfs[sid] // unsigned integer lookup + //log.Println("CELL", currentCell, sid, numFormat, currentCellType) + case "v": + //log.Println("CELL VALUE", ax) + + case "mergeCell": + dims := strings.Split(ax["ref"], ":") + startCol, startRow := refToIndexes(dims[0]) + endCol, endRow := refToIndexes(dims[1]) + for r := startRow; r <= endRow; r++ { + for c := startCol; c <= endCol; c++ { + if r == startRow && c == startCol { + // has data already! + } else if c == startCol { + // first and last column MAY be the same + if r == endRow { + s.placeValue(r, c, endRowMerged) + } else { + s.placeValue(r, c, continueRowMerged) + } + } else if c == endCol { + // first and last column are NOT the same + s.placeValue(r, c, endColumnMerged) + } else { + s.placeValue(r, c, continueColumnMerged) + } + } + } + + case "hyperlink": + col, row := refToIndexes(ax["ref"]) + link := linkmap[ax["id"]] + if len(s.rows) > row && len(s.rows[row].cols) > col { + if sstr, ok := s.rows[row].cols[col].(string); ok { + link = sstr + " <" + link + ">" + } + } + s.placeValue(row, col, link) + + case "mergeCells", "hyperlinks": + // NB don't need these outer containers + case "f": + //log.Println("start: ", v.Name.Local, v.Attr) + default: + //log.Println("start: ", v.Name.Local, v.Attr) + } + case xml.EndElement: + + switch v.Name.Local { + case "c": + currentCell = "" + case "row": + //currentRow = "" + } + //log.Println(" end: ", v.Name.Local) + default: + //log.Printf("%T %+v", tok, tok) + } + } + if err == io.EOF { + err = nil + } + return err +} + +func (s *Sheet) placeValue(rowIndex, colIndex int, val interface{}) { + if colIndex > s.maxCol || rowIndex > s.maxRow { + // invalid + return + } + + // ensure we always have a complete matrix + for len(s.rows) <= rowIndex { + emptyRow := make([]interface{}, s.maxCol+1) + for i := 0; i <= s.maxCol; i++ { + emptyRow[i] = staticBlank + } + s.rows = append(s.rows, &row{emptyRow}) + } + s.empty = false + s.rows[rowIndex].cols[colIndex] = val +} + +// Next advances to the next row of content. +// It MUST be called prior to any Scan(). +func (s *Sheet) Next() bool { + s.iterRow++ + return s.iterRow < len(s.rows) +} + +func (s *Sheet) Strings() []string { + currow := s.rows[s.iterRow] + res := make([]string, len(currow.cols)) + for i, col := range currow.cols { + res[i] = fmt.Sprint(col) + } + return res +} + +// Scan extracts values from the row into the provided arguments +// Arguments must be pointers to one of 5 supported types: +// bool, int, float64, string, or time.Time +func (s *Sheet) Scan(args ...interface{}) error { + currow := s.rows[s.iterRow] + + for i, a := range args { + switch v := a.(type) { + case *bool: + *v = currow.cols[i].(bool) + case *int: + *v = currow.cols[i].(int) + case *float64: + *v = currow.cols[i].(float64) + case *string: + *v = currow.cols[i].(string) + case *time.Time: + *v = currow.cols[i].(time.Time) + default: + return ErrInvalidType + } + } + return nil +} + +func (s *Sheet) IsEmpty() bool { + return s.empty +} + +// Err returns the last error that occured. +func (s *Sheet) Err() error { + return s.err +} + +// ErrInvalidType is returned by Scan for invalid arguments. +var ErrInvalidType = errors.New("xlsx: Scan only supports *bool, *int, *float64, *string, *time.Time arguments") diff --git a/xlsx/simple_test.go b/xlsx/simple_test.go new file mode 100644 index 0000000..4e0ee22 --- /dev/null +++ b/xlsx/simple_test.go @@ -0,0 +1,32 @@ +package xlsx + +import ( + "log" + "testing" +) + +func noTestOpen(t *testing.T) { + _, err := Open("test.xlsx") + if err != nil { + log.Fatal(err) + } +} + +func TestOpen2(t *testing.T) { + wb, err := Open("test2.xlsx") + if err != nil { + log.Fatal(err) + } + + for _, s := range wb.Sheets() { + //log.Println(s) + sheet, err := wb.Get(s) + if err != nil { + t.Fatal(err) + } + + for sheet.Next() { + sheet.Strings() + } + } +} diff --git a/xlsx/types.go b/xlsx/types.go new file mode 100644 index 0000000..ef11d73 --- /dev/null +++ b/xlsx/types.go @@ -0,0 +1,88 @@ +package xlsx + +import ( + "encoding/xml" + "strconv" + "strings" +) + +type CellType string + +// CellTypes define data type in section 18.18.11 +const ( + BlankCellType CellType = "" + BooleanCellType CellType = "b" + DateCellType CellType = "d" + ErrorCellType CellType = "e" + NumberCellType CellType = "n" + SharedStringCellType CellType = "s" + FormulaStringCellType CellType = "str" + InlineStringCellType CellType = "inlineStr" +) + +type staticCellType rune + +const ( + staticBlank staticCellType = 0 + + // marks a continuation column within a merged cell. + continueColumnMerged staticCellType = '→' + // marks the last column of a merged cell. + endColumnMerged staticCellType = '⇥' + + // marks a continuation row within a merged cell. + continueRowMerged staticCellType = '↓' + // marks the last row of a merged cell. + endRowMerged staticCellType = '⤓' +) + +func (s staticCellType) String() string { + if s == 0 { + return "" + } + return string([]rune{rune(s)}) +} + +// returns the 0-based index of the column string: +// "A"=0, "B"=1, "AA"=26, "BB"=53 +func col2int(col string) int { + idx := 0 + for _, c := range col { + idx *= 26 + idx += int(c - '@') + } + return idx - 1 +} + +func refToIndexes(r string) (column, row int) { + if len(r) < 2 { + return -1, -1 + } + i1 := strings.IndexAny(r, "0123456789") + if i1 <= 0 { + return -1, -1 + } + + // A1 Reference mode + col1 := r[:i1] + i2 := strings.IndexByte(r[i1:], 'C') + if i2 == -1 { + rn, _ := strconv.ParseInt(r[i1:], 10, 64) + return col2int(col1), int(rn) + } + + // R1C1 Reference Mode + col1 = r[i1:i2] + row1 := r[i2+1:] + cn, _ := strconv.ParseInt(col1, 10, 64) + rn, _ := strconv.ParseInt(row1, 10, 64) + return int(cn), int(rn) +} + +func attrMap(attrs []xml.Attr) map[string]string { + m := make(map[string]string, len(attrs)) + for _, a := range attrs { + m[a.Name.Local] = a.Value + } + return m +} diff --git a/xlsx/workbook.go b/xlsx/workbook.go new file mode 100644 index 0000000..b769c2a --- /dev/null +++ b/xlsx/workbook.go @@ -0,0 +1,167 @@ +package xlsx + +import ( + "encoding/xml" + "errors" + "io" + "path/filepath" + "strconv" +) + +func (d *Document) parseRels(dec *xml.Decoder, basedir string) error { + tok, err := dec.Token() + for ; err == nil; tok, err = dec.Token() { + switch v := tok.(type) { + // the tags we're interested in are all self-closing + case xml.StartElement: + switch v.Name.Local { + case "Relationship": + vals := make(map[string]string, 5) + for _, a := range v.Attr { + vals[a.Name.Local] = a.Value + } + if _, ok := d.rels[vals["Type"]]; !ok { + d.rels[vals["Type"]] = make(map[string]string) + } + d.rels[vals["Type"]][vals["Id"]] = filepath.Join(basedir, vals["Target"]) + if vals["Type"] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" { + d.primaryDoc = vals["Target"] + } + } + } + } + if err == io.EOF { + err = nil + } + return err +} + +func (d *Document) parseWorkbook(dec *xml.Decoder) error { + tok, err := dec.Token() + for ; err == nil; tok, err = dec.Token() { + switch v := tok.(type) { + case xml.StartElement: + //log.Println("start: ", v.Name.Local) + + switch v.Name.Local { + case "sheet": + vals := make(map[string]string, 5) + for _, a := range v.Attr { + vals[a.Name.Local] = a.Value + } + sheetID, ok1 := vals["id"] + sheetName, ok2 := vals["name"] + if !ok1 || !ok2 { + return errors.New("xlsx: invalid sheet definition") + } + s := &Sheet{ + d: d, + relID: sheetID, + name: sheetName, + docname: d.rels["http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet"][sheetID], + } + d.sheets = append(d.sheets, s) + } + case xml.EndElement: + //log.Println(" end: ", v.Name.Local) + default: + //log.Printf("%T %+v", tok, tok) + } + } + if err == io.EOF { + err = nil + } + return err +} + +func (d *Document) parseStyles(dec *xml.Decoder) error { + csxfNumFormat := []string{} + d.xfs = d.xfs[:0] + + section := 0 + tok, err := dec.Token() + for ; err == nil; tok, err = dec.Token() { + switch v := tok.(type) { + case xml.StartElement: + attrs := attrMap(v.Attr) + + switch v.Name.Local { + case "cellStyleXfs": + section = 1 + case "cellXfs": + section = 2 + n, _ := strconv.ParseInt(attrs["count"], 10, 64) + d.xfs = make([]string, 0, n) + + case "xf": + if section == 1 { + if _, ok := attrs["applyNumberFormat"]; ok { + csxfNumFormat = append(csxfNumFormat, attrs["numFmtId"]) + } else { + csxfNumFormat = append(csxfNumFormat, "-") + } + } else if section == 2 { + baseID, _ := strconv.ParseInt(attrs["xfId"], 10, 64) + thisXF := csxfNumFormat[baseID] + if _, ok := attrs["applyNumberFormat"]; ok { + thisXF = attrs["numFmtId"] + } else { + thisXF = "=" + } + + nfid, _ := strconv.ParseInt(thisXF, 10, 16) + thisXF = builtInFormats[uint16(nfid)] + d.xfs = append(d.xfs, thisXF) + } else { + panic("wheres is this xf??") + } + default: + //log.Println("start: ", v.Name.Local, v.Attr) + } + case xml.EndElement: + switch v.Name.Local { + case "cellStyleXfs": + section = 0 + case "cellXfs": + section = 0 + } + //log.Println(" end: ", v.Name.Local) + default: + //log.Printf("%T %+v", tok, tok) + } + } + if err == io.EOF { + err = nil + } + return err +} + +func (d *Document) parseSharedStrings(dec *xml.Decoder) error { + val := "" + tok, err := dec.Token() + for ; err == nil; tok, err = dec.Token() { + switch v := tok.(type) { + case xml.CharData: + val += string(v) + case xml.StartElement: + switch v.Name.Local { + case "si": + val = "" + default: + //log.Println("start: ", v.Name.Local) + } + case xml.EndElement: + if v.Name.Local == "si" { + d.strings = append(d.strings, val) + continue + } + //log.Println(" end: ", v.Name.Local) + default: + //log.Printf("%T %+v", tok, tok) + } + } + if err == io.EOF { + err = nil + } + return err +} diff --git a/xlsx/xlsx.go b/xlsx/xlsx.go new file mode 100644 index 0000000..3dafcab --- /dev/null +++ b/xlsx/xlsx.go @@ -0,0 +1,157 @@ +package xlsx + +import ( + "archive/zip" + "encoding/xml" + "errors" + "io" + "os" + "path/filepath" + "strings" +) + +// Document contains an Office Open XML document. +type Document struct { + filename string + r *zip.Reader + primaryDoc string + + // type => id => filename + rels map[string]map[string]string + sheets []*Sheet + strings []string + xfs []string +} + +func Open(filename string) (*Document, error) { + f, err := os.Open(filename) + if err != nil { + return nil, err + } + info, err := f.Stat() + if err != nil { + return nil, err + } + z, err := zip.NewReader(f, info.Size()) + if err != nil { + return nil, err + } + d := &Document{ + filename: filename, + r: z, + } + + d.rels = make(map[string]map[string]string, 4) + + // parse the primary relationships + dec, c, err := d.openXML("_rels/.rels") + if err != nil { + return nil, err + } + err = d.parseRels(dec, "") + c.Close() + if err != nil { + return nil, err + } + if d.primaryDoc == "" { + return nil, errors.New("xlsx: invalid document") + } + + // parse the secondary relationships to primary doc + base := filepath.Base(d.primaryDoc) + sub := strings.TrimSuffix(d.primaryDoc, base) + relfn := filepath.Join(sub, "_rels", base+".rels") + dec, c, err = d.openXML(relfn) + if err != nil { + return nil, err + } + err = d.parseRels(dec, sub) + c.Close() + if err != nil { + return nil, err + } + + // parse the workbook structure + dec, c, err = d.openXML(d.primaryDoc) + if err != nil { + return nil, err + } + err = d.parseWorkbook(dec) + c.Close() + if err != nil { + return nil, err + } + + styn := d.rels["http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles"] + for _, sst := range styn { + //log.Println(styn) + // parse the shared string table + dec, c, err = d.openXML(sst) + if err != nil { + return nil, err + } + err = d.parseStyles(dec) + c.Close() + if err != nil { + return nil, err + } + } + + ssn := d.rels["http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings"] + for _, sst := range ssn { + //log.Println(ssn) + // parse the shared string table + dec, c, err = d.openXML(sst) + if err != nil { + return nil, err + } + err = d.parseSharedStrings(dec) + c.Close() + if err != nil { + return nil, err + } + } + + for _, s := range d.sheets { + err = s.parseSheet() + if err != nil { + return nil, err + } + } + + return d, nil +} + +func (d *Document) openXML(name string) (*xml.Decoder, io.Closer, error) { + //log.Println(name) + for _, zf := range d.r.File { + if zf.Name == name { + zfr, err := zf.Open() + if err != nil { + return nil, nil, err + } + dec := xml.NewDecoder(zfr) + return dec, zfr, nil + } + } + return nil, nil, io.EOF +} + +func (d *Document) Sheets() []string { + res := make([]string, 0, len(d.sheets)) + for _, s := range d.sheets { + //if (s.HiddenState & 0x03) == 0 { + res = append(res, s.name) + //} + } + return res +} + +func (d *Document) Get(sheetName string) (*Sheet, error) { + for _, s := range d.sheets { + if s.name == sheetName { + return s, nil + } + } + return nil, errors.New("xlsx: sheet not found") +}