1
0
mirror of https://github.com/pbnjay/grate.git synced 2025-07-06 06:37:23 +02:00

refactor xls2tsv and add some features

This commit is contained in:
Jeremy Jay
2021-02-10 01:34:35 -05:00
parent 77213bcf00
commit ee3b4224e0

View File

@ -1,6 +1,7 @@
package main package main
import ( import (
"bufio"
"context" "context"
"flag" "flag"
"fmt" "fmt"
@ -10,41 +11,121 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"regexp" "regexp"
"runtime/pprof"
"strings" "strings"
"time"
"github.com/pbnjay/grate/xls" "github.com/pbnjay/grate/xls"
) )
var (
logfile = flag.String("l", "", "save processing logs to `filename.txt`")
pretend = flag.Bool("p", false, "pretend to output .tsv")
infoFile = flag.String("i", "results.txt", "`filename` to record stats about the process")
removeNewlines = flag.Bool("r", true, "remove embedded tabs, newlines, and condense spaces in cell contents")
trimSpaces = flag.Bool("w", true, "trim whitespace from cell contents")
skipBlanks = flag.Bool("b", true, "discard blank rows from the output")
cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
)
func main() { func main() {
pretend := flag.Bool("p", false, "pretend to output .tsv") timeFormat := "2006-01-02 15:04:05"
//infoOnly := flag.Bool("i", false, "show info/stats ONLY")
removeNewlines := flag.Bool("r", true, "remove embedded tabs, newlines, and condense spaces in cell contents")
trimSpaces := flag.Bool("w", true, "trim whitespace from cell contents")
skipBlanks := flag.Bool("b", true, "discard blank rows from the output")
flag.Parse() flag.Parse()
sanitize := regexp.MustCompile("[^a-zA-Z0-9]+") if *cpuprofile != "" {
newlines := regexp.MustCompile("[ \n\r\t]+") f, err := os.Create(*cpuprofile)
if err != nil {
log.Fatal(err)
}
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
}
if *logfile != "" {
fo, err := os.Create(*logfile)
if err != nil {
log.Fatal(err)
}
defer fo.Close()
log.SetOutput(fo)
}
fstats, err := os.OpenFile(*infoFile, os.O_CREATE|os.O_RDWR, 0644)
if err != nil {
log.Fatal(err)
}
defer fstats.Close()
pos, err := fstats.Seek(0, io.SeekEnd)
if err != nil {
log.Fatal(err)
}
if pos == 0 {
fmt.Fprintf(fstats, "time\tfilename\tsheet\trows\tcolumns\terrors\n")
}
for _, fn := range flag.Args() { for _, fn := range flag.Args() {
nowFmt := time.Now().Format(timeFormat)
results, err := processFile(fn)
if err != nil {
// returned errors are fatal
fmt.Fprintf(fstats, "%s\t%s\t-\t-\t-\t%s\n", nowFmt, fn, err.Error())
continue
}
for _, res := range results {
e := "-"
if res.Err != nil {
e = res.Err.Error()
}
fmt.Fprintf(fstats, "%s\t%s\t%s\t%d\t%d\t%s\n", nowFmt, res.Filename, res.SheetName,
res.NumRows, res.NumCols, e)
}
}
}
var (
sanitize = regexp.MustCompile("[^a-zA-Z0-9]+")
newlines = regexp.MustCompile("[ \n\r\t]+")
)
type stats struct {
Filename string
SheetName string
NumRows int
NumCols int
Err error
}
type Flusher interface {
Flush() error
}
func processFile(fn string) ([]stats, error) {
log.Printf("Opening file '%s' ...", fn) log.Printf("Opening file '%s' ...", fn)
wb, err := xls.Open(context.Background(), fn) wb, err := xls.Open(context.Background(), fn)
if err != nil { if err != nil {
log.Println(err) return nil, err
continue
} }
results := []stats{}
ext := filepath.Ext(fn) ext := filepath.Ext(fn)
fn2 := filepath.Base(strings.TrimSuffix(fn, ext)) fn2 := filepath.Base(strings.TrimSuffix(fn, ext))
for _, s := range wb.Sheets() { for _, s := range wb.Sheets() {
ps := stats{
Filename: fn,
SheetName: s,
}
log.Printf(" Opening Sheet '%s'...", s) log.Printf(" Opening Sheet '%s'...", s)
sheet, err := wb.Get(s) sheet, err := wb.Get(s)
if err != nil { if err != nil {
log.Println(err) ps.Err = err
results = append(results, ps)
continue continue
} }
if sheet.IsEmpty() { if sheet.IsEmpty() {
log.Println(" Empty sheet. Skipping.") log.Println(" Empty sheet. Skipping.")
results = append(results, ps)
continue continue
} }
s2 := sanitize.ReplaceAllString(s, "_") s2 := sanitize.ReplaceAllString(s, "_")
@ -52,10 +133,10 @@ func main() {
if !*pretend { if !*pretend {
f, err := os.Create(fn2 + "." + s2 + ".tsv") f, err := os.Create(fn2 + "." + s2 + ".tsv")
if err != nil { if err != nil {
log.Fatal(err) return nil, err
} }
defer f.Close() defer f.Close()
w = f w = bufio.NewWriter(f)
} }
for sheet.Next() { for sheet.Next() {
@ -71,15 +152,24 @@ func main() {
} }
if x != "" { if x != "" {
nonblank = true nonblank = true
if ps.NumCols < i {
ps.NumCols = i
}
} }
} }
if nonblank || !*skipBlanks { if nonblank || !*skipBlanks {
fmt.Fprintln(w, strings.Join(row, "\t")) fmt.Fprintln(w, strings.Join(row, "\t"))
ps.NumRows++
} }
} }
results = append(results, ps)
if ff, ok := w.(Flusher); ok {
ff.Flush()
}
if c, ok := w.(io.Closer); ok { if c, ok := w.(io.Closer); ok {
c.Close() c.Close()
} }
} }
} return results, nil
} }