1
0
mirror of https://github.com/pbnjay/grate.git synced 2026-05-16 17:16:40 +02:00

adding a simple-format reader for csv/tsv

This commit is contained in:
Jeremy Jay
2021-02-12 01:00:35 -05:00
parent 749e19458a
commit f25b853fdf
3 changed files with 198 additions and 0 deletions
+55
View File
@@ -0,0 +1,55 @@
package simple
import (
"encoding/csv"
"os"
"github.com/pbnjay/grate"
)
var _ = grate.Register("csv", 15, OpenCSV)
// OpenCSV defines a Source's instantiation function.
// It should return ErrNotInFormat immediately if filename is not of the correct file type.
func OpenCSV(filename string) (grate.Source, error) {
f, err := os.Open(filename)
if err != nil {
return nil, err
}
defer f.Close()
t := &simpleFile{
filename: filename,
iterRow: -1,
}
s := csv.NewReader(f)
s.FieldsPerRecord = -1
total := 0
ncols := make(map[int]int)
rec, err := s.Read()
for ; err == nil; rec, err = s.Read() {
ncols[len(rec)]++
total++
t.rows = append(t.rows, rec)
}
// kinda arbitrary metrics for detecting CSV
looksGood := 0
for c, n := range ncols {
if c <= 1 {
continue
}
if n > 10 && float64(n)/float64(total) > 0.8 {
// more than 80% of rows have the same number of columns, we're good
looksGood = 2
} else if n > 25 && looksGood == 0 {
looksGood = 1
}
}
if looksGood == 1 {
return t, grate.ErrNotInFormat
}
return t, nil
}
+89
View File
@@ -0,0 +1,89 @@
package simple
import (
"errors"
"fmt"
"strconv"
"strings"
"time"
"github.com/pbnjay/grate"
)
// represents a set of data collections.
type simpleFile struct {
filename string
rows [][]string
iterRow int
}
// List the individual data tables within this source.
func (t *simpleFile) List() ([]string, error) {
return []string{t.filename}, nil
}
// Get a Collection from the source by name.
func (t *simpleFile) Get(name string) (grate.Collection, error) {
return t, nil
}
// Next advances to the next record of content.
// It MUST be called prior to any Scan().
func (t *simpleFile) Next() bool {
t.iterRow++
return t.iterRow < len(t.rows)
}
// Strings extracts values from the current record into a list of strings.
func (t *simpleFile) Strings() []string {
return t.rows[t.iterRow]
}
// Scan extracts values from the current record into the provided arguments
// Arguments must be pointers to one of 5 supported types:
// bool, int, float64, string, or time.Time
func (t *simpleFile) Scan(args ...interface{}) error {
var err error
row := t.rows[t.iterRow]
if len(row) != len(args) {
return fmt.Errorf("grate/simple: expected %d Scan destinations, got %d", len(row), len(args))
}
for i, a := range args {
switch v := a.(type) {
case *bool:
switch strings.ToLower(row[i]) {
case "1", "t", "true", "y", "yes":
*v = true
default:
*v = false
}
case *int:
var n int64
n, err = strconv.ParseInt(row[i], 10, 64)
*v = int(n)
case *float64:
*v, err = strconv.ParseFloat(row[i], 64)
case *string:
*v = row[i]
case *time.Time:
return errors.New("grate/simple: time.Time not supported, you must parse string manually")
default:
return errors.New("grate/simple: scan destination must be one of: *bool, *int, *float64, *string, or *time.Time")
}
if err != nil {
return err
}
}
return nil
}
// IsEmpty returns true if there are no data values.
func (t *simpleFile) IsEmpty() bool {
return len(t.rows) == 0
}
// Err returns the last error that occured.
func (t *simpleFile) Err() error {
return nil
}
+54
View File
@@ -0,0 +1,54 @@
package simple
import (
"bufio"
"os"
"strings"
"github.com/pbnjay/grate"
)
var _ = grate.Register("tsv", 10, OpenTSV)
// OpenTSV defines a Source's instantiation function.
// It should return ErrNotInFormat immediately if filename is not of the correct file type.
func OpenTSV(filename string) (grate.Source, error) {
f, err := os.Open(filename)
if err != nil {
return nil, err
}
defer f.Close()
t := &simpleFile{
filename: filename,
iterRow: -1,
}
s := bufio.NewScanner(f)
total := 0
ncols := make(map[int]int)
for s.Scan() {
r := strings.Split(s.Text(), "\t")
ncols[len(r)]++
total++
t.rows = append(t.rows, r)
}
// kinda arbitrary metrics for detecting TSV
looksGood := 0
for c, n := range ncols {
if c <= 1 {
continue
}
if n > 10 && float64(n)/float64(total) > 0.8 {
// more than 80% of rows have the same number of columns, we're good
looksGood = 2
} else if n > 25 && looksGood == 0 {
looksGood = 1
}
}
if looksGood == 1 {
return t, grate.ErrNotInFormat
}
return t, nil
}