mirror of
https://github.com/pbnjay/grate.git
synced 2026-05-16 17:16:40 +02:00
adding a simple-format reader for csv/tsv
This commit is contained in:
@@ -0,0 +1,55 @@
|
||||
package simple
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"os"
|
||||
|
||||
"github.com/pbnjay/grate"
|
||||
)
|
||||
|
||||
var _ = grate.Register("csv", 15, OpenCSV)
|
||||
|
||||
// OpenCSV defines a Source's instantiation function.
|
||||
// It should return ErrNotInFormat immediately if filename is not of the correct file type.
|
||||
func OpenCSV(filename string) (grate.Source, error) {
|
||||
f, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
t := &simpleFile{
|
||||
filename: filename,
|
||||
iterRow: -1,
|
||||
}
|
||||
|
||||
s := csv.NewReader(f)
|
||||
s.FieldsPerRecord = -1
|
||||
|
||||
total := 0
|
||||
ncols := make(map[int]int)
|
||||
rec, err := s.Read()
|
||||
for ; err == nil; rec, err = s.Read() {
|
||||
ncols[len(rec)]++
|
||||
total++
|
||||
t.rows = append(t.rows, rec)
|
||||
}
|
||||
|
||||
// kinda arbitrary metrics for detecting CSV
|
||||
looksGood := 0
|
||||
for c, n := range ncols {
|
||||
if c <= 1 {
|
||||
continue
|
||||
}
|
||||
if n > 10 && float64(n)/float64(total) > 0.8 {
|
||||
// more than 80% of rows have the same number of columns, we're good
|
||||
looksGood = 2
|
||||
} else if n > 25 && looksGood == 0 {
|
||||
looksGood = 1
|
||||
}
|
||||
}
|
||||
if looksGood == 1 {
|
||||
return t, grate.ErrNotInFormat
|
||||
}
|
||||
|
||||
return t, nil
|
||||
}
|
||||
@@ -0,0 +1,89 @@
|
||||
package simple
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/pbnjay/grate"
|
||||
)
|
||||
|
||||
// represents a set of data collections.
|
||||
type simpleFile struct {
|
||||
filename string
|
||||
rows [][]string
|
||||
iterRow int
|
||||
}
|
||||
|
||||
// List the individual data tables within this source.
|
||||
func (t *simpleFile) List() ([]string, error) {
|
||||
return []string{t.filename}, nil
|
||||
}
|
||||
|
||||
// Get a Collection from the source by name.
|
||||
func (t *simpleFile) Get(name string) (grate.Collection, error) {
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// Next advances to the next record of content.
|
||||
// It MUST be called prior to any Scan().
|
||||
func (t *simpleFile) Next() bool {
|
||||
t.iterRow++
|
||||
return t.iterRow < len(t.rows)
|
||||
}
|
||||
|
||||
// Strings extracts values from the current record into a list of strings.
|
||||
func (t *simpleFile) Strings() []string {
|
||||
return t.rows[t.iterRow]
|
||||
}
|
||||
|
||||
// Scan extracts values from the current record into the provided arguments
|
||||
// Arguments must be pointers to one of 5 supported types:
|
||||
// bool, int, float64, string, or time.Time
|
||||
func (t *simpleFile) Scan(args ...interface{}) error {
|
||||
var err error
|
||||
row := t.rows[t.iterRow]
|
||||
if len(row) != len(args) {
|
||||
return fmt.Errorf("grate/simple: expected %d Scan destinations, got %d", len(row), len(args))
|
||||
}
|
||||
|
||||
for i, a := range args {
|
||||
switch v := a.(type) {
|
||||
case *bool:
|
||||
switch strings.ToLower(row[i]) {
|
||||
case "1", "t", "true", "y", "yes":
|
||||
*v = true
|
||||
default:
|
||||
*v = false
|
||||
}
|
||||
case *int:
|
||||
var n int64
|
||||
n, err = strconv.ParseInt(row[i], 10, 64)
|
||||
*v = int(n)
|
||||
case *float64:
|
||||
*v, err = strconv.ParseFloat(row[i], 64)
|
||||
case *string:
|
||||
*v = row[i]
|
||||
case *time.Time:
|
||||
return errors.New("grate/simple: time.Time not supported, you must parse string manually")
|
||||
default:
|
||||
return errors.New("grate/simple: scan destination must be one of: *bool, *int, *float64, *string, or *time.Time")
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsEmpty returns true if there are no data values.
|
||||
func (t *simpleFile) IsEmpty() bool {
|
||||
return len(t.rows) == 0
|
||||
}
|
||||
|
||||
// Err returns the last error that occured.
|
||||
func (t *simpleFile) Err() error {
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
package simple
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/pbnjay/grate"
|
||||
)
|
||||
|
||||
var _ = grate.Register("tsv", 10, OpenTSV)
|
||||
|
||||
// OpenTSV defines a Source's instantiation function.
|
||||
// It should return ErrNotInFormat immediately if filename is not of the correct file type.
|
||||
func OpenTSV(filename string) (grate.Source, error) {
|
||||
f, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
t := &simpleFile{
|
||||
filename: filename,
|
||||
iterRow: -1,
|
||||
}
|
||||
|
||||
s := bufio.NewScanner(f)
|
||||
total := 0
|
||||
ncols := make(map[int]int)
|
||||
for s.Scan() {
|
||||
r := strings.Split(s.Text(), "\t")
|
||||
ncols[len(r)]++
|
||||
total++
|
||||
t.rows = append(t.rows, r)
|
||||
}
|
||||
|
||||
// kinda arbitrary metrics for detecting TSV
|
||||
looksGood := 0
|
||||
for c, n := range ncols {
|
||||
if c <= 1 {
|
||||
continue
|
||||
}
|
||||
if n > 10 && float64(n)/float64(total) > 0.8 {
|
||||
// more than 80% of rows have the same number of columns, we're good
|
||||
looksGood = 2
|
||||
} else if n > 25 && looksGood == 0 {
|
||||
looksGood = 1
|
||||
}
|
||||
}
|
||||
if looksGood == 1 {
|
||||
return t, grate.ErrNotInFormat
|
||||
}
|
||||
|
||||
return t, nil
|
||||
}
|
||||
Reference in New Issue
Block a user