1
0
mirror of https://github.com/MontFerret/ferret.git synced 2024-12-04 10:35:08 +02:00

Updated README

This commit is contained in:
Tim Voronov 2018-09-18 21:41:16 -04:00
parent e02e861240
commit b700d17ac5
9 changed files with 474 additions and 27 deletions

1
Gopkg.lock generated
View File

@ -195,6 +195,7 @@
"github.com/pkg/errors",
"github.com/sethgrid/pester",
"github.com/smartystreets/goconvey/convey",
"golang.org/x/net/html",
"golang.org/x/sync/errgroup",
]
solver-name = "gps-cdcl"

257
README.md
View File

@ -1,32 +1,49 @@
# Ferret
> Web scraping query language
## Installation
### Prerequisites
* Go >=1.6
* GoDep
* Chrome (optional)
* GNU Make
```sh
make build
```
## Features
* Declarative language
* Support of JS-rendered pages
* Embeddable
* Extensible
## Motivation
Nowadays data is everything and who owns data - owns the world.
I have worked on multiple data-driven projects where data was an essential part of the system where I realized how cumbersome writing tons of scrapers is.
I was looking for some kind of tool that would let me to not write a code, but just express what data I need.
Unfortunately, I didn't find anything, and therefore decided to create one.
```Ferret``` project is an ambitious initiative to bring universal platform for writing scrapers without any hassle.
I have worked on multiple data-driven projects where data was an essential part of a system and I realized how cumbersome writing tons of scrapers is.
After some time looking for a tool that would let me to not write a code, but just express what data I need, decided to come up with my own solution.
```Ferret``` project is an ambitious initiative trying to bring universal platform for writing scrapers without any hassle.
## Inspiration
FQL (Ferret Query Language) is heavily inspired by [AQL](https://www.arangodb.com/) (ArangoDB Query Language).
But due to domain specifics, there are some differences in how things work.
But due to the domain specifics, there are some differences in how things work.
## WIP
Be aware, the the project is under heavy development. There is no documentation and some things may change in the final release.
For query syntax, you may go to [ArrangoDB web site](https://docs.arangodb.com/3.3/AQL/index.html) and use AQL docs as a docs for FQL - since they are identical.
For query syntax, you may go to [ArrangoDB web site](https://docs.arangodb.com/3.3/AQL/index.html) and use AQL docs as docs for FQL - since they are identical.
## Quick stark
## Quick start
### Browserless mode
If you want to play with ```fql``` and check its syntax, run CLI with the following commands:
If you want to play with ```fql``` and check its syntax, you can run CLI with the following commands:
```
go run ./cmd/cli/main.go
```
```ferret``` will run REPL.
```ferret``` will run in REPL mode.
```shell
Welcome to Ferret REPL
@ -39,7 +56,7 @@ Please use `Ctrl-D` to exit this program.
**Note:** blackslash is used for multiline queries.
If you want to execute a query store in a file, just type a file name
If you want to execute a query stored in a file, just pass a file name:
```
go run ./cmd/cli/main.go ./docs/examples/hackernews.fql
@ -48,9 +65,19 @@ go run ./cmd/cli/main.go ./docs/examples/hackernews.fql
### Browser mode
By default, ``ferret`` loads HTML pages via http protocol since it's faster.
By default, ``ferret`` loads HTML pages via http protocol, because it's faster.
But nowadays, there are more and more websites rendered with JavaScript, and therefore, this 'old school' approach does not really work.
For this case, you may fetch documents using Chrome or Chromium via Chrome DevTools protocol (aka CDP).
For such cases, you may fetch documents using Chrome or Chromium via Chrome DevTools protocol (aka CDP).
First, you need to make sure that you launched Chrome with ```remote-debugging-port=9222``` flag.
Second, you need to pass the address to ```ferret``` CLI.
```
./bin/ferret --cdp http://127.0.0.1:9222
```
**NOTE:** By default, ```ferret``` will try to use this local address as a default one, so it makes sense to explicitly pass the parameter only in case of either different port number or remote address.
Alternatively, you can tell CLI to launch Chrome for you.
```shell
go run ./cmd/cli/main.go --cdp-launch
@ -58,13 +85,7 @@ go run ./cmd/cli/main.go --cdp-launch
**Note:** Launch command is currently broken on MacOS.
Alternatively, you may open Chrome manually with ```remote-debugging-port=9222``` arguments and bass the address to ``ferret``:
```
./bin/ferret --cdp http://127.0.0.1:9222
```
In this case, you can use function ```DOCUMENT(url, isJsRendered)``` with ```true``` for loading JS rendered pages:
Once ```ferret``` knows how to communicate with Chrome, you can use a function ```DOCUMENT(url, isJsRendered)``` with ```true``` boolean value for loading JS rendered pages:
```shell
Welcome to Ferret REPL
@ -80,4 +101,198 @@ Please use `exit` or `Ctrl-D` to exit this program.
> artist: username.innerText,
> track: title.innerText
> }
```
```
### Embedded mode
```ferret``` is very modular system and therefore, can be easily be embedded into your Go application.
```go
package main
import (
"context"
"encoding/json"
"fmt"
"github.com/MontFerret/ferret/pkg/compiler"
"os"
)
type Topic struct {
Name string `json:"name"`
Description string `json:"description"`
Url string `json:"url"`
}
func main() {
topics, err := getTopTenTrendingTopics()
if err != nil {
fmt.Println(err)
os.Exit(1)
}
for _, topic := range topics {
fmt.Println(fmt.Sprintf("%s: %s %s", topic.Name, topic.Description, topic.Url))
}
}
func getTopTenTrendingTopics() ([]*Topic, error) {
query := `
LET doc = DOCUMENT("https://github.com/topics")
FOR el IN ELEMENTS(doc, ".py-4.border-bottom")
LIMIT 10
LET url = ELEMENT(el, "a")
LET name = ELEMENT(el, ".f3")
LET desc = ELEMENT(el, ".f5")
RETURN {
name: TRIM(name.innerText),
description: TRIM(desc.innerText),
url: "https://github.com" + url.attributes.href
}
`
comp := compiler.New()
program, err := comp.Compile(query)
if err != nil {
return nil, err
}
out, err := program.Run(context.Background())
if err != nil {
return nil, err
}
res := make([]*Topic, 0, 10)
err = json.Unmarshal(out, &res)
if err != nil {
return nil, err
}
return res, nil
}
```
## Extensibility
That said, ```ferret``` is very modular system which also allows not only embed it, but extend its standard library.
```
package main
import (
"context"
"encoding/json"
"fmt"
"github.com/MontFerret/ferret/pkg/compiler"
"github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/MontFerret/ferret/pkg/runtime/values"
"os"
)
func main() {
strs, err := getStrings()
if err != nil {
fmt.Println(err)
os.Exit(1)
}
for _, str := range strs {
fmt.Println(str)
}
}
func getStrings() ([]string, error) {
// function implements is a type of a function that ferret supports as a runtime function
transform := func(ctx context.Context, args ...core.Value) (core.Value, error) {
// it's just a helper function which helps to validate a number of passed args
err := core.ValidateArgs(args, 1)
if err != nil {
// it's recommended to return built-in None type, instead of nil
return values.None, err
}
// this is another helper functions allowing to do type validation
err = core.ValidateType(args[0], core.StringType)
if err != nil {
return values.None, err
}
// cast to built-in string type
str := args[0].(values.String)
return str.Concat(values.NewString("_ferret")).ToUpper(), nil
}
query := `
FOR el IN ["foo", "bar", "qaz"]
// conventionally all functions are registered in upper case
RETURN TRANSFORM(el)
`
comp := compiler.New()
comp.RegisterFunction("transform", transform)
program, err := comp.Compile(query)
if err != nil {
return nil, err
}
out, err := program.Run(context.Background())
if err != nil {
return nil, err
}
res := make([]string, 0, 3)
err = json.Unmarshal(out, &res)
if err != nil {
return nil, err
}
return res, nil
}
```
On top of that, you can completely turn off standard library, by passing the following option:
```go
comp := compiler.New(compiler.WithoutStdlib())
```
And after that, you can easily provide your own implementation of functions from standard library.
If you don't need a particular set of functions from standard library, you can turn off the entire ```stdlib``` and register separate packages from that:
```go
package main
import (
"github.com/MontFerret/ferret/pkg/compiler"
"github.com/MontFerret/ferret/pkg/stdlib/strings"
)
func main() {
comp := compiler.New(compiler.WithoutStdlib())
comp.RegisterFunctions(strings.NewLib())
}
```

View File

@ -25,7 +25,7 @@ var (
conn = flag.String(
"cdp",
"",
"http://127.0.0.1:9222",
"Chrome DevTools Protocol address",
)

70
docs/examples/embedded.go Normal file
View File

@ -0,0 +1,70 @@
package main
import (
"context"
"encoding/json"
"fmt"
"github.com/MontFerret/ferret/pkg/compiler"
"os"
)
type Topic struct {
Name string `json:"name"`
Description string `json:"description"`
Url string `json:"url"`
}
func main() {
topics, err := getTopTenTrendingTopics()
if err != nil {
fmt.Println(err)
os.Exit(1)
}
for _, topic := range topics {
fmt.Println(fmt.Sprintf("%s: %s %s", topic.Name, topic.Description, topic.Url))
}
}
func getTopTenTrendingTopics() ([]*Topic, error) {
query := `
LET doc = DOCUMENT("https://github.com/topics")
FOR el IN ELEMENTS(doc, ".py-4.border-bottom")
LIMIT 10
LET url = ELEMENT(el, "a")
LET name = ELEMENT(el, ".f3")
LET desc = ELEMENT(el, ".f5")
RETURN {
name: TRIM(name.innerText),
description: TRIM(desc.innerText),
url: "https://github.com" + url.attributes.href
}
`
comp := compiler.New()
program, err := comp.Compile(query)
if err != nil {
return nil, err
}
out, err := program.Run(context.Background())
if err != nil {
return nil, err
}
res := make([]*Topic, 0, 10)
err = json.Unmarshal(out, &res)
if err != nil {
return nil, err
}
return res, nil
}

View File

@ -0,0 +1,80 @@
package main
import (
"context"
"encoding/json"
"fmt"
"github.com/MontFerret/ferret/pkg/compiler"
"github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/MontFerret/ferret/pkg/runtime/values"
"os"
)
func main() {
strs, err := getStrings()
if err != nil {
fmt.Println(err)
os.Exit(1)
}
for _, str := range strs {
fmt.Println(str)
}
}
func getStrings() ([]string, error) {
// function implements is a type of a function that ferret supports as a runtime function
transform := func(ctx context.Context, args ...core.Value) (core.Value, error) {
// it's just a helper function which helps to validate a number of passed args
err := core.ValidateArgs(args, 1)
if err != nil {
// it's recommended to return built-in None type, instead of nil
return values.None, err
}
// this is another helper functions allowing to do type validation
err = core.ValidateType(args[0], core.StringType)
if err != nil {
return values.None, err
}
// cast to built-in string type
str := args[0].(values.String)
return str.Concat(values.NewString("_ferret")).ToUpper(), nil
}
query := `
FOR el IN ["foo", "bar", "qaz"]
// conventionally all functions are registered in upper case
RETURN TRANSFORM(el)
`
comp := compiler.New()
comp.RegisterFunction("transform", transform)
program, err := comp.Compile(query)
if err != nil {
return nil, err
}
out, err := program.Run(context.Background())
if err != nil {
return nil, err
}
res := make([]string, 0, 3)
err = json.Unmarshal(out, &res)
if err != nil {
return nil, err
}
return res, nil
}

View File

@ -6,13 +6,27 @@ import (
"github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/MontFerret/ferret/pkg/stdlib"
"github.com/pkg/errors"
"strings"
)
type FqlCompiler struct {
funcs map[string]core.Function
}
func New() *FqlCompiler {
func New(setters ...Option) *FqlCompiler {
c := &FqlCompiler{}
opts := &Options{}
for _, setter := range setters {
setter(opts)
}
if !opts.noStdlib {
c.funcs = stdlib.NewLib()
} else {
c.funcs = make(map[string]core.Function)
}
return &FqlCompiler{
stdlib.NewLib(),
}
@ -25,7 +39,17 @@ func (c *FqlCompiler) RegisterFunction(name string, fun core.Function) error {
return errors.Errorf("function already exists: %s", name)
}
c.funcs[name] = fun
c.funcs[strings.ToUpper(name)] = fun
return nil
}
func (c *FqlCompiler) RegisterFunctions(funcs map[string]core.Function) error {
for name, fun := range funcs {
if err := c.RegisterFunction(name, fun); err != nil {
return err
}
}
return nil
}

14
pkg/compiler/options.go Normal file
View File

@ -0,0 +1,14 @@
package compiler
type (
Option func(opts *Options)
Options struct {
noStdlib bool
}
)
func WithoutStdlib() Option {
return func(opts *Options) {
opts.noStdlib = true
}
}

View File

@ -7,9 +7,9 @@ import (
type Function = func(ctx context.Context, args ...Value) (Value, error)
func ValidateArgs(inputs []Value, required int) error {
if len(inputs) != required {
return Error(ErrMissedArgument, fmt.Sprintf("expected %d, but got %d arguments", required, len(inputs)))
func ValidateArgs(args []Value, required int) error {
if len(args) != required {
return Error(ErrMissedArgument, fmt.Sprintf("expected %d, but got %d arguments", required, len(args)))
}
return nil

View File

@ -112,3 +112,46 @@ func (t String) IndexOf(other String) Int {
func (t String) Concat(other core.Value) String {
return String(string(t) + other.String())
}
func (t String) Replace(old, new String, times Int) String {
return NewString(strings.Replace(string(t), string(old), string(new), int(times)))
}
func (t String) Remove(startIndex Int) String {
return t[startIndex : t.Length()-1]
}
func (t String) EndsWith(other String) Boolean {
return Boolean(strings.HasSuffix(string(t), string(other)))
}
func (t String) StartsWith(other String) Boolean {
return Boolean(strings.HasPrefix(string(t), string(other)))
}
func (t String) Split(separator String) *Array {
out := strings.Split(string(t), string(separator))
arr := NewArray(len(out))
for _, str := range out {
arr.Push(NewString(str))
}
return arr
}
func (t String) Substring(start, end Int) String {
return t[start:end]
}
func (t String) Trim() String {
return NewString(strings.TrimSpace(string(t)))
}
func (t String) ToUpper() String {
return NewString(strings.ToUpper(string(t)))
}
func (t String) ToLower() String {
return NewString(strings.ToLower(string(t)))
}