diff --git a/Gopkg.lock b/Gopkg.lock index 5732f601..90db2139 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -195,6 +195,7 @@ "github.com/pkg/errors", "github.com/sethgrid/pester", "github.com/smartystreets/goconvey/convey", + "golang.org/x/net/html", "golang.org/x/sync/errgroup", ] solver-name = "gps-cdcl" diff --git a/README.md b/README.md index dc723c09..3d55723e 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,49 @@ # Ferret > Web scraping query language +## Installation + +### Prerequisites +* Go >=1.6 +* GoDep +* Chrome (optional) +* GNU Make + +```sh +make build +``` + +## Features + +* Declarative language +* Support of JS-rendered pages +* Embeddable +* Extensible + ## Motivation Nowadays data is everything and who owns data - owns the world. -I have worked on multiple data-driven projects where data was an essential part of the system where I realized how cumbersome writing tons of scrapers is. -I was looking for some kind of tool that would let me to not write a code, but just express what data I need. -Unfortunately, I didn't find anything, and therefore decided to create one. -```Ferret``` project is an ambitious initiative to bring universal platform for writing scrapers without any hassle. +I have worked on multiple data-driven projects where data was an essential part of a system and I realized how cumbersome writing tons of scrapers is. +After some time looking for a tool that would let me to not write a code, but just express what data I need, decided to come up with my own solution. +```Ferret``` project is an ambitious initiative trying to bring universal platform for writing scrapers without any hassle. ## Inspiration FQL (Ferret Query Language) is heavily inspired by [AQL](https://www.arangodb.com/) (ArangoDB Query Language). -But due to domain specifics, there are some differences in how things work. +But due to the domain specifics, there are some differences in how things work. ## WIP Be aware, the the project is under heavy development. There is no documentation and some things may change in the final release. -For query syntax, you may go to [ArrangoDB web site](https://docs.arangodb.com/3.3/AQL/index.html) and use AQL docs as a docs for FQL - since they are identical. +For query syntax, you may go to [ArrangoDB web site](https://docs.arangodb.com/3.3/AQL/index.html) and use AQL docs as docs for FQL - since they are identical. -## Quick stark +## Quick start ### Browserless mode -If you want to play with ```fql``` and check its syntax, run CLI with the following commands: +If you want to play with ```fql``` and check its syntax, you can run CLI with the following commands: ``` go run ./cmd/cli/main.go - ``` -```ferret``` will run REPL. +```ferret``` will run in REPL mode. ```shell Welcome to Ferret REPL @@ -39,7 +56,7 @@ Please use `Ctrl-D` to exit this program. **Note:** blackslash is used for multiline queries. -If you want to execute a query store in a file, just type a file name +If you want to execute a query stored in a file, just pass a file name: ``` go run ./cmd/cli/main.go ./docs/examples/hackernews.fql @@ -48,9 +65,19 @@ go run ./cmd/cli/main.go ./docs/examples/hackernews.fql ### Browser mode -By default, ``ferret`` loads HTML pages via http protocol since it's faster. +By default, ``ferret`` loads HTML pages via http protocol, because it's faster. But nowadays, there are more and more websites rendered with JavaScript, and therefore, this 'old school' approach does not really work. -For this case, you may fetch documents using Chrome or Chromium via Chrome DevTools protocol (aka CDP). +For such cases, you may fetch documents using Chrome or Chromium via Chrome DevTools protocol (aka CDP). +First, you need to make sure that you launched Chrome with ```remote-debugging-port=9222``` flag. +Second, you need to pass the address to ```ferret``` CLI. + +``` +./bin/ferret --cdp http://127.0.0.1:9222 +``` + +**NOTE:** By default, ```ferret``` will try to use this local address as a default one, so it makes sense to explicitly pass the parameter only in case of either different port number or remote address. + +Alternatively, you can tell CLI to launch Chrome for you. ```shell go run ./cmd/cli/main.go --cdp-launch @@ -58,13 +85,7 @@ go run ./cmd/cli/main.go --cdp-launch **Note:** Launch command is currently broken on MacOS. -Alternatively, you may open Chrome manually with ```remote-debugging-port=9222``` arguments and bass the address to ``ferret``: - -``` -./bin/ferret --cdp http://127.0.0.1:9222 -``` - -In this case, you can use function ```DOCUMENT(url, isJsRendered)``` with ```true``` for loading JS rendered pages: +Once ```ferret``` knows how to communicate with Chrome, you can use a function ```DOCUMENT(url, isJsRendered)``` with ```true``` boolean value for loading JS rendered pages: ```shell Welcome to Ferret REPL @@ -80,4 +101,198 @@ Please use `exit` or `Ctrl-D` to exit this program. > artist: username.innerText, > track: title.innerText > } -``` \ No newline at end of file +``` + +### Embedded mode + +```ferret``` is very modular system and therefore, can be easily be embedded into your Go application. + +```go + +package main + +import ( + "context" + "encoding/json" + "fmt" + "github.com/MontFerret/ferret/pkg/compiler" + "os" +) + +type Topic struct { + Name string `json:"name"` + Description string `json:"description"` + Url string `json:"url"` +} + +func main() { + topics, err := getTopTenTrendingTopics() + + if err != nil { + fmt.Println(err) + os.Exit(1) + } + + for _, topic := range topics { + fmt.Println(fmt.Sprintf("%s: %s %s", topic.Name, topic.Description, topic.Url)) + } +} + +func getTopTenTrendingTopics() ([]*Topic, error) { + query := ` + LET doc = DOCUMENT("https://github.com/topics") + + FOR el IN ELEMENTS(doc, ".py-4.border-bottom") + LIMIT 10 + LET url = ELEMENT(el, "a") + LET name = ELEMENT(el, ".f3") + LET desc = ELEMENT(el, ".f5") + + RETURN { + name: TRIM(name.innerText), + description: TRIM(desc.innerText), + url: "https://github.com" + url.attributes.href + } + ` + + comp := compiler.New() + + program, err := comp.Compile(query) + + if err != nil { + return nil, err + } + + out, err := program.Run(context.Background()) + + if err != nil { + return nil, err + } + + res := make([]*Topic, 0, 10) + + err = json.Unmarshal(out, &res) + + if err != nil { + return nil, err + } + + return res, nil +} + +``` + +## Extensibility + +That said, ```ferret``` is very modular system which also allows not only embed it, but extend its standard library. + +``` +package main + +import ( + "context" + "encoding/json" + "fmt" + "github.com/MontFerret/ferret/pkg/compiler" + "github.com/MontFerret/ferret/pkg/runtime/core" + "github.com/MontFerret/ferret/pkg/runtime/values" + "os" +) + +func main() { + strs, err := getStrings() + + if err != nil { + fmt.Println(err) + os.Exit(1) + } + + for _, str := range strs { + fmt.Println(str) + } +} + +func getStrings() ([]string, error) { + // function implements is a type of a function that ferret supports as a runtime function + transform := func(ctx context.Context, args ...core.Value) (core.Value, error) { + // it's just a helper function which helps to validate a number of passed args + err := core.ValidateArgs(args, 1) + + if err != nil { + // it's recommended to return built-in None type, instead of nil + return values.None, err + } + + // this is another helper functions allowing to do type validation + err = core.ValidateType(args[0], core.StringType) + + if err != nil { + return values.None, err + } + + // cast to built-in string type + str := args[0].(values.String) + + return str.Concat(values.NewString("_ferret")).ToUpper(), nil + } + + query := ` + FOR el IN ["foo", "bar", "qaz"] + // conventionally all functions are registered in upper case + RETURN TRANSFORM(el) + ` + + comp := compiler.New() + comp.RegisterFunction("transform", transform) + + program, err := comp.Compile(query) + + if err != nil { + return nil, err + } + + out, err := program.Run(context.Background()) + + if err != nil { + return nil, err + } + + res := make([]string, 0, 3) + + err = json.Unmarshal(out, &res) + + if err != nil { + return nil, err + } + + return res, nil +} +``` + +On top of that, you can completely turn off standard library, by passing the following option: + +```go + +comp := compiler.New(compiler.WithoutStdlib()) + +``` + +And after that, you can easily provide your own implementation of functions from standard library. + +If you don't need a particular set of functions from standard library, you can turn off the entire ```stdlib``` and register separate packages from that: + +```go +package main + +import ( + "github.com/MontFerret/ferret/pkg/compiler" + "github.com/MontFerret/ferret/pkg/stdlib/strings" +) + + +func main() { + comp := compiler.New(compiler.WithoutStdlib()) + + comp.RegisterFunctions(strings.NewLib()) +} +``` diff --git a/cmd/cli/main.go b/cmd/cli/main.go index c0d57de0..54c0b5e0 100644 --- a/cmd/cli/main.go +++ b/cmd/cli/main.go @@ -25,7 +25,7 @@ var ( conn = flag.String( "cdp", - "", + "http://127.0.0.1:9222", "Chrome DevTools Protocol address", ) diff --git a/docs/examples/embedded.go b/docs/examples/embedded.go new file mode 100644 index 00000000..71d551f3 --- /dev/null +++ b/docs/examples/embedded.go @@ -0,0 +1,70 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "github.com/MontFerret/ferret/pkg/compiler" + "os" +) + +type Topic struct { + Name string `json:"name"` + Description string `json:"description"` + Url string `json:"url"` +} + +func main() { + topics, err := getTopTenTrendingTopics() + + if err != nil { + fmt.Println(err) + os.Exit(1) + } + + for _, topic := range topics { + fmt.Println(fmt.Sprintf("%s: %s %s", topic.Name, topic.Description, topic.Url)) + } +} + +func getTopTenTrendingTopics() ([]*Topic, error) { + query := ` + LET doc = DOCUMENT("https://github.com/topics") + + FOR el IN ELEMENTS(doc, ".py-4.border-bottom") + LIMIT 10 + LET url = ELEMENT(el, "a") + LET name = ELEMENT(el, ".f3") + LET desc = ELEMENT(el, ".f5") + + RETURN { + name: TRIM(name.innerText), + description: TRIM(desc.innerText), + url: "https://github.com" + url.attributes.href + } + ` + + comp := compiler.New() + + program, err := comp.Compile(query) + + if err != nil { + return nil, err + } + + out, err := program.Run(context.Background()) + + if err != nil { + return nil, err + } + + res := make([]*Topic, 0, 10) + + err = json.Unmarshal(out, &res) + + if err != nil { + return nil, err + } + + return res, nil +} diff --git a/docs/examples/extensible.go b/docs/examples/extensible.go new file mode 100644 index 00000000..c5b5b186 --- /dev/null +++ b/docs/examples/extensible.go @@ -0,0 +1,80 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "github.com/MontFerret/ferret/pkg/compiler" + "github.com/MontFerret/ferret/pkg/runtime/core" + "github.com/MontFerret/ferret/pkg/runtime/values" + "os" +) + +func main() { + strs, err := getStrings() + + if err != nil { + fmt.Println(err) + os.Exit(1) + } + + for _, str := range strs { + fmt.Println(str) + } +} + +func getStrings() ([]string, error) { + // function implements is a type of a function that ferret supports as a runtime function + transform := func(ctx context.Context, args ...core.Value) (core.Value, error) { + // it's just a helper function which helps to validate a number of passed args + err := core.ValidateArgs(args, 1) + + if err != nil { + // it's recommended to return built-in None type, instead of nil + return values.None, err + } + + // this is another helper functions allowing to do type validation + err = core.ValidateType(args[0], core.StringType) + + if err != nil { + return values.None, err + } + + // cast to built-in string type + str := args[0].(values.String) + + return str.Concat(values.NewString("_ferret")).ToUpper(), nil + } + + query := ` + FOR el IN ["foo", "bar", "qaz"] + // conventionally all functions are registered in upper case + RETURN TRANSFORM(el) + ` + + comp := compiler.New() + comp.RegisterFunction("transform", transform) + + program, err := comp.Compile(query) + + if err != nil { + return nil, err + } + + out, err := program.Run(context.Background()) + + if err != nil { + return nil, err + } + + res := make([]string, 0, 3) + + err = json.Unmarshal(out, &res) + + if err != nil { + return nil, err + } + + return res, nil +} diff --git a/pkg/compiler/compiler.go b/pkg/compiler/compiler.go index b9503e31..d5f75970 100644 --- a/pkg/compiler/compiler.go +++ b/pkg/compiler/compiler.go @@ -6,13 +6,27 @@ import ( "github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/stdlib" "github.com/pkg/errors" + "strings" ) type FqlCompiler struct { funcs map[string]core.Function } -func New() *FqlCompiler { +func New(setters ...Option) *FqlCompiler { + c := &FqlCompiler{} + opts := &Options{} + + for _, setter := range setters { + setter(opts) + } + + if !opts.noStdlib { + c.funcs = stdlib.NewLib() + } else { + c.funcs = make(map[string]core.Function) + } + return &FqlCompiler{ stdlib.NewLib(), } @@ -25,7 +39,17 @@ func (c *FqlCompiler) RegisterFunction(name string, fun core.Function) error { return errors.Errorf("function already exists: %s", name) } - c.funcs[name] = fun + c.funcs[strings.ToUpper(name)] = fun + + return nil +} + +func (c *FqlCompiler) RegisterFunctions(funcs map[string]core.Function) error { + for name, fun := range funcs { + if err := c.RegisterFunction(name, fun); err != nil { + return err + } + } return nil } diff --git a/pkg/compiler/options.go b/pkg/compiler/options.go new file mode 100644 index 00000000..0181ab55 --- /dev/null +++ b/pkg/compiler/options.go @@ -0,0 +1,14 @@ +package compiler + +type ( + Option func(opts *Options) + Options struct { + noStdlib bool + } +) + +func WithoutStdlib() Option { + return func(opts *Options) { + opts.noStdlib = true + } +} diff --git a/pkg/runtime/core/function.go b/pkg/runtime/core/function.go index d3d69446..2f1c6c2c 100644 --- a/pkg/runtime/core/function.go +++ b/pkg/runtime/core/function.go @@ -7,9 +7,9 @@ import ( type Function = func(ctx context.Context, args ...Value) (Value, error) -func ValidateArgs(inputs []Value, required int) error { - if len(inputs) != required { - return Error(ErrMissedArgument, fmt.Sprintf("expected %d, but got %d arguments", required, len(inputs))) +func ValidateArgs(args []Value, required int) error { + if len(args) != required { + return Error(ErrMissedArgument, fmt.Sprintf("expected %d, but got %d arguments", required, len(args))) } return nil diff --git a/pkg/runtime/values/string.go b/pkg/runtime/values/string.go index 57136a6c..b8596924 100644 --- a/pkg/runtime/values/string.go +++ b/pkg/runtime/values/string.go @@ -112,3 +112,46 @@ func (t String) IndexOf(other String) Int { func (t String) Concat(other core.Value) String { return String(string(t) + other.String()) } + +func (t String) Replace(old, new String, times Int) String { + return NewString(strings.Replace(string(t), string(old), string(new), int(times))) +} + +func (t String) Remove(startIndex Int) String { + return t[startIndex : t.Length()-1] +} + +func (t String) EndsWith(other String) Boolean { + return Boolean(strings.HasSuffix(string(t), string(other))) +} + +func (t String) StartsWith(other String) Boolean { + return Boolean(strings.HasPrefix(string(t), string(other))) +} + +func (t String) Split(separator String) *Array { + out := strings.Split(string(t), string(separator)) + arr := NewArray(len(out)) + + for _, str := range out { + arr.Push(NewString(str)) + } + + return arr +} + +func (t String) Substring(start, end Int) String { + return t[start:end] +} + +func (t String) Trim() String { + return NewString(strings.TrimSpace(string(t))) +} + +func (t String) ToUpper() String { + return NewString(strings.ToUpper(string(t))) +} + +func (t String) ToLower() String { + return NewString(strings.ToLower(string(t))) +}