1
0
mirror of https://github.com/MontFerret/ferret.git synced 2025-01-18 03:22:02 +02:00

Merge remote-tracking branch 'upstream/master'

This commit is contained in:
David Landry 2018-10-05 18:42:33 -04:00
commit 737016dcd2
42 changed files with 569 additions and 163 deletions

9
Gopkg.lock generated
View File

@ -137,6 +137,14 @@
revision = "a96e63847dc3c67d17befa69c303767e2f84e54f" revision = "a96e63847dc3c67d17befa69c303767e2f84e54f"
version = "v2.1" version = "v2.1"
[[projects]]
branch = "master"
digest = "1:f7aa53146bf79462509d4ce136826ebbd64907e4679e1b04e62758da6b68e589"
name = "github.com/orcaman/concurrent-map"
packages = ["."]
pruneopts = "UT"
revision = "b28018939af9022337862b94a463abb18abb3e0e"
[[projects]] [[projects]]
digest = "1:40e195917a951a8bf867cd05de2a46aaf1806c50cf92eebf4c16f78cd196f747" digest = "1:40e195917a951a8bf867cd05de2a46aaf1806c50cf92eebf4c16f78cd196f747"
name = "github.com/pkg/errors" name = "github.com/pkg/errors"
@ -228,6 +236,7 @@
"github.com/mafredri/cdp/rpcc", "github.com/mafredri/cdp/rpcc",
"github.com/mafredri/cdp/session", "github.com/mafredri/cdp/session",
"github.com/natefinch/lumberjack", "github.com/natefinch/lumberjack",
"github.com/orcaman/concurrent-map",
"github.com/pkg/errors", "github.com/pkg/errors",
"github.com/rs/zerolog", "github.com/rs/zerolog",
"github.com/sethgrid/pester", "github.com/sethgrid/pester",

View File

@ -5,7 +5,7 @@ export GOPATH
VERSION ?= $(shell git describe --tags --always --dirty) VERSION ?= $(shell git describe --tags --always --dirty)
DIR_BIN = ./bin DIR_BIN = ./bin
DIR_PKG = ./pkg DIR_PKG = ./pkg
DIR_CMD = ./cmd DIR_CLI = ./cli
default: build default: build
@ -14,7 +14,7 @@ build: install vet generate test compile
compile: compile:
go build -v -o ${DIR_BIN}/ferret \ go build -v -o ${DIR_BIN}/ferret \
-ldflags "-X main.Version=${VERSION}" \ -ldflags "-X main.Version=${VERSION}" \
${DIR_CMD}/main.go ./main.go
install: install:
dep ensure dep ensure
@ -30,14 +30,14 @@ doc:
# http://golang.org/cmd/go/#hdr-Run_gofmt_on_package_sources # http://golang.org/cmd/go/#hdr-Run_gofmt_on_package_sources
fmt: fmt:
go fmt ${DIR_CMD}/... ${DIR_PKG}/... go fmt ${DIR_CLI}/... ${DIR_PKG}/...
# https://github.com/golang/lint # https://github.com/golang/lint
# go get github.com/golang/lint/golint # go get github.com/golang/lint/golint
lint: lint:
golint ${DIR_CMD}/... ${DIR_PKG}/... golint ${DIR_CLI}/... ${DIR_PKG}/...
# http://godoc.org/code.google.com/p/go.tools/cmd/vet # http://godoc.org/code.google.com/p/go.tools/cmd/vet
# go get code.google.com/p/go.tools/cmd/vet # go get code.google.com/p/go.tools/cmd/vet
vet: vet:
go vet ${DIR_CMD}/... ${DIR_PKG}/... go vet ${DIR_CLI}/... ${DIR_PKG}/...

View File

@ -3,15 +3,15 @@
![ferret](https://raw.githubusercontent.com/MontFerret/ferret/master/assets/intro.jpg) ![ferret](https://raw.githubusercontent.com/MontFerret/ferret/master/assets/intro.jpg)
## What is it? ## What is it?
```ferret``` is a web scraping system aiming to simplify data extraction from the web for such things like ui testing, machine learning and analytics. ```ferret``` is a web scraping system aiming to simplify data extraction from the web for such things like UI testing, machine learning and analytics.
Having it's own declarative language, ```ferret``` abstracts away technical details and complexity of the underlying technologies, helping to focus on the data itself. Having its own declarative language, ```ferret``` abstracts away technical details and complexity of the underlying technologies, helping to focus on the data itself.
It's extremely portable, extensible and fast. It's extremely portable, extensible and fast.
## Show me some code ## Show me some code
The following example demonstrates the use of dynamic pages. The following example demonstrates the use of dynamic pages.
First of all, we load the main Google Search page, type search criteria into an input box and then click a search button. First of all, we load the main Google Search page, type search criteria into an input box and then click a search button.
The click action triggers a redirect, so we wait till its end. The click action triggers a redirect, so we wait till its end.
Once the page gets loaded, we iterate over all elements in search results and assign output to a variable. Once the page gets loaded, we iterate over all elements in search results and assign the output to a variable.
The final for loop filters out empty elements that might be because of inaccurate use of selectors. The final for loop filters out empty elements that might be because of inaccurate use of selectors.
```aql ```aql
@ -49,27 +49,32 @@ RETURN (
Nowadays data is everything and who owns data - owns the world. Nowadays data is everything and who owns data - owns the world.
I have worked on multiple data-driven projects where data was an essential part of a system and I realized how cumbersome writing tons of scrapers is. I have worked on multiple data-driven projects where data was an essential part of a system and I realized how cumbersome writing tons of scrapers is.
After some time looking for a tool that would let me to not write a code, but just express what data I need, decided to come up with my own solution. After some time looking for a tool that would let me to not write a code, but just express what data I need, decided to come up with my own solution.
```ferret``` project is an ambitious initiative trying to bring universal platform for writing scrapers without any hassle. ```ferret``` project is an ambitious initiative trying to bring the universal platform for writing scrapers without any hassle.
## Inspiration ## Inspiration
FQL (Ferret Query Language) is heavily inspired by [AQL](https://www.arangodb.com/) (ArangoDB Query Language). FQL (Ferret Query Language) is heavily inspired by [AQL](https://www.arangodb.com/) (ArangoDB Query Language).
But due to the domain specifics, there are some differences in how things work. But due to the domain specifics, there are some differences in how things work.
## WIP ## WIP
Be aware, the the project is under heavy development. There is no documentation and some things may change in the final release. Be aware, that the project is under heavy development. There is no documentation and some things may change in the final release.
For query syntax, you may go to [ArangoDB web site](https://docs.arangodb.com/3.3/AQL/index.html) and use AQL docs as docs for FQL - since they are identical. For query syntax, you may go to [ArangoDB web site](https://docs.arangodb.com/3.3/AQL/index.html) and use AQL docs as docs for FQL - since they are identical.
## Installation ## Installation
### Prerequisites ### Prerequisites
* Go >=1.6 #### Production
* Go >=1.9
* Chrome or Docker
#### Development
* GoDep * GoDep
* GNU Make * GNU Make
* Chrome or Docker (optional) * ANTLR4 >=4.7.1
```sh ```sh
make install && make compile go get github.com/MontFerret/ferret
``` ```
You can use your local copy of Google Chrome / Chromium, but for ease of use it's recommended to run it inside a Docker container: You can use your local copy of Google Chrome / Chromium, but for ease of use it's recommended to run it inside a Docker container:
@ -91,7 +96,7 @@ chrome.exe --remote-debugging-port=9222
If you want to play with ```fql``` and check its syntax, you can run CLI with the following commands: If you want to play with ```fql``` and check its syntax, you can run CLI with the following commands:
``` ```
go run ./cmd/main.go ferret
``` ```
```ferret``` will run in REPL mode. ```ferret``` will run in REPL mode.
@ -107,33 +112,33 @@ Please use `Ctrl-D` to exit this program.
``` ```
**Note:** symbol ```%``` is used to start and end multi line queries. You also can use heredoc format. **Note:** symbol ```%``` is used to start and end multi-line queries. You also can use the heredoc format.
If you want to execute a query stored in a file, just pass a file name: If you want to execute a query stored in a file, just pass a file name:
``` ```
go run ./cmd/main.go ./docs/examples/hackernews.fql ferret ./docs/examples/static-page.fql
``` ```
``` ```
cat ./docs/examples/hackernews.fql | go run ./cmd/main.go cat ./docs/examples/static-page.fql | ferret
``` ```
``` ```
go run ./cmd/main.go < ./docs/examples/hackernews.fql ferret < ./docs/examples/static-page.fql
``` ```
### Browser mode ### Browser mode
By default, ``ferret`` loads HTML pages via http protocol, because it's faster. By default, ``ferret`` loads HTML pages via HTTP protocol, because it's faster.
But nowadays, there are more and more websites rendered with JavaScript, and therefore, this 'old school' approach does not really work. But nowadays, there are more and more websites rendered with JavaScript, and therefore, this 'old school' approach does not really work.
For such cases, you may fetch documents using Chrome or Chromium via Chrome DevTools protocol (aka CDP). For such cases, you may fetch documents using Chrome or Chromium via Chrome DevTools protocol (aka CDP).
First, you need to make sure that you launched Chrome with ```remote-debugging-port=9222``` flag. First, you need to make sure that you launched Chrome with ```remote-debugging-port=9222``` flag.
Second, you need to pass the address to ```ferret``` CLI. Second, you need to pass the address to ```ferret``` CLI.
``` ```
go run ./cmd/main.go --cdp http://127.0.0.1:9222 ferret --cdp http://127.0.0.1:9222
``` ```
**NOTE:** By default, ```ferret``` will try to use this local address as a default one, so it makes sense to explicitly pass the parameter only in case of either different port number or remote address. **NOTE:** By default, ```ferret``` will try to use this local address as a default one, so it makes sense to explicitly pass the parameter only in case of either different port number or remote address.
@ -141,7 +146,7 @@ go run ./cmd/main.go --cdp http://127.0.0.1:9222
Alternatively, you can tell CLI to launch Chrome for you. Alternatively, you can tell CLI to launch Chrome for you.
```shell ```shell
go run ./cmd/main.go --cdp-launch ferret --cdp-launch
``` ```
**NOTE:** Launch command is currently broken on MacOS. **NOTE:** Launch command is currently broken on MacOS.
@ -345,7 +350,7 @@ func getStrings() ([]string, error) {
} }
``` ```
On top of that, you can completely turn off standard library, by passing the following option: On top of that, you can completely turn off the standard library, bypassing the following option:
```go ```go
comp := compiler.New(compiler.WithoutStdlib()) comp := compiler.New(compiler.WithoutStdlib())

View File

@ -0,0 +1,3 @@
LET doc = DOCUMENT("about:blank", true)
NAVIGATE(doc, "https://www.google.com/")
RETURN doc.url

View File

@ -1,13 +1,12 @@
LET doc = DOCUMENT("https://github.com/", true) LET doc = DOCUMENT("https://github.com/", true)
LET main = ELEMENT(doc, '.application-main') LET main = ELEMENT(doc, '.application-main')
LOG('innerText:start')
LET mainTxt = main.innerText LET mainTxt = main.innerText
LOG('innerText:end')
NAVIGATE(doc, "https://github.com/features") NAVIGATE(doc, "https://github.com/features")
LET features = ELEMENT(doc, '.application-main') LET features = ELEMENT(doc, '.application-main')
LET featuresTxt = features.innerText LET featuresTxt = features.innerText
LOG("featuresTxt:", featuresTxt)
RETURN mainTxt == featuresTxt RETURN mainTxt == featuresTxt

View File

@ -5,8 +5,8 @@ import (
"encoding/json" "encoding/json"
"flag" "flag"
"fmt" "fmt"
"github.com/MontFerret/ferret/cmd/cli" "github.com/MontFerret/ferret/cli"
"github.com/MontFerret/ferret/pkg/browser" "github.com/MontFerret/ferret/cli/browser"
"github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/core"
"io/ioutil" "io/ioutil"
"os" "os"

View File

@ -1774,24 +1774,29 @@ func TestParam(t *testing.T) {
}) })
} }
func TestHtml(t *testing.T) { //func TestHtml(t *testing.T) {
// Convey("Should load a document", t, func() { // Convey("Should load a document", t, func() {
// c := compiler.New() // c := compiler.New()
// //
// out, err := c.MustCompile(` // out, err := c.MustCompile(`
//LET doc = DOCUMENT("https://github.com/", true) //LET doc = DOCUMENT("https://github.com/", true)
//LET btn = ELEMENT(doc, ".HeaderMenu a") //LET main = ELEMENT(doc, '.application-main')
// //LET mainTxt = main.innerText
//CLICK(btn) //
//WAIT_NAVIGATION(doc) //NAVIGATE(doc, "https://github.com/features")
//WAIT_ELEMENT(doc, '.IconNav') //
// //LET features = ELEMENT(doc, '.application-main')
//RETURN INNER_HTML_ALL(doc, '.IconNav a') //LET featuresTxt = features.innerText
// //
// `).Run(context.Background()) //LOG("featuresTxt:", featuresTxt)
// //
// So(err, ShouldBeNil) //RETURN mainTxt == featuresTxt
// //
// So(string(out), ShouldEqual, `"int"`) //
// }) // `).Run(context.Background())
} //
// So(err, ShouldBeNil)
//
// So(string(out), ShouldEqual, `"int"`)
// })
//}

View File

@ -0,0 +1,52 @@
package core_test
import (
"testing"
"github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/pkg/errors"
. "github.com/smartystreets/goconvey/convey"
)
func TestSourceError(t *testing.T) {
Convey("Should match", t, func() {
sm := core.NewSourceMap("test", 1, 1)
msg := "test at 1:1"
cause := errors.New("cause")
e := errors.Errorf("%s: %s", cause.Error(), msg)
cse := core.SourceError(sm, cause)
So(cse, ShouldNotBeNil)
So(cse.Error(), ShouldEqual, e.Error())
})
}
func TestTypeError(t *testing.T) {
Convey("Should match", t, func() {
e := core.TypeError(core.BooleanType)
So(e, ShouldNotBeNil)
e = core.TypeError(core.BooleanType, core.BooleanType)
So(e, ShouldNotBeNil)
e = core.TypeError(core.BooleanType, core.BooleanType, core.IntType, core.FloatType)
So(e, ShouldNotBeNil)
cause := errors.New("invalid type: expected none or boolean or int or float, but got none")
e = core.TypeError(core.NoneType, core.NoneType, core.BooleanType, core.IntType, core.FloatType)
So(e.Error(), ShouldEqual, cause.Error())
})
}
func TestError(t *testing.T) {
Convey("Should match", t, func() {
msg := "test message"
cause := errors.New("cause")
e := errors.Errorf("%s: %s", cause.Error(), msg)
ce := core.Error(cause, msg)
So(ce, ShouldNotBeNil)
So(ce.Error(), ShouldEqual, e.Error())
})
}

View File

@ -0,0 +1,21 @@
package core_test
import (
"testing"
"github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/MontFerret/ferret/pkg/runtime/values"
. "github.com/smartystreets/goconvey/convey"
)
func TestValidateArgs(t *testing.T) {
Convey("Should match", t, func() {
a := []core.Value{values.NewInt(1), values.NewInt(2)}
e := core.ValidateArgs(a, 1, 2)
So(e, ShouldBeNil)
e = core.ValidateArgs(a, 3, 4)
So(e, ShouldNotBeNil)
})
}

View File

@ -0,0 +1,69 @@
package core_test
import (
"context"
"testing"
"github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/MontFerret/ferret/pkg/runtime/values"
. "github.com/smartystreets/goconvey/convey"
)
func TestParamsWith(t *testing.T) {
Convey("Should match", t, func() {
p := make(map[string]core.Value)
p["val1"] = values.NewInt(1)
p["val2"] = values.NewString("test")
pc := core.ParamsWith(context.Background(), p)
So(pc, ShouldNotBeNil)
So(pc.Value("params"), ShouldEqual, p)
})
}
func TestParamsFrom(t *testing.T) {
Convey("Should match", t, func() {
p := make(map[string]core.Value)
p["val1"] = values.NewInt(1)
p["val2"] = values.NewString("test")
_, err := core.ParamsFrom(context.Background())
So(err, ShouldNotBeNil)
ctx := context.WithValue(context.Background(), "fail", p)
pf, err := core.ParamsFrom(ctx)
So(err, ShouldNotBeNil)
ctx = context.WithValue(context.Background(), "params", p)
pf, err = core.ParamsFrom(ctx)
So(err, ShouldBeNil)
So(pf, ShouldEqual, p)
})
}
func TestParamFrom(t *testing.T) {
Convey("Should match", t, func() {
p := make(map[string]core.Value)
p["val1"] = values.NewInt(1)
p["val2"] = values.NewString("test")
_, err := core.ParamFrom(context.Background(), "")
So(err, ShouldNotBeNil)
ctx := context.WithValue(context.Background(), "fail", p)
_, err = core.ParamFrom(ctx, "val1")
So(err, ShouldNotBeNil)
ctx = context.WithValue(context.Background(), "params", p)
v, err := core.ParamFrom(ctx, "val1")
So(err, ShouldBeNil)
So(v, ShouldEqual, values.NewInt(1))
})
}

View File

@ -45,7 +45,7 @@ type Value interface {
String() string String() string
Compare(other Value) int Compare(other Value) int
Unwrap() interface{} Unwrap() interface{}
Hash() int Hash() uint64
Clone() Value Clone() Value
} }

View File

@ -70,10 +70,10 @@ func (e *ForExpression) Exec(ctx context.Context, scope *core.Scope) (core.Value
} }
// Hash map for a check for uniqueness // Hash map for a check for uniqueness
var hashes map[int]bool var hashes map[uint64]bool
if e.distinct { if e.distinct {
hashes = make(map[int]bool) hashes = make(map[uint64]bool)
} }
res := values.NewArray(10) res := values.NewArray(10)

View File

@ -1,10 +1,11 @@
package values package values
import ( import (
"crypto/sha512" "encoding/binary"
"encoding/json" "encoding/json"
"github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/pkg/errors" "github.com/pkg/errors"
"hash/fnv"
) )
type ( type (
@ -77,22 +78,29 @@ func (t *Array) Unwrap() interface{} {
return arr return arr
} }
func (t *Array) Hash() int { func (t *Array) Hash() uint64 {
bytes, err := t.MarshalJSON() h := fnv.New64a()
if err != nil { h.Write([]byte(t.Type().String()))
return 0 h.Write([]byte(":"))
h.Write([]byte("["))
endIndex := len(t.value) - 1
for i, el := range t.value {
bytes := make([]byte, 8)
binary.LittleEndian.PutUint64(bytes, el.Hash())
h.Write(bytes)
if i != endIndex {
h.Write([]byte(","))
}
} }
h := sha512.New() h.Write([]byte("]"))
out, err := h.Write(bytes) return h.Sum64()
if err != nil {
return 0
}
return out
} }
func (t *Array) Clone() core.Value { func (t *Array) Clone() core.Value {

View File

@ -119,6 +119,45 @@ func TestArray(t *testing.T) {
}) })
}) })
Convey(".Hash", t, func() {
Convey("It should calculate hash of non-empty array", func() {
arr := values.NewArrayWith(
values.NewInt(1),
values.NewInt(2),
values.NewInt(3),
)
h := arr.Hash()
So(h, ShouldBeGreaterThan, 0)
})
Convey("It should calculate hash of empty array", func() {
arr := values.NewArrayWith()
h := arr.Hash()
So(h, ShouldBeGreaterThan, 0)
})
Convey("Hash sum should be consistent", func() {
arr := values.NewArrayWith(
values.True,
values.NewInt(1),
values.NewFloat(1.1),
values.NewString("foobar"),
values.NewCurrentDateTime(),
values.NewArrayWith(values.NewInt(1), values.True),
values.NewObjectWith(values.NewObjectProperty("foo", values.NewString("bar"))),
)
h1 := arr.Hash()
h2 := arr.Hash()
So(h1, ShouldEqual, h2)
})
})
Convey(".Length", t, func() { Convey(".Length", t, func() {
Convey("Should return 0 when empty", func() { Convey("Should return 0 when empty", func() {
arr := values.NewArray(1) arr := values.NewArray(1)

View File

@ -1,9 +1,9 @@
package values package values
import ( import (
"crypto/sha512"
"encoding/json" "encoding/json"
"github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/core"
"hash/fnv"
"io" "io"
"io/ioutil" "io/ioutil"
) )
@ -62,16 +62,14 @@ func (b *Binary) Unwrap() interface{} {
return b.values return b.values
} }
func (b *Binary) Hash() int { func (b *Binary) Hash() uint64 {
h := sha512.New() h := fnv.New64a()
out, err := h.Write(b.values) h.Write([]byte(b.Type().String()))
h.Write([]byte(":"))
h.Write(b.values)
if err != nil { return h.Sum64()
return 0
}
return out
} }
func (b *Binary) Clone() core.Value { func (b *Binary) Clone() core.Value {

View File

@ -1,10 +1,10 @@
package values package values
import ( import (
"crypto/sha512"
"encoding/json" "encoding/json"
"github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/pkg/errors" "github.com/pkg/errors"
"hash/fnv"
"strings" "strings"
) )
@ -98,16 +98,14 @@ func (t Boolean) Unwrap() interface{} {
return bool(t) return bool(t)
} }
func (t Boolean) Hash() int { func (t Boolean) Hash() uint64 {
h := sha512.New() h := fnv.New64a()
out, err := h.Write([]byte(t.String())) h.Write([]byte(t.Type().String()))
h.Write([]byte(":"))
h.Write([]byte(t.String()))
if err != nil { return h.Sum64()
return 0
}
return out
} }
func (t Boolean) Clone() core.Value { func (t Boolean) Clone() core.Value {

View File

@ -70,4 +70,16 @@ func TestBoolean(t *testing.T) {
So(values.False.Compare(values.True), ShouldEqual, -1) So(values.False.Compare(values.True), ShouldEqual, -1)
}) })
}) })
Convey(".Hash", t, func() {
Convey("It should calculate hash", func() {
So(values.True.Hash(), ShouldBeGreaterThan, 0)
So(values.False.Hash(), ShouldBeGreaterThan, 0)
})
Convey("Hash sum should be consistent", func() {
So(values.True.Hash(), ShouldEqual, values.True.Hash())
So(values.False.Hash(), ShouldEqual, values.False.Hash())
})
})
} }

View File

@ -1,8 +1,8 @@
package values package values
import ( import (
"crypto/sha512"
"github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/core"
"hash/fnv"
"time" "time"
) )
@ -12,6 +12,10 @@ type DateTime struct {
time.Time time.Time
} }
func NewCurrentDateTime() DateTime {
return DateTime{time.Now()}
}
func NewDateTime(time time.Time) DateTime { func NewDateTime(time time.Time) DateTime {
return DateTime{time} return DateTime{time}
} }
@ -84,18 +88,21 @@ func (t DateTime) Unwrap() interface{} {
return t.Time return t.Time
} }
func (t DateTime) Hash() int { func (t DateTime) Hash() uint64 {
h := sha512.New() h := fnv.New64a()
t.Time.MarshalJSON() h.Write([]byte(t.Type().String()))
h.Write([]byte(":"))
out, err := h.Write([]byte(t.Time.String())) bytes, err := t.Time.GobEncode()
if err != nil { if err != nil {
return 0 return 0
} }
return out h.Write(bytes)
return h.Sum64()
} }
func (t DateTime) Clone() core.Value { func (t DateTime) Clone() core.Value {

View File

@ -0,0 +1,25 @@
package values_test
import (
"github.com/MontFerret/ferret/pkg/runtime/values"
. "github.com/smartystreets/goconvey/convey"
"testing"
)
func TestDateTime(t *testing.T) {
Convey(".Hash", t, func() {
Convey("It should calculate hash", func() {
d := values.NewCurrentDateTime()
h := d.Hash()
So(h, ShouldBeGreaterThan, 0)
})
Convey("Hash sum should be consistent", func() {
d := values.NewCurrentDateTime()
So(d.Hash(), ShouldEqual, d.Hash())
})
})
}

View File

@ -1,11 +1,13 @@
package values package values
import ( import (
"crypto/sha512" "encoding/binary"
"encoding/json" "encoding/json"
"fmt" "fmt"
"github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/pkg/errors" "github.com/pkg/errors"
"hash/fnv"
"math"
"strconv" "strconv"
) )
@ -112,22 +114,17 @@ func (t Float) Unwrap() interface{} {
return float64(t) return float64(t)
} }
func (t Float) Hash() int { func (t Float) Hash() uint64 {
bytes, err := t.MarshalJSON() h := fnv.New64a()
if err != nil { h.Write([]byte(t.Type().String()))
return 0 h.Write([]byte(":"))
}
h := sha512.New() bytes := make([]byte, 8)
binary.LittleEndian.PutUint64(bytes, math.Float64bits(float64(t)))
h.Write(bytes)
out, err := h.Write(bytes) return h.Sum64()
if err != nil {
return 0
}
return out
} }
func (t Float) Clone() core.Value { func (t Float) Clone() core.Value {

View File

@ -0,0 +1,29 @@
package values_test
import (
"github.com/MontFerret/ferret/pkg/runtime/values"
. "github.com/smartystreets/goconvey/convey"
"testing"
)
func TestFloat(t *testing.T) {
Convey(".Hash", t, func() {
Convey("It should calculate hash", func() {
v := values.NewFloat(1.1)
h := v.Hash()
So(h, ShouldBeGreaterThan, 0)
v2 := values.NewFloat(1.2)
So(h, ShouldNotEqual, v2.Hash())
})
Convey("Hash sum should be consistent", func() {
v := values.NewFloat(1.1)
So(v.Hash(), ShouldEqual, v.Hash())
})
})
}

View File

@ -1,9 +1,11 @@
package values package values
import ( import (
"encoding/binary"
"encoding/json" "encoding/json"
"github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/pkg/errors" "github.com/pkg/errors"
"hash/fnv"
"strconv" "strconv"
) )
@ -110,8 +112,17 @@ func (t Int) Unwrap() interface{} {
return int(t) return int(t)
} }
func (t Int) Hash() int { func (t Int) Hash() uint64 {
return int(t) h := fnv.New64a()
h.Write([]byte(t.Type().String()))
h.Write([]byte(":"))
bytes := make([]byte, 8)
binary.LittleEndian.PutUint64(bytes, uint64(t))
h.Write(bytes)
return h.Sum64()
} }
func (t Int) Clone() core.Value { func (t Int) Clone() core.Value {

View File

@ -0,0 +1,29 @@
package values_test
import (
"github.com/MontFerret/ferret/pkg/runtime/values"
. "github.com/smartystreets/goconvey/convey"
"testing"
)
func TestInt(t *testing.T) {
Convey(".Hash", t, func() {
Convey("It should calculate hash", func() {
v := values.NewInt(1)
h := v.Hash()
So(h, ShouldBeGreaterThan, 0)
v2 := values.NewInt(2)
So(h, ShouldNotEqual, v2.Hash())
})
Convey("Hash sum should be consistent", func() {
v := values.NewInt(1)
So(v.Hash(), ShouldEqual, v.Hash())
})
})
}

View File

@ -33,7 +33,7 @@ func (t *none) Unwrap() interface{} {
return nil return nil
} }
func (t *none) Hash() int { func (t *none) Hash() uint64 {
return 0 return 0
} }

View File

@ -1,15 +1,17 @@
package values package values
import ( import (
"crypto/sha512" "encoding/binary"
"encoding/json" "encoding/json"
"github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/core"
"hash/fnv"
"sort"
) )
type ( type (
ObjectPredicate = func(value core.Value, key string) bool ObjectPredicate = func(value core.Value, key string) bool
ObjectProperty struct { ObjectProperty struct {
name string key string
value core.Value value core.Value
} }
Object struct { Object struct {
@ -29,7 +31,7 @@ func NewObjectWith(props ...*ObjectProperty) *Object {
obj := NewObject() obj := NewObject()
for _, prop := range props { for _, prop := range props {
obj.value[prop.name] = prop.value obj.value[prop.key] = prop.value
} }
return obj return obj
@ -88,22 +90,43 @@ func (t *Object) Unwrap() interface{} {
return obj return obj
} }
func (t *Object) Hash() int { func (t *Object) Hash() uint64 {
bytes, err := t.MarshalJSON() h := fnv.New64a()
if err != nil { h.Write([]byte(t.Type().String()))
return 0 h.Write([]byte(":"))
h.Write([]byte("{"))
keys := make([]string, 0, len(t.value))
for key := range t.value {
keys = append(keys, key)
} }
h := sha512.New() // order does not really matter
// but it will give us a consistent hash sum
sort.Strings(keys)
endIndex := len(keys) - 1
out, err := h.Write(bytes) for idx, key := range keys {
h.Write([]byte(key))
h.Write([]byte(":"))
if err != nil { el := t.value[key]
return 0
bytes := make([]byte, 8)
binary.LittleEndian.PutUint64(bytes, el.Hash())
h.Write(bytes)
if idx != endIndex {
h.Write([]byte(","))
}
} }
return out h.Write([]byte("}"))
return h.Sum64()
} }
func (t *Object) Clone() core.Value { func (t *Object) Clone() core.Value {

View File

@ -142,6 +142,45 @@ func TestObject(t *testing.T) {
}) })
}) })
Convey(".Hash", t, func() {
Convey("It should calculate hash of non-empty object", func() {
v := values.NewObjectWith(
values.NewObjectProperty("foo", values.NewString("bar")),
values.NewObjectProperty("faz", values.NewInt(1)),
values.NewObjectProperty("qaz", values.True),
)
h := v.Hash()
So(h, ShouldBeGreaterThan, 0)
})
Convey("It should calculate hash of empty object", func() {
v := values.NewObject()
h := v.Hash()
So(h, ShouldBeGreaterThan, 0)
})
Convey("Hash sum should be consistent", func() {
v := values.NewObjectWith(
values.NewObjectProperty("boolean", values.True),
values.NewObjectProperty("int", values.NewInt(1)),
values.NewObjectProperty("float", values.NewFloat(1.1)),
values.NewObjectProperty("string", values.NewString("foobar")),
values.NewObjectProperty("datetime", values.NewCurrentDateTime()),
values.NewObjectProperty("array", values.NewArrayWith(values.NewInt(1), values.True)),
values.NewObjectProperty("object", values.NewObjectWith(values.NewObjectProperty("foo", values.NewString("bar")))),
)
h1 := v.Hash()
h2 := v.Hash()
So(h1, ShouldEqual, h2)
})
})
Convey(".Length", t, func() { Convey(".Length", t, func() {
Convey("Should return 0 when empty", func() { Convey("Should return 0 when empty", func() {
obj := values.NewObject() obj := values.NewObject()

View File

@ -1,11 +1,11 @@
package values package values
import ( import (
"crypto/sha512"
"encoding/json" "encoding/json"
"fmt" "fmt"
"github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/pkg/errors" "github.com/pkg/errors"
"hash/fnv"
"strings" "strings"
) )
@ -93,16 +93,14 @@ func (t String) Unwrap() interface{} {
return string(t) return string(t)
} }
func (t String) Hash() int { func (t String) Hash() uint64 {
h := sha512.New() h := fnv.New64a()
out, err := h.Write([]byte(t)) h.Write([]byte(t.Type().String()))
h.Write([]byte(":"))
h.Write([]byte(t))
if err != nil { return h.Sum64()
return 0
}
return out
} }
func (t String) Clone() core.Value { func (t String) Clone() core.Value {

View File

@ -0,0 +1,29 @@
package values_test
import (
"github.com/MontFerret/ferret/pkg/runtime/values"
. "github.com/smartystreets/goconvey/convey"
"testing"
)
func TestString(t *testing.T) {
Convey(".Hash", t, func() {
Convey("It should calculate hash", func() {
v := values.NewString("a")
h := v.Hash()
So(h, ShouldBeGreaterThan, 0)
v2 := values.NewString("b")
So(h, ShouldNotEqual, v2.Hash())
})
Convey("Hash sum should be consistent", func() {
v := values.NewString("foobar")
So(v.Hash(), ShouldEqual, v.Hash())
})
})
}

View File

@ -2,7 +2,6 @@ package dynamic
import ( import (
"context" "context"
"crypto/sha512"
"fmt" "fmt"
"github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/MontFerret/ferret/pkg/runtime/logging" "github.com/MontFerret/ferret/pkg/runtime/logging"
@ -17,10 +16,13 @@ import (
"github.com/mafredri/cdp/rpcc" "github.com/mafredri/cdp/rpcc"
"github.com/pkg/errors" "github.com/pkg/errors"
"github.com/rs/zerolog" "github.com/rs/zerolog"
"hash/fnv"
"sync" "sync"
"time" "time"
) )
const BlankPageUrl = "about:blank"
type HtmlDocument struct { type HtmlDocument struct {
sync.Mutex sync.Mutex
logger *zerolog.Logger logger *zerolog.Logger
@ -78,10 +80,12 @@ func LoadHtmlDocument(
return nil, err return nil, err
} }
err = waitForLoadEvent(ctx, client) if url != BlankPageUrl {
err = waitForLoadEvent(ctx, client)
if err != nil { if err != nil {
return nil, err return nil, err
}
} }
root, innerHtml, err := getRootElement(client) root, innerHtml, err := getRootElement(client)
@ -201,19 +205,17 @@ func (doc *HtmlDocument) Unwrap() interface{} {
return doc.element return doc.element
} }
func (doc *HtmlDocument) Hash() int { func (doc *HtmlDocument) Hash() uint64 {
doc.Lock() doc.Lock()
defer doc.Unlock() defer doc.Unlock()
h := sha512.New() h := fnv.New64a()
out, err := h.Write([]byte(doc.url)) h.Write([]byte(doc.Type().String()))
h.Write([]byte(":"))
h.Write([]byte(doc.url))
if err != nil { return h.Sum64()
return 0
}
return out
} }
func (doc *HtmlDocument) Clone() core.Value { func (doc *HtmlDocument) Clone() core.Value {
@ -637,6 +639,10 @@ func (doc *HtmlDocument) WaitForNavigation(timeout values.Int) error {
} }
func (doc *HtmlDocument) Navigate(url values.String) error { func (doc *HtmlDocument) Navigate(url values.String) error {
if url == "" {
url = BlankPageUrl
}
ctx := context.Background() ctx := context.Background()
repl, err := doc.client.Page.Navigate(ctx, page.NewNavigateArgs(url.String())) repl, err := doc.client.Page.Navigate(ctx, page.NewNavigateArgs(url.String()))
@ -648,5 +654,5 @@ func (doc *HtmlDocument) Navigate(url values.String) error {
return errors.New(*repl.ErrorText) return errors.New(*repl.ErrorText)
} }
return waitForLoadEvent(ctx, doc.client) return doc.WaitForNavigation(5000)
} }

View File

@ -38,6 +38,10 @@ func (drv *Driver) GetDocument(ctx context.Context, url string) (values.HtmlNode
ctx, cancel := context.WithTimeout(ctx, DefaultTimeout) ctx, cancel := context.WithTimeout(ctx, DefaultTimeout)
defer cancel() defer cancel()
if url == "" {
url = BlankPageUrl
}
// Create a new target belonging to the browser context, similar // Create a new target belonging to the browser context, similar
// to opening a new tab in an incognito window. // to opening a new tab in an incognito window.
createTargetArgs := target.NewCreateTargetArgs(url).SetBrowserContextID(drv.contextID) createTargetArgs := target.NewCreateTargetArgs(url).SetBrowserContextID(drv.contextID)

View File

@ -3,7 +3,6 @@ package dynamic
import ( import (
"bytes" "bytes"
"context" "context"
"crypto/sha512"
"encoding/json" "encoding/json"
"github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/MontFerret/ferret/pkg/runtime/values" "github.com/MontFerret/ferret/pkg/runtime/values"
@ -14,6 +13,7 @@ import (
"github.com/mafredri/cdp" "github.com/mafredri/cdp"
"github.com/mafredri/cdp/protocol/dom" "github.com/mafredri/cdp/protocol/dom"
"github.com/rs/zerolog" "github.com/rs/zerolog"
"hash/fnv"
"strconv" "strconv"
"sync" "sync"
"time" "time"
@ -188,24 +188,17 @@ func (el *HtmlElement) Unwrap() interface{} {
return el return el
} }
func (el *HtmlElement) Hash() int { func (el *HtmlElement) Hash() uint64 {
el.Lock() el.Lock()
defer el.Unlock() defer el.Unlock()
h := sha512.New() h := fnv.New64a()
out, err := h.Write([]byte(el.innerHtml)) h.Write([]byte(el.Type().String()))
h.Write([]byte(":"))
h.Write([]byte(el.innerHtml))
if err != nil { return h.Sum64()
el.logger.Error().
Timestamp().
Err(err).
Msg("failed to calculate hash value")
return 0
}
return out
} }
func (el *HtmlElement) Value() core.Value { func (el *HtmlElement) Value() core.Value {
@ -297,7 +290,7 @@ func (el *HtmlElement) GetChildNode(idx values.Int) core.Value {
func (el *HtmlElement) QuerySelector(selector values.String) core.Value { func (el *HtmlElement) QuerySelector(selector values.String) core.Value {
if !el.IsConnected() { if !el.IsConnected() {
return values.NewArray(0) return values.None
} }
ctx := context.Background() ctx := context.Background()

View File

@ -1,12 +1,12 @@
package static package static
import ( import (
"crypto/sha512"
"encoding/json" "encoding/json"
"github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/MontFerret/ferret/pkg/runtime/values" "github.com/MontFerret/ferret/pkg/runtime/values"
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/common" "github.com/MontFerret/ferret/pkg/stdlib/html/driver/common"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"hash/fnv"
) )
type HtmlElement struct { type HtmlElement struct {
@ -53,22 +53,20 @@ func (el *HtmlElement) Unwrap() interface{} {
return el.selection return el.selection
} }
func (el *HtmlElement) Hash() int { func (el *HtmlElement) Hash() uint64 {
h := sha512.New()
str, err := el.selection.Html() str, err := el.selection.Html()
if err != nil { if err != nil {
return 0 return 0
} }
out, err := h.Write([]byte(str)) h := fnv.New64a()
if err != nil { h.Write([]byte(el.Type().String()))
return 0 h.Write([]byte(":"))
} h.Write([]byte(str))
return out return h.Sum64()
} }
func (el *HtmlElement) Clone() core.Value { func (el *HtmlElement) Clone() core.Value {