From 957490efecd99a3db81ed3d9aad615b621950115 Mon Sep 17 00:00:00 2001 From: Tim Voronov Date: Sun, 7 Oct 2018 21:23:36 -0400 Subject: [PATCH] Feature/#19 proxy (#72) * #19 Some work on proxy * Fixed linter issue --- cli/exec.go | 1 + cli/options.go | 1 + cli/repl.go | 1 + examples/crawler.fql | 2 +- examples/dynamic-page.fql | 4 +- main.go | 7 + pkg/runtime/env/env.go | 28 +++ pkg/runtime/options.go | 6 +- pkg/runtime/program.go | 3 +- pkg/stdlib/html/driver/driver.go | 26 ++- pkg/stdlib/html/driver/dynamic/document.go | 209 +++++++----------- .../driver/dynamic/{dynamic.go => driver.go} | 43 +++- pkg/stdlib/html/driver/dynamic/helpers.go | 20 ++ pkg/stdlib/html/driver/dynamic/options.go | 15 ++ .../driver/static/{static.go => driver.go} | 65 ++++-- pkg/stdlib/html/driver/static/options.go | 38 +++- 16 files changed, 303 insertions(+), 166 deletions(-) create mode 100644 pkg/runtime/env/env.go rename pkg/stdlib/html/driver/dynamic/{dynamic.go => driver.go} (73%) create mode 100644 pkg/stdlib/html/driver/dynamic/options.go rename pkg/stdlib/html/driver/static/{static.go => driver.go} (56%) diff --git a/cli/exec.go b/cli/exec.go index 0a1a505b..142c310d 100644 --- a/cli/exec.go +++ b/cli/exec.go @@ -56,6 +56,7 @@ func Exec(query string, opts Options) { runtime.WithLog(l), runtime.WithLogLevel(logging.DebugLevel), runtime.WithParams(opts.Params), + runtime.WithProxy(opts.Proxy), ) if err != nil { diff --git a/cli/options.go b/cli/options.go index 86466d79..58bc3a9d 100644 --- a/cli/options.go +++ b/cli/options.go @@ -3,4 +3,5 @@ package cli type Options struct { Cdp string Params map[string]interface{} + Proxy string } diff --git a/cli/repl.go b/cli/repl.go index f310aece..e97e7918 100644 --- a/cli/repl.go +++ b/cli/repl.go @@ -98,6 +98,7 @@ func Repl(version string, opts Options) { runtime.WithLog(l), runtime.WithLogLevel(logging.DebugLevel), runtime.WithParams(opts.Params), + runtime.WithProxy(opts.Proxy), ) timer.Stop() diff --git a/examples/crawler.fql b/examples/crawler.fql index da0092a0..959e7049 100644 --- a/examples/crawler.fql +++ b/examples/crawler.fql @@ -7,7 +7,7 @@ LET links = ( ) FOR link IN links // The Verge has pretty heavy pages, so let's increase the navigation wait time - NAVIGATE(doc, link, 10000) + NAVIGATE(doc, link, 20000) WAIT_ELEMENT(doc, '.c-entry-content', 5000) LET texter = ELEMENT(doc, '.c-entry-content') RETURN texter.innerText \ No newline at end of file diff --git a/examples/dynamic-page.fql b/examples/dynamic-page.fql index fe163d29..46cdaf2a 100644 --- a/examples/dynamic-page.fql +++ b/examples/dynamic-page.fql @@ -9,6 +9,6 @@ FOR track IN tracks LET title = ELEMENT(track, '.chartTrack__title') RETURN { - artist: username.innerText, - track: title.innerText + artist: TRIM(username.innerText), + track: TRIM(title.innerText) } diff --git a/main.go b/main.go index 2eac695e..0a5d3965 100644 --- a/main.go +++ b/main.go @@ -76,6 +76,12 @@ var ( false, "launch Chrome", ) + + proxyAddress = flag.String( + "proxy", + "", + "address of proxy server to use (only applicable for static pages)", + ) ) func main() { @@ -137,6 +143,7 @@ func main() { opts := cli.Options{ Cdp: cdpConn, Params: p, + Proxy: *proxyAddress, } stat, _ := os.Stdin.Stat() diff --git a/pkg/runtime/env/env.go b/pkg/runtime/env/env.go new file mode 100644 index 00000000..c60f8af4 --- /dev/null +++ b/pkg/runtime/env/env.go @@ -0,0 +1,28 @@ +package env + +import "context" + +type ( + ctxKey struct{} + + Environment struct { + CDPAddress string + ProxyAddress string + } +) + +func WithContext(ctx context.Context, e Environment) context.Context { + return context.WithValue(ctx, ctxKey{}, e) +} + +func FromContext(ctx context.Context) Environment { + res := ctx.Value(ctxKey{}) + + val, ok := res.(Environment) + + if !ok { + return Environment{} + } + + return val +} diff --git a/pkg/runtime/options.go b/pkg/runtime/options.go index ecbb92e3..492fd6f4 100644 --- a/pkg/runtime/options.go +++ b/pkg/runtime/options.go @@ -3,6 +3,7 @@ package runtime import ( "context" "github.com/MontFerret/ferret/pkg/runtime/core" + "github.com/MontFerret/ferret/pkg/runtime/env" "github.com/MontFerret/ferret/pkg/runtime/logging" "github.com/MontFerret/ferret/pkg/runtime/values" "io" @@ -53,7 +54,6 @@ func WithBrowser(address string) Option { func WithProxy(address string) Option { return func(options *Options) { - // TODO: add implementation options.proxy = address } } @@ -73,6 +73,10 @@ func WithLogLevel(lvl logging.Level) Option { func (opts *Options) withContext(parent context.Context) context.Context { ctx := core.ParamsWith(parent, opts.params) ctx = logging.WithContext(ctx, opts.logging) + ctx = env.WithContext(ctx, env.Environment{ + CDPAddress: opts.cdp, + ProxyAddress: opts.proxy, + }) return ctx } diff --git a/pkg/runtime/program.go b/pkg/runtime/program.go index 0c04326f..d4e5e914 100644 --- a/pkg/runtime/program.go +++ b/pkg/runtime/program.go @@ -40,7 +40,8 @@ func (p *Program) Run(ctx context.Context, setters ...Option) ([]byte, error) { } ctx = opts.withContext(ctx) - ctx = driver.WithDynamicDriver(ctx, opts.cdp) + // TODO: Decouple from STDLIB + ctx = driver.WithDynamicDriver(ctx) ctx = driver.WithStaticDriver(ctx) out, err := p.body.Exec(ctx, scope) diff --git a/pkg/stdlib/html/driver/driver.go b/pkg/stdlib/html/driver/driver.go index 9725027f..cf8862c3 100644 --- a/pkg/stdlib/html/driver/driver.go +++ b/pkg/stdlib/html/driver/driver.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "github.com/MontFerret/ferret/pkg/runtime/core" + "github.com/MontFerret/ferret/pkg/runtime/env" "github.com/MontFerret/ferret/pkg/runtime/values" "github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic" "github.com/MontFerret/ferret/pkg/stdlib/html/driver/static" @@ -37,10 +38,27 @@ func FromContext(ctx context.Context, name Name) (Driver, error) { return nil, core.Error(core.ErrNotFound, fmt.Sprintf("%s driver", name)) } -func WithDynamicDriver(ctx context.Context, addr string) context.Context { - return context.WithValue(ctx, Dynamic, dynamic.NewDriver(addr)) +func WithDynamicDriver(ctx context.Context) context.Context { + e := env.FromContext(ctx) + + return context.WithValue( + ctx, + Dynamic, + dynamic.NewDriver( + e.CDPAddress, + dynamic.WithProxy(e.ProxyAddress), + ), + ) } -func WithStaticDriver(ctx context.Context, opts ...static.Option) context.Context { - return context.WithValue(ctx, Static, static.NewDriver(opts...)) +func WithStaticDriver(ctx context.Context) context.Context { + e := env.FromContext(ctx) + + return context.WithValue( + ctx, + Static, + static.NewDriver( + static.WithProxy(e.ProxyAddress), + ), + ) } diff --git a/pkg/stdlib/html/driver/dynamic/document.go b/pkg/stdlib/html/driver/dynamic/document.go index e69ebc04..ee048f04 100644 --- a/pkg/stdlib/html/driver/dynamic/document.go +++ b/pkg/stdlib/html/driver/dynamic/document.go @@ -6,16 +6,14 @@ import ( "hash/fnv" "sync" "time" - + "github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/logging" "github.com/MontFerret/ferret/pkg/runtime/values" "github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic/eval" "github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic/events" - "github.com/corpix/uarand" "github.com/mafredri/cdp" "github.com/mafredri/cdp/protocol/dom" - "github.com/mafredri/cdp/protocol/emulation" "github.com/mafredri/cdp/protocol/page" "github.com/mafredri/cdp/rpcc" "github.com/pkg/errors" @@ -24,19 +22,43 @@ import ( const BlankPageURL = "about:blank" -type HTMLDocument struct { - sync.Mutex - logger *zerolog.Logger - conn *rpcc.Conn - client *cdp.Client - events *events.EventBroker - url values.String - element *HTMLElement +type ( + ScreenshotFormat string + ScreenshotArgs struct { + X float64 + Y float64 + Width float64 + Height float64 + Format ScreenshotFormat + Quality int + } + + HTMLDocument struct { + sync.Mutex + logger *zerolog.Logger + conn *rpcc.Conn + client *cdp.Client + events *events.EventBroker + url values.String + element *HTMLElement + } +) + +const ( + ScreenshotFormatPNG ScreenshotFormat = "png" + ScreenshotFormatJPEG ScreenshotFormat = "jpeg" +) + +func IsScreenshotFormatValid(format string) bool { + value := ScreenshotFormat(format) + + return value == ScreenshotFormatPNG || value == ScreenshotFormatJPEG } func LoadHTMLDocument( ctx context.Context, conn *rpcc.Conn, + client *cdp.Client, url string, ) (*HTMLDocument, error) { if conn == nil { @@ -47,39 +69,7 @@ func LoadHTMLDocument( return nil, core.Error(core.ErrMissedArgument, "url") } - client := cdp.NewClient(conn) - - err := runBatch( - func() error { - return client.Page.Enable(ctx) - }, - - func() error { - return client.Page.SetLifecycleEventsEnabled( - ctx, - page.NewSetLifecycleEventsEnabledArgs(true), - ) - }, - - func() error { - return client.DOM.Enable(ctx) - }, - - func() error { - return client.Runtime.Enable(ctx) - }, - - func() error { - return client.Emulation.SetUserAgentOverride( - ctx, - emulation.NewSetUserAgentOverrideArgs(uarand.GetRandom()), - ) - }, - ) - - if err != nil { - return nil, err - } + var err error if url != BlankPageURL { err = waitForLoadEvent(ctx, client) @@ -111,26 +101,6 @@ func LoadHTMLDocument( ), nil } -func getRootElement(client *cdp.Client) (dom.Node, values.String, error) { - args := dom.NewGetDocumentArgs() - args.Depth = pointerInt(1) // lets load the entire document - ctx := context.Background() - - d, err := client.DOM.GetDocument(ctx, args) - - if err != nil { - return dom.Node{}, values.EmptyString, err - } - - innerHTML, err := client.DOM.GetOuterHTML(ctx, dom.NewGetOuterHTMLArgs().SetNodeID(d.Root.NodeID)) - - if err != nil { - return dom.Node{}, values.EmptyString, err - } - - return d.Root, values.NewString(innerHTML.OuterHTML), nil -} - func NewHTMLDocument( logger *zerolog.Logger, conn *rpcc.Conn, @@ -731,6 +701,54 @@ func (doc *HTMLDocument) Navigate(url values.String, timeout values.Int) error { return doc.WaitForNavigation(timeout) } +func (doc *HTMLDocument) CaptureScreenshot(params *ScreenshotArgs) (core.Value, error) { + ctx := context.Background() + metrics, err := doc.client.Page.GetLayoutMetrics(ctx) + + if params.Format == ScreenshotFormatJPEG && params.Quality < 0 && params.Quality > 100 { + params.Quality = 100 + } + + if params.X < 0 { + params.X = 0 + } + + if params.Y < 0 { + params.Y = 0 + } + + if params.Width <= 0 { + params.Width = float64(metrics.LayoutViewport.ClientWidth) - params.X + } + + if params.Height <= 0 { + params.Height = float64(metrics.LayoutViewport.ClientHeight) - params.Y + } + + clip := page.Viewport{ + X: params.X, + Y: params.Y, + Width: params.Width, + Height: params.Height, + Scale: 1.0, + } + + format := string(params.Format) + screenshotArgs := page.CaptureScreenshotArgs{ + Format: &format, + Quality: ¶ms.Quality, + Clip: &clip, + } + + reply, err := doc.client.Page.CaptureScreenshot(ctx, &screenshotArgs) + + if err != nil { + return values.None, err + } + + return values.NewBinary(reply.Data), nil +} + func (doc *HTMLDocument) onLoad(_ interface{}) { doc.Lock() defer doc.Unlock() @@ -777,66 +795,3 @@ func (doc *HTMLDocument) onError(val interface{}) { Err(err). Msg("unexpected error") } - -type ScreenshotFormat string - -const ( - ScreenshotFormatPNG ScreenshotFormat = "png" - ScreenshotFormatJPEG ScreenshotFormat = "jpeg" -) - -func IsScreenshotFormatValid(format string) bool { - value := ScreenshotFormat(format) - return value == ScreenshotFormatPNG || value == ScreenshotFormatJPEG -} - -type ScreenshotArgs struct { - X float64 - Y float64 - Width float64 - Height float64 - Format ScreenshotFormat - Quality int -} - -func (doc *HTMLDocument) CaptureScreenshot(params *ScreenshotArgs) (core.Value, error) { - ctx := context.Background() - metrics, err := doc.client.Page.GetLayoutMetrics(ctx) - - if params.Format == ScreenshotFormatJPEG && params.Quality < 0 && params.Quality > 100 { - params.Quality = 100 - } - if params.X < 0 { - params.X = 0 - } - if params.Y < 0 { - params.Y = 0 - } - if params.Width <= 0 { - params.Width = float64(metrics.LayoutViewport.ClientWidth) - params.X - } - if params.Height <= 0 { - params.Height = float64(metrics.LayoutViewport.ClientHeight) - params.Y - } - clip := page.Viewport{ - X: params.X, - Y: params.Y, - Width: params.Width, - Height: params.Height, - Scale: 1.0, - } - - format := string(params.Format) - screenshotArgs := page.CaptureScreenshotArgs{ - Format: &format, - Quality: ¶ms.Quality, - Clip: &clip, - } - - reply, err := doc.client.Page.CaptureScreenshot(ctx, &screenshotArgs) - if err != nil { - return values.None, err - } - - return values.NewBinary(reply.Data), nil -} diff --git a/pkg/stdlib/html/driver/dynamic/dynamic.go b/pkg/stdlib/html/driver/dynamic/driver.go similarity index 73% rename from pkg/stdlib/html/driver/dynamic/dynamic.go rename to pkg/stdlib/html/driver/dynamic/driver.go index 6d731c0e..758c254e 100644 --- a/pkg/stdlib/html/driver/dynamic/dynamic.go +++ b/pkg/stdlib/html/driver/dynamic/driver.go @@ -3,8 +3,11 @@ package dynamic import ( "context" "github.com/MontFerret/ferret/pkg/runtime/values" + "github.com/corpix/uarand" "github.com/mafredri/cdp" "github.com/mafredri/cdp/devtool" + "github.com/mafredri/cdp/protocol/emulation" + "github.com/mafredri/cdp/protocol/page" "github.com/mafredri/cdp/protocol/target" "github.com/mafredri/cdp/rpcc" "github.com/mafredri/cdp/session" @@ -19,11 +22,17 @@ type Driver struct { client *cdp.Client session *session.Manager contextID target.BrowserContextID + opts *Options } -func NewDriver(address string) *Driver { +func NewDriver(address string, opts ...Option) *Driver { drv := new(Driver) drv.dev = devtool.New(address) + drv.opts = new(Options) + + for _, opt := range opts { + opt(drv.opts) + } return drv } @@ -60,7 +69,37 @@ func (drv *Driver) GetDocument(ctx context.Context, targetURL values.String) (va return nil, err } - return LoadHTMLDocument(ctx, conn, url) + client := cdp.NewClient(conn) + + err = runBatch( + func() error { + return client.Page.Enable(ctx) + }, + + func() error { + return client.Page.SetLifecycleEventsEnabled( + ctx, + page.NewSetLifecycleEventsEnabledArgs(true), + ) + }, + + func() error { + return client.DOM.Enable(ctx) + }, + + func() error { + return client.Runtime.Enable(ctx) + }, + + func() error { + return client.Emulation.SetUserAgentOverride( + ctx, + emulation.NewSetUserAgentOverrideArgs(uarand.GetRandom()), + ) + }, + ) + + return LoadHTMLDocument(ctx, conn, client, url) } func (drv *Driver) Close() error { diff --git a/pkg/stdlib/html/driver/dynamic/helpers.go b/pkg/stdlib/html/driver/dynamic/helpers.go index b6796010..9731415e 100644 --- a/pkg/stdlib/html/driver/dynamic/helpers.go +++ b/pkg/stdlib/html/driver/dynamic/helpers.go @@ -29,6 +29,26 @@ func runBatch(funcs ...batchFunc) error { return eg.Wait() } +func getRootElement(client *cdp.Client) (dom.Node, values.String, error) { + args := dom.NewGetDocumentArgs() + args.Depth = pointerInt(1) // lets load the entire document + ctx := context.Background() + + d, err := client.DOM.GetDocument(ctx, args) + + if err != nil { + return dom.Node{}, values.EmptyString, err + } + + innerHTML, err := client.DOM.GetOuterHTML(ctx, dom.NewGetOuterHTMLArgs().SetNodeID(d.Root.NodeID)) + + if err != nil { + return dom.Node{}, values.EmptyString, err + } + + return d.Root, values.NewString(innerHTML.OuterHTML), nil +} + func parseAttrs(attrs []string) *values.Object { var attr values.String diff --git a/pkg/stdlib/html/driver/dynamic/options.go b/pkg/stdlib/html/driver/dynamic/options.go new file mode 100644 index 00000000..67b61945 --- /dev/null +++ b/pkg/stdlib/html/driver/dynamic/options.go @@ -0,0 +1,15 @@ +package dynamic + +type ( + Options struct { + proxy string + } + + Option func(opts *Options) +) + +func WithProxy(address string) Option { + return func(opts *Options) { + opts.proxy = address + } +} diff --git a/pkg/stdlib/html/driver/static/static.go b/pkg/stdlib/html/driver/static/driver.go similarity index 56% rename from pkg/stdlib/html/driver/static/static.go rename to pkg/stdlib/html/driver/static/driver.go index df81d1e9..ecf0fe12 100644 --- a/pkg/stdlib/html/driver/static/static.go +++ b/pkg/stdlib/html/driver/static/driver.go @@ -8,29 +8,62 @@ import ( "github.com/corpix/uarand" "github.com/pkg/errors" "github.com/sethgrid/pester" - httpx "net/http" + "net/http" + "net/url" ) type Driver struct { - client *pester.Client + client *pester.Client + options *Options } -func NewDriver(setters ...Option) *Driver { - client := pester.New() - client.Concurrency = 3 - client.MaxRetries = 5 - client.Backoff = pester.ExponentialBackoff - - for _, setter := range setters { - setter(client) +func NewDriver(opts ...Option) *Driver { + drv := new(Driver) + drv.options = &Options{ + concurrency: 3, + maxRetries: 5, + backoff: pester.ExponentialBackoff, } - return &Driver{client} + for _, opt := range opts { + opt(drv.options) + } + + if drv.options.proxy == "" { + drv.client = pester.New() + } else { + client, err := newClientWithProxy(drv.options) + + if err != nil { + drv.client = pester.New() + } else { + drv.client = pester.NewExtendedClient(client) + } + } + + drv.client.Concurrency = drv.options.concurrency + drv.client.MaxRetries = drv.options.maxRetries + drv.client.Backoff = drv.options.backoff + + return drv +} + +func newClientWithProxy(options *Options) (*http.Client, error) { + proxyURL, err := url.Parse(options.proxy) + + if err != nil { + return nil, err + } + + proxy := http.ProxyURL(proxyURL) + tr := &http.Transport{Proxy: proxy} + + return &http.Client{Transport: tr}, nil } func (d *Driver) GetDocument(_ context.Context, targetURL values.String) (values.HTMLNode, error) { - url := targetURL.String() - req, err := httpx.NewRequest(httpx.MethodGet, url, nil) + u := targetURL.String() + req, err := http.NewRequest(http.MethodGet, u, nil) if err != nil { return nil, err @@ -45,7 +78,7 @@ func (d *Driver) GetDocument(_ context.Context, targetURL values.String) (values resp, err := d.client.Do(req) if err != nil { - return nil, errors.Wrapf(err, "failed to retrieve a document %s", url) + return nil, errors.Wrapf(err, "failed to retrieve a document %s", u) } defer resp.Body.Close() @@ -53,10 +86,10 @@ func (d *Driver) GetDocument(_ context.Context, targetURL values.String) (values doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { - return nil, errors.Wrapf(err, "failed to parse a document %s", url) + return nil, errors.Wrapf(err, "failed to parse a document %s", u) } - return NewHTMLDocument(url, doc) + return NewHTMLDocument(u, doc) } func (d *Driver) ParseDocument(_ context.Context, str values.String) (values.HTMLNode, error) { diff --git a/pkg/stdlib/html/driver/static/options.go b/pkg/stdlib/html/driver/static/options.go index 7a97e3d3..7423367f 100644 --- a/pkg/stdlib/html/driver/static/options.go +++ b/pkg/stdlib/html/driver/static/options.go @@ -1,37 +1,51 @@ package static -import "github.com/sethgrid/pester" +import ( + "github.com/sethgrid/pester" +) type ( - Option func(opts *pester.Client) + Option func(opts *Options) + Options struct { + backoff pester.BackoffStrategy + maxRetries int + concurrency int + proxy string + } ) func WithDefaultBackoff() Option { - return func(opts *pester.Client) { - opts.Backoff = pester.DefaultBackoff + return func(opts *Options) { + opts.backoff = pester.DefaultBackoff } } func WithLinearBackoff() Option { - return func(opts *pester.Client) { - opts.Backoff = pester.LinearBackoff + return func(opts *Options) { + opts.backoff = pester.LinearBackoff } } func WithExponentialBackoff() Option { - return func(opts *pester.Client) { - opts.Backoff = pester.ExponentialBackoff + return func(opts *Options) { + opts.backoff = pester.ExponentialBackoff } } func WithMaxRetries(value int) Option { - return func(opts *pester.Client) { - opts.MaxRetries = value + return func(opts *Options) { + opts.maxRetries = value } } func WithConcurrency(value int) Option { - return func(opts *pester.Client) { - opts.Concurrency = value + return func(opts *Options) { + opts.concurrency = value + } +} + +func WithProxy(address string) Option { + return func(opts *Options) { + opts.proxy = address } }