1
0
mirror of https://github.com/MontFerret/ferret.git synced 2025-01-26 03:51:57 +02:00

Feature/#19 proxy (#72)

* #19 Some work on proxy

* Fixed linter issue
This commit is contained in:
Tim Voronov 2018-10-07 21:23:36 -04:00 committed by GitHub
parent 8f7edaedee
commit 957490efec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 303 additions and 166 deletions

View File

@ -56,6 +56,7 @@ func Exec(query string, opts Options) {
runtime.WithLog(l), runtime.WithLog(l),
runtime.WithLogLevel(logging.DebugLevel), runtime.WithLogLevel(logging.DebugLevel),
runtime.WithParams(opts.Params), runtime.WithParams(opts.Params),
runtime.WithProxy(opts.Proxy),
) )
if err != nil { if err != nil {

View File

@ -3,4 +3,5 @@ package cli
type Options struct { type Options struct {
Cdp string Cdp string
Params map[string]interface{} Params map[string]interface{}
Proxy string
} }

View File

@ -98,6 +98,7 @@ func Repl(version string, opts Options) {
runtime.WithLog(l), runtime.WithLog(l),
runtime.WithLogLevel(logging.DebugLevel), runtime.WithLogLevel(logging.DebugLevel),
runtime.WithParams(opts.Params), runtime.WithParams(opts.Params),
runtime.WithProxy(opts.Proxy),
) )
timer.Stop() timer.Stop()

View File

@ -7,7 +7,7 @@ LET links = (
) )
FOR link IN links FOR link IN links
// The Verge has pretty heavy pages, so let's increase the navigation wait time // The Verge has pretty heavy pages, so let's increase the navigation wait time
NAVIGATE(doc, link, 10000) NAVIGATE(doc, link, 20000)
WAIT_ELEMENT(doc, '.c-entry-content', 5000) WAIT_ELEMENT(doc, '.c-entry-content', 5000)
LET texter = ELEMENT(doc, '.c-entry-content') LET texter = ELEMENT(doc, '.c-entry-content')
RETURN texter.innerText RETURN texter.innerText

View File

@ -9,6 +9,6 @@ FOR track IN tracks
LET title = ELEMENT(track, '.chartTrack__title') LET title = ELEMENT(track, '.chartTrack__title')
RETURN { RETURN {
artist: username.innerText, artist: TRIM(username.innerText),
track: title.innerText track: TRIM(title.innerText)
} }

View File

@ -76,6 +76,12 @@ var (
false, false,
"launch Chrome", "launch Chrome",
) )
proxyAddress = flag.String(
"proxy",
"",
"address of proxy server to use (only applicable for static pages)",
)
) )
func main() { func main() {
@ -137,6 +143,7 @@ func main() {
opts := cli.Options{ opts := cli.Options{
Cdp: cdpConn, Cdp: cdpConn,
Params: p, Params: p,
Proxy: *proxyAddress,
} }
stat, _ := os.Stdin.Stat() stat, _ := os.Stdin.Stat()

28
pkg/runtime/env/env.go vendored Normal file
View File

@ -0,0 +1,28 @@
package env
import "context"
type (
ctxKey struct{}
Environment struct {
CDPAddress string
ProxyAddress string
}
)
func WithContext(ctx context.Context, e Environment) context.Context {
return context.WithValue(ctx, ctxKey{}, e)
}
func FromContext(ctx context.Context) Environment {
res := ctx.Value(ctxKey{})
val, ok := res.(Environment)
if !ok {
return Environment{}
}
return val
}

View File

@ -3,6 +3,7 @@ package runtime
import ( import (
"context" "context"
"github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/MontFerret/ferret/pkg/runtime/env"
"github.com/MontFerret/ferret/pkg/runtime/logging" "github.com/MontFerret/ferret/pkg/runtime/logging"
"github.com/MontFerret/ferret/pkg/runtime/values" "github.com/MontFerret/ferret/pkg/runtime/values"
"io" "io"
@ -53,7 +54,6 @@ func WithBrowser(address string) Option {
func WithProxy(address string) Option { func WithProxy(address string) Option {
return func(options *Options) { return func(options *Options) {
// TODO: add implementation
options.proxy = address options.proxy = address
} }
} }
@ -73,6 +73,10 @@ func WithLogLevel(lvl logging.Level) Option {
func (opts *Options) withContext(parent context.Context) context.Context { func (opts *Options) withContext(parent context.Context) context.Context {
ctx := core.ParamsWith(parent, opts.params) ctx := core.ParamsWith(parent, opts.params)
ctx = logging.WithContext(ctx, opts.logging) ctx = logging.WithContext(ctx, opts.logging)
ctx = env.WithContext(ctx, env.Environment{
CDPAddress: opts.cdp,
ProxyAddress: opts.proxy,
})
return ctx return ctx
} }

View File

@ -40,7 +40,8 @@ func (p *Program) Run(ctx context.Context, setters ...Option) ([]byte, error) {
} }
ctx = opts.withContext(ctx) ctx = opts.withContext(ctx)
ctx = driver.WithDynamicDriver(ctx, opts.cdp) // TODO: Decouple from STDLIB
ctx = driver.WithDynamicDriver(ctx)
ctx = driver.WithStaticDriver(ctx) ctx = driver.WithStaticDriver(ctx)
out, err := p.body.Exec(ctx, scope) out, err := p.body.Exec(ctx, scope)

View File

@ -4,6 +4,7 @@ import (
"context" "context"
"fmt" "fmt"
"github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/MontFerret/ferret/pkg/runtime/env"
"github.com/MontFerret/ferret/pkg/runtime/values" "github.com/MontFerret/ferret/pkg/runtime/values"
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic" "github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic"
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/static" "github.com/MontFerret/ferret/pkg/stdlib/html/driver/static"
@ -37,10 +38,27 @@ func FromContext(ctx context.Context, name Name) (Driver, error) {
return nil, core.Error(core.ErrNotFound, fmt.Sprintf("%s driver", name)) return nil, core.Error(core.ErrNotFound, fmt.Sprintf("%s driver", name))
} }
func WithDynamicDriver(ctx context.Context, addr string) context.Context { func WithDynamicDriver(ctx context.Context) context.Context {
return context.WithValue(ctx, Dynamic, dynamic.NewDriver(addr)) e := env.FromContext(ctx)
return context.WithValue(
ctx,
Dynamic,
dynamic.NewDriver(
e.CDPAddress,
dynamic.WithProxy(e.ProxyAddress),
),
)
} }
func WithStaticDriver(ctx context.Context, opts ...static.Option) context.Context { func WithStaticDriver(ctx context.Context) context.Context {
return context.WithValue(ctx, Static, static.NewDriver(opts...)) e := env.FromContext(ctx)
return context.WithValue(
ctx,
Static,
static.NewDriver(
static.WithProxy(e.ProxyAddress),
),
)
} }

View File

@ -6,16 +6,14 @@ import (
"hash/fnv" "hash/fnv"
"sync" "sync"
"time" "time"
"github.com/MontFerret/ferret/pkg/runtime/core" "github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/MontFerret/ferret/pkg/runtime/logging" "github.com/MontFerret/ferret/pkg/runtime/logging"
"github.com/MontFerret/ferret/pkg/runtime/values" "github.com/MontFerret/ferret/pkg/runtime/values"
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic/eval" "github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic/eval"
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic/events" "github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic/events"
"github.com/corpix/uarand"
"github.com/mafredri/cdp" "github.com/mafredri/cdp"
"github.com/mafredri/cdp/protocol/dom" "github.com/mafredri/cdp/protocol/dom"
"github.com/mafredri/cdp/protocol/emulation"
"github.com/mafredri/cdp/protocol/page" "github.com/mafredri/cdp/protocol/page"
"github.com/mafredri/cdp/rpcc" "github.com/mafredri/cdp/rpcc"
"github.com/pkg/errors" "github.com/pkg/errors"
@ -24,19 +22,43 @@ import (
const BlankPageURL = "about:blank" const BlankPageURL = "about:blank"
type HTMLDocument struct { type (
sync.Mutex ScreenshotFormat string
logger *zerolog.Logger ScreenshotArgs struct {
conn *rpcc.Conn X float64
client *cdp.Client Y float64
events *events.EventBroker Width float64
url values.String Height float64
element *HTMLElement Format ScreenshotFormat
Quality int
}
HTMLDocument struct {
sync.Mutex
logger *zerolog.Logger
conn *rpcc.Conn
client *cdp.Client
events *events.EventBroker
url values.String
element *HTMLElement
}
)
const (
ScreenshotFormatPNG ScreenshotFormat = "png"
ScreenshotFormatJPEG ScreenshotFormat = "jpeg"
)
func IsScreenshotFormatValid(format string) bool {
value := ScreenshotFormat(format)
return value == ScreenshotFormatPNG || value == ScreenshotFormatJPEG
} }
func LoadHTMLDocument( func LoadHTMLDocument(
ctx context.Context, ctx context.Context,
conn *rpcc.Conn, conn *rpcc.Conn,
client *cdp.Client,
url string, url string,
) (*HTMLDocument, error) { ) (*HTMLDocument, error) {
if conn == nil { if conn == nil {
@ -47,39 +69,7 @@ func LoadHTMLDocument(
return nil, core.Error(core.ErrMissedArgument, "url") return nil, core.Error(core.ErrMissedArgument, "url")
} }
client := cdp.NewClient(conn) var err error
err := runBatch(
func() error {
return client.Page.Enable(ctx)
},
func() error {
return client.Page.SetLifecycleEventsEnabled(
ctx,
page.NewSetLifecycleEventsEnabledArgs(true),
)
},
func() error {
return client.DOM.Enable(ctx)
},
func() error {
return client.Runtime.Enable(ctx)
},
func() error {
return client.Emulation.SetUserAgentOverride(
ctx,
emulation.NewSetUserAgentOverrideArgs(uarand.GetRandom()),
)
},
)
if err != nil {
return nil, err
}
if url != BlankPageURL { if url != BlankPageURL {
err = waitForLoadEvent(ctx, client) err = waitForLoadEvent(ctx, client)
@ -111,26 +101,6 @@ func LoadHTMLDocument(
), nil ), nil
} }
func getRootElement(client *cdp.Client) (dom.Node, values.String, error) {
args := dom.NewGetDocumentArgs()
args.Depth = pointerInt(1) // lets load the entire document
ctx := context.Background()
d, err := client.DOM.GetDocument(ctx, args)
if err != nil {
return dom.Node{}, values.EmptyString, err
}
innerHTML, err := client.DOM.GetOuterHTML(ctx, dom.NewGetOuterHTMLArgs().SetNodeID(d.Root.NodeID))
if err != nil {
return dom.Node{}, values.EmptyString, err
}
return d.Root, values.NewString(innerHTML.OuterHTML), nil
}
func NewHTMLDocument( func NewHTMLDocument(
logger *zerolog.Logger, logger *zerolog.Logger,
conn *rpcc.Conn, conn *rpcc.Conn,
@ -731,6 +701,54 @@ func (doc *HTMLDocument) Navigate(url values.String, timeout values.Int) error {
return doc.WaitForNavigation(timeout) return doc.WaitForNavigation(timeout)
} }
func (doc *HTMLDocument) CaptureScreenshot(params *ScreenshotArgs) (core.Value, error) {
ctx := context.Background()
metrics, err := doc.client.Page.GetLayoutMetrics(ctx)
if params.Format == ScreenshotFormatJPEG && params.Quality < 0 && params.Quality > 100 {
params.Quality = 100
}
if params.X < 0 {
params.X = 0
}
if params.Y < 0 {
params.Y = 0
}
if params.Width <= 0 {
params.Width = float64(metrics.LayoutViewport.ClientWidth) - params.X
}
if params.Height <= 0 {
params.Height = float64(metrics.LayoutViewport.ClientHeight) - params.Y
}
clip := page.Viewport{
X: params.X,
Y: params.Y,
Width: params.Width,
Height: params.Height,
Scale: 1.0,
}
format := string(params.Format)
screenshotArgs := page.CaptureScreenshotArgs{
Format: &format,
Quality: &params.Quality,
Clip: &clip,
}
reply, err := doc.client.Page.CaptureScreenshot(ctx, &screenshotArgs)
if err != nil {
return values.None, err
}
return values.NewBinary(reply.Data), nil
}
func (doc *HTMLDocument) onLoad(_ interface{}) { func (doc *HTMLDocument) onLoad(_ interface{}) {
doc.Lock() doc.Lock()
defer doc.Unlock() defer doc.Unlock()
@ -777,66 +795,3 @@ func (doc *HTMLDocument) onError(val interface{}) {
Err(err). Err(err).
Msg("unexpected error") Msg("unexpected error")
} }
type ScreenshotFormat string
const (
ScreenshotFormatPNG ScreenshotFormat = "png"
ScreenshotFormatJPEG ScreenshotFormat = "jpeg"
)
func IsScreenshotFormatValid(format string) bool {
value := ScreenshotFormat(format)
return value == ScreenshotFormatPNG || value == ScreenshotFormatJPEG
}
type ScreenshotArgs struct {
X float64
Y float64
Width float64
Height float64
Format ScreenshotFormat
Quality int
}
func (doc *HTMLDocument) CaptureScreenshot(params *ScreenshotArgs) (core.Value, error) {
ctx := context.Background()
metrics, err := doc.client.Page.GetLayoutMetrics(ctx)
if params.Format == ScreenshotFormatJPEG && params.Quality < 0 && params.Quality > 100 {
params.Quality = 100
}
if params.X < 0 {
params.X = 0
}
if params.Y < 0 {
params.Y = 0
}
if params.Width <= 0 {
params.Width = float64(metrics.LayoutViewport.ClientWidth) - params.X
}
if params.Height <= 0 {
params.Height = float64(metrics.LayoutViewport.ClientHeight) - params.Y
}
clip := page.Viewport{
X: params.X,
Y: params.Y,
Width: params.Width,
Height: params.Height,
Scale: 1.0,
}
format := string(params.Format)
screenshotArgs := page.CaptureScreenshotArgs{
Format: &format,
Quality: &params.Quality,
Clip: &clip,
}
reply, err := doc.client.Page.CaptureScreenshot(ctx, &screenshotArgs)
if err != nil {
return values.None, err
}
return values.NewBinary(reply.Data), nil
}

View File

@ -3,8 +3,11 @@ package dynamic
import ( import (
"context" "context"
"github.com/MontFerret/ferret/pkg/runtime/values" "github.com/MontFerret/ferret/pkg/runtime/values"
"github.com/corpix/uarand"
"github.com/mafredri/cdp" "github.com/mafredri/cdp"
"github.com/mafredri/cdp/devtool" "github.com/mafredri/cdp/devtool"
"github.com/mafredri/cdp/protocol/emulation"
"github.com/mafredri/cdp/protocol/page"
"github.com/mafredri/cdp/protocol/target" "github.com/mafredri/cdp/protocol/target"
"github.com/mafredri/cdp/rpcc" "github.com/mafredri/cdp/rpcc"
"github.com/mafredri/cdp/session" "github.com/mafredri/cdp/session"
@ -19,11 +22,17 @@ type Driver struct {
client *cdp.Client client *cdp.Client
session *session.Manager session *session.Manager
contextID target.BrowserContextID contextID target.BrowserContextID
opts *Options
} }
func NewDriver(address string) *Driver { func NewDriver(address string, opts ...Option) *Driver {
drv := new(Driver) drv := new(Driver)
drv.dev = devtool.New(address) drv.dev = devtool.New(address)
drv.opts = new(Options)
for _, opt := range opts {
opt(drv.opts)
}
return drv return drv
} }
@ -60,7 +69,37 @@ func (drv *Driver) GetDocument(ctx context.Context, targetURL values.String) (va
return nil, err return nil, err
} }
return LoadHTMLDocument(ctx, conn, url) client := cdp.NewClient(conn)
err = runBatch(
func() error {
return client.Page.Enable(ctx)
},
func() error {
return client.Page.SetLifecycleEventsEnabled(
ctx,
page.NewSetLifecycleEventsEnabledArgs(true),
)
},
func() error {
return client.DOM.Enable(ctx)
},
func() error {
return client.Runtime.Enable(ctx)
},
func() error {
return client.Emulation.SetUserAgentOverride(
ctx,
emulation.NewSetUserAgentOverrideArgs(uarand.GetRandom()),
)
},
)
return LoadHTMLDocument(ctx, conn, client, url)
} }
func (drv *Driver) Close() error { func (drv *Driver) Close() error {

View File

@ -29,6 +29,26 @@ func runBatch(funcs ...batchFunc) error {
return eg.Wait() return eg.Wait()
} }
func getRootElement(client *cdp.Client) (dom.Node, values.String, error) {
args := dom.NewGetDocumentArgs()
args.Depth = pointerInt(1) // lets load the entire document
ctx := context.Background()
d, err := client.DOM.GetDocument(ctx, args)
if err != nil {
return dom.Node{}, values.EmptyString, err
}
innerHTML, err := client.DOM.GetOuterHTML(ctx, dom.NewGetOuterHTMLArgs().SetNodeID(d.Root.NodeID))
if err != nil {
return dom.Node{}, values.EmptyString, err
}
return d.Root, values.NewString(innerHTML.OuterHTML), nil
}
func parseAttrs(attrs []string) *values.Object { func parseAttrs(attrs []string) *values.Object {
var attr values.String var attr values.String

View File

@ -0,0 +1,15 @@
package dynamic
type (
Options struct {
proxy string
}
Option func(opts *Options)
)
func WithProxy(address string) Option {
return func(opts *Options) {
opts.proxy = address
}
}

View File

@ -8,29 +8,62 @@ import (
"github.com/corpix/uarand" "github.com/corpix/uarand"
"github.com/pkg/errors" "github.com/pkg/errors"
"github.com/sethgrid/pester" "github.com/sethgrid/pester"
httpx "net/http" "net/http"
"net/url"
) )
type Driver struct { type Driver struct {
client *pester.Client client *pester.Client
options *Options
} }
func NewDriver(setters ...Option) *Driver { func NewDriver(opts ...Option) *Driver {
client := pester.New() drv := new(Driver)
client.Concurrency = 3 drv.options = &Options{
client.MaxRetries = 5 concurrency: 3,
client.Backoff = pester.ExponentialBackoff maxRetries: 5,
backoff: pester.ExponentialBackoff,
for _, setter := range setters {
setter(client)
} }
return &Driver{client} for _, opt := range opts {
opt(drv.options)
}
if drv.options.proxy == "" {
drv.client = pester.New()
} else {
client, err := newClientWithProxy(drv.options)
if err != nil {
drv.client = pester.New()
} else {
drv.client = pester.NewExtendedClient(client)
}
}
drv.client.Concurrency = drv.options.concurrency
drv.client.MaxRetries = drv.options.maxRetries
drv.client.Backoff = drv.options.backoff
return drv
}
func newClientWithProxy(options *Options) (*http.Client, error) {
proxyURL, err := url.Parse(options.proxy)
if err != nil {
return nil, err
}
proxy := http.ProxyURL(proxyURL)
tr := &http.Transport{Proxy: proxy}
return &http.Client{Transport: tr}, nil
} }
func (d *Driver) GetDocument(_ context.Context, targetURL values.String) (values.HTMLNode, error) { func (d *Driver) GetDocument(_ context.Context, targetURL values.String) (values.HTMLNode, error) {
url := targetURL.String() u := targetURL.String()
req, err := httpx.NewRequest(httpx.MethodGet, url, nil) req, err := http.NewRequest(http.MethodGet, u, nil)
if err != nil { if err != nil {
return nil, err return nil, err
@ -45,7 +78,7 @@ func (d *Driver) GetDocument(_ context.Context, targetURL values.String) (values
resp, err := d.client.Do(req) resp, err := d.client.Do(req)
if err != nil { if err != nil {
return nil, errors.Wrapf(err, "failed to retrieve a document %s", url) return nil, errors.Wrapf(err, "failed to retrieve a document %s", u)
} }
defer resp.Body.Close() defer resp.Body.Close()
@ -53,10 +86,10 @@ func (d *Driver) GetDocument(_ context.Context, targetURL values.String) (values
doc, err := goquery.NewDocumentFromReader(resp.Body) doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil { if err != nil {
return nil, errors.Wrapf(err, "failed to parse a document %s", url) return nil, errors.Wrapf(err, "failed to parse a document %s", u)
} }
return NewHTMLDocument(url, doc) return NewHTMLDocument(u, doc)
} }
func (d *Driver) ParseDocument(_ context.Context, str values.String) (values.HTMLNode, error) { func (d *Driver) ParseDocument(_ context.Context, str values.String) (values.HTMLNode, error) {

View File

@ -1,37 +1,51 @@
package static package static
import "github.com/sethgrid/pester" import (
"github.com/sethgrid/pester"
)
type ( type (
Option func(opts *pester.Client) Option func(opts *Options)
Options struct {
backoff pester.BackoffStrategy
maxRetries int
concurrency int
proxy string
}
) )
func WithDefaultBackoff() Option { func WithDefaultBackoff() Option {
return func(opts *pester.Client) { return func(opts *Options) {
opts.Backoff = pester.DefaultBackoff opts.backoff = pester.DefaultBackoff
} }
} }
func WithLinearBackoff() Option { func WithLinearBackoff() Option {
return func(opts *pester.Client) { return func(opts *Options) {
opts.Backoff = pester.LinearBackoff opts.backoff = pester.LinearBackoff
} }
} }
func WithExponentialBackoff() Option { func WithExponentialBackoff() Option {
return func(opts *pester.Client) { return func(opts *Options) {
opts.Backoff = pester.ExponentialBackoff opts.backoff = pester.ExponentialBackoff
} }
} }
func WithMaxRetries(value int) Option { func WithMaxRetries(value int) Option {
return func(opts *pester.Client) { return func(opts *Options) {
opts.MaxRetries = value opts.maxRetries = value
} }
} }
func WithConcurrency(value int) Option { func WithConcurrency(value int) Option {
return func(opts *pester.Client) { return func(opts *Options) {
opts.Concurrency = value opts.concurrency = value
}
}
func WithProxy(address string) Option {
return func(opts *Options) {
opts.proxy = address
} }
} }