1
0
mirror of https://github.com/MontFerret/ferret.git synced 2025-01-12 03:19:45 +02:00

Feature/#19 proxy (#72)

* #19 Some work on proxy

* Fixed linter issue
This commit is contained in:
Tim Voronov 2018-10-07 21:23:36 -04:00 committed by GitHub
parent 8f7edaedee
commit 957490efec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 303 additions and 166 deletions

View File

@ -56,6 +56,7 @@ func Exec(query string, opts Options) {
runtime.WithLog(l),
runtime.WithLogLevel(logging.DebugLevel),
runtime.WithParams(opts.Params),
runtime.WithProxy(opts.Proxy),
)
if err != nil {

View File

@ -3,4 +3,5 @@ package cli
type Options struct {
Cdp string
Params map[string]interface{}
Proxy string
}

View File

@ -98,6 +98,7 @@ func Repl(version string, opts Options) {
runtime.WithLog(l),
runtime.WithLogLevel(logging.DebugLevel),
runtime.WithParams(opts.Params),
runtime.WithProxy(opts.Proxy),
)
timer.Stop()

View File

@ -7,7 +7,7 @@ LET links = (
)
FOR link IN links
// The Verge has pretty heavy pages, so let's increase the navigation wait time
NAVIGATE(doc, link, 10000)
NAVIGATE(doc, link, 20000)
WAIT_ELEMENT(doc, '.c-entry-content', 5000)
LET texter = ELEMENT(doc, '.c-entry-content')
RETURN texter.innerText

View File

@ -9,6 +9,6 @@ FOR track IN tracks
LET title = ELEMENT(track, '.chartTrack__title')
RETURN {
artist: username.innerText,
track: title.innerText
artist: TRIM(username.innerText),
track: TRIM(title.innerText)
}

View File

@ -76,6 +76,12 @@ var (
false,
"launch Chrome",
)
proxyAddress = flag.String(
"proxy",
"",
"address of proxy server to use (only applicable for static pages)",
)
)
func main() {
@ -137,6 +143,7 @@ func main() {
opts := cli.Options{
Cdp: cdpConn,
Params: p,
Proxy: *proxyAddress,
}
stat, _ := os.Stdin.Stat()

28
pkg/runtime/env/env.go vendored Normal file
View File

@ -0,0 +1,28 @@
package env
import "context"
type (
ctxKey struct{}
Environment struct {
CDPAddress string
ProxyAddress string
}
)
func WithContext(ctx context.Context, e Environment) context.Context {
return context.WithValue(ctx, ctxKey{}, e)
}
func FromContext(ctx context.Context) Environment {
res := ctx.Value(ctxKey{})
val, ok := res.(Environment)
if !ok {
return Environment{}
}
return val
}

View File

@ -3,6 +3,7 @@ package runtime
import (
"context"
"github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/MontFerret/ferret/pkg/runtime/env"
"github.com/MontFerret/ferret/pkg/runtime/logging"
"github.com/MontFerret/ferret/pkg/runtime/values"
"io"
@ -53,7 +54,6 @@ func WithBrowser(address string) Option {
func WithProxy(address string) Option {
return func(options *Options) {
// TODO: add implementation
options.proxy = address
}
}
@ -73,6 +73,10 @@ func WithLogLevel(lvl logging.Level) Option {
func (opts *Options) withContext(parent context.Context) context.Context {
ctx := core.ParamsWith(parent, opts.params)
ctx = logging.WithContext(ctx, opts.logging)
ctx = env.WithContext(ctx, env.Environment{
CDPAddress: opts.cdp,
ProxyAddress: opts.proxy,
})
return ctx
}

View File

@ -40,7 +40,8 @@ func (p *Program) Run(ctx context.Context, setters ...Option) ([]byte, error) {
}
ctx = opts.withContext(ctx)
ctx = driver.WithDynamicDriver(ctx, opts.cdp)
// TODO: Decouple from STDLIB
ctx = driver.WithDynamicDriver(ctx)
ctx = driver.WithStaticDriver(ctx)
out, err := p.body.Exec(ctx, scope)

View File

@ -4,6 +4,7 @@ import (
"context"
"fmt"
"github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/MontFerret/ferret/pkg/runtime/env"
"github.com/MontFerret/ferret/pkg/runtime/values"
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic"
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/static"
@ -37,10 +38,27 @@ func FromContext(ctx context.Context, name Name) (Driver, error) {
return nil, core.Error(core.ErrNotFound, fmt.Sprintf("%s driver", name))
}
func WithDynamicDriver(ctx context.Context, addr string) context.Context {
return context.WithValue(ctx, Dynamic, dynamic.NewDriver(addr))
func WithDynamicDriver(ctx context.Context) context.Context {
e := env.FromContext(ctx)
return context.WithValue(
ctx,
Dynamic,
dynamic.NewDriver(
e.CDPAddress,
dynamic.WithProxy(e.ProxyAddress),
),
)
}
func WithStaticDriver(ctx context.Context, opts ...static.Option) context.Context {
return context.WithValue(ctx, Static, static.NewDriver(opts...))
func WithStaticDriver(ctx context.Context) context.Context {
e := env.FromContext(ctx)
return context.WithValue(
ctx,
Static,
static.NewDriver(
static.WithProxy(e.ProxyAddress),
),
)
}

View File

@ -6,16 +6,14 @@ import (
"hash/fnv"
"sync"
"time"
"github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/MontFerret/ferret/pkg/runtime/logging"
"github.com/MontFerret/ferret/pkg/runtime/values"
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic/eval"
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic/events"
"github.com/corpix/uarand"
"github.com/mafredri/cdp"
"github.com/mafredri/cdp/protocol/dom"
"github.com/mafredri/cdp/protocol/emulation"
"github.com/mafredri/cdp/protocol/page"
"github.com/mafredri/cdp/rpcc"
"github.com/pkg/errors"
@ -24,19 +22,43 @@ import (
const BlankPageURL = "about:blank"
type HTMLDocument struct {
sync.Mutex
logger *zerolog.Logger
conn *rpcc.Conn
client *cdp.Client
events *events.EventBroker
url values.String
element *HTMLElement
type (
ScreenshotFormat string
ScreenshotArgs struct {
X float64
Y float64
Width float64
Height float64
Format ScreenshotFormat
Quality int
}
HTMLDocument struct {
sync.Mutex
logger *zerolog.Logger
conn *rpcc.Conn
client *cdp.Client
events *events.EventBroker
url values.String
element *HTMLElement
}
)
const (
ScreenshotFormatPNG ScreenshotFormat = "png"
ScreenshotFormatJPEG ScreenshotFormat = "jpeg"
)
func IsScreenshotFormatValid(format string) bool {
value := ScreenshotFormat(format)
return value == ScreenshotFormatPNG || value == ScreenshotFormatJPEG
}
func LoadHTMLDocument(
ctx context.Context,
conn *rpcc.Conn,
client *cdp.Client,
url string,
) (*HTMLDocument, error) {
if conn == nil {
@ -47,39 +69,7 @@ func LoadHTMLDocument(
return nil, core.Error(core.ErrMissedArgument, "url")
}
client := cdp.NewClient(conn)
err := runBatch(
func() error {
return client.Page.Enable(ctx)
},
func() error {
return client.Page.SetLifecycleEventsEnabled(
ctx,
page.NewSetLifecycleEventsEnabledArgs(true),
)
},
func() error {
return client.DOM.Enable(ctx)
},
func() error {
return client.Runtime.Enable(ctx)
},
func() error {
return client.Emulation.SetUserAgentOverride(
ctx,
emulation.NewSetUserAgentOverrideArgs(uarand.GetRandom()),
)
},
)
if err != nil {
return nil, err
}
var err error
if url != BlankPageURL {
err = waitForLoadEvent(ctx, client)
@ -111,26 +101,6 @@ func LoadHTMLDocument(
), nil
}
func getRootElement(client *cdp.Client) (dom.Node, values.String, error) {
args := dom.NewGetDocumentArgs()
args.Depth = pointerInt(1) // lets load the entire document
ctx := context.Background()
d, err := client.DOM.GetDocument(ctx, args)
if err != nil {
return dom.Node{}, values.EmptyString, err
}
innerHTML, err := client.DOM.GetOuterHTML(ctx, dom.NewGetOuterHTMLArgs().SetNodeID(d.Root.NodeID))
if err != nil {
return dom.Node{}, values.EmptyString, err
}
return d.Root, values.NewString(innerHTML.OuterHTML), nil
}
func NewHTMLDocument(
logger *zerolog.Logger,
conn *rpcc.Conn,
@ -731,6 +701,54 @@ func (doc *HTMLDocument) Navigate(url values.String, timeout values.Int) error {
return doc.WaitForNavigation(timeout)
}
func (doc *HTMLDocument) CaptureScreenshot(params *ScreenshotArgs) (core.Value, error) {
ctx := context.Background()
metrics, err := doc.client.Page.GetLayoutMetrics(ctx)
if params.Format == ScreenshotFormatJPEG && params.Quality < 0 && params.Quality > 100 {
params.Quality = 100
}
if params.X < 0 {
params.X = 0
}
if params.Y < 0 {
params.Y = 0
}
if params.Width <= 0 {
params.Width = float64(metrics.LayoutViewport.ClientWidth) - params.X
}
if params.Height <= 0 {
params.Height = float64(metrics.LayoutViewport.ClientHeight) - params.Y
}
clip := page.Viewport{
X: params.X,
Y: params.Y,
Width: params.Width,
Height: params.Height,
Scale: 1.0,
}
format := string(params.Format)
screenshotArgs := page.CaptureScreenshotArgs{
Format: &format,
Quality: &params.Quality,
Clip: &clip,
}
reply, err := doc.client.Page.CaptureScreenshot(ctx, &screenshotArgs)
if err != nil {
return values.None, err
}
return values.NewBinary(reply.Data), nil
}
func (doc *HTMLDocument) onLoad(_ interface{}) {
doc.Lock()
defer doc.Unlock()
@ -777,66 +795,3 @@ func (doc *HTMLDocument) onError(val interface{}) {
Err(err).
Msg("unexpected error")
}
type ScreenshotFormat string
const (
ScreenshotFormatPNG ScreenshotFormat = "png"
ScreenshotFormatJPEG ScreenshotFormat = "jpeg"
)
func IsScreenshotFormatValid(format string) bool {
value := ScreenshotFormat(format)
return value == ScreenshotFormatPNG || value == ScreenshotFormatJPEG
}
type ScreenshotArgs struct {
X float64
Y float64
Width float64
Height float64
Format ScreenshotFormat
Quality int
}
func (doc *HTMLDocument) CaptureScreenshot(params *ScreenshotArgs) (core.Value, error) {
ctx := context.Background()
metrics, err := doc.client.Page.GetLayoutMetrics(ctx)
if params.Format == ScreenshotFormatJPEG && params.Quality < 0 && params.Quality > 100 {
params.Quality = 100
}
if params.X < 0 {
params.X = 0
}
if params.Y < 0 {
params.Y = 0
}
if params.Width <= 0 {
params.Width = float64(metrics.LayoutViewport.ClientWidth) - params.X
}
if params.Height <= 0 {
params.Height = float64(metrics.LayoutViewport.ClientHeight) - params.Y
}
clip := page.Viewport{
X: params.X,
Y: params.Y,
Width: params.Width,
Height: params.Height,
Scale: 1.0,
}
format := string(params.Format)
screenshotArgs := page.CaptureScreenshotArgs{
Format: &format,
Quality: &params.Quality,
Clip: &clip,
}
reply, err := doc.client.Page.CaptureScreenshot(ctx, &screenshotArgs)
if err != nil {
return values.None, err
}
return values.NewBinary(reply.Data), nil
}

View File

@ -3,8 +3,11 @@ package dynamic
import (
"context"
"github.com/MontFerret/ferret/pkg/runtime/values"
"github.com/corpix/uarand"
"github.com/mafredri/cdp"
"github.com/mafredri/cdp/devtool"
"github.com/mafredri/cdp/protocol/emulation"
"github.com/mafredri/cdp/protocol/page"
"github.com/mafredri/cdp/protocol/target"
"github.com/mafredri/cdp/rpcc"
"github.com/mafredri/cdp/session"
@ -19,11 +22,17 @@ type Driver struct {
client *cdp.Client
session *session.Manager
contextID target.BrowserContextID
opts *Options
}
func NewDriver(address string) *Driver {
func NewDriver(address string, opts ...Option) *Driver {
drv := new(Driver)
drv.dev = devtool.New(address)
drv.opts = new(Options)
for _, opt := range opts {
opt(drv.opts)
}
return drv
}
@ -60,7 +69,37 @@ func (drv *Driver) GetDocument(ctx context.Context, targetURL values.String) (va
return nil, err
}
return LoadHTMLDocument(ctx, conn, url)
client := cdp.NewClient(conn)
err = runBatch(
func() error {
return client.Page.Enable(ctx)
},
func() error {
return client.Page.SetLifecycleEventsEnabled(
ctx,
page.NewSetLifecycleEventsEnabledArgs(true),
)
},
func() error {
return client.DOM.Enable(ctx)
},
func() error {
return client.Runtime.Enable(ctx)
},
func() error {
return client.Emulation.SetUserAgentOverride(
ctx,
emulation.NewSetUserAgentOverrideArgs(uarand.GetRandom()),
)
},
)
return LoadHTMLDocument(ctx, conn, client, url)
}
func (drv *Driver) Close() error {

View File

@ -29,6 +29,26 @@ func runBatch(funcs ...batchFunc) error {
return eg.Wait()
}
func getRootElement(client *cdp.Client) (dom.Node, values.String, error) {
args := dom.NewGetDocumentArgs()
args.Depth = pointerInt(1) // lets load the entire document
ctx := context.Background()
d, err := client.DOM.GetDocument(ctx, args)
if err != nil {
return dom.Node{}, values.EmptyString, err
}
innerHTML, err := client.DOM.GetOuterHTML(ctx, dom.NewGetOuterHTMLArgs().SetNodeID(d.Root.NodeID))
if err != nil {
return dom.Node{}, values.EmptyString, err
}
return d.Root, values.NewString(innerHTML.OuterHTML), nil
}
func parseAttrs(attrs []string) *values.Object {
var attr values.String

View File

@ -0,0 +1,15 @@
package dynamic
type (
Options struct {
proxy string
}
Option func(opts *Options)
)
func WithProxy(address string) Option {
return func(opts *Options) {
opts.proxy = address
}
}

View File

@ -8,29 +8,62 @@ import (
"github.com/corpix/uarand"
"github.com/pkg/errors"
"github.com/sethgrid/pester"
httpx "net/http"
"net/http"
"net/url"
)
type Driver struct {
client *pester.Client
client *pester.Client
options *Options
}
func NewDriver(setters ...Option) *Driver {
client := pester.New()
client.Concurrency = 3
client.MaxRetries = 5
client.Backoff = pester.ExponentialBackoff
for _, setter := range setters {
setter(client)
func NewDriver(opts ...Option) *Driver {
drv := new(Driver)
drv.options = &Options{
concurrency: 3,
maxRetries: 5,
backoff: pester.ExponentialBackoff,
}
return &Driver{client}
for _, opt := range opts {
opt(drv.options)
}
if drv.options.proxy == "" {
drv.client = pester.New()
} else {
client, err := newClientWithProxy(drv.options)
if err != nil {
drv.client = pester.New()
} else {
drv.client = pester.NewExtendedClient(client)
}
}
drv.client.Concurrency = drv.options.concurrency
drv.client.MaxRetries = drv.options.maxRetries
drv.client.Backoff = drv.options.backoff
return drv
}
func newClientWithProxy(options *Options) (*http.Client, error) {
proxyURL, err := url.Parse(options.proxy)
if err != nil {
return nil, err
}
proxy := http.ProxyURL(proxyURL)
tr := &http.Transport{Proxy: proxy}
return &http.Client{Transport: tr}, nil
}
func (d *Driver) GetDocument(_ context.Context, targetURL values.String) (values.HTMLNode, error) {
url := targetURL.String()
req, err := httpx.NewRequest(httpx.MethodGet, url, nil)
u := targetURL.String()
req, err := http.NewRequest(http.MethodGet, u, nil)
if err != nil {
return nil, err
@ -45,7 +78,7 @@ func (d *Driver) GetDocument(_ context.Context, targetURL values.String) (values
resp, err := d.client.Do(req)
if err != nil {
return nil, errors.Wrapf(err, "failed to retrieve a document %s", url)
return nil, errors.Wrapf(err, "failed to retrieve a document %s", u)
}
defer resp.Body.Close()
@ -53,10 +86,10 @@ func (d *Driver) GetDocument(_ context.Context, targetURL values.String) (values
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, errors.Wrapf(err, "failed to parse a document %s", url)
return nil, errors.Wrapf(err, "failed to parse a document %s", u)
}
return NewHTMLDocument(url, doc)
return NewHTMLDocument(u, doc)
}
func (d *Driver) ParseDocument(_ context.Context, str values.String) (values.HTMLNode, error) {

View File

@ -1,37 +1,51 @@
package static
import "github.com/sethgrid/pester"
import (
"github.com/sethgrid/pester"
)
type (
Option func(opts *pester.Client)
Option func(opts *Options)
Options struct {
backoff pester.BackoffStrategy
maxRetries int
concurrency int
proxy string
}
)
func WithDefaultBackoff() Option {
return func(opts *pester.Client) {
opts.Backoff = pester.DefaultBackoff
return func(opts *Options) {
opts.backoff = pester.DefaultBackoff
}
}
func WithLinearBackoff() Option {
return func(opts *pester.Client) {
opts.Backoff = pester.LinearBackoff
return func(opts *Options) {
opts.backoff = pester.LinearBackoff
}
}
func WithExponentialBackoff() Option {
return func(opts *pester.Client) {
opts.Backoff = pester.ExponentialBackoff
return func(opts *Options) {
opts.backoff = pester.ExponentialBackoff
}
}
func WithMaxRetries(value int) Option {
return func(opts *pester.Client) {
opts.MaxRetries = value
return func(opts *Options) {
opts.maxRetries = value
}
}
func WithConcurrency(value int) Option {
return func(opts *pester.Client) {
opts.Concurrency = value
return func(opts *Options) {
opts.concurrency = value
}
}
func WithProxy(address string) Option {
return func(opts *Options) {
opts.proxy = address
}
}