mirror of
https://github.com/MontFerret/ferret.git
synced 2025-01-26 03:51:57 +02:00
parent
8f7edaedee
commit
957490efec
@ -56,6 +56,7 @@ func Exec(query string, opts Options) {
|
|||||||
runtime.WithLog(l),
|
runtime.WithLog(l),
|
||||||
runtime.WithLogLevel(logging.DebugLevel),
|
runtime.WithLogLevel(logging.DebugLevel),
|
||||||
runtime.WithParams(opts.Params),
|
runtime.WithParams(opts.Params),
|
||||||
|
runtime.WithProxy(opts.Proxy),
|
||||||
)
|
)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -3,4 +3,5 @@ package cli
|
|||||||
type Options struct {
|
type Options struct {
|
||||||
Cdp string
|
Cdp string
|
||||||
Params map[string]interface{}
|
Params map[string]interface{}
|
||||||
|
Proxy string
|
||||||
}
|
}
|
||||||
|
@ -98,6 +98,7 @@ func Repl(version string, opts Options) {
|
|||||||
runtime.WithLog(l),
|
runtime.WithLog(l),
|
||||||
runtime.WithLogLevel(logging.DebugLevel),
|
runtime.WithLogLevel(logging.DebugLevel),
|
||||||
runtime.WithParams(opts.Params),
|
runtime.WithParams(opts.Params),
|
||||||
|
runtime.WithProxy(opts.Proxy),
|
||||||
)
|
)
|
||||||
|
|
||||||
timer.Stop()
|
timer.Stop()
|
||||||
|
@ -7,7 +7,7 @@ LET links = (
|
|||||||
)
|
)
|
||||||
FOR link IN links
|
FOR link IN links
|
||||||
// The Verge has pretty heavy pages, so let's increase the navigation wait time
|
// The Verge has pretty heavy pages, so let's increase the navigation wait time
|
||||||
NAVIGATE(doc, link, 10000)
|
NAVIGATE(doc, link, 20000)
|
||||||
WAIT_ELEMENT(doc, '.c-entry-content', 5000)
|
WAIT_ELEMENT(doc, '.c-entry-content', 5000)
|
||||||
LET texter = ELEMENT(doc, '.c-entry-content')
|
LET texter = ELEMENT(doc, '.c-entry-content')
|
||||||
RETURN texter.innerText
|
RETURN texter.innerText
|
@ -9,6 +9,6 @@ FOR track IN tracks
|
|||||||
LET title = ELEMENT(track, '.chartTrack__title')
|
LET title = ELEMENT(track, '.chartTrack__title')
|
||||||
|
|
||||||
RETURN {
|
RETURN {
|
||||||
artist: username.innerText,
|
artist: TRIM(username.innerText),
|
||||||
track: title.innerText
|
track: TRIM(title.innerText)
|
||||||
}
|
}
|
||||||
|
7
main.go
7
main.go
@ -76,6 +76,12 @@ var (
|
|||||||
false,
|
false,
|
||||||
"launch Chrome",
|
"launch Chrome",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
proxyAddress = flag.String(
|
||||||
|
"proxy",
|
||||||
|
"",
|
||||||
|
"address of proxy server to use (only applicable for static pages)",
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@ -137,6 +143,7 @@ func main() {
|
|||||||
opts := cli.Options{
|
opts := cli.Options{
|
||||||
Cdp: cdpConn,
|
Cdp: cdpConn,
|
||||||
Params: p,
|
Params: p,
|
||||||
|
Proxy: *proxyAddress,
|
||||||
}
|
}
|
||||||
|
|
||||||
stat, _ := os.Stdin.Stat()
|
stat, _ := os.Stdin.Stat()
|
||||||
|
28
pkg/runtime/env/env.go
vendored
Normal file
28
pkg/runtime/env/env.go
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
package env
|
||||||
|
|
||||||
|
import "context"
|
||||||
|
|
||||||
|
type (
|
||||||
|
ctxKey struct{}
|
||||||
|
|
||||||
|
Environment struct {
|
||||||
|
CDPAddress string
|
||||||
|
ProxyAddress string
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
func WithContext(ctx context.Context, e Environment) context.Context {
|
||||||
|
return context.WithValue(ctx, ctxKey{}, e)
|
||||||
|
}
|
||||||
|
|
||||||
|
func FromContext(ctx context.Context) Environment {
|
||||||
|
res := ctx.Value(ctxKey{})
|
||||||
|
|
||||||
|
val, ok := res.(Environment)
|
||||||
|
|
||||||
|
if !ok {
|
||||||
|
return Environment{}
|
||||||
|
}
|
||||||
|
|
||||||
|
return val
|
||||||
|
}
|
@ -3,6 +3,7 @@ package runtime
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"github.com/MontFerret/ferret/pkg/runtime/core"
|
"github.com/MontFerret/ferret/pkg/runtime/core"
|
||||||
|
"github.com/MontFerret/ferret/pkg/runtime/env"
|
||||||
"github.com/MontFerret/ferret/pkg/runtime/logging"
|
"github.com/MontFerret/ferret/pkg/runtime/logging"
|
||||||
"github.com/MontFerret/ferret/pkg/runtime/values"
|
"github.com/MontFerret/ferret/pkg/runtime/values"
|
||||||
"io"
|
"io"
|
||||||
@ -53,7 +54,6 @@ func WithBrowser(address string) Option {
|
|||||||
|
|
||||||
func WithProxy(address string) Option {
|
func WithProxy(address string) Option {
|
||||||
return func(options *Options) {
|
return func(options *Options) {
|
||||||
// TODO: add implementation
|
|
||||||
options.proxy = address
|
options.proxy = address
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -73,6 +73,10 @@ func WithLogLevel(lvl logging.Level) Option {
|
|||||||
func (opts *Options) withContext(parent context.Context) context.Context {
|
func (opts *Options) withContext(parent context.Context) context.Context {
|
||||||
ctx := core.ParamsWith(parent, opts.params)
|
ctx := core.ParamsWith(parent, opts.params)
|
||||||
ctx = logging.WithContext(ctx, opts.logging)
|
ctx = logging.WithContext(ctx, opts.logging)
|
||||||
|
ctx = env.WithContext(ctx, env.Environment{
|
||||||
|
CDPAddress: opts.cdp,
|
||||||
|
ProxyAddress: opts.proxy,
|
||||||
|
})
|
||||||
|
|
||||||
return ctx
|
return ctx
|
||||||
}
|
}
|
||||||
|
@ -40,7 +40,8 @@ func (p *Program) Run(ctx context.Context, setters ...Option) ([]byte, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ctx = opts.withContext(ctx)
|
ctx = opts.withContext(ctx)
|
||||||
ctx = driver.WithDynamicDriver(ctx, opts.cdp)
|
// TODO: Decouple from STDLIB
|
||||||
|
ctx = driver.WithDynamicDriver(ctx)
|
||||||
ctx = driver.WithStaticDriver(ctx)
|
ctx = driver.WithStaticDriver(ctx)
|
||||||
|
|
||||||
out, err := p.body.Exec(ctx, scope)
|
out, err := p.body.Exec(ctx, scope)
|
||||||
|
@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/MontFerret/ferret/pkg/runtime/core"
|
"github.com/MontFerret/ferret/pkg/runtime/core"
|
||||||
|
"github.com/MontFerret/ferret/pkg/runtime/env"
|
||||||
"github.com/MontFerret/ferret/pkg/runtime/values"
|
"github.com/MontFerret/ferret/pkg/runtime/values"
|
||||||
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic"
|
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic"
|
||||||
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/static"
|
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/static"
|
||||||
@ -37,10 +38,27 @@ func FromContext(ctx context.Context, name Name) (Driver, error) {
|
|||||||
return nil, core.Error(core.ErrNotFound, fmt.Sprintf("%s driver", name))
|
return nil, core.Error(core.ErrNotFound, fmt.Sprintf("%s driver", name))
|
||||||
}
|
}
|
||||||
|
|
||||||
func WithDynamicDriver(ctx context.Context, addr string) context.Context {
|
func WithDynamicDriver(ctx context.Context) context.Context {
|
||||||
return context.WithValue(ctx, Dynamic, dynamic.NewDriver(addr))
|
e := env.FromContext(ctx)
|
||||||
|
|
||||||
|
return context.WithValue(
|
||||||
|
ctx,
|
||||||
|
Dynamic,
|
||||||
|
dynamic.NewDriver(
|
||||||
|
e.CDPAddress,
|
||||||
|
dynamic.WithProxy(e.ProxyAddress),
|
||||||
|
),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
func WithStaticDriver(ctx context.Context, opts ...static.Option) context.Context {
|
func WithStaticDriver(ctx context.Context) context.Context {
|
||||||
return context.WithValue(ctx, Static, static.NewDriver(opts...))
|
e := env.FromContext(ctx)
|
||||||
|
|
||||||
|
return context.WithValue(
|
||||||
|
ctx,
|
||||||
|
Static,
|
||||||
|
static.NewDriver(
|
||||||
|
static.WithProxy(e.ProxyAddress),
|
||||||
|
),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
@ -6,16 +6,14 @@ import (
|
|||||||
"hash/fnv"
|
"hash/fnv"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/MontFerret/ferret/pkg/runtime/core"
|
"github.com/MontFerret/ferret/pkg/runtime/core"
|
||||||
"github.com/MontFerret/ferret/pkg/runtime/logging"
|
"github.com/MontFerret/ferret/pkg/runtime/logging"
|
||||||
"github.com/MontFerret/ferret/pkg/runtime/values"
|
"github.com/MontFerret/ferret/pkg/runtime/values"
|
||||||
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic/eval"
|
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic/eval"
|
||||||
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic/events"
|
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic/events"
|
||||||
"github.com/corpix/uarand"
|
|
||||||
"github.com/mafredri/cdp"
|
"github.com/mafredri/cdp"
|
||||||
"github.com/mafredri/cdp/protocol/dom"
|
"github.com/mafredri/cdp/protocol/dom"
|
||||||
"github.com/mafredri/cdp/protocol/emulation"
|
|
||||||
"github.com/mafredri/cdp/protocol/page"
|
"github.com/mafredri/cdp/protocol/page"
|
||||||
"github.com/mafredri/cdp/rpcc"
|
"github.com/mafredri/cdp/rpcc"
|
||||||
"github.com/pkg/errors"
|
"github.com/pkg/errors"
|
||||||
@ -24,19 +22,43 @@ import (
|
|||||||
|
|
||||||
const BlankPageURL = "about:blank"
|
const BlankPageURL = "about:blank"
|
||||||
|
|
||||||
type HTMLDocument struct {
|
type (
|
||||||
sync.Mutex
|
ScreenshotFormat string
|
||||||
logger *zerolog.Logger
|
ScreenshotArgs struct {
|
||||||
conn *rpcc.Conn
|
X float64
|
||||||
client *cdp.Client
|
Y float64
|
||||||
events *events.EventBroker
|
Width float64
|
||||||
url values.String
|
Height float64
|
||||||
element *HTMLElement
|
Format ScreenshotFormat
|
||||||
|
Quality int
|
||||||
|
}
|
||||||
|
|
||||||
|
HTMLDocument struct {
|
||||||
|
sync.Mutex
|
||||||
|
logger *zerolog.Logger
|
||||||
|
conn *rpcc.Conn
|
||||||
|
client *cdp.Client
|
||||||
|
events *events.EventBroker
|
||||||
|
url values.String
|
||||||
|
element *HTMLElement
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
ScreenshotFormatPNG ScreenshotFormat = "png"
|
||||||
|
ScreenshotFormatJPEG ScreenshotFormat = "jpeg"
|
||||||
|
)
|
||||||
|
|
||||||
|
func IsScreenshotFormatValid(format string) bool {
|
||||||
|
value := ScreenshotFormat(format)
|
||||||
|
|
||||||
|
return value == ScreenshotFormatPNG || value == ScreenshotFormatJPEG
|
||||||
}
|
}
|
||||||
|
|
||||||
func LoadHTMLDocument(
|
func LoadHTMLDocument(
|
||||||
ctx context.Context,
|
ctx context.Context,
|
||||||
conn *rpcc.Conn,
|
conn *rpcc.Conn,
|
||||||
|
client *cdp.Client,
|
||||||
url string,
|
url string,
|
||||||
) (*HTMLDocument, error) {
|
) (*HTMLDocument, error) {
|
||||||
if conn == nil {
|
if conn == nil {
|
||||||
@ -47,39 +69,7 @@ func LoadHTMLDocument(
|
|||||||
return nil, core.Error(core.ErrMissedArgument, "url")
|
return nil, core.Error(core.ErrMissedArgument, "url")
|
||||||
}
|
}
|
||||||
|
|
||||||
client := cdp.NewClient(conn)
|
var err error
|
||||||
|
|
||||||
err := runBatch(
|
|
||||||
func() error {
|
|
||||||
return client.Page.Enable(ctx)
|
|
||||||
},
|
|
||||||
|
|
||||||
func() error {
|
|
||||||
return client.Page.SetLifecycleEventsEnabled(
|
|
||||||
ctx,
|
|
||||||
page.NewSetLifecycleEventsEnabledArgs(true),
|
|
||||||
)
|
|
||||||
},
|
|
||||||
|
|
||||||
func() error {
|
|
||||||
return client.DOM.Enable(ctx)
|
|
||||||
},
|
|
||||||
|
|
||||||
func() error {
|
|
||||||
return client.Runtime.Enable(ctx)
|
|
||||||
},
|
|
||||||
|
|
||||||
func() error {
|
|
||||||
return client.Emulation.SetUserAgentOverride(
|
|
||||||
ctx,
|
|
||||||
emulation.NewSetUserAgentOverrideArgs(uarand.GetRandom()),
|
|
||||||
)
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if url != BlankPageURL {
|
if url != BlankPageURL {
|
||||||
err = waitForLoadEvent(ctx, client)
|
err = waitForLoadEvent(ctx, client)
|
||||||
@ -111,26 +101,6 @@ func LoadHTMLDocument(
|
|||||||
), nil
|
), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getRootElement(client *cdp.Client) (dom.Node, values.String, error) {
|
|
||||||
args := dom.NewGetDocumentArgs()
|
|
||||||
args.Depth = pointerInt(1) // lets load the entire document
|
|
||||||
ctx := context.Background()
|
|
||||||
|
|
||||||
d, err := client.DOM.GetDocument(ctx, args)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return dom.Node{}, values.EmptyString, err
|
|
||||||
}
|
|
||||||
|
|
||||||
innerHTML, err := client.DOM.GetOuterHTML(ctx, dom.NewGetOuterHTMLArgs().SetNodeID(d.Root.NodeID))
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return dom.Node{}, values.EmptyString, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return d.Root, values.NewString(innerHTML.OuterHTML), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewHTMLDocument(
|
func NewHTMLDocument(
|
||||||
logger *zerolog.Logger,
|
logger *zerolog.Logger,
|
||||||
conn *rpcc.Conn,
|
conn *rpcc.Conn,
|
||||||
@ -731,6 +701,54 @@ func (doc *HTMLDocument) Navigate(url values.String, timeout values.Int) error {
|
|||||||
return doc.WaitForNavigation(timeout)
|
return doc.WaitForNavigation(timeout)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (doc *HTMLDocument) CaptureScreenshot(params *ScreenshotArgs) (core.Value, error) {
|
||||||
|
ctx := context.Background()
|
||||||
|
metrics, err := doc.client.Page.GetLayoutMetrics(ctx)
|
||||||
|
|
||||||
|
if params.Format == ScreenshotFormatJPEG && params.Quality < 0 && params.Quality > 100 {
|
||||||
|
params.Quality = 100
|
||||||
|
}
|
||||||
|
|
||||||
|
if params.X < 0 {
|
||||||
|
params.X = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
if params.Y < 0 {
|
||||||
|
params.Y = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
if params.Width <= 0 {
|
||||||
|
params.Width = float64(metrics.LayoutViewport.ClientWidth) - params.X
|
||||||
|
}
|
||||||
|
|
||||||
|
if params.Height <= 0 {
|
||||||
|
params.Height = float64(metrics.LayoutViewport.ClientHeight) - params.Y
|
||||||
|
}
|
||||||
|
|
||||||
|
clip := page.Viewport{
|
||||||
|
X: params.X,
|
||||||
|
Y: params.Y,
|
||||||
|
Width: params.Width,
|
||||||
|
Height: params.Height,
|
||||||
|
Scale: 1.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
format := string(params.Format)
|
||||||
|
screenshotArgs := page.CaptureScreenshotArgs{
|
||||||
|
Format: &format,
|
||||||
|
Quality: ¶ms.Quality,
|
||||||
|
Clip: &clip,
|
||||||
|
}
|
||||||
|
|
||||||
|
reply, err := doc.client.Page.CaptureScreenshot(ctx, &screenshotArgs)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return values.None, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return values.NewBinary(reply.Data), nil
|
||||||
|
}
|
||||||
|
|
||||||
func (doc *HTMLDocument) onLoad(_ interface{}) {
|
func (doc *HTMLDocument) onLoad(_ interface{}) {
|
||||||
doc.Lock()
|
doc.Lock()
|
||||||
defer doc.Unlock()
|
defer doc.Unlock()
|
||||||
@ -777,66 +795,3 @@ func (doc *HTMLDocument) onError(val interface{}) {
|
|||||||
Err(err).
|
Err(err).
|
||||||
Msg("unexpected error")
|
Msg("unexpected error")
|
||||||
}
|
}
|
||||||
|
|
||||||
type ScreenshotFormat string
|
|
||||||
|
|
||||||
const (
|
|
||||||
ScreenshotFormatPNG ScreenshotFormat = "png"
|
|
||||||
ScreenshotFormatJPEG ScreenshotFormat = "jpeg"
|
|
||||||
)
|
|
||||||
|
|
||||||
func IsScreenshotFormatValid(format string) bool {
|
|
||||||
value := ScreenshotFormat(format)
|
|
||||||
return value == ScreenshotFormatPNG || value == ScreenshotFormatJPEG
|
|
||||||
}
|
|
||||||
|
|
||||||
type ScreenshotArgs struct {
|
|
||||||
X float64
|
|
||||||
Y float64
|
|
||||||
Width float64
|
|
||||||
Height float64
|
|
||||||
Format ScreenshotFormat
|
|
||||||
Quality int
|
|
||||||
}
|
|
||||||
|
|
||||||
func (doc *HTMLDocument) CaptureScreenshot(params *ScreenshotArgs) (core.Value, error) {
|
|
||||||
ctx := context.Background()
|
|
||||||
metrics, err := doc.client.Page.GetLayoutMetrics(ctx)
|
|
||||||
|
|
||||||
if params.Format == ScreenshotFormatJPEG && params.Quality < 0 && params.Quality > 100 {
|
|
||||||
params.Quality = 100
|
|
||||||
}
|
|
||||||
if params.X < 0 {
|
|
||||||
params.X = 0
|
|
||||||
}
|
|
||||||
if params.Y < 0 {
|
|
||||||
params.Y = 0
|
|
||||||
}
|
|
||||||
if params.Width <= 0 {
|
|
||||||
params.Width = float64(metrics.LayoutViewport.ClientWidth) - params.X
|
|
||||||
}
|
|
||||||
if params.Height <= 0 {
|
|
||||||
params.Height = float64(metrics.LayoutViewport.ClientHeight) - params.Y
|
|
||||||
}
|
|
||||||
clip := page.Viewport{
|
|
||||||
X: params.X,
|
|
||||||
Y: params.Y,
|
|
||||||
Width: params.Width,
|
|
||||||
Height: params.Height,
|
|
||||||
Scale: 1.0,
|
|
||||||
}
|
|
||||||
|
|
||||||
format := string(params.Format)
|
|
||||||
screenshotArgs := page.CaptureScreenshotArgs{
|
|
||||||
Format: &format,
|
|
||||||
Quality: ¶ms.Quality,
|
|
||||||
Clip: &clip,
|
|
||||||
}
|
|
||||||
|
|
||||||
reply, err := doc.client.Page.CaptureScreenshot(ctx, &screenshotArgs)
|
|
||||||
if err != nil {
|
|
||||||
return values.None, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return values.NewBinary(reply.Data), nil
|
|
||||||
}
|
|
||||||
|
@ -3,8 +3,11 @@ package dynamic
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"github.com/MontFerret/ferret/pkg/runtime/values"
|
"github.com/MontFerret/ferret/pkg/runtime/values"
|
||||||
|
"github.com/corpix/uarand"
|
||||||
"github.com/mafredri/cdp"
|
"github.com/mafredri/cdp"
|
||||||
"github.com/mafredri/cdp/devtool"
|
"github.com/mafredri/cdp/devtool"
|
||||||
|
"github.com/mafredri/cdp/protocol/emulation"
|
||||||
|
"github.com/mafredri/cdp/protocol/page"
|
||||||
"github.com/mafredri/cdp/protocol/target"
|
"github.com/mafredri/cdp/protocol/target"
|
||||||
"github.com/mafredri/cdp/rpcc"
|
"github.com/mafredri/cdp/rpcc"
|
||||||
"github.com/mafredri/cdp/session"
|
"github.com/mafredri/cdp/session"
|
||||||
@ -19,11 +22,17 @@ type Driver struct {
|
|||||||
client *cdp.Client
|
client *cdp.Client
|
||||||
session *session.Manager
|
session *session.Manager
|
||||||
contextID target.BrowserContextID
|
contextID target.BrowserContextID
|
||||||
|
opts *Options
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewDriver(address string) *Driver {
|
func NewDriver(address string, opts ...Option) *Driver {
|
||||||
drv := new(Driver)
|
drv := new(Driver)
|
||||||
drv.dev = devtool.New(address)
|
drv.dev = devtool.New(address)
|
||||||
|
drv.opts = new(Options)
|
||||||
|
|
||||||
|
for _, opt := range opts {
|
||||||
|
opt(drv.opts)
|
||||||
|
}
|
||||||
|
|
||||||
return drv
|
return drv
|
||||||
}
|
}
|
||||||
@ -60,7 +69,37 @@ func (drv *Driver) GetDocument(ctx context.Context, targetURL values.String) (va
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return LoadHTMLDocument(ctx, conn, url)
|
client := cdp.NewClient(conn)
|
||||||
|
|
||||||
|
err = runBatch(
|
||||||
|
func() error {
|
||||||
|
return client.Page.Enable(ctx)
|
||||||
|
},
|
||||||
|
|
||||||
|
func() error {
|
||||||
|
return client.Page.SetLifecycleEventsEnabled(
|
||||||
|
ctx,
|
||||||
|
page.NewSetLifecycleEventsEnabledArgs(true),
|
||||||
|
)
|
||||||
|
},
|
||||||
|
|
||||||
|
func() error {
|
||||||
|
return client.DOM.Enable(ctx)
|
||||||
|
},
|
||||||
|
|
||||||
|
func() error {
|
||||||
|
return client.Runtime.Enable(ctx)
|
||||||
|
},
|
||||||
|
|
||||||
|
func() error {
|
||||||
|
return client.Emulation.SetUserAgentOverride(
|
||||||
|
ctx,
|
||||||
|
emulation.NewSetUserAgentOverrideArgs(uarand.GetRandom()),
|
||||||
|
)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
return LoadHTMLDocument(ctx, conn, client, url)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (drv *Driver) Close() error {
|
func (drv *Driver) Close() error {
|
@ -29,6 +29,26 @@ func runBatch(funcs ...batchFunc) error {
|
|||||||
return eg.Wait()
|
return eg.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getRootElement(client *cdp.Client) (dom.Node, values.String, error) {
|
||||||
|
args := dom.NewGetDocumentArgs()
|
||||||
|
args.Depth = pointerInt(1) // lets load the entire document
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
d, err := client.DOM.GetDocument(ctx, args)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return dom.Node{}, values.EmptyString, err
|
||||||
|
}
|
||||||
|
|
||||||
|
innerHTML, err := client.DOM.GetOuterHTML(ctx, dom.NewGetOuterHTMLArgs().SetNodeID(d.Root.NodeID))
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return dom.Node{}, values.EmptyString, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return d.Root, values.NewString(innerHTML.OuterHTML), nil
|
||||||
|
}
|
||||||
|
|
||||||
func parseAttrs(attrs []string) *values.Object {
|
func parseAttrs(attrs []string) *values.Object {
|
||||||
var attr values.String
|
var attr values.String
|
||||||
|
|
||||||
|
15
pkg/stdlib/html/driver/dynamic/options.go
Normal file
15
pkg/stdlib/html/driver/dynamic/options.go
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
package dynamic
|
||||||
|
|
||||||
|
type (
|
||||||
|
Options struct {
|
||||||
|
proxy string
|
||||||
|
}
|
||||||
|
|
||||||
|
Option func(opts *Options)
|
||||||
|
)
|
||||||
|
|
||||||
|
func WithProxy(address string) Option {
|
||||||
|
return func(opts *Options) {
|
||||||
|
opts.proxy = address
|
||||||
|
}
|
||||||
|
}
|
@ -8,29 +8,62 @@ import (
|
|||||||
"github.com/corpix/uarand"
|
"github.com/corpix/uarand"
|
||||||
"github.com/pkg/errors"
|
"github.com/pkg/errors"
|
||||||
"github.com/sethgrid/pester"
|
"github.com/sethgrid/pester"
|
||||||
httpx "net/http"
|
"net/http"
|
||||||
|
"net/url"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Driver struct {
|
type Driver struct {
|
||||||
client *pester.Client
|
client *pester.Client
|
||||||
|
options *Options
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewDriver(setters ...Option) *Driver {
|
func NewDriver(opts ...Option) *Driver {
|
||||||
client := pester.New()
|
drv := new(Driver)
|
||||||
client.Concurrency = 3
|
drv.options = &Options{
|
||||||
client.MaxRetries = 5
|
concurrency: 3,
|
||||||
client.Backoff = pester.ExponentialBackoff
|
maxRetries: 5,
|
||||||
|
backoff: pester.ExponentialBackoff,
|
||||||
for _, setter := range setters {
|
|
||||||
setter(client)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return &Driver{client}
|
for _, opt := range opts {
|
||||||
|
opt(drv.options)
|
||||||
|
}
|
||||||
|
|
||||||
|
if drv.options.proxy == "" {
|
||||||
|
drv.client = pester.New()
|
||||||
|
} else {
|
||||||
|
client, err := newClientWithProxy(drv.options)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
drv.client = pester.New()
|
||||||
|
} else {
|
||||||
|
drv.client = pester.NewExtendedClient(client)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
drv.client.Concurrency = drv.options.concurrency
|
||||||
|
drv.client.MaxRetries = drv.options.maxRetries
|
||||||
|
drv.client.Backoff = drv.options.backoff
|
||||||
|
|
||||||
|
return drv
|
||||||
|
}
|
||||||
|
|
||||||
|
func newClientWithProxy(options *Options) (*http.Client, error) {
|
||||||
|
proxyURL, err := url.Parse(options.proxy)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
proxy := http.ProxyURL(proxyURL)
|
||||||
|
tr := &http.Transport{Proxy: proxy}
|
||||||
|
|
||||||
|
return &http.Client{Transport: tr}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Driver) GetDocument(_ context.Context, targetURL values.String) (values.HTMLNode, error) {
|
func (d *Driver) GetDocument(_ context.Context, targetURL values.String) (values.HTMLNode, error) {
|
||||||
url := targetURL.String()
|
u := targetURL.String()
|
||||||
req, err := httpx.NewRequest(httpx.MethodGet, url, nil)
|
req, err := http.NewRequest(http.MethodGet, u, nil)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -45,7 +78,7 @@ func (d *Driver) GetDocument(_ context.Context, targetURL values.String) (values
|
|||||||
resp, err := d.client.Do(req)
|
resp, err := d.client.Do(req)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, errors.Wrapf(err, "failed to retrieve a document %s", url)
|
return nil, errors.Wrapf(err, "failed to retrieve a document %s", u)
|
||||||
}
|
}
|
||||||
|
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
@ -53,10 +86,10 @@ func (d *Driver) GetDocument(_ context.Context, targetURL values.String) (values
|
|||||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, errors.Wrapf(err, "failed to parse a document %s", url)
|
return nil, errors.Wrapf(err, "failed to parse a document %s", u)
|
||||||
}
|
}
|
||||||
|
|
||||||
return NewHTMLDocument(url, doc)
|
return NewHTMLDocument(u, doc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Driver) ParseDocument(_ context.Context, str values.String) (values.HTMLNode, error) {
|
func (d *Driver) ParseDocument(_ context.Context, str values.String) (values.HTMLNode, error) {
|
@ -1,37 +1,51 @@
|
|||||||
package static
|
package static
|
||||||
|
|
||||||
import "github.com/sethgrid/pester"
|
import (
|
||||||
|
"github.com/sethgrid/pester"
|
||||||
|
)
|
||||||
|
|
||||||
type (
|
type (
|
||||||
Option func(opts *pester.Client)
|
Option func(opts *Options)
|
||||||
|
Options struct {
|
||||||
|
backoff pester.BackoffStrategy
|
||||||
|
maxRetries int
|
||||||
|
concurrency int
|
||||||
|
proxy string
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
func WithDefaultBackoff() Option {
|
func WithDefaultBackoff() Option {
|
||||||
return func(opts *pester.Client) {
|
return func(opts *Options) {
|
||||||
opts.Backoff = pester.DefaultBackoff
|
opts.backoff = pester.DefaultBackoff
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func WithLinearBackoff() Option {
|
func WithLinearBackoff() Option {
|
||||||
return func(opts *pester.Client) {
|
return func(opts *Options) {
|
||||||
opts.Backoff = pester.LinearBackoff
|
opts.backoff = pester.LinearBackoff
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func WithExponentialBackoff() Option {
|
func WithExponentialBackoff() Option {
|
||||||
return func(opts *pester.Client) {
|
return func(opts *Options) {
|
||||||
opts.Backoff = pester.ExponentialBackoff
|
opts.backoff = pester.ExponentialBackoff
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func WithMaxRetries(value int) Option {
|
func WithMaxRetries(value int) Option {
|
||||||
return func(opts *pester.Client) {
|
return func(opts *Options) {
|
||||||
opts.MaxRetries = value
|
opts.maxRetries = value
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func WithConcurrency(value int) Option {
|
func WithConcurrency(value int) Option {
|
||||||
return func(opts *pester.Client) {
|
return func(opts *Options) {
|
||||||
opts.Concurrency = value
|
opts.concurrency = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func WithProxy(address string) Option {
|
||||||
|
return func(opts *Options) {
|
||||||
|
opts.proxy = address
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user