mirror of
https://github.com/MontFerret/ferret.git
synced 2025-04-07 07:19:58 +02:00
Added possibility to load pages from memory (#434)
* Added possibility to load pages from memory * Fixed indent
This commit is contained in:
parent
145a16f97d
commit
4af0e0f15f
18
e2e/tests/dynamic/parse.fql
Normal file
18
e2e/tests/dynamic/parse.fql
Normal file
@ -0,0 +1,18 @@
|
||||
LET page = PARSE(`
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Offline</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Hello world</h1>
|
||||
</body>
|
||||
</html>
|
||||
`, {
|
||||
driver: "cdp"
|
||||
})
|
||||
|
||||
LET title = ELEMENT(page, "title")
|
||||
|
||||
RETURN EXPECT(title.innerText, "Offline")
|
16
e2e/tests/static/parse.fql
Normal file
16
e2e/tests/static/parse.fql
Normal file
@ -0,0 +1,16 @@
|
||||
LET page = PARSE(`
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Offline</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Hello world</h1>
|
||||
</body>
|
||||
</html>
|
||||
`)
|
||||
|
||||
LET title = ELEMENT(page, "title")
|
||||
|
||||
RETURN EXPECT(title.innerText, "Offline")
|
@ -2,6 +2,8 @@ package cdp
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
|
||||
"github.com/mafredri/cdp"
|
||||
"github.com/mafredri/cdp/devtool"
|
||||
"github.com/mafredri/cdp/protocol/browser"
|
||||
@ -9,7 +11,6 @@ import (
|
||||
"github.com/mafredri/cdp/rpcc"
|
||||
"github.com/mafredri/cdp/session"
|
||||
"github.com/pkg/errors"
|
||||
"sync"
|
||||
|
||||
"github.com/MontFerret/ferret/pkg/drivers"
|
||||
"github.com/MontFerret/ferret/pkg/runtime/logging"
|
||||
@ -48,7 +49,7 @@ func (drv *Driver) Name() string {
|
||||
func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTMLPage, error) {
|
||||
logger := logging.FromContext(ctx)
|
||||
|
||||
err := drv.init(ctx)
|
||||
conn, err := drv.createConnection(ctx, params.KeepCookies)
|
||||
|
||||
if err != nil {
|
||||
logger.
|
||||
@ -56,15 +57,64 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM
|
||||
Timestamp().
|
||||
Err(err).
|
||||
Str("driver", drv.options.Name).
|
||||
Msg("failed to initialize the driver")
|
||||
Msg("failed to create a new connection")
|
||||
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return LoadHTMLPage(ctx, conn, drv.setDefaultParams(params))
|
||||
}
|
||||
|
||||
func (drv *Driver) Parse(ctx context.Context, params drivers.ParseParams) (drivers.HTMLPage, error) {
|
||||
logger := logging.FromContext(ctx)
|
||||
|
||||
conn, err := drv.createConnection(ctx, true)
|
||||
|
||||
if err != nil {
|
||||
logger.
|
||||
Error().
|
||||
Timestamp().
|
||||
Err(err).
|
||||
Str("driver", drv.options.Name).
|
||||
Msg("failed to create a new connection")
|
||||
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return LoadHTMLPageWithContent(ctx, conn, drv.setDefaultParams(drivers.Params{
|
||||
URL: BlankPageURL,
|
||||
UserAgent: "",
|
||||
KeepCookies: params.KeepCookies,
|
||||
Cookies: params.Cookies,
|
||||
Headers: params.Headers,
|
||||
Viewport: params.Viewport,
|
||||
}), params.Content)
|
||||
}
|
||||
|
||||
func (drv *Driver) Close() error {
|
||||
drv.mu.Lock()
|
||||
defer drv.mu.Unlock()
|
||||
|
||||
if drv.session != nil {
|
||||
drv.session.Close()
|
||||
|
||||
return drv.conn.Close()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (drv *Driver) createConnection(ctx context.Context, keepCookies bool) (*rpcc.Conn, error) {
|
||||
err := drv.init(ctx)
|
||||
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "initialize driver")
|
||||
}
|
||||
|
||||
// Args for a new target belonging to the browser context
|
||||
createTargetArgs := target.NewCreateTargetArgs(BlankPageURL)
|
||||
|
||||
if !drv.options.KeepCookies && !params.KeepCookies {
|
||||
if !drv.options.KeepCookies && !keepCookies {
|
||||
// Set it to an incognito mode
|
||||
createTargetArgs.SetBrowserContextID(drv.contextID)
|
||||
}
|
||||
@ -73,30 +123,20 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM
|
||||
createTarget, err := drv.client.Target.CreateTarget(ctx, createTargetArgs)
|
||||
|
||||
if err != nil {
|
||||
logger.
|
||||
Error().
|
||||
Timestamp().
|
||||
Err(err).
|
||||
Str("driver", drv.options.Name).
|
||||
Msg("failed to create a browser target")
|
||||
|
||||
return nil, err
|
||||
return nil, errors.Wrap(err, "create a browser target")
|
||||
}
|
||||
|
||||
// Connect to target using the existing websocket connection.
|
||||
conn, err := drv.session.Dial(ctx, createTarget.TargetID)
|
||||
|
||||
if err != nil {
|
||||
logger.
|
||||
Error().
|
||||
Timestamp().
|
||||
Err(err).
|
||||
Str("driver", drv.options.Name).
|
||||
Msg("failed to establish a connection")
|
||||
|
||||
return nil, err
|
||||
return nil, errors.Wrap(err, "establish a new connection")
|
||||
}
|
||||
|
||||
return conn, nil
|
||||
}
|
||||
|
||||
func (drv *Driver) setDefaultParams(params drivers.Params) drivers.Params {
|
||||
if params.UserAgent == "" {
|
||||
params.UserAgent = drv.options.UserAgent
|
||||
}
|
||||
@ -133,20 +173,7 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM
|
||||
}
|
||||
}
|
||||
|
||||
return LoadHTMLPage(ctx, conn, params)
|
||||
}
|
||||
|
||||
func (drv *Driver) Close() error {
|
||||
drv.mu.Lock()
|
||||
defer drv.mu.Unlock()
|
||||
|
||||
if drv.session != nil {
|
||||
drv.session.Close()
|
||||
|
||||
return drv.conn.Close()
|
||||
}
|
||||
|
||||
return nil
|
||||
return params
|
||||
}
|
||||
|
||||
func (drv *Driver) init(ctx context.Context) error {
|
||||
|
@ -2,8 +2,6 @@ package cdp
|
||||
|
||||
import (
|
||||
"context"
|
||||
"github.com/MontFerret/ferret/pkg/drivers/cdp/dom"
|
||||
"github.com/pkg/errors"
|
||||
"hash/fnv"
|
||||
"io"
|
||||
"regexp"
|
||||
@ -12,9 +10,11 @@ import (
|
||||
"github.com/mafredri/cdp"
|
||||
"github.com/mafredri/cdp/protocol/page"
|
||||
"github.com/mafredri/cdp/rpcc"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/rs/zerolog"
|
||||
|
||||
"github.com/MontFerret/ferret/pkg/drivers"
|
||||
"github.com/MontFerret/ferret/pkg/drivers/cdp/dom"
|
||||
"github.com/MontFerret/ferret/pkg/drivers/cdp/events"
|
||||
"github.com/MontFerret/ferret/pkg/drivers/cdp/input"
|
||||
net "github.com/MontFerret/ferret/pkg/drivers/cdp/network"
|
||||
@ -126,6 +126,51 @@ func LoadHTMLPage(
|
||||
return p, nil
|
||||
}
|
||||
|
||||
func LoadHTMLPageWithContent(
|
||||
ctx context.Context,
|
||||
conn *rpcc.Conn,
|
||||
params drivers.Params,
|
||||
content []byte,
|
||||
) (p *HTMLPage, err error) {
|
||||
logger := logging.FromContext(ctx)
|
||||
p, err = LoadHTMLPage(ctx, conn, params)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err != nil {
|
||||
if e := p.Close(); e != nil {
|
||||
logger.Error().Err(e).Msg("failed to close page")
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
frameID := p.getCurrentDocument().Frame().Frame.ID
|
||||
err = p.client.Page.SetDocumentContent(ctx, page.NewSetDocumentContentArgs(frameID, string(content)))
|
||||
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "set document content")
|
||||
}
|
||||
|
||||
// Remove prev frames (from a blank page)
|
||||
prev := p.dom.GetMainFrame()
|
||||
err = p.dom.RemoveFrameRecursively(prev.Frame().Frame.ID)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
err = p.loadMainFrame(ctx)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return p, nil
|
||||
}
|
||||
|
||||
func NewHTMLPage(
|
||||
logger *zerolog.Logger,
|
||||
conn *rpcc.Conn,
|
||||
|
@ -2,8 +2,9 @@ package drivers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"github.com/MontFerret/ferret/pkg/runtime/core"
|
||||
"io"
|
||||
|
||||
"github.com/MontFerret/ferret/pkg/runtime/core"
|
||||
)
|
||||
|
||||
type (
|
||||
@ -18,6 +19,7 @@ type (
|
||||
io.Closer
|
||||
Name() string
|
||||
Open(ctx context.Context, params Params) (HTMLPage, error)
|
||||
Parse(ctx context.Context, params ParseParams) (HTMLPage, error)
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -6,13 +6,13 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
|
||||
"github.com/MontFerret/ferret/pkg/drivers"
|
||||
"github.com/MontFerret/ferret/pkg/drivers/common"
|
||||
"github.com/MontFerret/ferret/pkg/runtime/logging"
|
||||
"github.com/MontFerret/ferret/pkg/runtime/values"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/sethgrid/pester"
|
||||
|
||||
"github.com/MontFerret/ferret/pkg/drivers"
|
||||
"github.com/MontFerret/ferret/pkg/drivers/common"
|
||||
"github.com/MontFerret/ferret/pkg/runtime/logging"
|
||||
)
|
||||
|
||||
const DriverName = "http"
|
||||
@ -177,13 +177,8 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM
|
||||
return NewHTMLPage(doc, params.URL, &r, cookies)
|
||||
}
|
||||
|
||||
func (drv *Driver) responseCodeAllowed(resp *http.Response) bool {
|
||||
_, exists := drv.options.AllowedHTTPCodes[resp.StatusCode]
|
||||
return exists
|
||||
}
|
||||
|
||||
func (drv *Driver) Parse(_ context.Context, str values.String) (drivers.HTMLPage, error) {
|
||||
buf := bytes.NewBuffer([]byte(str))
|
||||
func (drv *Driver) Parse(_ context.Context, params drivers.ParseParams) (drivers.HTMLPage, error) {
|
||||
buf := bytes.NewBuffer(params.Content)
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(buf)
|
||||
|
||||
@ -199,3 +194,8 @@ func (drv *Driver) Close() error {
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (drv *Driver) responseCodeAllowed(resp *http.Response) bool {
|
||||
_, exists := drv.options.AllowedHTTPCodes[resp.StatusCode]
|
||||
return exists
|
||||
}
|
||||
|
@ -17,4 +17,12 @@ type (
|
||||
Headers HTTPHeaders
|
||||
Viewport *Viewport
|
||||
}
|
||||
|
||||
ParseParams struct {
|
||||
Content []byte
|
||||
KeepCookies bool
|
||||
Cookies HTTPCookies
|
||||
Headers HTTPHeaders
|
||||
Viewport *Viewport
|
||||
}
|
||||
)
|
||||
|
@ -220,6 +220,12 @@ func (t *Object) ForEach(predicate ObjectPredicate) {
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Object) Has(key String) Boolean {
|
||||
_, exists := t.value[string(key)]
|
||||
|
||||
return NewBoolean(exists)
|
||||
}
|
||||
|
||||
func (t *Object) MustGet(key String) core.Value {
|
||||
val, _ := t.Get(key)
|
||||
|
||||
|
@ -43,6 +43,7 @@ func RegisterLib(ns core.Namespace) error {
|
||||
"NAVIGATE_BACK": NavigateBack,
|
||||
"NAVIGATE_FORWARD": NavigateForward,
|
||||
"PAGINATION": Pagination,
|
||||
"PARSE": Parse,
|
||||
"PDF": PDF,
|
||||
"SCREENSHOT": Screenshot,
|
||||
"SCROLL": ScrollXY,
|
||||
|
155
pkg/stdlib/html/parse.go
Normal file
155
pkg/stdlib/html/parse.go
Normal file
@ -0,0 +1,155 @@
|
||||
package html
|
||||
|
||||
import (
|
||||
"context"
|
||||
"github.com/MontFerret/ferret/pkg/drivers"
|
||||
"github.com/MontFerret/ferret/pkg/runtime/values/types"
|
||||
"github.com/pkg/errors"
|
||||
|
||||
"github.com/MontFerret/ferret/pkg/runtime/core"
|
||||
"github.com/MontFerret/ferret/pkg/runtime/values"
|
||||
)
|
||||
|
||||
type ParseParams struct {
|
||||
drivers.ParseParams
|
||||
Driver string
|
||||
}
|
||||
|
||||
// PARSE loads an HTML page from a given string or byte array
|
||||
// @param params (Object) - Optional, an object containing the following properties :
|
||||
// driver (String) - Optional, driver name.
|
||||
// keepCookies (Boolean) - Optional, boolean value indicating whether to use cookies from previous sessions.
|
||||
// i.e. not to open a page in the Incognito mode.
|
||||
// cookies (HTTPCookies) - Optional, set of HTTP cookies.
|
||||
// headers (HTTPHeaders) - Optional, HTTP headers.
|
||||
// viewport (Viewport) - Optional, viewport params.
|
||||
// @returns (HTMLPage) - Returns parsed and loaded HTML page.
|
||||
func Parse(ctx context.Context, args ...core.Value) (core.Value, error) {
|
||||
if err := core.ValidateArgs(args, 1, 2); err != nil {
|
||||
return values.None, err
|
||||
}
|
||||
|
||||
arg1 := args[0]
|
||||
|
||||
if err := core.ValidateType(arg1, types.String, types.Binary); err != nil {
|
||||
return values.None, err
|
||||
}
|
||||
|
||||
var content []byte
|
||||
|
||||
if arg1.Type() == types.String {
|
||||
content = []byte(arg1.(values.String))
|
||||
} else {
|
||||
content = []byte(arg1.(values.Binary))
|
||||
}
|
||||
|
||||
var params ParseParams
|
||||
|
||||
if len(args) > 1 {
|
||||
if err := core.ValidateType(args[1], types.Object); err != nil {
|
||||
return values.None, err
|
||||
}
|
||||
|
||||
p, err := parseParseParams(content, args[1].(*values.Object))
|
||||
|
||||
if err != nil {
|
||||
return values.None, err
|
||||
}
|
||||
|
||||
params = p
|
||||
} else {
|
||||
params = defaultParseParams(content)
|
||||
}
|
||||
|
||||
drv, err := drivers.FromContext(ctx, params.Driver)
|
||||
|
||||
if err != nil {
|
||||
return values.None, err
|
||||
}
|
||||
|
||||
return drv.Parse(ctx, params.ParseParams)
|
||||
}
|
||||
|
||||
func defaultParseParams(content []byte) ParseParams {
|
||||
return ParseParams{
|
||||
ParseParams: drivers.ParseParams{
|
||||
Content: content,
|
||||
},
|
||||
Driver: "",
|
||||
}
|
||||
}
|
||||
|
||||
func parseParseParams(content []byte, arg *values.Object) (ParseParams, error) {
|
||||
res := defaultParseParams(content)
|
||||
|
||||
if arg.Has("driver") {
|
||||
driverName := arg.MustGet("driver")
|
||||
|
||||
if err := core.ValidateType(driverName, types.String); err != nil {
|
||||
return ParseParams{}, errors.Wrap(err, ".driver")
|
||||
}
|
||||
|
||||
res.Driver = driverName.String()
|
||||
}
|
||||
|
||||
if arg.Has("keepCookies") {
|
||||
keepCookies := arg.MustGet("keepCookies")
|
||||
|
||||
if err := core.ValidateType(keepCookies, types.Boolean); err != nil {
|
||||
return ParseParams{}, errors.Wrap(err, ".keepCookies")
|
||||
}
|
||||
|
||||
res.KeepCookies = bool(keepCookies.(values.Boolean))
|
||||
}
|
||||
|
||||
if arg.Has("cookies") {
|
||||
cookies := arg.MustGet("cookies")
|
||||
|
||||
if err := core.ValidateType(cookies, types.Array, types.Object); err != nil {
|
||||
return res, err
|
||||
}
|
||||
|
||||
switch c := cookies.(type) {
|
||||
case *values.Array:
|
||||
cookies, err := parseCookieArray(c)
|
||||
|
||||
if err != nil {
|
||||
return ParseParams{}, errors.Wrap(err, ".cookies")
|
||||
}
|
||||
|
||||
res.Cookies = cookies
|
||||
case *values.Object:
|
||||
cookies, err := parseCookieObject(c)
|
||||
|
||||
if err != nil {
|
||||
return ParseParams{}, errors.Wrap(err, ".cookies")
|
||||
}
|
||||
|
||||
res.Cookies = cookies
|
||||
default:
|
||||
res.Cookies = make(drivers.HTTPCookies)
|
||||
}
|
||||
}
|
||||
|
||||
if arg.Has("headers") {
|
||||
headers := arg.MustGet("headers")
|
||||
|
||||
if err := core.ValidateType(headers, types.Object); err != nil {
|
||||
return ParseParams{}, errors.Wrap(err, ".headers")
|
||||
}
|
||||
|
||||
res.Headers = parseHeader(headers.(*values.Object))
|
||||
}
|
||||
|
||||
if arg.Has("viewport") {
|
||||
viewport, err := parseViewport(arg.MustGet("viewport"))
|
||||
|
||||
if err != nil {
|
||||
return ParseParams{}, errors.Wrap(err, ".viewport")
|
||||
}
|
||||
|
||||
res.Viewport = viewport
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user