1
0
mirror of https://github.com/MontFerret/ferret.git synced 2025-04-07 07:19:58 +02:00

Added possibility to load pages from memory (#434)

* Added possibility to load pages from memory

* Fixed indent
This commit is contained in:
Tim Voronov 2020-01-04 12:57:41 -05:00 committed by GitHub
parent 145a16f97d
commit 4af0e0f15f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 326 additions and 48 deletions

View File

@ -0,0 +1,18 @@
LET page = PARSE(`
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Offline</title>
</head>
<body>
<h1>Hello world</h1>
</body>
</html>
`, {
driver: "cdp"
})
LET title = ELEMENT(page, "title")
RETURN EXPECT(title.innerText, "Offline")

View File

@ -0,0 +1,16 @@
LET page = PARSE(`
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Offline</title>
</head>
<body>
<h1>Hello world</h1>
</body>
</html>
`)
LET title = ELEMENT(page, "title")
RETURN EXPECT(title.innerText, "Offline")

View File

@ -2,6 +2,8 @@ package cdp
import (
"context"
"sync"
"github.com/mafredri/cdp"
"github.com/mafredri/cdp/devtool"
"github.com/mafredri/cdp/protocol/browser"
@ -9,7 +11,6 @@ import (
"github.com/mafredri/cdp/rpcc"
"github.com/mafredri/cdp/session"
"github.com/pkg/errors"
"sync"
"github.com/MontFerret/ferret/pkg/drivers"
"github.com/MontFerret/ferret/pkg/runtime/logging"
@ -48,7 +49,7 @@ func (drv *Driver) Name() string {
func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTMLPage, error) {
logger := logging.FromContext(ctx)
err := drv.init(ctx)
conn, err := drv.createConnection(ctx, params.KeepCookies)
if err != nil {
logger.
@ -56,15 +57,64 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM
Timestamp().
Err(err).
Str("driver", drv.options.Name).
Msg("failed to initialize the driver")
Msg("failed to create a new connection")
return nil, err
}
return LoadHTMLPage(ctx, conn, drv.setDefaultParams(params))
}
func (drv *Driver) Parse(ctx context.Context, params drivers.ParseParams) (drivers.HTMLPage, error) {
logger := logging.FromContext(ctx)
conn, err := drv.createConnection(ctx, true)
if err != nil {
logger.
Error().
Timestamp().
Err(err).
Str("driver", drv.options.Name).
Msg("failed to create a new connection")
return nil, err
}
return LoadHTMLPageWithContent(ctx, conn, drv.setDefaultParams(drivers.Params{
URL: BlankPageURL,
UserAgent: "",
KeepCookies: params.KeepCookies,
Cookies: params.Cookies,
Headers: params.Headers,
Viewport: params.Viewport,
}), params.Content)
}
func (drv *Driver) Close() error {
drv.mu.Lock()
defer drv.mu.Unlock()
if drv.session != nil {
drv.session.Close()
return drv.conn.Close()
}
return nil
}
func (drv *Driver) createConnection(ctx context.Context, keepCookies bool) (*rpcc.Conn, error) {
err := drv.init(ctx)
if err != nil {
return nil, errors.Wrap(err, "initialize driver")
}
// Args for a new target belonging to the browser context
createTargetArgs := target.NewCreateTargetArgs(BlankPageURL)
if !drv.options.KeepCookies && !params.KeepCookies {
if !drv.options.KeepCookies && !keepCookies {
// Set it to an incognito mode
createTargetArgs.SetBrowserContextID(drv.contextID)
}
@ -73,30 +123,20 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM
createTarget, err := drv.client.Target.CreateTarget(ctx, createTargetArgs)
if err != nil {
logger.
Error().
Timestamp().
Err(err).
Str("driver", drv.options.Name).
Msg("failed to create a browser target")
return nil, err
return nil, errors.Wrap(err, "create a browser target")
}
// Connect to target using the existing websocket connection.
conn, err := drv.session.Dial(ctx, createTarget.TargetID)
if err != nil {
logger.
Error().
Timestamp().
Err(err).
Str("driver", drv.options.Name).
Msg("failed to establish a connection")
return nil, err
return nil, errors.Wrap(err, "establish a new connection")
}
return conn, nil
}
func (drv *Driver) setDefaultParams(params drivers.Params) drivers.Params {
if params.UserAgent == "" {
params.UserAgent = drv.options.UserAgent
}
@ -133,20 +173,7 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM
}
}
return LoadHTMLPage(ctx, conn, params)
}
func (drv *Driver) Close() error {
drv.mu.Lock()
defer drv.mu.Unlock()
if drv.session != nil {
drv.session.Close()
return drv.conn.Close()
}
return nil
return params
}
func (drv *Driver) init(ctx context.Context) error {

View File

@ -2,8 +2,6 @@ package cdp
import (
"context"
"github.com/MontFerret/ferret/pkg/drivers/cdp/dom"
"github.com/pkg/errors"
"hash/fnv"
"io"
"regexp"
@ -12,9 +10,11 @@ import (
"github.com/mafredri/cdp"
"github.com/mafredri/cdp/protocol/page"
"github.com/mafredri/cdp/rpcc"
"github.com/pkg/errors"
"github.com/rs/zerolog"
"github.com/MontFerret/ferret/pkg/drivers"
"github.com/MontFerret/ferret/pkg/drivers/cdp/dom"
"github.com/MontFerret/ferret/pkg/drivers/cdp/events"
"github.com/MontFerret/ferret/pkg/drivers/cdp/input"
net "github.com/MontFerret/ferret/pkg/drivers/cdp/network"
@ -126,6 +126,51 @@ func LoadHTMLPage(
return p, nil
}
func LoadHTMLPageWithContent(
ctx context.Context,
conn *rpcc.Conn,
params drivers.Params,
content []byte,
) (p *HTMLPage, err error) {
logger := logging.FromContext(ctx)
p, err = LoadHTMLPage(ctx, conn, params)
if err != nil {
return nil, err
}
defer func() {
if err != nil {
if e := p.Close(); e != nil {
logger.Error().Err(e).Msg("failed to close page")
}
}
}()
frameID := p.getCurrentDocument().Frame().Frame.ID
err = p.client.Page.SetDocumentContent(ctx, page.NewSetDocumentContentArgs(frameID, string(content)))
if err != nil {
return nil, errors.Wrap(err, "set document content")
}
// Remove prev frames (from a blank page)
prev := p.dom.GetMainFrame()
err = p.dom.RemoveFrameRecursively(prev.Frame().Frame.ID)
if err != nil {
return nil, err
}
err = p.loadMainFrame(ctx)
if err != nil {
return nil, err
}
return p, nil
}
func NewHTMLPage(
logger *zerolog.Logger,
conn *rpcc.Conn,

View File

@ -2,8 +2,9 @@ package drivers
import (
"context"
"github.com/MontFerret/ferret/pkg/runtime/core"
"io"
"github.com/MontFerret/ferret/pkg/runtime/core"
)
type (
@ -18,6 +19,7 @@ type (
io.Closer
Name() string
Open(ctx context.Context, params Params) (HTMLPage, error)
Parse(ctx context.Context, params ParseParams) (HTMLPage, error)
}
)

View File

@ -6,13 +6,13 @@ import (
"net/http"
"net/url"
"github.com/MontFerret/ferret/pkg/drivers"
"github.com/MontFerret/ferret/pkg/drivers/common"
"github.com/MontFerret/ferret/pkg/runtime/logging"
"github.com/MontFerret/ferret/pkg/runtime/values"
"github.com/PuerkitoBio/goquery"
"github.com/pkg/errors"
"github.com/sethgrid/pester"
"github.com/MontFerret/ferret/pkg/drivers"
"github.com/MontFerret/ferret/pkg/drivers/common"
"github.com/MontFerret/ferret/pkg/runtime/logging"
)
const DriverName = "http"
@ -177,13 +177,8 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM
return NewHTMLPage(doc, params.URL, &r, cookies)
}
func (drv *Driver) responseCodeAllowed(resp *http.Response) bool {
_, exists := drv.options.AllowedHTTPCodes[resp.StatusCode]
return exists
}
func (drv *Driver) Parse(_ context.Context, str values.String) (drivers.HTMLPage, error) {
buf := bytes.NewBuffer([]byte(str))
func (drv *Driver) Parse(_ context.Context, params drivers.ParseParams) (drivers.HTMLPage, error) {
buf := bytes.NewBuffer(params.Content)
doc, err := goquery.NewDocumentFromReader(buf)
@ -199,3 +194,8 @@ func (drv *Driver) Close() error {
return nil
}
func (drv *Driver) responseCodeAllowed(resp *http.Response) bool {
_, exists := drv.options.AllowedHTTPCodes[resp.StatusCode]
return exists
}

View File

@ -17,4 +17,12 @@ type (
Headers HTTPHeaders
Viewport *Viewport
}
ParseParams struct {
Content []byte
KeepCookies bool
Cookies HTTPCookies
Headers HTTPHeaders
Viewport *Viewport
}
)

View File

@ -220,6 +220,12 @@ func (t *Object) ForEach(predicate ObjectPredicate) {
}
}
func (t *Object) Has(key String) Boolean {
_, exists := t.value[string(key)]
return NewBoolean(exists)
}
func (t *Object) MustGet(key String) core.Value {
val, _ := t.Get(key)

View File

@ -43,6 +43,7 @@ func RegisterLib(ns core.Namespace) error {
"NAVIGATE_BACK": NavigateBack,
"NAVIGATE_FORWARD": NavigateForward,
"PAGINATION": Pagination,
"PARSE": Parse,
"PDF": PDF,
"SCREENSHOT": Screenshot,
"SCROLL": ScrollXY,

155
pkg/stdlib/html/parse.go Normal file
View File

@ -0,0 +1,155 @@
package html
import (
"context"
"github.com/MontFerret/ferret/pkg/drivers"
"github.com/MontFerret/ferret/pkg/runtime/values/types"
"github.com/pkg/errors"
"github.com/MontFerret/ferret/pkg/runtime/core"
"github.com/MontFerret/ferret/pkg/runtime/values"
)
type ParseParams struct {
drivers.ParseParams
Driver string
}
// PARSE loads an HTML page from a given string or byte array
// @param params (Object) - Optional, an object containing the following properties :
// driver (String) - Optional, driver name.
// keepCookies (Boolean) - Optional, boolean value indicating whether to use cookies from previous sessions.
// i.e. not to open a page in the Incognito mode.
// cookies (HTTPCookies) - Optional, set of HTTP cookies.
// headers (HTTPHeaders) - Optional, HTTP headers.
// viewport (Viewport) - Optional, viewport params.
// @returns (HTMLPage) - Returns parsed and loaded HTML page.
func Parse(ctx context.Context, args ...core.Value) (core.Value, error) {
if err := core.ValidateArgs(args, 1, 2); err != nil {
return values.None, err
}
arg1 := args[0]
if err := core.ValidateType(arg1, types.String, types.Binary); err != nil {
return values.None, err
}
var content []byte
if arg1.Type() == types.String {
content = []byte(arg1.(values.String))
} else {
content = []byte(arg1.(values.Binary))
}
var params ParseParams
if len(args) > 1 {
if err := core.ValidateType(args[1], types.Object); err != nil {
return values.None, err
}
p, err := parseParseParams(content, args[1].(*values.Object))
if err != nil {
return values.None, err
}
params = p
} else {
params = defaultParseParams(content)
}
drv, err := drivers.FromContext(ctx, params.Driver)
if err != nil {
return values.None, err
}
return drv.Parse(ctx, params.ParseParams)
}
func defaultParseParams(content []byte) ParseParams {
return ParseParams{
ParseParams: drivers.ParseParams{
Content: content,
},
Driver: "",
}
}
func parseParseParams(content []byte, arg *values.Object) (ParseParams, error) {
res := defaultParseParams(content)
if arg.Has("driver") {
driverName := arg.MustGet("driver")
if err := core.ValidateType(driverName, types.String); err != nil {
return ParseParams{}, errors.Wrap(err, ".driver")
}
res.Driver = driverName.String()
}
if arg.Has("keepCookies") {
keepCookies := arg.MustGet("keepCookies")
if err := core.ValidateType(keepCookies, types.Boolean); err != nil {
return ParseParams{}, errors.Wrap(err, ".keepCookies")
}
res.KeepCookies = bool(keepCookies.(values.Boolean))
}
if arg.Has("cookies") {
cookies := arg.MustGet("cookies")
if err := core.ValidateType(cookies, types.Array, types.Object); err != nil {
return res, err
}
switch c := cookies.(type) {
case *values.Array:
cookies, err := parseCookieArray(c)
if err != nil {
return ParseParams{}, errors.Wrap(err, ".cookies")
}
res.Cookies = cookies
case *values.Object:
cookies, err := parseCookieObject(c)
if err != nil {
return ParseParams{}, errors.Wrap(err, ".cookies")
}
res.Cookies = cookies
default:
res.Cookies = make(drivers.HTTPCookies)
}
}
if arg.Has("headers") {
headers := arg.MustGet("headers")
if err := core.ValidateType(headers, types.Object); err != nil {
return ParseParams{}, errors.Wrap(err, ".headers")
}
res.Headers = parseHeader(headers.(*values.Object))
}
if arg.Has("viewport") {
viewport, err := parseViewport(arg.MustGet("viewport"))
if err != nil {
return ParseParams{}, errors.Wrap(err, ".viewport")
}
res.Viewport = viewport
}
return res, nil
}