1
0
mirror of https://github.com/MontFerret/ferret.git synced 2025-02-09 13:38:35 +02:00

Added support of ignoring page resources (#592)

* Added support of ignoring page resources

* Updatd pipeline scripts

* Updated comments
This commit is contained in:
Tim Voronov 2021-02-19 11:40:30 -05:00 committed by GitHub
parent c6a459e35c
commit 79566c3b76
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 256 additions and 12 deletions

View File

@ -23,7 +23,7 @@ jobs:
sudo curl -o /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar https://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar
export CLASSPATH=".:/usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar:$CLASSPATH"
mkdir $HOME/antlr-bin
echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar "$@"' > $HOME/antlr-bin/antlr
echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-4.9-complete.jar "$@"' > $HOME/antlr-bin/antlr
echo -e '#!/bin/bash\njava org.antlr.v4.gui.TestRig "$@"' > $HOME/antlr-bin/grun
chmod +x $HOME/antlr-bin/*
export PATH=$PATH:$HOME/antlr-bin

View File

@ -24,7 +24,7 @@ jobs:
sudo curl -o /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar https://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar
export CLASSPATH=".:/usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar:$CLASSPATH"
mkdir $HOME/antlr-bin
echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar "$@"' > $HOME/antlr-bin/antlr
echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-4.9-complete.jar "$@"' > $HOME/antlr-bin/antlr
echo -e '#!/bin/bash\njava org.antlr.v4.gui.TestRig "$@"' > $HOME/antlr-bin/grun
chmod +x $HOME/antlr-bin/*
sudo ln -s $HOME/antlr-bin/antlr /usr/local/bin/antlr

View File

@ -0,0 +1,13 @@
LET p = DOCUMENT("https://www.gettyimages.com/", {
driver: "cdp",
disable: {
resources: [
{
url: "*",
type: "image"
}
]
}
})
RETURN NONE

View File

@ -62,7 +62,6 @@ func (loop *Loop) RemoveListener(eventID ID, listenerID ListenerID) {
// run starts running an event loop.
// It constantly iterates over each event source.
// Additionally to that, on each iteration it checks the command channel in order to perform add/remove listener/source operations.
func (loop *Loop) run(ctx context.Context) {
sources := loop.sources
size := sources.Size()

View File

@ -5,4 +5,5 @@ import "github.com/MontFerret/ferret/pkg/drivers/cdp/events"
var (
eventFrameLoad = events.New("frame_load")
responseReceived = events.New("response_received")
requestPaused = events.New("request_paused")
)

View File

@ -8,6 +8,7 @@ import (
"sync"
"github.com/mafredri/cdp"
"github.com/mafredri/cdp/protocol/fetch"
"github.com/mafredri/cdp/protocol/network"
"github.com/mafredri/cdp/protocol/page"
"github.com/mafredri/cdp/rpcc"
@ -37,6 +38,7 @@ type (
eventLoop *events.Loop
cancel context.CancelFunc
responseListenerID events.ListenerID
filterListenerID events.ListenerID
response *sync.Map
}
)
@ -44,6 +46,7 @@ type (
func New(
logger *zerolog.Logger,
client *cdp.Client,
options Options,
) (*Manager, error) {
ctx, cancel := context.WithCancel(context.Background())
@ -55,6 +58,20 @@ func New(
m.cancel = cancel
m.response = new(sync.Map)
if len(options.Cookies) > 0 {
for url, cookies := range options.Cookies {
if err := m.setCookiesInternal(ctx, url, cookies); err != nil {
return nil, err
}
}
}
if len(options.Headers) > 0 {
if err := m.setHeadersInternal(ctx, options.Headers); err != nil {
return nil, err
}
}
var err error
closers := make([]io.Closer, 0, 10)
@ -87,6 +104,32 @@ func New(
m.responseListenerID = m.eventLoop.AddListener(responseReceived, m.onResponse)
if len(options.Filter.Patterns) > 0 {
el2 := events.NewLoop()
err = m.client.Fetch.Enable(ctx, toFetchArgs(options.Filter.Patterns))
if err != nil {
return nil, err
}
requestPausedStream, err := m.client.Fetch.RequestPaused(ctx)
if err != nil {
return nil, err
}
el2.AddSource(events.NewSource(requestPaused, requestPausedStream, func(stream rpcc.Stream) (interface{}, error) {
return stream.(fetch.RequestPausedClient).Recv()
}))
m.filterListenerID = el2.AddListener(requestPaused, m.onRequestPaused)
// run in a separate loop in order to get higher priority
// TODO: Consider adding support of event priorities to EventLoop
el2.Run(ctx)
}
m.eventLoop.Run(ctx)
return m, nil
@ -128,6 +171,10 @@ func (m *Manager) SetCookies(ctx context.Context, url string, cookies drivers.HT
m.mu.Lock()
defer m.mu.Unlock()
return m.setCookiesInternal(ctx, url, cookies)
}
func (m *Manager) setCookiesInternal(ctx context.Context, url string, cookies drivers.HTTPCookies) error {
if len(cookies) == 0 {
return nil
}
@ -176,6 +223,10 @@ func (m *Manager) SetHeaders(ctx context.Context, headers drivers.HTTPHeaders) e
m.mu.Lock()
defer m.mu.Unlock()
return m.setHeadersInternal(ctx, headers)
}
func (m *Manager) setHeadersInternal(ctx context.Context, headers drivers.HTTPHeaders) error {
if len(headers) == 0 {
return nil
}
@ -431,3 +482,27 @@ func (m *Manager) onResponse(_ context.Context, message interface{}) (out bool)
return
}
func (m *Manager) onRequestPaused(ctx context.Context, message interface{}) (out bool) {
out = true
msg, ok := message.(*fetch.RequestPausedReply)
if !ok {
return
}
err := m.client.Fetch.FailRequest(ctx, &fetch.FailRequestArgs{
RequestID: msg.RequestID,
ErrorReason: network.ErrorReasonBlockedByClient,
})
if err != nil {
m.logger.
Err(err).
Str("resourceType", msg.ResourceType.String()).
Str("url", msg.Request.URL).
Msg("failed to terminate a request")
}
return
}

View File

@ -0,0 +1,37 @@
package network
import (
"github.com/MontFerret/ferret/pkg/drivers"
"github.com/mafredri/cdp/protocol/fetch"
)
type (
Cookies map[string]drivers.HTTPCookies
Filter struct {
Patterns []drivers.ResourceFilter
}
Options struct {
Cookies Cookies
Headers drivers.HTTPHeaders
Filter Filter
}
)
func toFetchArgs(filterPatterns []drivers.ResourceFilter) *fetch.EnableArgs {
patterns := make([]fetch.RequestPattern, 0, len(filterPatterns))
for _, pattern := range filterPatterns {
rt := toResourceType(pattern.Type)
patterns = append(patterns, fetch.RequestPattern{
URLPattern: &pattern.URL,
ResourceType: &rt,
})
}
return &fetch.EnableArgs{
Patterns: patterns,
}
}

View File

@ -0,0 +1,37 @@
package network
import "github.com/mafredri/cdp/protocol/network"
var (
resourceTypeMapping = map[string]network.ResourceType{
"document": network.ResourceTypeDocument,
"stylesheet": network.ResourceTypeStylesheet,
"css": network.ResourceTypeStylesheet,
"image": network.ResourceTypeImage,
"media": network.ResourceTypeMedia,
"font": network.ResourceTypeFont,
"script": network.ResourceTypeScript,
"js": network.ResourceTypeScript,
"texttrack": network.ResourceTypeTextTrack,
"xhr": network.ResourceTypeXHR,
"ajax": network.ResourceTypeXHR,
"fetch": network.ResourceTypeFetch,
"eventsource": network.ResourceTypeEventSource,
"websocket": network.ResourceTypeWebSocket,
"manifest": network.ResourceTypeManifest,
"sxg": network.ResourceTypeSignedExchange,
"ping": network.ResourceTypePing,
"cspViolationReport": network.ResourceTypeCSPViolationReport,
"other": network.ResourceTypeOther,
}
)
func toResourceType(alias string) network.ResourceType {
rt, found := resourceTypeMapping[alias]
if found {
return rt
}
return network.ResourceTypeNotSet
}

View File

@ -69,19 +69,22 @@ func LoadHTMLPage(
}
}()
netManager, err := net.New(logger, client)
if err != nil {
return nil, err
netOpts := net.Options{
Headers: params.Headers,
}
err = netManager.SetCookies(ctx, params.URL, params.Cookies)
if err != nil {
return nil, err
if len(params.Cookies) > 0 {
netOpts.Cookies = make(map[string]drivers.HTTPCookies)
netOpts.Cookies[params.URL] = params.Cookies
}
err = netManager.SetHeaders(ctx, params.Headers)
if params.Disable != nil {
if len(params.Disable.Resources) > 0 {
netOpts.Filter.Patterns = params.Disable.Resources
}
}
netManager, err := net.New(logger, client, netOpts)
if err != nil {
return nil, err

View File

@ -1,6 +1,15 @@
package drivers
type (
ResourceFilter struct {
URL string
Type string
}
Disable struct {
Resources []ResourceFilter
}
Viewport struct {
Height int
Width int
@ -16,6 +25,7 @@ type (
Cookies HTTPCookies
Headers HTTPHeaders
Viewport *Viewport
Disable *Disable
}
ParseParams struct {

View File

@ -27,6 +27,10 @@ type PageLoadParams struct {
// @param {Boolean} [params.keepCookies=False] - Boolean value indicating whether to use cookies from previous sessions i.e. not to open a page in the Incognito mode.
// @param {HTTPCookies} [params.cookies] - Set of HTTP cookies to use during page loading.
// @param {HTTPHeaders} [params.headers] - Set of HTTP headers to use during page loading.
// @param {Object} [params.disable] - Set of parameters to disable some page functionality or behavior.
// @param {Object[]} [params.disable.resources] - Collection of rules to disable resources during page load and navigation.
// @param {String} [params.disable.resources.*.url] - Resource url pattern. If set, requests for matching urls will be blocked. Wildcards ('*' -> zero or more, '?' -> exactly one) are allowed. Escape character is backslash. Omitting is equivalent to "*".
// @param {String} [params.disable.resources.*.type] - Resource type. If set, requests for matching resource types will be blocked.
// @param {Object} [params.viewport] - Viewport params.
// @param {Int} [params.viewport.height] - Viewport height.
// @param {Int} [params.viewport.width] - Viewport width.
@ -186,6 +190,18 @@ func newPageLoadParams(url values.String, arg core.Value) (PageLoadParams, error
res.Viewport = viewport
}
disable, exists := obj.Get(values.NewString("disable"))
if exists {
disable, err := parseDisable(disable)
if err != nil {
return res, err
}
res.Disable = disable
}
case types.String:
res.Driver = arg.(values.String).String()
case types.Boolean:
@ -391,3 +407,56 @@ func parseViewport(value core.Value) (*drivers.Viewport, error) {
return res, nil
}
func parseDisable(value core.Value) (*drivers.Disable, error) {
if err := core.ValidateType(value, types.Object); err != nil {
return nil, err
}
res := &drivers.Disable{}
disable := value.(*values.Object)
resources, exists := disable.Get("resources")
if exists {
if err := core.ValidateType(resources, types.Array); err != nil {
return nil, err
}
resources := resources.(*values.Array)
res.Resources = make([]drivers.ResourceFilter, 0, resources.Length())
var e error
resources.ForEach(func(el core.Value, idx int) bool {
if e = core.ValidateType(el, types.Object); e != nil {
return false
}
pattern := el.(*values.Object)
url, urlExists := pattern.Get("url")
resType, resTypeExists := pattern.Get("type")
// ignore element
if !urlExists && !resTypeExists {
return true
}
res.Resources = append(res.Resources, drivers.ResourceFilter{
URL: url.String(),
Type: resType.String(),
})
return true
})
if e != nil {
return nil, e
}
}
return res, nil
}