mirror of
https://github.com/MontFerret/ferret.git
synced 2025-02-09 13:38:35 +02:00
Added support of ignoring page resources (#592)
* Added support of ignoring page resources * Updatd pipeline scripts * Updated comments
This commit is contained in:
parent
c6a459e35c
commit
79566c3b76
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@ -23,7 +23,7 @@ jobs:
|
||||
sudo curl -o /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar https://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar
|
||||
export CLASSPATH=".:/usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar:$CLASSPATH"
|
||||
mkdir $HOME/antlr-bin
|
||||
echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar "$@"' > $HOME/antlr-bin/antlr
|
||||
echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-4.9-complete.jar "$@"' > $HOME/antlr-bin/antlr
|
||||
echo -e '#!/bin/bash\njava org.antlr.v4.gui.TestRig "$@"' > $HOME/antlr-bin/grun
|
||||
chmod +x $HOME/antlr-bin/*
|
||||
export PATH=$PATH:$HOME/antlr-bin
|
||||
|
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@ -24,7 +24,7 @@ jobs:
|
||||
sudo curl -o /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar https://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar
|
||||
export CLASSPATH=".:/usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar:$CLASSPATH"
|
||||
mkdir $HOME/antlr-bin
|
||||
echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar "$@"' > $HOME/antlr-bin/antlr
|
||||
echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-4.9-complete.jar "$@"' > $HOME/antlr-bin/antlr
|
||||
echo -e '#!/bin/bash\njava org.antlr.v4.gui.TestRig "$@"' > $HOME/antlr-bin/grun
|
||||
chmod +x $HOME/antlr-bin/*
|
||||
sudo ln -s $HOME/antlr-bin/antlr /usr/local/bin/antlr
|
||||
|
13
examples/disable-images.fql
Normal file
13
examples/disable-images.fql
Normal file
@ -0,0 +1,13 @@
|
||||
LET p = DOCUMENT("https://www.gettyimages.com/", {
|
||||
driver: "cdp",
|
||||
disable: {
|
||||
resources: [
|
||||
{
|
||||
url: "*",
|
||||
type: "image"
|
||||
}
|
||||
]
|
||||
}
|
||||
})
|
||||
|
||||
RETURN NONE
|
@ -62,7 +62,6 @@ func (loop *Loop) RemoveListener(eventID ID, listenerID ListenerID) {
|
||||
|
||||
// run starts running an event loop.
|
||||
// It constantly iterates over each event source.
|
||||
// Additionally to that, on each iteration it checks the command channel in order to perform add/remove listener/source operations.
|
||||
func (loop *Loop) run(ctx context.Context) {
|
||||
sources := loop.sources
|
||||
size := sources.Size()
|
||||
|
@ -5,4 +5,5 @@ import "github.com/MontFerret/ferret/pkg/drivers/cdp/events"
|
||||
var (
|
||||
eventFrameLoad = events.New("frame_load")
|
||||
responseReceived = events.New("response_received")
|
||||
requestPaused = events.New("request_paused")
|
||||
)
|
||||
|
@ -8,6 +8,7 @@ import (
|
||||
"sync"
|
||||
|
||||
"github.com/mafredri/cdp"
|
||||
"github.com/mafredri/cdp/protocol/fetch"
|
||||
"github.com/mafredri/cdp/protocol/network"
|
||||
"github.com/mafredri/cdp/protocol/page"
|
||||
"github.com/mafredri/cdp/rpcc"
|
||||
@ -37,6 +38,7 @@ type (
|
||||
eventLoop *events.Loop
|
||||
cancel context.CancelFunc
|
||||
responseListenerID events.ListenerID
|
||||
filterListenerID events.ListenerID
|
||||
response *sync.Map
|
||||
}
|
||||
)
|
||||
@ -44,6 +46,7 @@ type (
|
||||
func New(
|
||||
logger *zerolog.Logger,
|
||||
client *cdp.Client,
|
||||
options Options,
|
||||
) (*Manager, error) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
@ -55,6 +58,20 @@ func New(
|
||||
m.cancel = cancel
|
||||
m.response = new(sync.Map)
|
||||
|
||||
if len(options.Cookies) > 0 {
|
||||
for url, cookies := range options.Cookies {
|
||||
if err := m.setCookiesInternal(ctx, url, cookies); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(options.Headers) > 0 {
|
||||
if err := m.setHeadersInternal(ctx, options.Headers); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
var err error
|
||||
|
||||
closers := make([]io.Closer, 0, 10)
|
||||
@ -87,6 +104,32 @@ func New(
|
||||
|
||||
m.responseListenerID = m.eventLoop.AddListener(responseReceived, m.onResponse)
|
||||
|
||||
if len(options.Filter.Patterns) > 0 {
|
||||
el2 := events.NewLoop()
|
||||
|
||||
err = m.client.Fetch.Enable(ctx, toFetchArgs(options.Filter.Patterns))
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
requestPausedStream, err := m.client.Fetch.RequestPaused(ctx)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
el2.AddSource(events.NewSource(requestPaused, requestPausedStream, func(stream rpcc.Stream) (interface{}, error) {
|
||||
return stream.(fetch.RequestPausedClient).Recv()
|
||||
}))
|
||||
|
||||
m.filterListenerID = el2.AddListener(requestPaused, m.onRequestPaused)
|
||||
|
||||
// run in a separate loop in order to get higher priority
|
||||
// TODO: Consider adding support of event priorities to EventLoop
|
||||
el2.Run(ctx)
|
||||
}
|
||||
|
||||
m.eventLoop.Run(ctx)
|
||||
|
||||
return m, nil
|
||||
@ -128,6 +171,10 @@ func (m *Manager) SetCookies(ctx context.Context, url string, cookies drivers.HT
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
return m.setCookiesInternal(ctx, url, cookies)
|
||||
}
|
||||
|
||||
func (m *Manager) setCookiesInternal(ctx context.Context, url string, cookies drivers.HTTPCookies) error {
|
||||
if len(cookies) == 0 {
|
||||
return nil
|
||||
}
|
||||
@ -176,6 +223,10 @@ func (m *Manager) SetHeaders(ctx context.Context, headers drivers.HTTPHeaders) e
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
return m.setHeadersInternal(ctx, headers)
|
||||
}
|
||||
|
||||
func (m *Manager) setHeadersInternal(ctx context.Context, headers drivers.HTTPHeaders) error {
|
||||
if len(headers) == 0 {
|
||||
return nil
|
||||
}
|
||||
@ -431,3 +482,27 @@ func (m *Manager) onResponse(_ context.Context, message interface{}) (out bool)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (m *Manager) onRequestPaused(ctx context.Context, message interface{}) (out bool) {
|
||||
out = true
|
||||
msg, ok := message.(*fetch.RequestPausedReply)
|
||||
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
err := m.client.Fetch.FailRequest(ctx, &fetch.FailRequestArgs{
|
||||
RequestID: msg.RequestID,
|
||||
ErrorReason: network.ErrorReasonBlockedByClient,
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
m.logger.
|
||||
Err(err).
|
||||
Str("resourceType", msg.ResourceType.String()).
|
||||
Str("url", msg.Request.URL).
|
||||
Msg("failed to terminate a request")
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
37
pkg/drivers/cdp/network/options.go
Normal file
37
pkg/drivers/cdp/network/options.go
Normal file
@ -0,0 +1,37 @@
|
||||
package network
|
||||
|
||||
import (
|
||||
"github.com/MontFerret/ferret/pkg/drivers"
|
||||
"github.com/mafredri/cdp/protocol/fetch"
|
||||
)
|
||||
|
||||
type (
|
||||
Cookies map[string]drivers.HTTPCookies
|
||||
|
||||
Filter struct {
|
||||
Patterns []drivers.ResourceFilter
|
||||
}
|
||||
|
||||
Options struct {
|
||||
Cookies Cookies
|
||||
Headers drivers.HTTPHeaders
|
||||
Filter Filter
|
||||
}
|
||||
)
|
||||
|
||||
func toFetchArgs(filterPatterns []drivers.ResourceFilter) *fetch.EnableArgs {
|
||||
patterns := make([]fetch.RequestPattern, 0, len(filterPatterns))
|
||||
|
||||
for _, pattern := range filterPatterns {
|
||||
rt := toResourceType(pattern.Type)
|
||||
|
||||
patterns = append(patterns, fetch.RequestPattern{
|
||||
URLPattern: &pattern.URL,
|
||||
ResourceType: &rt,
|
||||
})
|
||||
}
|
||||
|
||||
return &fetch.EnableArgs{
|
||||
Patterns: patterns,
|
||||
}
|
||||
}
|
37
pkg/drivers/cdp/network/resources.go
Normal file
37
pkg/drivers/cdp/network/resources.go
Normal file
@ -0,0 +1,37 @@
|
||||
package network
|
||||
|
||||
import "github.com/mafredri/cdp/protocol/network"
|
||||
|
||||
var (
|
||||
resourceTypeMapping = map[string]network.ResourceType{
|
||||
"document": network.ResourceTypeDocument,
|
||||
"stylesheet": network.ResourceTypeStylesheet,
|
||||
"css": network.ResourceTypeStylesheet,
|
||||
"image": network.ResourceTypeImage,
|
||||
"media": network.ResourceTypeMedia,
|
||||
"font": network.ResourceTypeFont,
|
||||
"script": network.ResourceTypeScript,
|
||||
"js": network.ResourceTypeScript,
|
||||
"texttrack": network.ResourceTypeTextTrack,
|
||||
"xhr": network.ResourceTypeXHR,
|
||||
"ajax": network.ResourceTypeXHR,
|
||||
"fetch": network.ResourceTypeFetch,
|
||||
"eventsource": network.ResourceTypeEventSource,
|
||||
"websocket": network.ResourceTypeWebSocket,
|
||||
"manifest": network.ResourceTypeManifest,
|
||||
"sxg": network.ResourceTypeSignedExchange,
|
||||
"ping": network.ResourceTypePing,
|
||||
"cspViolationReport": network.ResourceTypeCSPViolationReport,
|
||||
"other": network.ResourceTypeOther,
|
||||
}
|
||||
)
|
||||
|
||||
func toResourceType(alias string) network.ResourceType {
|
||||
rt, found := resourceTypeMapping[alias]
|
||||
|
||||
if found {
|
||||
return rt
|
||||
}
|
||||
|
||||
return network.ResourceTypeNotSet
|
||||
}
|
@ -69,19 +69,22 @@ func LoadHTMLPage(
|
||||
}
|
||||
}()
|
||||
|
||||
netManager, err := net.New(logger, client)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
netOpts := net.Options{
|
||||
Headers: params.Headers,
|
||||
}
|
||||
|
||||
err = netManager.SetCookies(ctx, params.URL, params.Cookies)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
if len(params.Cookies) > 0 {
|
||||
netOpts.Cookies = make(map[string]drivers.HTTPCookies)
|
||||
netOpts.Cookies[params.URL] = params.Cookies
|
||||
}
|
||||
|
||||
err = netManager.SetHeaders(ctx, params.Headers)
|
||||
if params.Disable != nil {
|
||||
if len(params.Disable.Resources) > 0 {
|
||||
netOpts.Filter.Patterns = params.Disable.Resources
|
||||
}
|
||||
}
|
||||
|
||||
netManager, err := net.New(logger, client, netOpts)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -1,6 +1,15 @@
|
||||
package drivers
|
||||
|
||||
type (
|
||||
ResourceFilter struct {
|
||||
URL string
|
||||
Type string
|
||||
}
|
||||
|
||||
Disable struct {
|
||||
Resources []ResourceFilter
|
||||
}
|
||||
|
||||
Viewport struct {
|
||||
Height int
|
||||
Width int
|
||||
@ -16,6 +25,7 @@ type (
|
||||
Cookies HTTPCookies
|
||||
Headers HTTPHeaders
|
||||
Viewport *Viewport
|
||||
Disable *Disable
|
||||
}
|
||||
|
||||
ParseParams struct {
|
||||
|
@ -27,6 +27,10 @@ type PageLoadParams struct {
|
||||
// @param {Boolean} [params.keepCookies=False] - Boolean value indicating whether to use cookies from previous sessions i.e. not to open a page in the Incognito mode.
|
||||
// @param {HTTPCookies} [params.cookies] - Set of HTTP cookies to use during page loading.
|
||||
// @param {HTTPHeaders} [params.headers] - Set of HTTP headers to use during page loading.
|
||||
// @param {Object} [params.disable] - Set of parameters to disable some page functionality or behavior.
|
||||
// @param {Object[]} [params.disable.resources] - Collection of rules to disable resources during page load and navigation.
|
||||
// @param {String} [params.disable.resources.*.url] - Resource url pattern. If set, requests for matching urls will be blocked. Wildcards ('*' -> zero or more, '?' -> exactly one) are allowed. Escape character is backslash. Omitting is equivalent to "*".
|
||||
// @param {String} [params.disable.resources.*.type] - Resource type. If set, requests for matching resource types will be blocked.
|
||||
// @param {Object} [params.viewport] - Viewport params.
|
||||
// @param {Int} [params.viewport.height] - Viewport height.
|
||||
// @param {Int} [params.viewport.width] - Viewport width.
|
||||
@ -186,6 +190,18 @@ func newPageLoadParams(url values.String, arg core.Value) (PageLoadParams, error
|
||||
|
||||
res.Viewport = viewport
|
||||
}
|
||||
|
||||
disable, exists := obj.Get(values.NewString("disable"))
|
||||
|
||||
if exists {
|
||||
disable, err := parseDisable(disable)
|
||||
|
||||
if err != nil {
|
||||
return res, err
|
||||
}
|
||||
|
||||
res.Disable = disable
|
||||
}
|
||||
case types.String:
|
||||
res.Driver = arg.(values.String).String()
|
||||
case types.Boolean:
|
||||
@ -391,3 +407,56 @@ func parseViewport(value core.Value) (*drivers.Viewport, error) {
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
||||
func parseDisable(value core.Value) (*drivers.Disable, error) {
|
||||
if err := core.ValidateType(value, types.Object); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
res := &drivers.Disable{}
|
||||
|
||||
disable := value.(*values.Object)
|
||||
|
||||
resources, exists := disable.Get("resources")
|
||||
|
||||
if exists {
|
||||
if err := core.ValidateType(resources, types.Array); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resources := resources.(*values.Array)
|
||||
|
||||
res.Resources = make([]drivers.ResourceFilter, 0, resources.Length())
|
||||
|
||||
var e error
|
||||
|
||||
resources.ForEach(func(el core.Value, idx int) bool {
|
||||
if e = core.ValidateType(el, types.Object); e != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
pattern := el.(*values.Object)
|
||||
|
||||
url, urlExists := pattern.Get("url")
|
||||
resType, resTypeExists := pattern.Get("type")
|
||||
|
||||
// ignore element
|
||||
if !urlExists && !resTypeExists {
|
||||
return true
|
||||
}
|
||||
|
||||
res.Resources = append(res.Resources, drivers.ResourceFilter{
|
||||
URL: url.String(),
|
||||
Type: resType.String(),
|
||||
})
|
||||
|
||||
return true
|
||||
})
|
||||
|
||||
if e != nil {
|
||||
return nil, e
|
||||
}
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user