diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1a9af8c3..15f4ca2a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -23,7 +23,7 @@ jobs: sudo curl -o /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar https://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar export CLASSPATH=".:/usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar:$CLASSPATH" mkdir $HOME/antlr-bin - echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar "$@"' > $HOME/antlr-bin/antlr + echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-4.9-complete.jar "$@"' > $HOME/antlr-bin/antlr echo -e '#!/bin/bash\njava org.antlr.v4.gui.TestRig "$@"' > $HOME/antlr-bin/grun chmod +x $HOME/antlr-bin/* export PATH=$PATH:$HOME/antlr-bin diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b4adef01..004fe671 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -24,7 +24,7 @@ jobs: sudo curl -o /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar https://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar export CLASSPATH=".:/usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar:$CLASSPATH" mkdir $HOME/antlr-bin - echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-${ANTLR_VERSION}-complete.jar "$@"' > $HOME/antlr-bin/antlr + echo -e '#!/bin/bash\njava -jar /usr/local/lib/antlr-4.9-complete.jar "$@"' > $HOME/antlr-bin/antlr echo -e '#!/bin/bash\njava org.antlr.v4.gui.TestRig "$@"' > $HOME/antlr-bin/grun chmod +x $HOME/antlr-bin/* sudo ln -s $HOME/antlr-bin/antlr /usr/local/bin/antlr diff --git a/examples/disable-images.fql b/examples/disable-images.fql new file mode 100644 index 00000000..5f06bfa7 --- /dev/null +++ b/examples/disable-images.fql @@ -0,0 +1,13 @@ +LET p = DOCUMENT("https://www.gettyimages.com/", { + driver: "cdp", + disable: { + resources: [ + { + url: "*", + type: "image" + } + ] + } +}) + +RETURN NONE \ No newline at end of file diff --git a/pkg/drivers/cdp/events/loop.go b/pkg/drivers/cdp/events/loop.go index e94b4dea..16ed2c18 100644 --- a/pkg/drivers/cdp/events/loop.go +++ b/pkg/drivers/cdp/events/loop.go @@ -62,7 +62,6 @@ func (loop *Loop) RemoveListener(eventID ID, listenerID ListenerID) { // run starts running an event loop. // It constantly iterates over each event source. -// Additionally to that, on each iteration it checks the command channel in order to perform add/remove listener/source operations. func (loop *Loop) run(ctx context.Context) { sources := loop.sources size := sources.Size() diff --git a/pkg/drivers/cdp/network/events.go b/pkg/drivers/cdp/network/events.go index 55e72585..7363a9de 100644 --- a/pkg/drivers/cdp/network/events.go +++ b/pkg/drivers/cdp/network/events.go @@ -5,4 +5,5 @@ import "github.com/MontFerret/ferret/pkg/drivers/cdp/events" var ( eventFrameLoad = events.New("frame_load") responseReceived = events.New("response_received") + requestPaused = events.New("request_paused") ) diff --git a/pkg/drivers/cdp/network/manager.go b/pkg/drivers/cdp/network/manager.go index 97e426af..edb222fb 100644 --- a/pkg/drivers/cdp/network/manager.go +++ b/pkg/drivers/cdp/network/manager.go @@ -8,6 +8,7 @@ import ( "sync" "github.com/mafredri/cdp" + "github.com/mafredri/cdp/protocol/fetch" "github.com/mafredri/cdp/protocol/network" "github.com/mafredri/cdp/protocol/page" "github.com/mafredri/cdp/rpcc" @@ -37,6 +38,7 @@ type ( eventLoop *events.Loop cancel context.CancelFunc responseListenerID events.ListenerID + filterListenerID events.ListenerID response *sync.Map } ) @@ -44,6 +46,7 @@ type ( func New( logger *zerolog.Logger, client *cdp.Client, + options Options, ) (*Manager, error) { ctx, cancel := context.WithCancel(context.Background()) @@ -55,6 +58,20 @@ func New( m.cancel = cancel m.response = new(sync.Map) + if len(options.Cookies) > 0 { + for url, cookies := range options.Cookies { + if err := m.setCookiesInternal(ctx, url, cookies); err != nil { + return nil, err + } + } + } + + if len(options.Headers) > 0 { + if err := m.setHeadersInternal(ctx, options.Headers); err != nil { + return nil, err + } + } + var err error closers := make([]io.Closer, 0, 10) @@ -87,6 +104,32 @@ func New( m.responseListenerID = m.eventLoop.AddListener(responseReceived, m.onResponse) + if len(options.Filter.Patterns) > 0 { + el2 := events.NewLoop() + + err = m.client.Fetch.Enable(ctx, toFetchArgs(options.Filter.Patterns)) + + if err != nil { + return nil, err + } + + requestPausedStream, err := m.client.Fetch.RequestPaused(ctx) + + if err != nil { + return nil, err + } + + el2.AddSource(events.NewSource(requestPaused, requestPausedStream, func(stream rpcc.Stream) (interface{}, error) { + return stream.(fetch.RequestPausedClient).Recv() + })) + + m.filterListenerID = el2.AddListener(requestPaused, m.onRequestPaused) + + // run in a separate loop in order to get higher priority + // TODO: Consider adding support of event priorities to EventLoop + el2.Run(ctx) + } + m.eventLoop.Run(ctx) return m, nil @@ -128,6 +171,10 @@ func (m *Manager) SetCookies(ctx context.Context, url string, cookies drivers.HT m.mu.Lock() defer m.mu.Unlock() + return m.setCookiesInternal(ctx, url, cookies) +} + +func (m *Manager) setCookiesInternal(ctx context.Context, url string, cookies drivers.HTTPCookies) error { if len(cookies) == 0 { return nil } @@ -176,6 +223,10 @@ func (m *Manager) SetHeaders(ctx context.Context, headers drivers.HTTPHeaders) e m.mu.Lock() defer m.mu.Unlock() + return m.setHeadersInternal(ctx, headers) +} + +func (m *Manager) setHeadersInternal(ctx context.Context, headers drivers.HTTPHeaders) error { if len(headers) == 0 { return nil } @@ -431,3 +482,27 @@ func (m *Manager) onResponse(_ context.Context, message interface{}) (out bool) return } + +func (m *Manager) onRequestPaused(ctx context.Context, message interface{}) (out bool) { + out = true + msg, ok := message.(*fetch.RequestPausedReply) + + if !ok { + return + } + + err := m.client.Fetch.FailRequest(ctx, &fetch.FailRequestArgs{ + RequestID: msg.RequestID, + ErrorReason: network.ErrorReasonBlockedByClient, + }) + + if err != nil { + m.logger. + Err(err). + Str("resourceType", msg.ResourceType.String()). + Str("url", msg.Request.URL). + Msg("failed to terminate a request") + } + + return +} diff --git a/pkg/drivers/cdp/network/options.go b/pkg/drivers/cdp/network/options.go new file mode 100644 index 00000000..b013b119 --- /dev/null +++ b/pkg/drivers/cdp/network/options.go @@ -0,0 +1,37 @@ +package network + +import ( + "github.com/MontFerret/ferret/pkg/drivers" + "github.com/mafredri/cdp/protocol/fetch" +) + +type ( + Cookies map[string]drivers.HTTPCookies + + Filter struct { + Patterns []drivers.ResourceFilter + } + + Options struct { + Cookies Cookies + Headers drivers.HTTPHeaders + Filter Filter + } +) + +func toFetchArgs(filterPatterns []drivers.ResourceFilter) *fetch.EnableArgs { + patterns := make([]fetch.RequestPattern, 0, len(filterPatterns)) + + for _, pattern := range filterPatterns { + rt := toResourceType(pattern.Type) + + patterns = append(patterns, fetch.RequestPattern{ + URLPattern: &pattern.URL, + ResourceType: &rt, + }) + } + + return &fetch.EnableArgs{ + Patterns: patterns, + } +} diff --git a/pkg/drivers/cdp/network/resources.go b/pkg/drivers/cdp/network/resources.go new file mode 100644 index 00000000..9e9d65d7 --- /dev/null +++ b/pkg/drivers/cdp/network/resources.go @@ -0,0 +1,37 @@ +package network + +import "github.com/mafredri/cdp/protocol/network" + +var ( + resourceTypeMapping = map[string]network.ResourceType{ + "document": network.ResourceTypeDocument, + "stylesheet": network.ResourceTypeStylesheet, + "css": network.ResourceTypeStylesheet, + "image": network.ResourceTypeImage, + "media": network.ResourceTypeMedia, + "font": network.ResourceTypeFont, + "script": network.ResourceTypeScript, + "js": network.ResourceTypeScript, + "texttrack": network.ResourceTypeTextTrack, + "xhr": network.ResourceTypeXHR, + "ajax": network.ResourceTypeXHR, + "fetch": network.ResourceTypeFetch, + "eventsource": network.ResourceTypeEventSource, + "websocket": network.ResourceTypeWebSocket, + "manifest": network.ResourceTypeManifest, + "sxg": network.ResourceTypeSignedExchange, + "ping": network.ResourceTypePing, + "cspViolationReport": network.ResourceTypeCSPViolationReport, + "other": network.ResourceTypeOther, + } +) + +func toResourceType(alias string) network.ResourceType { + rt, found := resourceTypeMapping[alias] + + if found { + return rt + } + + return network.ResourceTypeNotSet +} diff --git a/pkg/drivers/cdp/page.go b/pkg/drivers/cdp/page.go index 5dc607fa..df7aa008 100644 --- a/pkg/drivers/cdp/page.go +++ b/pkg/drivers/cdp/page.go @@ -69,19 +69,22 @@ func LoadHTMLPage( } }() - netManager, err := net.New(logger, client) - - if err != nil { - return nil, err + netOpts := net.Options{ + Headers: params.Headers, } - err = netManager.SetCookies(ctx, params.URL, params.Cookies) - - if err != nil { - return nil, err + if len(params.Cookies) > 0 { + netOpts.Cookies = make(map[string]drivers.HTTPCookies) + netOpts.Cookies[params.URL] = params.Cookies } - err = netManager.SetHeaders(ctx, params.Headers) + if params.Disable != nil { + if len(params.Disable.Resources) > 0 { + netOpts.Filter.Patterns = params.Disable.Resources + } + } + + netManager, err := net.New(logger, client, netOpts) if err != nil { return nil, err diff --git a/pkg/drivers/params.go b/pkg/drivers/params.go index 8190a774..5549a3f6 100644 --- a/pkg/drivers/params.go +++ b/pkg/drivers/params.go @@ -1,6 +1,15 @@ package drivers type ( + ResourceFilter struct { + URL string + Type string + } + + Disable struct { + Resources []ResourceFilter + } + Viewport struct { Height int Width int @@ -16,6 +25,7 @@ type ( Cookies HTTPCookies Headers HTTPHeaders Viewport *Viewport + Disable *Disable } ParseParams struct { diff --git a/pkg/stdlib/html/document.go b/pkg/stdlib/html/document.go index bcec602b..d8858b6e 100644 --- a/pkg/stdlib/html/document.go +++ b/pkg/stdlib/html/document.go @@ -27,6 +27,10 @@ type PageLoadParams struct { // @param {Boolean} [params.keepCookies=False] - Boolean value indicating whether to use cookies from previous sessions i.e. not to open a page in the Incognito mode. // @param {HTTPCookies} [params.cookies] - Set of HTTP cookies to use during page loading. // @param {HTTPHeaders} [params.headers] - Set of HTTP headers to use during page loading. +// @param {Object} [params.disable] - Set of parameters to disable some page functionality or behavior. +// @param {Object[]} [params.disable.resources] - Collection of rules to disable resources during page load and navigation. +// @param {String} [params.disable.resources.*.url] - Resource url pattern. If set, requests for matching urls will be blocked. Wildcards ('*' -> zero or more, '?' -> exactly one) are allowed. Escape character is backslash. Omitting is equivalent to "*". +// @param {String} [params.disable.resources.*.type] - Resource type. If set, requests for matching resource types will be blocked. // @param {Object} [params.viewport] - Viewport params. // @param {Int} [params.viewport.height] - Viewport height. // @param {Int} [params.viewport.width] - Viewport width. @@ -186,6 +190,18 @@ func newPageLoadParams(url values.String, arg core.Value) (PageLoadParams, error res.Viewport = viewport } + + disable, exists := obj.Get(values.NewString("disable")) + + if exists { + disable, err := parseDisable(disable) + + if err != nil { + return res, err + } + + res.Disable = disable + } case types.String: res.Driver = arg.(values.String).String() case types.Boolean: @@ -391,3 +407,56 @@ func parseViewport(value core.Value) (*drivers.Viewport, error) { return res, nil } + +func parseDisable(value core.Value) (*drivers.Disable, error) { + if err := core.ValidateType(value, types.Object); err != nil { + return nil, err + } + + res := &drivers.Disable{} + + disable := value.(*values.Object) + + resources, exists := disable.Get("resources") + + if exists { + if err := core.ValidateType(resources, types.Array); err != nil { + return nil, err + } + + resources := resources.(*values.Array) + + res.Resources = make([]drivers.ResourceFilter, 0, resources.Length()) + + var e error + + resources.ForEach(func(el core.Value, idx int) bool { + if e = core.ValidateType(el, types.Object); e != nil { + return false + } + + pattern := el.(*values.Object) + + url, urlExists := pattern.Get("url") + resType, resTypeExists := pattern.Get("type") + + // ignore element + if !urlExists && !resTypeExists { + return true + } + + res.Resources = append(res.Resources, drivers.ResourceFilter{ + URL: url.String(), + Type: resType.String(), + }) + + return true + }) + + if e != nil { + return nil, e + } + } + + return res, nil +}