From 629591980642bdec3a8762fcd214c6f45b5cdbb9 Mon Sep 17 00:00:00 2001 From: Tim Voronov Date: Fri, 5 Mar 2021 08:08:28 -0500 Subject: [PATCH] #593 handling non-200 status codes --- examples/non-200.fql | 11 +++++++ pkg/drivers/cdp/page.go | 6 ++-- pkg/drivers/http/driver.go | 51 +++++++++++++++++++++++++++--- pkg/drivers/http/options.go | 36 +++++++++++++-------- pkg/drivers/params.go | 12 +++++-- pkg/runtime/values/object.go | 10 ++++++ pkg/stdlib/html/document.go | 61 +++++++++++++++++++++++++++++------- 7 files changed, 153 insertions(+), 34 deletions(-) create mode 100644 examples/non-200.fql diff --git a/examples/non-200.fql b/examples/non-200.fql new file mode 100644 index 00000000..504b21dd --- /dev/null +++ b/examples/non-200.fql @@ -0,0 +1,11 @@ +LET p = DOCUMENT('https://www.g2.com/categories', { + ignore: { + statusCodes: [ + { + code: 403 + } + ] + } +}) + +RETURN p.response.statusCode \ No newline at end of file diff --git a/pkg/drivers/cdp/page.go b/pkg/drivers/cdp/page.go index df7aa008..064d41e7 100644 --- a/pkg/drivers/cdp/page.go +++ b/pkg/drivers/cdp/page.go @@ -78,9 +78,9 @@ func LoadHTMLPage( netOpts.Cookies[params.URL] = params.Cookies } - if params.Disable != nil { - if len(params.Disable.Resources) > 0 { - netOpts.Filter.Patterns = params.Disable.Resources + if params.Ignore != nil { + if len(params.Ignore.Resources) > 0 { + netOpts.Filter.Patterns = params.Ignore.Resources } } diff --git a/pkg/drivers/http/driver.go b/pkg/drivers/http/driver.go index f36087c4..1b4929b5 100644 --- a/pkg/drivers/http/driver.go +++ b/pkg/drivers/http/driver.go @@ -3,6 +3,7 @@ package http import ( "bytes" "context" + "github.com/gobwas/glob" "net/http" "net/url" @@ -171,7 +172,13 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM defer resp.Body.Close() - if !drv.responseCodeAllowed(resp) { + var queryFilters []drivers.StatusCodeFilter + + if params.Ignore != nil { + queryFilters = params.Ignore.StatusCodes + } + + if !drv.responseCodeAllowed(resp, queryFilters) { return nil, errors.New(resp.Status) } @@ -214,7 +221,43 @@ func (drv *Driver) Close() error { return nil } -func (drv *Driver) responseCodeAllowed(resp *http.Response) bool { - _, exists := drv.options.AllowedHTTPCodes[resp.StatusCode] - return exists +func (drv *Driver) responseCodeAllowed(resp *http.Response, additional []drivers.StatusCodeFilter) bool { + var allowed bool + reqURL := resp.Request.URL.String() + + // OK is by default + if resp.StatusCode >= 200 && resp.StatusCode <= 299 { + return true + } + + // Try to use those that are passed within a query + for _, filter := range additional { + allowed = filter.Code == resp.StatusCode + + // check url + if allowed && filter.URL != "" { + allowed = glob.MustCompile(filter.URL).Match(reqURL) + } + + if allowed { + break + } + } + + // if still not allowed, try the default ones + if !allowed { + for _, filter := range drv.options.HTTPCodesFilter { + allowed = filter.Code == resp.StatusCode + + if allowed && filter.URL != nil { + allowed = filter.URL.Match(reqURL) + } + + if allowed { + break + } + } + } + + return allowed } diff --git a/pkg/drivers/http/options.go b/pkg/drivers/http/options.go index 10a3e352..5f9c53c6 100644 --- a/pkg/drivers/http/options.go +++ b/pkg/drivers/http/options.go @@ -1,6 +1,7 @@ package http import ( + "github.com/gobwas/glob" stdhttp "net/http" "github.com/MontFerret/ferret/pkg/drivers" @@ -10,17 +11,22 @@ import ( type ( Option func(opts *Options) + compiledStatusCodeFilter struct { + URL glob.Glob + Code int + } + Options struct { - Name string - Backoff pester.BackoffStrategy - MaxRetries int - Concurrency int - Proxy string - UserAgent string - Headers drivers.HTTPHeaders - Cookies drivers.HTTPCookies - AllowedHTTPCodes map[int]struct{} - HTTPTransport *stdhttp.Transport + Name string + Backoff pester.BackoffStrategy + MaxRetries int + Concurrency int + Proxy string + UserAgent string + Headers drivers.HTTPHeaders + Cookies drivers.HTTPCookies + HTTPCodesFilter []compiledStatusCodeFilter + HTTPTransport *stdhttp.Transport } ) @@ -30,7 +36,7 @@ func newOptions(setters []Option) *Options { opts.Backoff = pester.ExponentialBackoff opts.Concurrency = 3 opts.MaxRetries = 5 - opts.AllowedHTTPCodes = map[int]struct{}{stdhttp.StatusOK: struct{}{}} + opts.HTTPCodesFilter = make([]compiledStatusCodeFilter, 0, 5) for _, setter := range setters { setter(opts) @@ -133,14 +139,18 @@ func WithCookies(cookies []drivers.HTTPCookie) Option { func WithAllowedHTTPCode(httpCode int) Option { return func(opts *Options) { - opts.AllowedHTTPCodes[httpCode] = struct{}{} + opts.HTTPCodesFilter = append(opts.HTTPCodesFilter, compiledStatusCodeFilter{ + Code: httpCode, + }) } } func WithAllowedHTTPCodes(httpCodes []int) Option { return func(opts *Options) { for _, code := range httpCodes { - opts.AllowedHTTPCodes[code] = struct{}{} + opts.HTTPCodesFilter = append(opts.HTTPCodesFilter, compiledStatusCodeFilter{ + Code: code, + }) } } } diff --git a/pkg/drivers/params.go b/pkg/drivers/params.go index 5549a3f6..e39c707f 100644 --- a/pkg/drivers/params.go +++ b/pkg/drivers/params.go @@ -6,8 +6,14 @@ type ( Type string } - Disable struct { - Resources []ResourceFilter + StatusCodeFilter struct { + URL string + Code int + } + + Ignore struct { + Resources []ResourceFilter + StatusCodes []StatusCodeFilter } Viewport struct { @@ -25,7 +31,7 @@ type ( Cookies HTTPCookies Headers HTTPHeaders Viewport *Viewport - Disable *Disable + Ignore *Ignore } ParseParams struct { diff --git a/pkg/runtime/values/object.go b/pkg/runtime/values/object.go index 718ba93f..fd5d49ed 100644 --- a/pkg/runtime/values/object.go +++ b/pkg/runtime/values/object.go @@ -243,6 +243,16 @@ func (t *Object) MustGet(key String) core.Value { return val } +func (t *Object) MustGetOr(key String, defaultValue core.Value) core.Value { + val, found := t.value[string(key)] + + if found { + return val + } + + return defaultValue +} + func (t *Object) Get(key String) (core.Value, Boolean) { val, found := t.value[string(key)] diff --git a/pkg/stdlib/html/document.go b/pkg/stdlib/html/document.go index d8858b6e..25edcf9f 100644 --- a/pkg/stdlib/html/document.go +++ b/pkg/stdlib/html/document.go @@ -2,6 +2,7 @@ package html import ( "context" + "github.com/pkg/errors" "strings" "time" @@ -27,10 +28,10 @@ type PageLoadParams struct { // @param {Boolean} [params.keepCookies=False] - Boolean value indicating whether to use cookies from previous sessions i.e. not to open a page in the Incognito mode. // @param {HTTPCookies} [params.cookies] - Set of HTTP cookies to use during page loading. // @param {HTTPHeaders} [params.headers] - Set of HTTP headers to use during page loading. -// @param {Object} [params.disable] - Set of parameters to disable some page functionality or behavior. -// @param {Object[]} [params.disable.resources] - Collection of rules to disable resources during page load and navigation. -// @param {String} [params.disable.resources.*.url] - Resource url pattern. If set, requests for matching urls will be blocked. Wildcards ('*' -> zero or more, '?' -> exactly one) are allowed. Escape character is backslash. Omitting is equivalent to "*". -// @param {String} [params.disable.resources.*.type] - Resource type. If set, requests for matching resource types will be blocked. +// @param {Object} [params.ignore] - Set of parameters to ignore some page functionality or behavior. +// @param {Object[]} [params.ignore.resources] - Collection of rules to ignore resources during page load and navigation. +// @param {String} [params.ignore.resources.*.url] - Resource url pattern. If set, requests for matching urls will be blocked. Wildcards ('*' -> zero or more, '?' -> exactly one) are allowed. Escape character is backslash. Omitting is equivalent to "*". +// @param {String} [params.ignore.resources.*.type] - Resource type. If set, requests for matching resource types will be blocked. // @param {Object} [params.viewport] - Viewport params. // @param {Int} [params.viewport.height] - Viewport height. // @param {Int} [params.viewport.width] - Viewport width. @@ -191,16 +192,16 @@ func newPageLoadParams(url values.String, arg core.Value) (PageLoadParams, error res.Viewport = viewport } - disable, exists := obj.Get(values.NewString("disable")) + ignore, exists := obj.Get(values.NewString("ignore")) if exists { - disable, err := parseDisable(disable) + ignore, err := parseIgnore(ignore) if err != nil { return res, err } - res.Disable = disable + res.Ignore = ignore } case types.String: res.Driver = arg.(values.String).String() @@ -408,16 +409,16 @@ func parseViewport(value core.Value) (*drivers.Viewport, error) { return res, nil } -func parseDisable(value core.Value) (*drivers.Disable, error) { +func parseIgnore(value core.Value) (*drivers.Ignore, error) { if err := core.ValidateType(value, types.Object); err != nil { return nil, err } - res := &drivers.Disable{} + res := &drivers.Ignore{} - disable := value.(*values.Object) + ignore := value.(*values.Object) - resources, exists := disable.Get("resources") + resources, exists := ignore.Get("resources") if exists { if err := core.ValidateType(resources, types.Array); err != nil { @@ -458,5 +459,43 @@ func parseDisable(value core.Value) (*drivers.Disable, error) { } } + statusCodes, exists := ignore.Get("statusCodes") + + if exists { + if err := core.ValidateType(statusCodes, types.Array); err != nil { + return nil, err + } + + statusCodes := statusCodes.(*values.Array) + + res.StatusCodes = make([]drivers.StatusCodeFilter, 0, statusCodes.Length()) + + var e error + + statusCodes.ForEach(func(el core.Value, idx int) bool { + if e = core.ValidateType(el, types.Object); e != nil { + return false + } + + pattern := el.(*values.Object) + + url := pattern.MustGetOr("url", values.NewString("")) + code, codeExists := pattern.Get("code") + + // ignore element + if !codeExists { + e = errors.New("http code is required") + return false + } + + res.StatusCodes = append(res.StatusCodes, drivers.StatusCodeFilter{ + URL: url.String(), + Code: int(values.ToInt(code)), + }) + + return true + }) + } + return res, nil }