1
0
mirror of https://github.com/MontFerret/ferret.git synced 2025-07-17 01:32:22 +02:00

#593 handling non-200 status codes

This commit is contained in:
Tim Voronov
2021-03-05 08:08:28 -05:00
committed by GitHub
parent 79566c3b76
commit 6295919806
7 changed files with 153 additions and 34 deletions

11
examples/non-200.fql Normal file
View File

@ -0,0 +1,11 @@
LET p = DOCUMENT('https://www.g2.com/categories', {
ignore: {
statusCodes: [
{
code: 403
}
]
}
})
RETURN p.response.statusCode

View File

@ -78,9 +78,9 @@ func LoadHTMLPage(
netOpts.Cookies[params.URL] = params.Cookies netOpts.Cookies[params.URL] = params.Cookies
} }
if params.Disable != nil { if params.Ignore != nil {
if len(params.Disable.Resources) > 0 { if len(params.Ignore.Resources) > 0 {
netOpts.Filter.Patterns = params.Disable.Resources netOpts.Filter.Patterns = params.Ignore.Resources
} }
} }

View File

@ -3,6 +3,7 @@ package http
import ( import (
"bytes" "bytes"
"context" "context"
"github.com/gobwas/glob"
"net/http" "net/http"
"net/url" "net/url"
@ -171,7 +172,13 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM
defer resp.Body.Close() defer resp.Body.Close()
if !drv.responseCodeAllowed(resp) { var queryFilters []drivers.StatusCodeFilter
if params.Ignore != nil {
queryFilters = params.Ignore.StatusCodes
}
if !drv.responseCodeAllowed(resp, queryFilters) {
return nil, errors.New(resp.Status) return nil, errors.New(resp.Status)
} }
@ -214,7 +221,43 @@ func (drv *Driver) Close() error {
return nil return nil
} }
func (drv *Driver) responseCodeAllowed(resp *http.Response) bool { func (drv *Driver) responseCodeAllowed(resp *http.Response, additional []drivers.StatusCodeFilter) bool {
_, exists := drv.options.AllowedHTTPCodes[resp.StatusCode] var allowed bool
return exists reqURL := resp.Request.URL.String()
// OK is by default
if resp.StatusCode >= 200 && resp.StatusCode <= 299 {
return true
}
// Try to use those that are passed within a query
for _, filter := range additional {
allowed = filter.Code == resp.StatusCode
// check url
if allowed && filter.URL != "" {
allowed = glob.MustCompile(filter.URL).Match(reqURL)
}
if allowed {
break
}
}
// if still not allowed, try the default ones
if !allowed {
for _, filter := range drv.options.HTTPCodesFilter {
allowed = filter.Code == resp.StatusCode
if allowed && filter.URL != nil {
allowed = filter.URL.Match(reqURL)
}
if allowed {
break
}
}
}
return allowed
} }

View File

@ -1,6 +1,7 @@
package http package http
import ( import (
"github.com/gobwas/glob"
stdhttp "net/http" stdhttp "net/http"
"github.com/MontFerret/ferret/pkg/drivers" "github.com/MontFerret/ferret/pkg/drivers"
@ -10,17 +11,22 @@ import (
type ( type (
Option func(opts *Options) Option func(opts *Options)
compiledStatusCodeFilter struct {
URL glob.Glob
Code int
}
Options struct { Options struct {
Name string Name string
Backoff pester.BackoffStrategy Backoff pester.BackoffStrategy
MaxRetries int MaxRetries int
Concurrency int Concurrency int
Proxy string Proxy string
UserAgent string UserAgent string
Headers drivers.HTTPHeaders Headers drivers.HTTPHeaders
Cookies drivers.HTTPCookies Cookies drivers.HTTPCookies
AllowedHTTPCodes map[int]struct{} HTTPCodesFilter []compiledStatusCodeFilter
HTTPTransport *stdhttp.Transport HTTPTransport *stdhttp.Transport
} }
) )
@ -30,7 +36,7 @@ func newOptions(setters []Option) *Options {
opts.Backoff = pester.ExponentialBackoff opts.Backoff = pester.ExponentialBackoff
opts.Concurrency = 3 opts.Concurrency = 3
opts.MaxRetries = 5 opts.MaxRetries = 5
opts.AllowedHTTPCodes = map[int]struct{}{stdhttp.StatusOK: struct{}{}} opts.HTTPCodesFilter = make([]compiledStatusCodeFilter, 0, 5)
for _, setter := range setters { for _, setter := range setters {
setter(opts) setter(opts)
@ -133,14 +139,18 @@ func WithCookies(cookies []drivers.HTTPCookie) Option {
func WithAllowedHTTPCode(httpCode int) Option { func WithAllowedHTTPCode(httpCode int) Option {
return func(opts *Options) { return func(opts *Options) {
opts.AllowedHTTPCodes[httpCode] = struct{}{} opts.HTTPCodesFilter = append(opts.HTTPCodesFilter, compiledStatusCodeFilter{
Code: httpCode,
})
} }
} }
func WithAllowedHTTPCodes(httpCodes []int) Option { func WithAllowedHTTPCodes(httpCodes []int) Option {
return func(opts *Options) { return func(opts *Options) {
for _, code := range httpCodes { for _, code := range httpCodes {
opts.AllowedHTTPCodes[code] = struct{}{} opts.HTTPCodesFilter = append(opts.HTTPCodesFilter, compiledStatusCodeFilter{
Code: code,
})
} }
} }
} }

View File

@ -6,8 +6,14 @@ type (
Type string Type string
} }
Disable struct { StatusCodeFilter struct {
Resources []ResourceFilter URL string
Code int
}
Ignore struct {
Resources []ResourceFilter
StatusCodes []StatusCodeFilter
} }
Viewport struct { Viewport struct {
@ -25,7 +31,7 @@ type (
Cookies HTTPCookies Cookies HTTPCookies
Headers HTTPHeaders Headers HTTPHeaders
Viewport *Viewport Viewport *Viewport
Disable *Disable Ignore *Ignore
} }
ParseParams struct { ParseParams struct {

View File

@ -243,6 +243,16 @@ func (t *Object) MustGet(key String) core.Value {
return val return val
} }
func (t *Object) MustGetOr(key String, defaultValue core.Value) core.Value {
val, found := t.value[string(key)]
if found {
return val
}
return defaultValue
}
func (t *Object) Get(key String) (core.Value, Boolean) { func (t *Object) Get(key String) (core.Value, Boolean) {
val, found := t.value[string(key)] val, found := t.value[string(key)]

View File

@ -2,6 +2,7 @@ package html
import ( import (
"context" "context"
"github.com/pkg/errors"
"strings" "strings"
"time" "time"
@ -27,10 +28,10 @@ type PageLoadParams struct {
// @param {Boolean} [params.keepCookies=False] - Boolean value indicating whether to use cookies from previous sessions i.e. not to open a page in the Incognito mode. // @param {Boolean} [params.keepCookies=False] - Boolean value indicating whether to use cookies from previous sessions i.e. not to open a page in the Incognito mode.
// @param {HTTPCookies} [params.cookies] - Set of HTTP cookies to use during page loading. // @param {HTTPCookies} [params.cookies] - Set of HTTP cookies to use during page loading.
// @param {HTTPHeaders} [params.headers] - Set of HTTP headers to use during page loading. // @param {HTTPHeaders} [params.headers] - Set of HTTP headers to use during page loading.
// @param {Object} [params.disable] - Set of parameters to disable some page functionality or behavior. // @param {Object} [params.ignore] - Set of parameters to ignore some page functionality or behavior.
// @param {Object[]} [params.disable.resources] - Collection of rules to disable resources during page load and navigation. // @param {Object[]} [params.ignore.resources] - Collection of rules to ignore resources during page load and navigation.
// @param {String} [params.disable.resources.*.url] - Resource url pattern. If set, requests for matching urls will be blocked. Wildcards ('*' -> zero or more, '?' -> exactly one) are allowed. Escape character is backslash. Omitting is equivalent to "*". // @param {String} [params.ignore.resources.*.url] - Resource url pattern. If set, requests for matching urls will be blocked. Wildcards ('*' -> zero or more, '?' -> exactly one) are allowed. Escape character is backslash. Omitting is equivalent to "*".
// @param {String} [params.disable.resources.*.type] - Resource type. If set, requests for matching resource types will be blocked. // @param {String} [params.ignore.resources.*.type] - Resource type. If set, requests for matching resource types will be blocked.
// @param {Object} [params.viewport] - Viewport params. // @param {Object} [params.viewport] - Viewport params.
// @param {Int} [params.viewport.height] - Viewport height. // @param {Int} [params.viewport.height] - Viewport height.
// @param {Int} [params.viewport.width] - Viewport width. // @param {Int} [params.viewport.width] - Viewport width.
@ -191,16 +192,16 @@ func newPageLoadParams(url values.String, arg core.Value) (PageLoadParams, error
res.Viewport = viewport res.Viewport = viewport
} }
disable, exists := obj.Get(values.NewString("disable")) ignore, exists := obj.Get(values.NewString("ignore"))
if exists { if exists {
disable, err := parseDisable(disable) ignore, err := parseIgnore(ignore)
if err != nil { if err != nil {
return res, err return res, err
} }
res.Disable = disable res.Ignore = ignore
} }
case types.String: case types.String:
res.Driver = arg.(values.String).String() res.Driver = arg.(values.String).String()
@ -408,16 +409,16 @@ func parseViewport(value core.Value) (*drivers.Viewport, error) {
return res, nil return res, nil
} }
func parseDisable(value core.Value) (*drivers.Disable, error) { func parseIgnore(value core.Value) (*drivers.Ignore, error) {
if err := core.ValidateType(value, types.Object); err != nil { if err := core.ValidateType(value, types.Object); err != nil {
return nil, err return nil, err
} }
res := &drivers.Disable{} res := &drivers.Ignore{}
disable := value.(*values.Object) ignore := value.(*values.Object)
resources, exists := disable.Get("resources") resources, exists := ignore.Get("resources")
if exists { if exists {
if err := core.ValidateType(resources, types.Array); err != nil { if err := core.ValidateType(resources, types.Array); err != nil {
@ -458,5 +459,43 @@ func parseDisable(value core.Value) (*drivers.Disable, error) {
} }
} }
statusCodes, exists := ignore.Get("statusCodes")
if exists {
if err := core.ValidateType(statusCodes, types.Array); err != nil {
return nil, err
}
statusCodes := statusCodes.(*values.Array)
res.StatusCodes = make([]drivers.StatusCodeFilter, 0, statusCodes.Length())
var e error
statusCodes.ForEach(func(el core.Value, idx int) bool {
if e = core.ValidateType(el, types.Object); e != nil {
return false
}
pattern := el.(*values.Object)
url := pattern.MustGetOr("url", values.NewString(""))
code, codeExists := pattern.Get("code")
// ignore element
if !codeExists {
e = errors.New("http code is required")
return false
}
res.StatusCodes = append(res.StatusCodes, drivers.StatusCodeFilter{
URL: url.String(),
Code: int(values.ToInt(code)),
})
return true
})
}
return res, nil return res, nil
} }