1
0
mirror of https://github.com/MontFerret/ferret.git synced 2024-12-12 11:15:14 +02:00

#593 handling non-200 status codes

This commit is contained in:
Tim Voronov 2021-03-05 08:08:28 -05:00 committed by GitHub
parent 79566c3b76
commit 6295919806
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 153 additions and 34 deletions

11
examples/non-200.fql Normal file
View File

@ -0,0 +1,11 @@
LET p = DOCUMENT('https://www.g2.com/categories', {
ignore: {
statusCodes: [
{
code: 403
}
]
}
})
RETURN p.response.statusCode

View File

@ -78,9 +78,9 @@ func LoadHTMLPage(
netOpts.Cookies[params.URL] = params.Cookies
}
if params.Disable != nil {
if len(params.Disable.Resources) > 0 {
netOpts.Filter.Patterns = params.Disable.Resources
if params.Ignore != nil {
if len(params.Ignore.Resources) > 0 {
netOpts.Filter.Patterns = params.Ignore.Resources
}
}

View File

@ -3,6 +3,7 @@ package http
import (
"bytes"
"context"
"github.com/gobwas/glob"
"net/http"
"net/url"
@ -171,7 +172,13 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM
defer resp.Body.Close()
if !drv.responseCodeAllowed(resp) {
var queryFilters []drivers.StatusCodeFilter
if params.Ignore != nil {
queryFilters = params.Ignore.StatusCodes
}
if !drv.responseCodeAllowed(resp, queryFilters) {
return nil, errors.New(resp.Status)
}
@ -214,7 +221,43 @@ func (drv *Driver) Close() error {
return nil
}
func (drv *Driver) responseCodeAllowed(resp *http.Response) bool {
_, exists := drv.options.AllowedHTTPCodes[resp.StatusCode]
return exists
func (drv *Driver) responseCodeAllowed(resp *http.Response, additional []drivers.StatusCodeFilter) bool {
var allowed bool
reqURL := resp.Request.URL.String()
// OK is by default
if resp.StatusCode >= 200 && resp.StatusCode <= 299 {
return true
}
// Try to use those that are passed within a query
for _, filter := range additional {
allowed = filter.Code == resp.StatusCode
// check url
if allowed && filter.URL != "" {
allowed = glob.MustCompile(filter.URL).Match(reqURL)
}
if allowed {
break
}
}
// if still not allowed, try the default ones
if !allowed {
for _, filter := range drv.options.HTTPCodesFilter {
allowed = filter.Code == resp.StatusCode
if allowed && filter.URL != nil {
allowed = filter.URL.Match(reqURL)
}
if allowed {
break
}
}
}
return allowed
}

View File

@ -1,6 +1,7 @@
package http
import (
"github.com/gobwas/glob"
stdhttp "net/http"
"github.com/MontFerret/ferret/pkg/drivers"
@ -10,17 +11,22 @@ import (
type (
Option func(opts *Options)
compiledStatusCodeFilter struct {
URL glob.Glob
Code int
}
Options struct {
Name string
Backoff pester.BackoffStrategy
MaxRetries int
Concurrency int
Proxy string
UserAgent string
Headers drivers.HTTPHeaders
Cookies drivers.HTTPCookies
AllowedHTTPCodes map[int]struct{}
HTTPTransport *stdhttp.Transport
Name string
Backoff pester.BackoffStrategy
MaxRetries int
Concurrency int
Proxy string
UserAgent string
Headers drivers.HTTPHeaders
Cookies drivers.HTTPCookies
HTTPCodesFilter []compiledStatusCodeFilter
HTTPTransport *stdhttp.Transport
}
)
@ -30,7 +36,7 @@ func newOptions(setters []Option) *Options {
opts.Backoff = pester.ExponentialBackoff
opts.Concurrency = 3
opts.MaxRetries = 5
opts.AllowedHTTPCodes = map[int]struct{}{stdhttp.StatusOK: struct{}{}}
opts.HTTPCodesFilter = make([]compiledStatusCodeFilter, 0, 5)
for _, setter := range setters {
setter(opts)
@ -133,14 +139,18 @@ func WithCookies(cookies []drivers.HTTPCookie) Option {
func WithAllowedHTTPCode(httpCode int) Option {
return func(opts *Options) {
opts.AllowedHTTPCodes[httpCode] = struct{}{}
opts.HTTPCodesFilter = append(opts.HTTPCodesFilter, compiledStatusCodeFilter{
Code: httpCode,
})
}
}
func WithAllowedHTTPCodes(httpCodes []int) Option {
return func(opts *Options) {
for _, code := range httpCodes {
opts.AllowedHTTPCodes[code] = struct{}{}
opts.HTTPCodesFilter = append(opts.HTTPCodesFilter, compiledStatusCodeFilter{
Code: code,
})
}
}
}

View File

@ -6,8 +6,14 @@ type (
Type string
}
Disable struct {
Resources []ResourceFilter
StatusCodeFilter struct {
URL string
Code int
}
Ignore struct {
Resources []ResourceFilter
StatusCodes []StatusCodeFilter
}
Viewport struct {
@ -25,7 +31,7 @@ type (
Cookies HTTPCookies
Headers HTTPHeaders
Viewport *Viewport
Disable *Disable
Ignore *Ignore
}
ParseParams struct {

View File

@ -243,6 +243,16 @@ func (t *Object) MustGet(key String) core.Value {
return val
}
func (t *Object) MustGetOr(key String, defaultValue core.Value) core.Value {
val, found := t.value[string(key)]
if found {
return val
}
return defaultValue
}
func (t *Object) Get(key String) (core.Value, Boolean) {
val, found := t.value[string(key)]

View File

@ -2,6 +2,7 @@ package html
import (
"context"
"github.com/pkg/errors"
"strings"
"time"
@ -27,10 +28,10 @@ type PageLoadParams struct {
// @param {Boolean} [params.keepCookies=False] - Boolean value indicating whether to use cookies from previous sessions i.e. not to open a page in the Incognito mode.
// @param {HTTPCookies} [params.cookies] - Set of HTTP cookies to use during page loading.
// @param {HTTPHeaders} [params.headers] - Set of HTTP headers to use during page loading.
// @param {Object} [params.disable] - Set of parameters to disable some page functionality or behavior.
// @param {Object[]} [params.disable.resources] - Collection of rules to disable resources during page load and navigation.
// @param {String} [params.disable.resources.*.url] - Resource url pattern. If set, requests for matching urls will be blocked. Wildcards ('*' -> zero or more, '?' -> exactly one) are allowed. Escape character is backslash. Omitting is equivalent to "*".
// @param {String} [params.disable.resources.*.type] - Resource type. If set, requests for matching resource types will be blocked.
// @param {Object} [params.ignore] - Set of parameters to ignore some page functionality or behavior.
// @param {Object[]} [params.ignore.resources] - Collection of rules to ignore resources during page load and navigation.
// @param {String} [params.ignore.resources.*.url] - Resource url pattern. If set, requests for matching urls will be blocked. Wildcards ('*' -> zero or more, '?' -> exactly one) are allowed. Escape character is backslash. Omitting is equivalent to "*".
// @param {String} [params.ignore.resources.*.type] - Resource type. If set, requests for matching resource types will be blocked.
// @param {Object} [params.viewport] - Viewport params.
// @param {Int} [params.viewport.height] - Viewport height.
// @param {Int} [params.viewport.width] - Viewport width.
@ -191,16 +192,16 @@ func newPageLoadParams(url values.String, arg core.Value) (PageLoadParams, error
res.Viewport = viewport
}
disable, exists := obj.Get(values.NewString("disable"))
ignore, exists := obj.Get(values.NewString("ignore"))
if exists {
disable, err := parseDisable(disable)
ignore, err := parseIgnore(ignore)
if err != nil {
return res, err
}
res.Disable = disable
res.Ignore = ignore
}
case types.String:
res.Driver = arg.(values.String).String()
@ -408,16 +409,16 @@ func parseViewport(value core.Value) (*drivers.Viewport, error) {
return res, nil
}
func parseDisable(value core.Value) (*drivers.Disable, error) {
func parseIgnore(value core.Value) (*drivers.Ignore, error) {
if err := core.ValidateType(value, types.Object); err != nil {
return nil, err
}
res := &drivers.Disable{}
res := &drivers.Ignore{}
disable := value.(*values.Object)
ignore := value.(*values.Object)
resources, exists := disable.Get("resources")
resources, exists := ignore.Get("resources")
if exists {
if err := core.ValidateType(resources, types.Array); err != nil {
@ -458,5 +459,43 @@ func parseDisable(value core.Value) (*drivers.Disable, error) {
}
}
statusCodes, exists := ignore.Get("statusCodes")
if exists {
if err := core.ValidateType(statusCodes, types.Array); err != nil {
return nil, err
}
statusCodes := statusCodes.(*values.Array)
res.StatusCodes = make([]drivers.StatusCodeFilter, 0, statusCodes.Length())
var e error
statusCodes.ForEach(func(el core.Value, idx int) bool {
if e = core.ValidateType(el, types.Object); e != nil {
return false
}
pattern := el.(*values.Object)
url := pattern.MustGetOr("url", values.NewString(""))
code, codeExists := pattern.Get("code")
// ignore element
if !codeExists {
e = errors.New("http code is required")
return false
}
res.StatusCodes = append(res.StatusCodes, drivers.StatusCodeFilter{
URL: url.String(),
Code: int(values.ToInt(code)),
})
return true
})
}
return res, nil
}