mirror of
https://github.com/MontFerret/ferret.git
synced 2025-07-17 01:32:22 +02:00
#593 handling non-200 status codes
This commit is contained in:
11
examples/non-200.fql
Normal file
11
examples/non-200.fql
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
LET p = DOCUMENT('https://www.g2.com/categories', {
|
||||||
|
ignore: {
|
||||||
|
statusCodes: [
|
||||||
|
{
|
||||||
|
code: 403
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
RETURN p.response.statusCode
|
@ -78,9 +78,9 @@ func LoadHTMLPage(
|
|||||||
netOpts.Cookies[params.URL] = params.Cookies
|
netOpts.Cookies[params.URL] = params.Cookies
|
||||||
}
|
}
|
||||||
|
|
||||||
if params.Disable != nil {
|
if params.Ignore != nil {
|
||||||
if len(params.Disable.Resources) > 0 {
|
if len(params.Ignore.Resources) > 0 {
|
||||||
netOpts.Filter.Patterns = params.Disable.Resources
|
netOpts.Filter.Patterns = params.Ignore.Resources
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,6 +3,7 @@ package http
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
|
"github.com/gobwas/glob"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
|
|
||||||
@ -171,7 +172,13 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM
|
|||||||
|
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
if !drv.responseCodeAllowed(resp) {
|
var queryFilters []drivers.StatusCodeFilter
|
||||||
|
|
||||||
|
if params.Ignore != nil {
|
||||||
|
queryFilters = params.Ignore.StatusCodes
|
||||||
|
}
|
||||||
|
|
||||||
|
if !drv.responseCodeAllowed(resp, queryFilters) {
|
||||||
return nil, errors.New(resp.Status)
|
return nil, errors.New(resp.Status)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -214,7 +221,43 @@ func (drv *Driver) Close() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (drv *Driver) responseCodeAllowed(resp *http.Response) bool {
|
func (drv *Driver) responseCodeAllowed(resp *http.Response, additional []drivers.StatusCodeFilter) bool {
|
||||||
_, exists := drv.options.AllowedHTTPCodes[resp.StatusCode]
|
var allowed bool
|
||||||
return exists
|
reqURL := resp.Request.URL.String()
|
||||||
|
|
||||||
|
// OK is by default
|
||||||
|
if resp.StatusCode >= 200 && resp.StatusCode <= 299 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to use those that are passed within a query
|
||||||
|
for _, filter := range additional {
|
||||||
|
allowed = filter.Code == resp.StatusCode
|
||||||
|
|
||||||
|
// check url
|
||||||
|
if allowed && filter.URL != "" {
|
||||||
|
allowed = glob.MustCompile(filter.URL).Match(reqURL)
|
||||||
|
}
|
||||||
|
|
||||||
|
if allowed {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if still not allowed, try the default ones
|
||||||
|
if !allowed {
|
||||||
|
for _, filter := range drv.options.HTTPCodesFilter {
|
||||||
|
allowed = filter.Code == resp.StatusCode
|
||||||
|
|
||||||
|
if allowed && filter.URL != nil {
|
||||||
|
allowed = filter.URL.Match(reqURL)
|
||||||
|
}
|
||||||
|
|
||||||
|
if allowed {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return allowed
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package http
|
package http
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"github.com/gobwas/glob"
|
||||||
stdhttp "net/http"
|
stdhttp "net/http"
|
||||||
|
|
||||||
"github.com/MontFerret/ferret/pkg/drivers"
|
"github.com/MontFerret/ferret/pkg/drivers"
|
||||||
@ -10,17 +11,22 @@ import (
|
|||||||
type (
|
type (
|
||||||
Option func(opts *Options)
|
Option func(opts *Options)
|
||||||
|
|
||||||
|
compiledStatusCodeFilter struct {
|
||||||
|
URL glob.Glob
|
||||||
|
Code int
|
||||||
|
}
|
||||||
|
|
||||||
Options struct {
|
Options struct {
|
||||||
Name string
|
Name string
|
||||||
Backoff pester.BackoffStrategy
|
Backoff pester.BackoffStrategy
|
||||||
MaxRetries int
|
MaxRetries int
|
||||||
Concurrency int
|
Concurrency int
|
||||||
Proxy string
|
Proxy string
|
||||||
UserAgent string
|
UserAgent string
|
||||||
Headers drivers.HTTPHeaders
|
Headers drivers.HTTPHeaders
|
||||||
Cookies drivers.HTTPCookies
|
Cookies drivers.HTTPCookies
|
||||||
AllowedHTTPCodes map[int]struct{}
|
HTTPCodesFilter []compiledStatusCodeFilter
|
||||||
HTTPTransport *stdhttp.Transport
|
HTTPTransport *stdhttp.Transport
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -30,7 +36,7 @@ func newOptions(setters []Option) *Options {
|
|||||||
opts.Backoff = pester.ExponentialBackoff
|
opts.Backoff = pester.ExponentialBackoff
|
||||||
opts.Concurrency = 3
|
opts.Concurrency = 3
|
||||||
opts.MaxRetries = 5
|
opts.MaxRetries = 5
|
||||||
opts.AllowedHTTPCodes = map[int]struct{}{stdhttp.StatusOK: struct{}{}}
|
opts.HTTPCodesFilter = make([]compiledStatusCodeFilter, 0, 5)
|
||||||
|
|
||||||
for _, setter := range setters {
|
for _, setter := range setters {
|
||||||
setter(opts)
|
setter(opts)
|
||||||
@ -133,14 +139,18 @@ func WithCookies(cookies []drivers.HTTPCookie) Option {
|
|||||||
|
|
||||||
func WithAllowedHTTPCode(httpCode int) Option {
|
func WithAllowedHTTPCode(httpCode int) Option {
|
||||||
return func(opts *Options) {
|
return func(opts *Options) {
|
||||||
opts.AllowedHTTPCodes[httpCode] = struct{}{}
|
opts.HTTPCodesFilter = append(opts.HTTPCodesFilter, compiledStatusCodeFilter{
|
||||||
|
Code: httpCode,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func WithAllowedHTTPCodes(httpCodes []int) Option {
|
func WithAllowedHTTPCodes(httpCodes []int) Option {
|
||||||
return func(opts *Options) {
|
return func(opts *Options) {
|
||||||
for _, code := range httpCodes {
|
for _, code := range httpCodes {
|
||||||
opts.AllowedHTTPCodes[code] = struct{}{}
|
opts.HTTPCodesFilter = append(opts.HTTPCodesFilter, compiledStatusCodeFilter{
|
||||||
|
Code: code,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,8 +6,14 @@ type (
|
|||||||
Type string
|
Type string
|
||||||
}
|
}
|
||||||
|
|
||||||
Disable struct {
|
StatusCodeFilter struct {
|
||||||
Resources []ResourceFilter
|
URL string
|
||||||
|
Code int
|
||||||
|
}
|
||||||
|
|
||||||
|
Ignore struct {
|
||||||
|
Resources []ResourceFilter
|
||||||
|
StatusCodes []StatusCodeFilter
|
||||||
}
|
}
|
||||||
|
|
||||||
Viewport struct {
|
Viewport struct {
|
||||||
@ -25,7 +31,7 @@ type (
|
|||||||
Cookies HTTPCookies
|
Cookies HTTPCookies
|
||||||
Headers HTTPHeaders
|
Headers HTTPHeaders
|
||||||
Viewport *Viewport
|
Viewport *Viewport
|
||||||
Disable *Disable
|
Ignore *Ignore
|
||||||
}
|
}
|
||||||
|
|
||||||
ParseParams struct {
|
ParseParams struct {
|
||||||
|
@ -243,6 +243,16 @@ func (t *Object) MustGet(key String) core.Value {
|
|||||||
return val
|
return val
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *Object) MustGetOr(key String, defaultValue core.Value) core.Value {
|
||||||
|
val, found := t.value[string(key)]
|
||||||
|
|
||||||
|
if found {
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
|
||||||
|
return defaultValue
|
||||||
|
}
|
||||||
|
|
||||||
func (t *Object) Get(key String) (core.Value, Boolean) {
|
func (t *Object) Get(key String) (core.Value, Boolean) {
|
||||||
val, found := t.value[string(key)]
|
val, found := t.value[string(key)]
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@ package html
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"github.com/pkg/errors"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -27,10 +28,10 @@ type PageLoadParams struct {
|
|||||||
// @param {Boolean} [params.keepCookies=False] - Boolean value indicating whether to use cookies from previous sessions i.e. not to open a page in the Incognito mode.
|
// @param {Boolean} [params.keepCookies=False] - Boolean value indicating whether to use cookies from previous sessions i.e. not to open a page in the Incognito mode.
|
||||||
// @param {HTTPCookies} [params.cookies] - Set of HTTP cookies to use during page loading.
|
// @param {HTTPCookies} [params.cookies] - Set of HTTP cookies to use during page loading.
|
||||||
// @param {HTTPHeaders} [params.headers] - Set of HTTP headers to use during page loading.
|
// @param {HTTPHeaders} [params.headers] - Set of HTTP headers to use during page loading.
|
||||||
// @param {Object} [params.disable] - Set of parameters to disable some page functionality or behavior.
|
// @param {Object} [params.ignore] - Set of parameters to ignore some page functionality or behavior.
|
||||||
// @param {Object[]} [params.disable.resources] - Collection of rules to disable resources during page load and navigation.
|
// @param {Object[]} [params.ignore.resources] - Collection of rules to ignore resources during page load and navigation.
|
||||||
// @param {String} [params.disable.resources.*.url] - Resource url pattern. If set, requests for matching urls will be blocked. Wildcards ('*' -> zero or more, '?' -> exactly one) are allowed. Escape character is backslash. Omitting is equivalent to "*".
|
// @param {String} [params.ignore.resources.*.url] - Resource url pattern. If set, requests for matching urls will be blocked. Wildcards ('*' -> zero or more, '?' -> exactly one) are allowed. Escape character is backslash. Omitting is equivalent to "*".
|
||||||
// @param {String} [params.disable.resources.*.type] - Resource type. If set, requests for matching resource types will be blocked.
|
// @param {String} [params.ignore.resources.*.type] - Resource type. If set, requests for matching resource types will be blocked.
|
||||||
// @param {Object} [params.viewport] - Viewport params.
|
// @param {Object} [params.viewport] - Viewport params.
|
||||||
// @param {Int} [params.viewport.height] - Viewport height.
|
// @param {Int} [params.viewport.height] - Viewport height.
|
||||||
// @param {Int} [params.viewport.width] - Viewport width.
|
// @param {Int} [params.viewport.width] - Viewport width.
|
||||||
@ -191,16 +192,16 @@ func newPageLoadParams(url values.String, arg core.Value) (PageLoadParams, error
|
|||||||
res.Viewport = viewport
|
res.Viewport = viewport
|
||||||
}
|
}
|
||||||
|
|
||||||
disable, exists := obj.Get(values.NewString("disable"))
|
ignore, exists := obj.Get(values.NewString("ignore"))
|
||||||
|
|
||||||
if exists {
|
if exists {
|
||||||
disable, err := parseDisable(disable)
|
ignore, err := parseIgnore(ignore)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return res, err
|
return res, err
|
||||||
}
|
}
|
||||||
|
|
||||||
res.Disable = disable
|
res.Ignore = ignore
|
||||||
}
|
}
|
||||||
case types.String:
|
case types.String:
|
||||||
res.Driver = arg.(values.String).String()
|
res.Driver = arg.(values.String).String()
|
||||||
@ -408,16 +409,16 @@ func parseViewport(value core.Value) (*drivers.Viewport, error) {
|
|||||||
return res, nil
|
return res, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseDisable(value core.Value) (*drivers.Disable, error) {
|
func parseIgnore(value core.Value) (*drivers.Ignore, error) {
|
||||||
if err := core.ValidateType(value, types.Object); err != nil {
|
if err := core.ValidateType(value, types.Object); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
res := &drivers.Disable{}
|
res := &drivers.Ignore{}
|
||||||
|
|
||||||
disable := value.(*values.Object)
|
ignore := value.(*values.Object)
|
||||||
|
|
||||||
resources, exists := disable.Get("resources")
|
resources, exists := ignore.Get("resources")
|
||||||
|
|
||||||
if exists {
|
if exists {
|
||||||
if err := core.ValidateType(resources, types.Array); err != nil {
|
if err := core.ValidateType(resources, types.Array); err != nil {
|
||||||
@ -458,5 +459,43 @@ func parseDisable(value core.Value) (*drivers.Disable, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
statusCodes, exists := ignore.Get("statusCodes")
|
||||||
|
|
||||||
|
if exists {
|
||||||
|
if err := core.ValidateType(statusCodes, types.Array); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
statusCodes := statusCodes.(*values.Array)
|
||||||
|
|
||||||
|
res.StatusCodes = make([]drivers.StatusCodeFilter, 0, statusCodes.Length())
|
||||||
|
|
||||||
|
var e error
|
||||||
|
|
||||||
|
statusCodes.ForEach(func(el core.Value, idx int) bool {
|
||||||
|
if e = core.ValidateType(el, types.Object); e != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
pattern := el.(*values.Object)
|
||||||
|
|
||||||
|
url := pattern.MustGetOr("url", values.NewString(""))
|
||||||
|
code, codeExists := pattern.Get("code")
|
||||||
|
|
||||||
|
// ignore element
|
||||||
|
if !codeExists {
|
||||||
|
e = errors.New("http code is required")
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
res.StatusCodes = append(res.StatusCodes, drivers.StatusCodeFilter{
|
||||||
|
URL: url.String(),
|
||||||
|
Code: int(values.ToInt(code)),
|
||||||
|
})
|
||||||
|
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
return res, nil
|
return res, nil
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user