2018-12-22 06:14:41 +02:00
|
|
|
package http
|
2018-09-18 22:42:38 +02:00
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"context"
|
2018-10-12 17:58:08 +02:00
|
|
|
"net/http"
|
|
|
|
"net/url"
|
|
|
|
|
2019-02-20 01:10:18 +02:00
|
|
|
"github.com/MontFerret/ferret/pkg/drivers"
|
2018-12-22 06:14:41 +02:00
|
|
|
"github.com/MontFerret/ferret/pkg/drivers/common"
|
2019-02-13 19:31:18 +02:00
|
|
|
"github.com/MontFerret/ferret/pkg/runtime/logging"
|
2018-09-18 22:42:38 +02:00
|
|
|
"github.com/MontFerret/ferret/pkg/runtime/values"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
|
"github.com/pkg/errors"
|
|
|
|
"github.com/sethgrid/pester"
|
|
|
|
)
|
|
|
|
|
2019-02-20 01:10:18 +02:00
|
|
|
const DriverName = "http"
|
|
|
|
|
2018-12-22 06:14:41 +02:00
|
|
|
type Driver struct {
|
|
|
|
client *pester.Client
|
|
|
|
options *Options
|
2018-12-01 02:30:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func NewDriver(opts ...Option) *Driver {
|
|
|
|
drv := new(Driver)
|
|
|
|
drv.options = newOptions(opts)
|
|
|
|
|
2019-09-05 17:49:21 +02:00
|
|
|
if drv.options.Proxy == "" {
|
2018-10-08 03:23:36 +02:00
|
|
|
drv.client = pester.New()
|
|
|
|
} else {
|
|
|
|
client, err := newClientWithProxy(drv.options)
|
2018-09-18 22:42:38 +02:00
|
|
|
|
2018-10-08 03:23:36 +02:00
|
|
|
if err != nil {
|
|
|
|
drv.client = pester.New()
|
|
|
|
} else {
|
|
|
|
drv.client = pester.NewExtendedClient(client)
|
|
|
|
}
|
2018-09-18 22:42:38 +02:00
|
|
|
}
|
|
|
|
|
2019-09-05 17:49:21 +02:00
|
|
|
drv.client.Concurrency = drv.options.Concurrency
|
|
|
|
drv.client.MaxRetries = drv.options.MaxRetries
|
|
|
|
drv.client.Backoff = drv.options.Backoff
|
2018-10-08 03:23:36 +02:00
|
|
|
|
|
|
|
return drv
|
|
|
|
}
|
|
|
|
|
|
|
|
func newClientWithProxy(options *Options) (*http.Client, error) {
|
2019-09-05 17:49:21 +02:00
|
|
|
proxyURL, err := url.Parse(options.Proxy)
|
2018-10-08 03:23:36 +02:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
proxy := http.ProxyURL(proxyURL)
|
|
|
|
tr := &http.Transport{Proxy: proxy}
|
|
|
|
|
|
|
|
return &http.Client{Transport: tr}, nil
|
2018-09-18 22:42:38 +02:00
|
|
|
}
|
|
|
|
|
2019-02-20 01:10:18 +02:00
|
|
|
func (drv *Driver) Name() string {
|
2019-09-05 17:49:21 +02:00
|
|
|
return drv.options.Name
|
2019-02-20 01:10:18 +02:00
|
|
|
}
|
|
|
|
|
2019-07-17 19:29:16 +02:00
|
|
|
func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTMLPage, error) {
|
2019-03-16 01:59:05 +02:00
|
|
|
req, err := http.NewRequest(http.MethodGet, params.URL, nil)
|
2018-09-18 22:42:38 +02:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2019-03-16 01:59:05 +02:00
|
|
|
logger := logging.FromContext(ctx)
|
|
|
|
|
2018-09-18 22:42:38 +02:00
|
|
|
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")
|
|
|
|
req.Header.Set("Accept-Language", "en-US,en;q=0.9,ru;q=0.8")
|
|
|
|
req.Header.Set("Cache-Control", "no-cache")
|
|
|
|
req.Header.Set("Pragma", "no-cache")
|
|
|
|
|
2019-09-05 17:49:21 +02:00
|
|
|
if drv.options.Headers != nil && params.Headers == nil {
|
|
|
|
params.Headers = make(drivers.HTTPHeaders)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set default headers
|
|
|
|
for k, v := range drv.options.Headers {
|
|
|
|
_, exists := params.Headers[k]
|
2019-03-16 01:59:05 +02:00
|
|
|
|
2019-09-05 17:49:21 +02:00
|
|
|
// do not override user's set values
|
|
|
|
if !exists {
|
|
|
|
params.Headers[k] = v
|
2019-03-16 01:59:05 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-09-05 17:49:21 +02:00
|
|
|
for k := range params.Headers {
|
|
|
|
req.Header.Add(k, params.Headers.Get(k))
|
|
|
|
|
|
|
|
logger.
|
|
|
|
Debug().
|
|
|
|
Timestamp().
|
|
|
|
Str("header", k).
|
|
|
|
Msg("set header")
|
|
|
|
}
|
|
|
|
|
|
|
|
if drv.options.Cookies != nil && params.Cookies == nil {
|
|
|
|
params.Cookies = make(drivers.HTTPCookies)
|
|
|
|
}
|
|
|
|
|
|
|
|
// set default cookies
|
|
|
|
for k, v := range drv.options.Cookies {
|
|
|
|
_, exists := params.Cookies[k]
|
2019-03-16 01:59:05 +02:00
|
|
|
|
2019-09-05 17:49:21 +02:00
|
|
|
// do not override user's set values
|
|
|
|
if !exists {
|
|
|
|
params.Cookies[k] = v
|
2019-03-16 01:59:05 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-09-05 17:49:21 +02:00
|
|
|
for _, c := range params.Cookies {
|
|
|
|
req.AddCookie(fromDriverCookie(c))
|
|
|
|
|
|
|
|
logger.
|
|
|
|
Debug().
|
|
|
|
Timestamp().
|
|
|
|
Str("cookie", c.Name).
|
|
|
|
Msg("set cookie")
|
|
|
|
}
|
|
|
|
|
2018-11-22 03:38:27 +02:00
|
|
|
req = req.WithContext(ctx)
|
|
|
|
|
2019-03-16 01:59:05 +02:00
|
|
|
var ua string
|
|
|
|
|
|
|
|
if params.UserAgent != "" {
|
|
|
|
ua = common.GetUserAgent(params.UserAgent)
|
|
|
|
} else {
|
2019-09-05 17:49:21 +02:00
|
|
|
ua = common.GetUserAgent(drv.options.UserAgent)
|
2019-03-16 01:59:05 +02:00
|
|
|
}
|
2018-10-08 04:18:57 +02:00
|
|
|
|
2018-12-22 06:14:41 +02:00
|
|
|
logger.
|
|
|
|
Debug().
|
2019-02-20 01:10:18 +02:00
|
|
|
Timestamp().
|
2018-12-22 06:14:41 +02:00
|
|
|
Str("user-agent", ua).
|
|
|
|
Msg("using User-Agent")
|
|
|
|
|
2019-06-25 18:51:51 +02:00
|
|
|
if ua != "" {
|
|
|
|
req.Header.Set("User-Agent", ua)
|
|
|
|
}
|
|
|
|
|
2018-10-08 04:18:57 +02:00
|
|
|
resp, err := drv.client.Do(req)
|
2018-09-18 22:42:38 +02:00
|
|
|
|
|
|
|
if err != nil {
|
2019-03-16 01:59:05 +02:00
|
|
|
return nil, errors.Wrapf(err, "failed to retrieve a document %s", params.URL)
|
2018-09-18 22:42:38 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
defer resp.Body.Close()
|
|
|
|
|
2019-10-16 00:54:42 +02:00
|
|
|
if !drv.responseCodeAllowed(resp) {
|
2019-06-25 18:51:51 +02:00
|
|
|
return nil, errors.New(resp.Status)
|
|
|
|
}
|
|
|
|
|
2018-09-18 22:42:38 +02:00
|
|
|
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
|
|
|
|
|
|
|
if err != nil {
|
2019-03-16 01:59:05 +02:00
|
|
|
return nil, errors.Wrapf(err, "failed to parse a document %s", params.URL)
|
2018-09-18 22:42:38 +02:00
|
|
|
}
|
|
|
|
|
2019-09-05 17:49:21 +02:00
|
|
|
cookies, err := toDriverCookies(resp.Cookies())
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2019-09-30 20:09:51 +02:00
|
|
|
r := drivers.HTTPResponse{
|
|
|
|
StatusCode: resp.StatusCode,
|
|
|
|
Status: resp.Status,
|
|
|
|
Headers: drivers.HTTPHeaders(resp.Header),
|
|
|
|
}
|
|
|
|
|
2019-10-03 21:42:14 +02:00
|
|
|
return NewHTMLPage(doc, params.URL, &r, cookies)
|
2018-09-18 22:42:38 +02:00
|
|
|
}
|
|
|
|
|
2019-10-16 00:54:42 +02:00
|
|
|
func (drv *Driver) responseCodeAllowed(resp *http.Response) bool {
|
|
|
|
_, exists := drv.options.AllowedHTTPCodes[resp.StatusCode]
|
|
|
|
return exists
|
|
|
|
}
|
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
func (drv *Driver) Parse(_ context.Context, str values.String) (drivers.HTMLPage, error) {
|
2018-09-18 22:42:38 +02:00
|
|
|
buf := bytes.NewBuffer([]byte(str))
|
|
|
|
|
|
|
|
doc, err := goquery.NewDocumentFromReader(buf)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, errors.Wrap(err, "failed to parse a document")
|
|
|
|
}
|
|
|
|
|
2019-09-30 20:09:51 +02:00
|
|
|
return NewHTMLPage(doc, "#blank", nil, nil)
|
2018-09-18 22:42:38 +02:00
|
|
|
}
|
|
|
|
|
2018-10-08 04:18:57 +02:00
|
|
|
func (drv *Driver) Close() error {
|
|
|
|
drv.client = nil
|
2018-09-18 22:42:38 +02:00
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|