2018-09-27 04:03:06 +02:00
|
|
|
package static
|
2018-09-18 22:42:38 +02:00
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"context"
|
|
|
|
"github.com/MontFerret/ferret/pkg/runtime/values"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
|
"github.com/corpix/uarand"
|
|
|
|
"github.com/pkg/errors"
|
|
|
|
"github.com/sethgrid/pester"
|
2018-10-08 03:23:36 +02:00
|
|
|
"net/http"
|
|
|
|
"net/url"
|
2018-09-18 22:42:38 +02:00
|
|
|
)
|
|
|
|
|
2018-09-27 04:03:06 +02:00
|
|
|
type Driver struct {
|
2018-10-08 03:23:36 +02:00
|
|
|
client *pester.Client
|
|
|
|
options *Options
|
2018-09-18 22:42:38 +02:00
|
|
|
}
|
|
|
|
|
2018-10-08 03:23:36 +02:00
|
|
|
func NewDriver(opts ...Option) *Driver {
|
|
|
|
drv := new(Driver)
|
|
|
|
drv.options = &Options{
|
|
|
|
concurrency: 3,
|
|
|
|
maxRetries: 5,
|
|
|
|
backoff: pester.ExponentialBackoff,
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, opt := range opts {
|
|
|
|
opt(drv.options)
|
|
|
|
}
|
|
|
|
|
|
|
|
if drv.options.proxy == "" {
|
|
|
|
drv.client = pester.New()
|
|
|
|
} else {
|
|
|
|
client, err := newClientWithProxy(drv.options)
|
2018-09-18 22:42:38 +02:00
|
|
|
|
2018-10-08 03:23:36 +02:00
|
|
|
if err != nil {
|
|
|
|
drv.client = pester.New()
|
|
|
|
} else {
|
|
|
|
drv.client = pester.NewExtendedClient(client)
|
|
|
|
}
|
2018-09-18 22:42:38 +02:00
|
|
|
}
|
|
|
|
|
2018-10-08 03:23:36 +02:00
|
|
|
drv.client.Concurrency = drv.options.concurrency
|
|
|
|
drv.client.MaxRetries = drv.options.maxRetries
|
|
|
|
drv.client.Backoff = drv.options.backoff
|
|
|
|
|
|
|
|
return drv
|
|
|
|
}
|
|
|
|
|
|
|
|
func newClientWithProxy(options *Options) (*http.Client, error) {
|
|
|
|
proxyURL, err := url.Parse(options.proxy)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
proxy := http.ProxyURL(proxyURL)
|
|
|
|
tr := &http.Transport{Proxy: proxy}
|
|
|
|
|
|
|
|
return &http.Client{Transport: tr}, nil
|
2018-09-18 22:42:38 +02:00
|
|
|
}
|
|
|
|
|
2018-10-07 04:33:39 +02:00
|
|
|
func (d *Driver) GetDocument(_ context.Context, targetURL values.String) (values.HTMLNode, error) {
|
2018-10-08 03:23:36 +02:00
|
|
|
u := targetURL.String()
|
|
|
|
req, err := http.NewRequest(http.MethodGet, u, nil)
|
2018-09-18 22:42:38 +02:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")
|
|
|
|
req.Header.Set("Accept-Language", "en-US,en;q=0.9,ru;q=0.8")
|
|
|
|
req.Header.Set("Cache-Control", "no-cache")
|
|
|
|
req.Header.Set("Pragma", "no-cache")
|
|
|
|
req.Header.Set("User-Agent", uarand.GetRandom())
|
|
|
|
|
|
|
|
resp, err := d.client.Do(req)
|
|
|
|
|
|
|
|
if err != nil {
|
2018-10-08 03:23:36 +02:00
|
|
|
return nil, errors.Wrapf(err, "failed to retrieve a document %s", u)
|
2018-09-18 22:42:38 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
defer resp.Body.Close()
|
|
|
|
|
|
|
|
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
|
|
|
|
|
|
|
if err != nil {
|
2018-10-08 03:23:36 +02:00
|
|
|
return nil, errors.Wrapf(err, "failed to parse a document %s", u)
|
2018-09-18 22:42:38 +02:00
|
|
|
}
|
|
|
|
|
2018-10-08 03:23:36 +02:00
|
|
|
return NewHTMLDocument(u, doc)
|
2018-09-18 22:42:38 +02:00
|
|
|
}
|
|
|
|
|
2018-10-07 04:33:39 +02:00
|
|
|
func (d *Driver) ParseDocument(_ context.Context, str values.String) (values.HTMLNode, error) {
|
2018-09-18 22:42:38 +02:00
|
|
|
buf := bytes.NewBuffer([]byte(str))
|
|
|
|
|
|
|
|
doc, err := goquery.NewDocumentFromReader(buf)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, errors.Wrap(err, "failed to parse a document")
|
|
|
|
}
|
|
|
|
|
2018-10-06 01:40:09 +02:00
|
|
|
return NewHTMLDocument("#string", doc)
|
2018-09-18 22:42:38 +02:00
|
|
|
}
|
|
|
|
|
2018-09-27 04:03:06 +02:00
|
|
|
func (d *Driver) Close() error {
|
2018-09-18 22:42:38 +02:00
|
|
|
d.client = nil
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|