1
0
mirror of https://github.com/MontFerret/ferret.git synced 2024-12-16 11:37:36 +02:00
ferret/pkg/stdlib/html/driver/static/static.go

78 lines
1.7 KiB
Go
Raw Normal View History

2018-09-27 04:03:06 +02:00
package static
2018-09-18 22:42:38 +02:00
import (
"bytes"
"context"
"github.com/MontFerret/ferret/pkg/runtime/values"
"github.com/PuerkitoBio/goquery"
"github.com/corpix/uarand"
"github.com/pkg/errors"
"github.com/sethgrid/pester"
httpx "net/http"
)
2018-09-27 04:03:06 +02:00
type Driver struct {
2018-09-18 22:42:38 +02:00
client *pester.Client
}
2018-09-27 04:03:06 +02:00
func NewDriver(setters ...Option) *Driver {
2018-09-18 22:42:38 +02:00
client := pester.New()
client.Concurrency = 3
client.MaxRetries = 5
client.Backoff = pester.ExponentialBackoff
for _, setter := range setters {
setter(client)
}
2018-09-27 04:03:06 +02:00
return &Driver{client}
2018-09-18 22:42:38 +02:00
}
2018-10-06 01:40:09 +02:00
func (d *Driver) GetDocument(_ context.Context, url string) (values.HTMLNode, error) {
2018-09-18 22:42:38 +02:00
req, err := httpx.NewRequest(httpx.MethodGet, url, nil)
if err != nil {
return nil, err
}
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.9,ru;q=0.8")
req.Header.Set("Cache-Control", "no-cache")
req.Header.Set("Pragma", "no-cache")
req.Header.Set("User-Agent", uarand.GetRandom())
resp, err := d.client.Do(req)
if err != nil {
return nil, errors.Wrapf(err, "failed to retrieve a document %s", url)
}
defer resp.Body.Close()
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, errors.Wrapf(err, "failed to parse a document %s", url)
}
2018-10-06 01:40:09 +02:00
return NewHTMLDocument(url, doc)
2018-09-18 22:42:38 +02:00
}
2018-10-06 01:40:09 +02:00
func (d *Driver) ParseDocument(_ context.Context, str string) (values.HTMLNode, error) {
2018-09-18 22:42:38 +02:00
buf := bytes.NewBuffer([]byte(str))
doc, err := goquery.NewDocumentFromReader(buf)
if err != nil {
return nil, errors.Wrap(err, "failed to parse a document")
}
2018-10-06 01:40:09 +02:00
return NewHTMLDocument("#string", doc)
2018-09-18 22:42:38 +02:00
}
2018-09-27 04:03:06 +02:00
func (d *Driver) Close() error {
2018-09-18 22:42:38 +02:00
d.client = nil
return nil
}