1
0
mirror of https://github.com/MontFerret/ferret.git synced 2024-12-16 11:37:36 +02:00
ferret/pkg/html/dynamic/document.go

820 lines
15 KiB
Go
Raw Normal View History

2018-09-27 04:03:06 +02:00
package dynamic
2018-09-18 22:42:38 +02:00
import (
"context"
2018-09-23 10:33:20 +02:00
"fmt"
"hash/fnv"
"sync"
"time"
"github.com/MontFerret/ferret/pkg/html/dynamic/eval"
"github.com/MontFerret/ferret/pkg/html/dynamic/events"
2018-09-18 22:42:38 +02:00
"github.com/MontFerret/ferret/pkg/runtime/core"
2018-09-28 06:28:33 +02:00
"github.com/MontFerret/ferret/pkg/runtime/logging"
2018-09-23 10:33:20 +02:00
"github.com/MontFerret/ferret/pkg/runtime/values"
2018-09-18 22:42:38 +02:00
"github.com/mafredri/cdp"
"github.com/mafredri/cdp/protocol/input"
"github.com/mafredri/cdp/protocol/page"
2018-09-18 22:42:38 +02:00
"github.com/mafredri/cdp/rpcc"
2018-09-26 01:04:07 +02:00
"github.com/pkg/errors"
2018-09-28 06:28:33 +02:00
"github.com/rs/zerolog"
2018-09-18 22:42:38 +02:00
)
2018-10-06 02:42:23 +02:00
const BlankPageURL = "about:blank"
2018-10-05 03:37:28 +02:00
type (
ScreenshotFormat string
ScreenshotArgs struct {
X float64
Y float64
Width float64
Height float64
Format ScreenshotFormat
Quality int
}
HTMLDocument struct {
sync.Mutex
logger *zerolog.Logger
conn *rpcc.Conn
client *cdp.Client
events *events.EventBroker
url values.String
element *HTMLElement
}
)
const (
ScreenshotFormatPNG ScreenshotFormat = "png"
ScreenshotFormatJPEG ScreenshotFormat = "jpeg"
)
func IsScreenshotFormatValid(format string) bool {
value := ScreenshotFormat(format)
return value == ScreenshotFormatPNG || value == ScreenshotFormatJPEG
2018-09-18 22:42:38 +02:00
}
func LoadHTMLDocument(
2018-09-18 22:42:38 +02:00
ctx context.Context,
conn *rpcc.Conn,
client *cdp.Client,
2018-09-18 22:42:38 +02:00
url string,
) (*HTMLDocument, error) {
2018-09-18 22:42:38 +02:00
if conn == nil {
return nil, core.Error(core.ErrMissedArgument, "connection")
}
if url == "" {
return nil, core.Error(core.ErrMissedArgument, "url")
}
var err error
2018-09-18 22:42:38 +02:00
2018-10-06 02:42:23 +02:00
if url != BlankPageURL {
2018-10-05 03:37:28 +02:00
err = waitForLoadEvent(ctx, client)
2018-09-18 22:42:38 +02:00
2018-10-05 03:37:28 +02:00
if err != nil {
return nil, err
}
2018-09-18 22:42:38 +02:00
}
node, err := getRootElement(ctx, client)
2018-09-18 22:42:38 +02:00
if err != nil {
return nil, errors.Wrap(err, "failed to get root element")
2018-09-18 22:42:38 +02:00
}
2018-09-25 23:58:57 +02:00
broker, err := createEventBroker(client)
if err != nil {
return nil, errors.Wrap(err, "failed to create event events")
}
logger := logging.FromContext(ctx)
rootElement, err := LoadElement(
ctx,
logger,
client,
broker,
node.Root.NodeID,
node.Root.BackendNodeID,
)
if err != nil {
return nil, errors.Wrap(err, "failed to load root element")
}
return NewHTMLDocument(
logger,
2018-09-28 06:28:33 +02:00
conn,
client,
broker,
values.NewString(url),
rootElement,
2018-09-28 06:28:33 +02:00
), nil
}
func NewHTMLDocument(
2018-09-28 06:28:33 +02:00
logger *zerolog.Logger,
2018-09-25 23:58:57 +02:00
conn *rpcc.Conn,
client *cdp.Client,
broker *events.EventBroker,
url values.String,
rootElement *HTMLElement,
) *HTMLDocument {
doc := new(HTMLDocument)
2018-09-28 06:28:33 +02:00
doc.logger = logger
2018-09-25 23:58:57 +02:00
doc.conn = conn
doc.client = client
doc.events = broker
doc.url = url
doc.element = rootElement
broker.AddEventListener(events.EventLoad, doc.handlePageLoad)
broker.AddEventListener(events.EventError, doc.handleError)
2018-09-25 23:58:57 +02:00
return doc
}
func (doc *HTMLDocument) MarshalJSON() ([]byte, error) {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
return doc.element.MarshalJSON()
2018-09-18 22:42:38 +02:00
}
func (doc *HTMLDocument) Type() core.Type {
return core.HTMLDocumentType
2018-09-18 22:42:38 +02:00
}
func (doc *HTMLDocument) String() string {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
return doc.url.String()
2018-09-18 22:42:38 +02:00
}
func (doc *HTMLDocument) Unwrap() interface{} {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
return doc.element
}
func (doc *HTMLDocument) Hash() uint64 {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
2018-10-05 21:17:22 +02:00
h := fnv.New64a()
2018-09-25 23:58:57 +02:00
2018-10-05 21:17:22 +02:00
h.Write([]byte(doc.Type().String()))
h.Write([]byte(":"))
h.Write([]byte(doc.url))
2018-09-25 23:58:57 +02:00
2018-10-05 21:17:22 +02:00
return h.Sum64()
2018-09-25 23:58:57 +02:00
}
func (doc *HTMLDocument) Copy() core.Value {
2018-09-27 17:53:26 +02:00
return values.None
}
func (doc *HTMLDocument) Compare(other core.Value) int {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
2018-09-18 22:42:38 +02:00
switch other.Type() {
case core.HTMLDocumentType:
other := other.(*HTMLDocument)
2018-09-18 22:42:38 +02:00
return doc.url.Compare(other.url)
2018-09-18 22:42:38 +02:00
default:
if other.Type() > core.HTMLDocumentType {
2018-09-18 22:42:38 +02:00
return -1
}
return 1
}
}
2018-09-23 10:33:20 +02:00
func (doc *HTMLDocument) Close() error {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
2018-09-28 06:28:33 +02:00
var err error
err = doc.events.Stop()
if err != nil {
doc.logger.Warn().
Timestamp().
Str("url", doc.url.String()).
Err(err).
Msg("failed to stop event events")
2018-09-28 06:28:33 +02:00
}
err = doc.events.Close()
if err != nil {
doc.logger.Warn().
Timestamp().
Str("url", doc.url.String()).
Err(err).
Msg("failed to close event events")
2018-09-28 06:28:33 +02:00
}
err = doc.element.Close()
2018-09-25 23:58:57 +02:00
2018-09-28 06:28:33 +02:00
if err != nil {
doc.logger.Warn().
Timestamp().
Str("url", doc.url.String()).
Err(err).
Msg("failed to close root element")
}
err = doc.client.Page.Close(context.Background())
if err != nil {
doc.logger.Warn().
Timestamp().
Str("url", doc.url.String()).
Err(err).
Msg("failed to close browser page")
}
2018-09-25 23:58:57 +02:00
return doc.conn.Close()
}
func (doc *HTMLDocument) NodeType() values.Int {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
return doc.element.NodeType()
}
func (doc *HTMLDocument) NodeName() values.String {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
return doc.element.NodeName()
}
func (doc *HTMLDocument) Length() values.Int {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
return doc.element.Length()
}
func (doc *HTMLDocument) InnerText() values.String {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
return doc.element.InnerText()
}
func (doc *HTMLDocument) InnerHTML() values.String {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
return doc.element.InnerHTML()
2018-09-25 23:58:57 +02:00
}
func (doc *HTMLDocument) Value() core.Value {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
return doc.element.Value()
}
func (doc *HTMLDocument) GetAttributes() core.Value {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
return doc.element.GetAttributes()
}
func (doc *HTMLDocument) GetAttribute(name values.String) core.Value {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
return doc.element.GetAttribute(name)
}
func (doc *HTMLDocument) GetChildNodes() core.Value {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
return doc.element.GetChildNodes()
}
func (doc *HTMLDocument) GetChildNode(idx values.Int) core.Value {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
return doc.element.GetChildNode(idx)
}
func (doc *HTMLDocument) QuerySelector(selector values.String) core.Value {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
return doc.element.QuerySelector(selector)
}
func (doc *HTMLDocument) QuerySelectorAll(selector values.String) core.Value {
2018-09-25 23:58:57 +02:00
doc.Lock()
defer doc.Unlock()
return doc.element.QuerySelectorAll(selector)
}
func (doc *HTMLDocument) URL() core.Value {
doc.Lock()
defer doc.Unlock()
return doc.url
}
func (doc *HTMLDocument) InnerHTMLBySelector(selector values.String) values.String {
doc.Lock()
defer doc.Unlock()
2018-09-28 04:03:35 +02:00
return doc.element.InnerHTMLBySelector(selector)
2018-09-28 04:03:35 +02:00
}
func (doc *HTMLDocument) InnerHTMLBySelectorAll(selector values.String) *values.Array {
doc.Lock()
defer doc.Unlock()
2018-09-28 04:03:35 +02:00
return doc.element.InnerHTMLBySelectorAll(selector)
2018-09-28 04:03:35 +02:00
}
func (doc *HTMLDocument) InnerTextBySelector(selector values.String) values.String {
doc.Lock()
defer doc.Unlock()
2018-09-28 04:03:35 +02:00
return doc.element.InnerHTMLBySelector(selector)
2018-09-28 04:03:35 +02:00
}
func (doc *HTMLDocument) InnerTextBySelectorAll(selector values.String) *values.Array {
doc.Lock()
defer doc.Unlock()
2018-09-28 04:03:35 +02:00
return doc.element.InnerTextBySelectorAll(selector)
2018-09-28 04:03:35 +02:00
}
func (doc *HTMLDocument) CountBySelector(selector values.String) values.Int {
doc.Lock()
defer doc.Unlock()
return doc.element.CountBySelector(selector)
}
func (doc *HTMLDocument) ClickBySelector(selector values.String) (values.Boolean, error) {
2018-09-25 23:58:57 +02:00
res, err := eval.Eval(
doc.client,
fmt.Sprintf(`
2018-09-28 03:41:41 +02:00
var el = document.querySelector(%s);
if (el == null) {
return false;
}
var evt = new window.MouseEvent('click', { bubbles: true });
el.dispatchEvent(evt);
2018-09-28 04:03:35 +02:00
return true;
`, eval.ParamString(selector.String())),
true,
false,
)
if err != nil {
return values.False, err
}
if res.Type() == core.BooleanType {
return res.(values.Boolean), nil
}
return values.False, nil
}
func (doc *HTMLDocument) ClickBySelectorAll(selector values.String) (values.Boolean, error) {
2018-09-28 04:03:35 +02:00
res, err := eval.Eval(
doc.client,
fmt.Sprintf(`
var elements = document.querySelectorAll(%s);
if (elements == null) {
return false;
}
elements.forEach((el) => {
var evt = new window.MouseEvent('click', { bubbles: true });
el.dispatchEvent(evt);
});
return true;
2018-09-28 03:41:41 +02:00
`, eval.ParamString(selector.String())),
true,
false,
)
if err != nil {
return values.False, err
}
if res.Type() == core.BooleanType {
return res.(values.Boolean), nil
}
return values.False, nil
}
func (doc *HTMLDocument) InputBySelector(selector values.String, value core.Value, delay values.Int) (values.Boolean, error) {
ctx := context.Background()
valStr := value.String()
2018-09-28 03:41:41 +02:00
res, err := eval.Eval(
doc.client,
fmt.Sprintf(`
2018-09-28 03:41:41 +02:00
var el = document.querySelector(%s);
if (el == null) {
return false;
}
el.focus();
2018-09-28 03:41:41 +02:00
return true;
`, eval.ParamString(selector.String())),
true,
false,
)
if err != nil {
return values.False, err
}
if res.Type() == core.BooleanType && res.(values.Boolean) == values.False {
return values.False, nil
}
delayMs := time.Duration(delay)
time.Sleep(delayMs * time.Millisecond)
for _, ch := range valStr {
for _, ev := range []string{"keyDown", "keyUp"} {
ke := input.NewDispatchKeyEventArgs(ev).SetText(string(ch))
if err := doc.client.Input.DispatchKeyEvent(ctx, ke); err != nil {
return values.False, err
}
time.Sleep(delayMs * time.Millisecond)
}
}
return values.True, nil
}
func (doc *HTMLDocument) WaitForSelector(selector values.String, timeout values.Int) error {
task := events.NewEvalWaitTask(
2018-09-23 10:33:20 +02:00
doc.client,
fmt.Sprintf(`
var el = document.querySelector(%s);
2018-09-23 10:33:20 +02:00
if (el != null) {
return true;
}
// null means we need to repeat
2018-09-23 10:33:20 +02:00
return null;
2018-09-28 03:41:41 +02:00
`, eval.ParamString(selector.String())),
2018-09-23 10:33:20 +02:00
time.Millisecond*time.Duration(timeout),
2018-09-25 23:58:57 +02:00
events.DefaultPolling,
2018-09-23 10:33:20 +02:00
)
_, err := task.Run()
return err
}
func (doc *HTMLDocument) WaitForClass(selector, class values.String, timeout values.Int) error {
task := events.NewEvalWaitTask(
doc.client,
fmt.Sprintf(`
var el = document.querySelector(%s);
if (el == null) {
return false;
}
var className = %s;
var found = el.className.split(' ').find(i => i === className);
if (found != null) {
return true;
}
// null means we need to repeat
return null;
`,
eval.ParamString(selector.String()),
eval.ParamString(class.String()),
),
time.Millisecond*time.Duration(timeout),
events.DefaultPolling,
)
_, err := task.Run()
return err
}
func (doc *HTMLDocument) WaitForClassAll(selector, class values.String, timeout values.Int) error {
task := events.NewEvalWaitTask(
doc.client,
fmt.Sprintf(`
var elements = document.querySelectorAll(%s);
if (elements == null || elements.length === 0) {
return false;
}
var className = %s;
var foundCount = 0;
elements.forEach((el) => {
var found = el.className.split(' ').find(i => i === className);
if (found != null) {
foundCount++;
}
});
if (foundCount === elements.length) {
return true;
}
// null means we need to repeat
return null;
`,
eval.ParamString(selector.String()),
eval.ParamString(class.String()),
),
time.Millisecond*time.Duration(timeout),
events.DefaultPolling,
)
_, err := task.Run()
return err
}
func (doc *HTMLDocument) WaitForNavigation(timeout values.Int) error {
// do not wait
if timeout == 0 {
return nil
}
onEvent := make(chan struct{})
2018-09-25 23:58:57 +02:00
listener := func(_ interface{}) {
close(onEvent)
2018-09-25 23:58:57 +02:00
}
defer doc.events.RemoveEventListener(events.EventLoad, listener)
doc.events.AddEventListener(events.EventLoad, listener)
select {
case <-onEvent:
return nil
case <-time.After(time.Millisecond * time.Duration(timeout)):
return core.ErrTimeout
2018-09-25 23:58:57 +02:00
}
}
2018-09-26 01:04:07 +02:00
2018-10-08 02:15:41 +02:00
func (doc *HTMLDocument) Navigate(url values.String, timeout values.Int) error {
2018-10-05 03:37:28 +02:00
if url == "" {
2018-10-06 02:42:23 +02:00
url = BlankPageURL
2018-10-05 03:37:28 +02:00
}
2018-09-26 01:04:07 +02:00
ctx := context.Background()
repl, err := doc.client.Page.Navigate(ctx, page.NewNavigateArgs(url.String()))
if err != nil {
return err
}
if repl.ErrorText != nil {
return errors.New(*repl.ErrorText)
}
2018-10-08 02:15:41 +02:00
return doc.WaitForNavigation(timeout)
}
func (doc *HTMLDocument) NavigateBack(skip values.Int, timeout values.Int) (values.Boolean, error) {
ctx := context.Background()
history, err := doc.client.Page.GetNavigationHistory(ctx)
if err != nil {
return values.False, err
}
// we are in the beginning
if history.CurrentIndex == 0 {
return values.False, nil
}
if skip < 1 {
skip = 1
}
to := history.CurrentIndex - int(skip)
if to < 0 {
// TODO: Return error?
return values.False, nil
}
prev := history.Entries[to]
err = doc.client.Page.NavigateToHistoryEntry(ctx, page.NewNavigateToHistoryEntryArgs(prev.ID))
if err != nil {
return values.False, err
}
err = doc.WaitForNavigation(timeout)
if err != nil {
return values.False, err
}
return values.True, nil
}
func (doc *HTMLDocument) NavigateForward(skip values.Int, timeout values.Int) (values.Boolean, error) {
ctx := context.Background()
history, err := doc.client.Page.GetNavigationHistory(ctx)
if err != nil {
return values.False, err
}
length := len(history.Entries)
lastIndex := length - 1
// nowhere to go forward
if history.CurrentIndex == lastIndex {
return values.False, nil
}
if skip < 1 {
skip = 1
}
to := int(skip) + history.CurrentIndex
if to > lastIndex {
// TODO: Return error?
return values.False, nil
}
next := history.Entries[to]
err = doc.client.Page.NavigateToHistoryEntry(ctx, page.NewNavigateToHistoryEntryArgs(next.ID))
if err != nil {
return values.False, err
}
err = doc.WaitForNavigation(timeout)
if err != nil {
return values.False, err
}
return values.True, nil
}
func (doc *HTMLDocument) PrintToPDF(params *page.PrintToPDFArgs) (core.Value, error) {
ctx := context.Background()
2018-10-14 03:08:18 +02:00
reply, err := doc.client.Page.PrintToPDF(ctx, params)
if err != nil {
return values.None, err
}
return values.NewBinary(reply.Data), nil
}
func (doc *HTMLDocument) CaptureScreenshot(params *ScreenshotArgs) (core.Value, error) {
ctx := context.Background()
metrics, err := doc.client.Page.GetLayoutMetrics(ctx)
if params.Format == ScreenshotFormatJPEG && params.Quality < 0 && params.Quality > 100 {
params.Quality = 100
}
if params.X < 0 {
params.X = 0
}
if params.Y < 0 {
params.Y = 0
}
if params.Width <= 0 {
params.Width = float64(metrics.LayoutViewport.ClientWidth) - params.X
}
if params.Height <= 0 {
params.Height = float64(metrics.LayoutViewport.ClientHeight) - params.Y
}
clip := page.Viewport{
X: params.X,
Y: params.Y,
Width: params.Width,
Height: params.Height,
Scale: 1.0,
}
format := string(params.Format)
screenshotArgs := page.CaptureScreenshotArgs{
Format: &format,
Quality: &params.Quality,
Clip: &clip,
}
reply, err := doc.client.Page.CaptureScreenshot(ctx, &screenshotArgs)
if err != nil {
return values.None, err
}
return values.NewBinary(reply.Data), nil
}
2018-10-08 04:56:01 +02:00
func (doc *HTMLDocument) handlePageLoad(_ interface{}) {
2018-10-08 02:15:41 +02:00
doc.Lock()
defer doc.Unlock()
ctx, cancel := contextWithTimeout()
defer cancel()
node, err := getRootElement(ctx, doc.client)
2018-10-08 02:15:41 +02:00
if err != nil {
doc.logger.Error().
Timestamp().
Err(err).
Msg("failed to get root node after page load")
return
}
updated, err := LoadElement(
ctx,
2018-10-08 02:15:41 +02:00
doc.logger,
doc.client,
doc.events,
node.Root.NodeID,
node.Root.BackendNodeID,
2018-10-08 02:15:41 +02:00
)
if err != nil {
doc.logger.Error().
Timestamp().
Err(err).
Msg("failed to load root node after page load")
return
}
// close the prev element
doc.element.Close()
// create a new root element wrapper
doc.element = updated
2018-10-08 02:15:41 +02:00
doc.url = ""
if node.Root.BaseURL != nil {
doc.url = values.NewString(*node.Root.BaseURL)
2018-10-08 02:15:41 +02:00
}
}
2018-10-08 04:56:01 +02:00
func (doc *HTMLDocument) handleError(val interface{}) {
2018-10-08 02:15:41 +02:00
err, ok := val.(error)
if !ok {
return
}
doc.logger.Error().
Timestamp().
Err(err).
Msg("unexpected error")
2018-09-26 01:04:07 +02:00
}