1
0
mirror of https://github.com/MontFerret/ferret.git synced 2025-01-20 03:29:51 +02:00

653 lines
12 KiB
Go
Raw Normal View History

2018-09-26 22:03:06 -04:00
package dynamic
2018-09-18 16:42:38 -04:00
import (
"context"
2018-09-25 17:58:57 -04:00
"crypto/sha512"
2018-09-23 04:33:20 -04:00
"fmt"
2018-09-18 16:42:38 -04:00
"github.com/MontFerret/ferret/pkg/runtime/core"
2018-09-28 00:28:33 -04:00
"github.com/MontFerret/ferret/pkg/runtime/logging"
2018-09-23 04:33:20 -04:00
"github.com/MontFerret/ferret/pkg/runtime/values"
2018-09-26 22:03:06 -04:00
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic/eval"
"github.com/MontFerret/ferret/pkg/stdlib/html/driver/dynamic/events"
"github.com/corpix/uarand"
2018-09-18 16:42:38 -04:00
"github.com/mafredri/cdp"
"github.com/mafredri/cdp/protocol/dom"
"github.com/mafredri/cdp/protocol/emulation"
"github.com/mafredri/cdp/protocol/page"
2018-09-18 16:42:38 -04:00
"github.com/mafredri/cdp/rpcc"
2018-09-25 19:04:07 -04:00
"github.com/pkg/errors"
2018-09-28 00:28:33 -04:00
"github.com/rs/zerolog"
2018-09-25 17:58:57 -04:00
"sync"
2018-09-23 04:33:20 -04:00
"time"
2018-09-18 16:42:38 -04:00
)
type HtmlDocument struct {
2018-09-25 17:58:57 -04:00
sync.Mutex
2018-09-28 00:28:33 -04:00
logger *zerolog.Logger
2018-09-25 17:58:57 -04:00
conn *rpcc.Conn
client *cdp.Client
events *events.EventBroker
url values.String
2018-09-25 17:58:57 -04:00
element *HtmlElement
2018-09-18 16:42:38 -04:00
}
2018-09-25 17:58:57 -04:00
func LoadHtmlDocument(
2018-09-18 16:42:38 -04:00
ctx context.Context,
conn *rpcc.Conn,
url string,
) (*HtmlDocument, error) {
if conn == nil {
return nil, core.Error(core.ErrMissedArgument, "connection")
}
if url == "" {
return nil, core.Error(core.ErrMissedArgument, "url")
}
client := cdp.NewClient(conn)
2018-09-25 19:04:07 -04:00
err := runBatch(
2018-09-18 16:42:38 -04:00
func() error {
return client.Page.Enable(ctx)
},
func() error {
return client.Page.SetLifecycleEventsEnabled(
ctx,
page.NewSetLifecycleEventsEnabledArgs(true),
)
},
2018-09-18 16:42:38 -04:00
func() error {
return client.DOM.Enable(ctx)
},
func() error {
return client.Runtime.Enable(ctx)
},
func() error {
return client.Emulation.SetUserAgentOverride(
ctx,
emulation.NewSetUserAgentOverrideArgs(uarand.GetRandom()),
)
},
2018-09-18 16:42:38 -04:00
)
if err != nil {
return nil, err
}
err = waitForLoadEvent(ctx, client)
2018-09-18 16:42:38 -04:00
if err != nil {
return nil, err
}
2018-09-26 22:03:06 -04:00
root, innerHtml, err := getRootElement(client)
2018-09-18 16:42:38 -04:00
if err != nil {
return nil, err
}
2018-09-25 17:58:57 -04:00
broker, err := createEventBroker(client)
if err != nil {
return nil, err
}
2018-09-28 00:28:33 -04:00
return NewHtmlDocument(
logging.From(ctx),
conn,
client,
broker,
root,
innerHtml,
), nil
}
2018-09-26 22:03:06 -04:00
func getRootElement(client *cdp.Client) (dom.Node, values.String, error) {
2018-09-18 16:42:38 -04:00
args := dom.NewGetDocumentArgs()
2018-09-25 19:04:07 -04:00
args.Depth = pointerInt(1) // lets load the entire document
2018-09-26 22:03:06 -04:00
ctx := context.Background()
2018-09-18 16:42:38 -04:00
2018-09-26 22:03:06 -04:00
d, err := client.DOM.GetDocument(ctx, args)
2018-09-18 16:42:38 -04:00
if err != nil {
2018-09-26 22:03:06 -04:00
return dom.Node{}, values.EmptyString, err
}
2018-09-26 22:03:06 -04:00
innerHtml, err := client.DOM.GetOuterHTML(ctx, dom.NewGetOuterHTMLArgs().SetNodeID(d.Root.NodeID))
2018-09-25 17:58:57 -04:00
if err != nil {
2018-09-26 22:03:06 -04:00
return dom.Node{}, values.EmptyString, err
2018-09-25 17:58:57 -04:00
}
2018-09-26 22:03:06 -04:00
return d.Root, values.NewString(innerHtml.OuterHTML), nil
2018-09-18 16:42:38 -04:00
}
2018-09-25 17:58:57 -04:00
func NewHtmlDocument(
2018-09-28 00:28:33 -04:00
logger *zerolog.Logger,
2018-09-25 17:58:57 -04:00
conn *rpcc.Conn,
client *cdp.Client,
broker *events.EventBroker,
2018-09-26 22:03:06 -04:00
root dom.Node,
innerHtml values.String,
2018-09-25 17:58:57 -04:00
) *HtmlDocument {
doc := new(HtmlDocument)
2018-09-28 00:28:33 -04:00
doc.logger = logger
2018-09-25 17:58:57 -04:00
doc.conn = conn
doc.client = client
doc.events = broker
2018-09-28 00:28:33 -04:00
doc.element = NewHtmlElement(doc.logger, client, broker, root.NodeID, root, innerHtml)
2018-09-25 17:58:57 -04:00
doc.url = ""
if root.BaseURL != nil {
doc.url = values.NewString(*root.BaseURL)
2018-09-25 17:58:57 -04:00
}
2018-09-25 17:58:57 -04:00
broker.AddEventListener("load", func(_ interface{}) {
doc.Lock()
defer doc.Unlock()
2018-09-18 16:42:38 -04:00
2018-09-26 22:03:06 -04:00
updated, innerHtml, err := getRootElement(client)
2018-09-25 17:58:57 -04:00
if err != nil {
2018-09-28 00:28:33 -04:00
doc.logger.Error().
Timestamp().
Err(err).
Msg("failed to get root node after page load")
2018-09-25 17:58:57 -04:00
return
}
2018-09-27 11:32:52 -04:00
// close the prev element
doc.element.Close()
2018-09-25 17:58:57 -04:00
// create a new root element wrapper
2018-09-28 00:28:33 -04:00
doc.element = NewHtmlElement(doc.logger, client, broker, updated.NodeID, updated, innerHtml)
2018-09-25 17:58:57 -04:00
doc.url = ""
if updated.BaseURL != nil {
doc.url = values.NewString(*updated.BaseURL)
2018-09-25 17:58:57 -04:00
}
})
return doc
}
func (doc *HtmlDocument) MarshalJSON() ([]byte, error) {
doc.Lock()
defer doc.Unlock()
return doc.element.MarshalJSON()
2018-09-18 16:42:38 -04:00
}
func (doc *HtmlDocument) Type() core.Type {
return core.HtmlDocumentType
}
func (doc *HtmlDocument) String() string {
2018-09-25 17:58:57 -04:00
doc.Lock()
defer doc.Unlock()
return doc.url.String()
2018-09-18 16:42:38 -04:00
}
2018-09-25 17:58:57 -04:00
func (doc *HtmlDocument) Unwrap() interface{} {
doc.Lock()
defer doc.Unlock()
return doc.element
}
func (doc *HtmlDocument) Hash() int {
doc.Lock()
defer doc.Unlock()
h := sha512.New()
out, err := h.Write([]byte(doc.url))
if err != nil {
return 0
}
return out
}
2018-09-27 11:53:26 -04:00
func (doc *HtmlDocument) Clone() core.Value {
return values.None
}
2018-09-18 16:42:38 -04:00
func (doc *HtmlDocument) Compare(other core.Value) int {
2018-09-25 17:58:57 -04:00
doc.Lock()
defer doc.Unlock()
2018-09-18 16:42:38 -04:00
switch other.Type() {
case core.HtmlDocumentType:
other := other.(*HtmlDocument)
return doc.url.Compare(other.url)
2018-09-18 16:42:38 -04:00
default:
if other.Type() > core.HtmlDocumentType {
return -1
}
return 1
}
}
2018-09-23 04:33:20 -04:00
2018-09-25 17:58:57 -04:00
func (doc *HtmlDocument) Close() error {
doc.Lock()
defer doc.Unlock()
2018-09-28 00:28:33 -04:00
var err error
err = doc.events.Stop()
if err != nil {
doc.logger.Warn().
Timestamp().
Str("url", doc.url.String()).
Err(err).
Msg("failed to stop event broker")
}
err = doc.events.Close()
if err != nil {
doc.logger.Warn().
Timestamp().
Str("url", doc.url.String()).
Err(err).
Msg("failed to close event broker")
}
err = doc.element.Close()
2018-09-25 17:58:57 -04:00
2018-09-28 00:28:33 -04:00
if err != nil {
doc.logger.Warn().
Timestamp().
Str("url", doc.url.String()).
Err(err).
Msg("failed to close root element")
}
err = doc.client.Page.Close(context.Background())
if err != nil {
doc.logger.Warn().
Timestamp().
Str("url", doc.url.String()).
Err(err).
Msg("failed to close browser page")
}
2018-09-25 17:58:57 -04:00
return doc.conn.Close()
}
func (doc *HtmlDocument) NodeType() values.Int {
doc.Lock()
defer doc.Unlock()
return doc.element.NodeType()
}
func (doc *HtmlDocument) NodeName() values.String {
doc.Lock()
defer doc.Unlock()
return doc.element.NodeName()
}
func (doc *HtmlDocument) Length() values.Int {
doc.Lock()
defer doc.Unlock()
return doc.element.Length()
}
func (doc *HtmlDocument) InnerText() values.String {
doc.Lock()
defer doc.Unlock()
return doc.element.InnerText()
}
func (doc *HtmlDocument) InnerHtml() values.String {
doc.Lock()
defer doc.Unlock()
return doc.element.InnerHtml()
}
func (doc *HtmlDocument) Value() core.Value {
doc.Lock()
defer doc.Unlock()
return doc.element.Value()
}
func (doc *HtmlDocument) GetAttributes() core.Value {
doc.Lock()
defer doc.Unlock()
return doc.element.GetAttributes()
}
func (doc *HtmlDocument) GetAttribute(name values.String) core.Value {
doc.Lock()
defer doc.Unlock()
return doc.element.GetAttribute(name)
}
func (doc *HtmlDocument) GetChildNodes() core.Value {
doc.Lock()
defer doc.Unlock()
return doc.element.GetChildNodes()
}
func (doc *HtmlDocument) GetChildNode(idx values.Int) core.Value {
doc.Lock()
defer doc.Unlock()
return doc.element.GetChildNode(idx)
}
func (doc *HtmlDocument) QuerySelector(selector values.String) core.Value {
doc.Lock()
defer doc.Unlock()
return doc.element.QuerySelector(selector)
}
func (doc *HtmlDocument) QuerySelectorAll(selector values.String) core.Value {
doc.Lock()
defer doc.Unlock()
return doc.element.QuerySelectorAll(selector)
}
func (doc *HtmlDocument) Url() core.Value {
return doc.url
}
2018-09-27 22:03:35 -04:00
func (doc *HtmlDocument) InnerHtmlBySelector(selector values.String) (values.String, error) {
res, err := eval.Eval(
doc.client,
fmt.Sprintf(`
var el = document.querySelector(%s);
if (el == null) {
return "";
}
return el.innerHtml;
`, eval.ParamString(selector.String())),
true,
false,
)
if err != nil {
return values.EmptyString, err
}
if res.Type() == core.StringType {
return res.(values.String), nil
}
return values.EmptyString, nil
}
func (doc *HtmlDocument) InnerHtmlBySelectorAll(selector values.String) (*values.Array, error) {
res, err := eval.Eval(
doc.client,
fmt.Sprintf(`
2018-09-28 00:28:33 -04:00
var result = [];
2018-09-27 22:03:35 -04:00
var elements = document.querySelectorAll(%s);
if (elements == null) {
2018-09-28 00:28:33 -04:00
return result;
2018-09-27 22:03:35 -04:00
}
2018-09-28 00:28:33 -04:00
elements.forEach((i) => {
result.push(i.innerHtml);
});
return result;
2018-09-27 22:03:35 -04:00
`, eval.ParamString(selector.String())),
true,
false,
)
if err != nil {
return values.NewArray(0), err
}
if res.Type() == core.ArrayType {
return res.(*values.Array), nil
}
return values.NewArray(0), nil
}
func (doc *HtmlDocument) InnerTextBySelector(selector values.String) (values.String, error) {
res, err := eval.Eval(
doc.client,
fmt.Sprintf(`
var el = document.querySelector(%s);
if (el == null) {
return "";
}
return el.innerText;
`, eval.ParamString(selector.String())),
true,
false,
)
if err != nil {
return values.EmptyString, err
}
if res.Type() == core.StringType {
return res.(values.String), nil
}
return values.EmptyString, nil
}
func (doc *HtmlDocument) InnerTextBySelectorAll(selector values.String) (*values.Array, error) {
res, err := eval.Eval(
doc.client,
fmt.Sprintf(`
2018-09-28 00:28:33 -04:00
var result = [];
2018-09-27 22:03:35 -04:00
var elements = document.querySelectorAll(%s);
if (elements == null) {
2018-09-28 00:28:33 -04:00
return result;
2018-09-27 22:03:35 -04:00
}
2018-09-28 00:28:33 -04:00
elements.forEach((i) => {
result.push(i.innerText);
});
return result;
2018-09-27 22:03:35 -04:00
`, eval.ParamString(selector.String())),
true,
false,
)
if err != nil {
return values.NewArray(0), err
}
if res.Type() == core.ArrayType {
return res.(*values.Array), nil
}
return values.NewArray(0), nil
}
func (doc *HtmlDocument) ClickBySelector(selector values.String) (values.Boolean, error) {
2018-09-25 17:58:57 -04:00
res, err := eval.Eval(
doc.client,
fmt.Sprintf(`
2018-09-27 21:41:41 -04:00
var el = document.querySelector(%s);
if (el == null) {
return false;
}
var evt = new window.MouseEvent('click', { bubbles: true });
el.dispatchEvent(evt);
2018-09-27 22:03:35 -04:00
return true;
`, eval.ParamString(selector.String())),
true,
false,
)
if err != nil {
return values.False, err
}
if res.Type() == core.BooleanType {
return res.(values.Boolean), nil
}
return values.False, nil
}
func (doc *HtmlDocument) ClickBySelectorAll(selector values.String) (values.Boolean, error) {
res, err := eval.Eval(
doc.client,
fmt.Sprintf(`
var elements = document.querySelectorAll(%s);
if (elements == null) {
return false;
}
elements.forEach((el) => {
var evt = new window.MouseEvent('click', { bubbles: true });
el.dispatchEvent(evt);
});
return true;
2018-09-27 21:41:41 -04:00
`, eval.ParamString(selector.String())),
true,
false,
)
if err != nil {
return values.False, err
}
if res.Type() == core.BooleanType {
return res.(values.Boolean), nil
}
return values.False, nil
}
func (doc *HtmlDocument) InputBySelector(selector values.String, value core.Value) (values.Boolean, error) {
res, err := eval.Eval(
doc.client,
fmt.Sprintf(
`
var el = document.querySelector(%s);
if (el == null) {
return false;
}
var evt = new window.Event('input', { bubbles: true });
el.value = %s
el.dispatchEvent(evt);
return true;
`,
eval.ParamString(selector.String()),
eval.ParamString(value.String()),
),
true,
false,
)
if err != nil {
return values.False, err
}
if res.Type() == core.BooleanType {
return res.(values.Boolean), nil
}
return values.False, nil
}
2018-09-23 04:33:20 -04:00
func (doc *HtmlDocument) WaitForSelector(selector values.String, timeout values.Int) error {
2018-09-25 17:58:57 -04:00
task := events.NewWaitTask(
2018-09-23 04:33:20 -04:00
doc.client,
fmt.Sprintf(`
2018-09-27 21:41:41 -04:00
el = document.querySelector(%s);
2018-09-23 04:33:20 -04:00
if (el != null) {
return true;
}
return null;
2018-09-27 21:41:41 -04:00
`, eval.ParamString(selector.String())),
2018-09-23 04:33:20 -04:00
time.Millisecond*time.Duration(timeout),
2018-09-25 17:58:57 -04:00
events.DefaultPolling,
2018-09-23 04:33:20 -04:00
)
_, err := task.Run()
return err
}
2018-09-25 17:58:57 -04:00
func (doc *HtmlDocument) WaitForNavigation(timeout values.Int) error {
timer := time.NewTimer(time.Millisecond * time.Duration(timeout))
onEvent := make(chan bool)
listener := func(_ interface{}) {
onEvent <- true
}
2018-09-25 17:58:57 -04:00
defer doc.events.RemoveEventListener("load", listener)
defer close(onEvent)
2018-09-25 17:58:57 -04:00
doc.events.AddEventListener("load", listener)
2018-09-25 17:58:57 -04:00
for {
select {
case <-onEvent:
timer.Stop()
2018-09-25 17:58:57 -04:00
return nil
case <-timer.C:
return core.ErrTimeout
}
}
}
2018-09-25 19:04:07 -04:00
func (doc *HtmlDocument) Navigate(url values.String) error {
ctx := context.Background()
repl, err := doc.client.Page.Navigate(ctx, page.NewNavigateArgs(url.String()))
if err != nil {
return err
}
if repl.ErrorText != nil {
return errors.New(*repl.ErrorText)
}
return waitForLoadEvent(ctx, doc.client)
}