2018-12-22 06:14:41 +02:00
|
|
|
package cdp
|
2018-09-27 04:03:06 +02:00
|
|
|
|
|
|
|
import (
|
2018-10-09 02:20:40 +02:00
|
|
|
"bytes"
|
2018-09-27 04:03:06 +02:00
|
|
|
"context"
|
2018-11-12 21:53:36 +02:00
|
|
|
"errors"
|
2019-06-19 23:58:56 +02:00
|
|
|
"golang.org/x/net/html"
|
2019-02-13 19:31:18 +02:00
|
|
|
"strings"
|
2019-03-16 01:59:05 +02:00
|
|
|
"time"
|
2019-02-13 19:31:18 +02:00
|
|
|
|
2019-03-16 01:59:05 +02:00
|
|
|
"github.com/MontFerret/ferret/pkg/drivers"
|
2018-12-22 06:14:41 +02:00
|
|
|
"github.com/MontFerret/ferret/pkg/drivers/cdp/eval"
|
|
|
|
"github.com/MontFerret/ferret/pkg/drivers/common"
|
2018-09-27 04:03:06 +02:00
|
|
|
"github.com/MontFerret/ferret/pkg/runtime/values"
|
2019-06-20 19:21:48 +02:00
|
|
|
|
2018-10-09 02:20:40 +02:00
|
|
|
"github.com/PuerkitoBio/goquery"
|
2018-09-27 04:03:06 +02:00
|
|
|
"github.com/mafredri/cdp"
|
|
|
|
"github.com/mafredri/cdp/protocol/dom"
|
2019-03-16 01:59:05 +02:00
|
|
|
"github.com/mafredri/cdp/protocol/network"
|
2018-09-27 04:03:06 +02:00
|
|
|
"github.com/mafredri/cdp/protocol/page"
|
2018-11-12 21:53:36 +02:00
|
|
|
"github.com/mafredri/cdp/protocol/runtime"
|
2019-02-20 01:10:18 +02:00
|
|
|
"golang.org/x/sync/errgroup"
|
2018-09-27 04:03:06 +02:00
|
|
|
)
|
|
|
|
|
2019-03-16 01:59:05 +02:00
|
|
|
var emptyExpires = time.Time{}
|
|
|
|
|
2018-11-15 21:33:53 +02:00
|
|
|
type (
|
|
|
|
batchFunc = func() error
|
|
|
|
)
|
2018-09-27 04:03:06 +02:00
|
|
|
|
|
|
|
func runBatch(funcs ...batchFunc) error {
|
|
|
|
eg := errgroup.Group{}
|
|
|
|
|
|
|
|
for _, f := range funcs {
|
|
|
|
eg.Go(f)
|
|
|
|
}
|
|
|
|
|
|
|
|
return eg.Wait()
|
|
|
|
}
|
|
|
|
|
|
|
|
func parseAttrs(attrs []string) *values.Object {
|
|
|
|
var attr values.String
|
|
|
|
|
|
|
|
res := values.NewObject()
|
|
|
|
|
|
|
|
for _, el := range attrs {
|
2018-10-07 04:33:39 +02:00
|
|
|
el = strings.TrimSpace(el)
|
2018-09-27 04:03:06 +02:00
|
|
|
str := values.NewString(el)
|
|
|
|
|
|
|
|
if common.IsAttribute(el) {
|
|
|
|
attr = str
|
|
|
|
res.Set(str, values.EmptyString)
|
|
|
|
} else {
|
|
|
|
current, ok := res.Get(attr)
|
|
|
|
|
|
|
|
if ok {
|
2018-10-07 04:33:39 +02:00
|
|
|
if current.String() != "" {
|
|
|
|
res.Set(attr, current.(values.String).Concat(values.SpaceString).Concat(str))
|
|
|
|
} else {
|
|
|
|
res.Set(attr, str)
|
|
|
|
}
|
2018-09-27 04:03:06 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return res
|
|
|
|
}
|
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
func loadInnerHTML(ctx context.Context, client *cdp.Client, exec *eval.ExecutionContext, id HTMLElementIdentity, nodeType html.NodeType) (values.String, error) {
|
|
|
|
// not a document
|
|
|
|
if nodeType != html.DocumentNode {
|
|
|
|
var objID runtime.RemoteObjectID
|
2018-09-27 06:26:56 +02:00
|
|
|
|
2019-07-03 20:05:02 +02:00
|
|
|
if id.objectID != "" {
|
2019-06-19 23:58:56 +02:00
|
|
|
objID = id.objectID
|
2019-07-03 20:05:02 +02:00
|
|
|
} else {
|
2019-06-19 23:58:56 +02:00
|
|
|
repl, err := client.DOM.ResolveNode(ctx, dom.NewResolveNodeArgs().SetNodeID(id.nodeID))
|
2018-11-12 21:53:36 +02:00
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
2018-11-12 21:53:36 +02:00
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
if repl.Object.ObjectID == nil {
|
|
|
|
return "", errors.New("unable to resolve node")
|
|
|
|
}
|
|
|
|
|
|
|
|
objID = *repl.Object.ObjectID
|
2018-11-12 21:53:36 +02:00
|
|
|
}
|
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
res, err := exec.ReadProperty(ctx, objID, "innerHTML")
|
2018-11-22 05:45:00 +02:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
return values.NewString(res.String()), nil
|
2018-11-22 05:45:00 +02:00
|
|
|
}
|
|
|
|
|
2019-07-03 20:05:02 +02:00
|
|
|
repl, err := exec.EvalWithValue(ctx, "return document.documentElement.innerHTML")
|
2018-11-22 05:45:00 +02:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
return values.NewString(repl.String()), nil
|
2018-11-22 05:45:00 +02:00
|
|
|
}
|
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
func loadInnerHTMLByNodeID(ctx context.Context, client *cdp.Client, exec *eval.ExecutionContext, nodeID dom.NodeID) (values.String, error) {
|
|
|
|
node, err := client.DOM.DescribeNode(ctx, dom.NewDescribeNodeArgs().SetNodeID(nodeID))
|
2018-11-22 05:45:00 +02:00
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
if err != nil {
|
|
|
|
return values.EmptyString, err
|
|
|
|
}
|
2018-11-22 05:45:00 +02:00
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
return loadInnerHTML(ctx, client, exec, HTMLElementIdentity{
|
|
|
|
nodeID: nodeID,
|
|
|
|
}, common.ToHTMLType(node.Node.NodeType))
|
|
|
|
}
|
2018-11-22 05:45:00 +02:00
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
func loadInnerText(ctx context.Context, client *cdp.Client, exec *eval.ExecutionContext, id HTMLElementIdentity, nodeType html.NodeType) (values.String, error) {
|
|
|
|
// not a document
|
|
|
|
if nodeType != html.DocumentNode {
|
|
|
|
var objID runtime.RemoteObjectID
|
2018-11-22 05:45:00 +02:00
|
|
|
|
2019-07-03 20:05:02 +02:00
|
|
|
if id.objectID != "" {
|
2019-06-19 23:58:56 +02:00
|
|
|
objID = id.objectID
|
2019-07-03 20:05:02 +02:00
|
|
|
} else {
|
2019-06-19 23:58:56 +02:00
|
|
|
repl, err := client.DOM.ResolveNode(ctx, dom.NewResolveNodeArgs().SetNodeID(id.nodeID))
|
2018-11-22 05:45:00 +02:00
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
|
|
|
|
if repl.Object.ObjectID == nil {
|
|
|
|
return "", errors.New("unable to resolve node")
|
|
|
|
}
|
|
|
|
|
|
|
|
objID = *repl.Object.ObjectID
|
|
|
|
}
|
|
|
|
|
|
|
|
res, err := exec.ReadProperty(ctx, objID, "innerText")
|
2018-11-22 05:45:00 +02:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
|
|
|
|
return values.NewString(res.String()), err
|
|
|
|
}
|
|
|
|
|
2019-07-03 20:05:02 +02:00
|
|
|
repl, err := exec.EvalWithValue(ctx, "return document.documentElement.innerText")
|
2018-10-09 02:20:40 +02:00
|
|
|
|
|
|
|
if err != nil {
|
2018-10-11 18:39:03 +02:00
|
|
|
return "", err
|
2018-10-09 02:20:40 +02:00
|
|
|
}
|
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
return values.NewString(repl.String()), nil
|
2018-10-09 02:20:40 +02:00
|
|
|
}
|
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
//func loadInnerTextByNodeID(ctx context.Context, client *cdp.Client, exec *eval.ExecutionContext, nodeID dom.NodeID) (values.String, error) {
|
|
|
|
// node, err := client.DOM.DescribeNode(ctx, dom.NewDescribeNodeArgs().SetNodeID(nodeID))
|
|
|
|
//
|
|
|
|
// if err != nil {
|
|
|
|
// return values.EmptyString, err
|
|
|
|
// }
|
|
|
|
//
|
|
|
|
// return loadInnerText(ctx, client, exec, HTMLElementIdentity{
|
|
|
|
// nodeID: nodeID,
|
|
|
|
// }, common.ToHTMLType(node.Node.NodeType))
|
|
|
|
//}
|
|
|
|
|
2018-10-09 02:20:40 +02:00
|
|
|
func parseInnerText(innerHTML string) (values.String, error) {
|
|
|
|
buff := bytes.NewBuffer([]byte(innerHTML))
|
|
|
|
|
|
|
|
parsed, err := goquery.NewDocumentFromReader(buff)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return values.EmptyString, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return values.NewString(parsed.Text()), nil
|
|
|
|
}
|
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
func createChildrenArray(nodes []dom.Node) []HTMLElementIdentity {
|
|
|
|
children := make([]HTMLElementIdentity, len(nodes))
|
2018-09-27 06:26:56 +02:00
|
|
|
|
|
|
|
for idx, child := range nodes {
|
2019-06-19 23:58:56 +02:00
|
|
|
child := child
|
|
|
|
children[idx] = HTMLElementIdentity{
|
2019-07-03 20:05:02 +02:00
|
|
|
nodeID: child.NodeID,
|
2018-09-27 04:03:06 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-11 18:39:03 +02:00
|
|
|
return children
|
2018-09-27 04:03:06 +02:00
|
|
|
}
|
|
|
|
|
2019-03-16 01:59:05 +02:00
|
|
|
func fromDriverCookie(url string, cookie drivers.HTTPCookie) network.CookieParam {
|
|
|
|
sameSite := network.CookieSameSiteNotSet
|
|
|
|
|
|
|
|
switch cookie.SameSite {
|
|
|
|
case drivers.SameSiteLaxMode:
|
|
|
|
sameSite = network.CookieSameSiteLax
|
|
|
|
case drivers.SameSiteStrictMode:
|
|
|
|
sameSite = network.CookieSameSiteStrict
|
|
|
|
}
|
|
|
|
|
|
|
|
if cookie.Expires == emptyExpires {
|
|
|
|
cookie.Expires = time.Now().Add(time.Duration(24) + time.Hour)
|
|
|
|
}
|
|
|
|
|
|
|
|
normalizedURL := normalizeCookieURL(url)
|
|
|
|
|
|
|
|
return network.CookieParam{
|
|
|
|
URL: &normalizedURL,
|
|
|
|
Name: cookie.Name,
|
|
|
|
Value: cookie.Value,
|
|
|
|
Secure: &cookie.Secure,
|
|
|
|
Path: &cookie.Path,
|
|
|
|
Domain: &cookie.Domain,
|
|
|
|
HTTPOnly: &cookie.HTTPOnly,
|
|
|
|
SameSite: sameSite,
|
|
|
|
Expires: network.TimeSinceEpoch(cookie.Expires.Unix()),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func fromDriverCookieDelete(url string, cookie drivers.HTTPCookie) *network.DeleteCookiesArgs {
|
|
|
|
normalizedURL := normalizeCookieURL(url)
|
|
|
|
|
|
|
|
return &network.DeleteCookiesArgs{
|
|
|
|
URL: &normalizedURL,
|
|
|
|
Name: cookie.Name,
|
|
|
|
Path: &cookie.Path,
|
|
|
|
Domain: &cookie.Domain,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func toDriverCookie(c network.Cookie) drivers.HTTPCookie {
|
|
|
|
sameSite := drivers.SameSiteDefaultMode
|
|
|
|
|
|
|
|
switch c.SameSite {
|
|
|
|
case network.CookieSameSiteLax:
|
|
|
|
sameSite = drivers.SameSiteLaxMode
|
|
|
|
case network.CookieSameSiteStrict:
|
|
|
|
sameSite = drivers.SameSiteStrictMode
|
|
|
|
}
|
|
|
|
|
|
|
|
return drivers.HTTPCookie{
|
|
|
|
Name: c.Name,
|
|
|
|
Value: c.Value,
|
|
|
|
Path: c.Path,
|
|
|
|
Domain: c.Domain,
|
|
|
|
Expires: time.Unix(int64(c.Expires), 0),
|
|
|
|
SameSite: sameSite,
|
|
|
|
Secure: c.Secure,
|
|
|
|
HTTPOnly: c.HTTPOnly,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func normalizeCookieURL(url string) string {
|
|
|
|
const httpPrefix = "http://"
|
|
|
|
const httpsPrefix = "https://"
|
|
|
|
|
|
|
|
if strings.HasPrefix(url, httpPrefix) || strings.HasPrefix(url, httpsPrefix) {
|
|
|
|
return url
|
|
|
|
}
|
|
|
|
|
|
|
|
return httpPrefix + url
|
|
|
|
}
|
2019-04-13 03:05:11 +02:00
|
|
|
|
2019-06-19 23:58:56 +02:00
|
|
|
func resolveFrame(ctx context.Context, client *cdp.Client, frame page.Frame) (dom.Node, runtime.ExecutionContextID, error) {
|
|
|
|
worldRepl, err := client.Page.CreateIsolatedWorld(ctx, page.NewCreateIsolatedWorldArgs(frame.ID))
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return dom.Node{}, -1, err
|
|
|
|
}
|
|
|
|
|
|
|
|
evalRes, err := client.Runtime.Evaluate(
|
|
|
|
ctx,
|
|
|
|
runtime.NewEvaluateArgs(eval.PrepareEval("return document")).
|
|
|
|
SetContextID(worldRepl.ExecutionContextID),
|
|
|
|
)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return dom.Node{}, -1, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if evalRes.ExceptionDetails != nil {
|
|
|
|
exception := *evalRes.ExceptionDetails
|
|
|
|
|
|
|
|
return dom.Node{}, -1, errors.New(exception.Text)
|
|
|
|
}
|
|
|
|
|
|
|
|
if evalRes.Result.ObjectID == nil {
|
|
|
|
return dom.Node{}, -1, errors.New("failed to resolve frame document")
|
|
|
|
}
|
|
|
|
|
|
|
|
req, err := client.DOM.RequestNode(ctx, dom.NewRequestNodeArgs(*evalRes.Result.ObjectID))
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return dom.Node{}, -1, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if req.NodeID == 0 {
|
|
|
|
return dom.Node{}, -1, errors.New("framed document is resolved with empty node id")
|
|
|
|
}
|
|
|
|
|
|
|
|
desc, err := client.DOM.DescribeNode(
|
|
|
|
ctx,
|
|
|
|
dom.
|
|
|
|
NewDescribeNodeArgs().
|
|
|
|
SetNodeID(req.NodeID).
|
|
|
|
SetDepth(1),
|
|
|
|
)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return dom.Node{}, -1, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returned node, by some reason, does not contain the NodeID
|
|
|
|
// So, we have to set it manually
|
|
|
|
desc.Node.NodeID = req.NodeID
|
|
|
|
|
|
|
|
return desc.Node, worldRepl.ExecutionContextID, nil
|
|
|
|
}
|