mirror of https://github.com/vimagick/dockerfiles.git synced 2024-11-28 09:08:36 +02:00

History

kev 5540773d96 update browserless		2024-07-16 18:33:42 +08:00
..
docker-compose-v2.yml	update browserless	2024-07-16 18:33:42 +08:00
docker-compose.yml	update	2023-11-24 17:13:11 +08:00
docker-stack.yml	fix browserless logging	2023-10-24 17:38:05 +08:00
README.md	update browserless	2024-06-06 16:03:48 +08:00

README.md

browserless

Browserless makes it easy to run your puppeteer scripts in an optimized way. It takes care of all the binaries and managing of Chrome so you don't have to.

screenshot.js

'use strict';

const puppeteer = require('puppeteer-core');

(async() => {
  const browser = await puppeteer.connect({browserWSEndpoint: 'ws://localhost:3000'});
  const page = await browser.newPage();
  await page.setViewport({width: 1280, height: 720});
  await page.goto('https://www.google.com/', {waitUntil: 'networkidle2'});
  await page.waitForSelector('img[alt="Google"]');
  await page.screenshot({path: 'google.png', fullPage: true});
  await browser.close();
})();

screenshot.py

#!/usr/bin/env python

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = webdriver.ChromeOptions()
options.add_argument("--window-size=1024,768")
options.add_argument('--headless')
options.add_argument('--no-sandbox')

driver = webdriver.Remote(
    command_executor='http://localhost:3000/webdriver',
    options=options
)

driver.get('https://www.google.com/')

WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, 'img[alt="Google"]'))
)

driver.save_screenshot('google.png')
driver.close()
driver.quit()

Up and Running

$ docker-compose up -d

$ crontab -l
0 */4 * * * docker restart browserless-browserless-1
5 *   * * * docker exec browserless-browserless-1 find /tmp -name '.com.google.Chrome.*' -mmin +60 -exec rm -rf {} \;

$ npm install puppeteer-core
$ node screenshot.js
$ imgcat google.png

$ pip install selenium
$ python screenshot.py
$ imgcat google.png

$ http :3000/screenshot \
       url=https://www.youtube.com \
       options[fullPage]:=true \
       gotoOptions[waitUntil]=networkidle2 > youtube.png
$ imgcat youtube.png

$ http :3000/scrape url=https://www.youtube.com elements[0][selector]=title debug[network]:=true |
    jq -r '.debug.network.outbound[].url' |
      xargs -r tldextract -j |
        jq -r 'select(.fqdn|length>0).fqdn' |
          sort -u
accounts.google.com
fonts.googleapis.com
fonts.gstatic.com
googleads.g.doubleclick.net
i.ytimg.com
www.youtube.com

Waiting for Condition (v2)

async functions are supported

$ cat fn.js
async () => {
  await new Promise(resolve => {
    const interval = setInterval(() => {
      var e = document.querySelector('#tryit-data');
      if (e && e.innerText.includes('country')) {
        resolve();
        clearInterval(interval);
      }
    }, 1000)
  });
}

$ http '127.0.0.1:3000/content?token=1234567890&stealth&--proxy-server=http://x.x.x.x:8080' url='https://ipinfo.io' waitForFunction[fn]=@fn.js waitForFunction[timeout]:=10000 > ipinfo.html

Intercepting HTTP traffic (v2)

$ cat function.js
export default async function ({ page }) {
  const url = 'https://ipinfo.io/widget/demo/';
  const promise = page.waitForResponse(res => res.url().includes(url));
  await page.goto("https://ipinfo.io/");
  const res = await promise;
  const txt = await res.text();
  return JSON.parse(txt);
}

$ http '127.0.0.1:3000/function?token=1234567890' Content-Type:application/javascript < function.js > ipinfo.json