2018-11-08 12:02:28 +02:00
|
|
|
browserless
|
|
|
|
===========
|
|
|
|
|
2018-11-13 11:38:17 +02:00
|
|
|
[Browserless][1] makes it easy to run your puppeteer scripts in an optimized
|
|
|
|
way. It takes care of all the binaries and managing of Chrome so you don't have
|
|
|
|
to.
|
|
|
|
|
2018-11-08 12:02:28 +02:00
|
|
|
## docker-compose.yml
|
|
|
|
|
|
|
|
```yaml
|
2023-02-22 12:38:45 +02:00
|
|
|
version: '3.8'
|
|
|
|
services:
|
|
|
|
browserless:
|
|
|
|
image: browserless/chrome:1-chrome-stable
|
|
|
|
ports:
|
|
|
|
- "3000:3000"
|
|
|
|
environment:
|
|
|
|
- DEBUG=browserless/chrome
|
|
|
|
- MAX_CONCURRENT_SESSIONS=100
|
|
|
|
- CONNECTION_TIMEOUT=300000
|
|
|
|
- MAX_QUEUE_LENGTH=100
|
|
|
|
- ENABLE_CORS=true
|
|
|
|
- ENABLE_DEBUG_VIEWER=true
|
|
|
|
- EXIT_ON_HEALTH_FAILURE=true
|
2023-11-24 11:13:11 +02:00
|
|
|
# TOKEN=4e2a9c32-2854-479a-a9f3-ba8899f2fdc1
|
2023-02-22 12:38:45 +02:00
|
|
|
restart: unless-stopped
|
2018-11-08 12:02:28 +02:00
|
|
|
```
|
|
|
|
|
|
|
|
## screenshot.js
|
|
|
|
|
|
|
|
```javascript
|
|
|
|
'use strict';
|
|
|
|
|
2023-02-22 12:38:45 +02:00
|
|
|
const puppeteer = require('puppeteer-core');
|
2018-11-08 12:02:28 +02:00
|
|
|
|
|
|
|
(async() => {
|
|
|
|
const browser = await puppeteer.connect({browserWSEndpoint: 'ws://localhost:3000'});
|
|
|
|
const page = await browser.newPage();
|
2023-02-22 12:38:45 +02:00
|
|
|
await page.setViewport({width: 1280, height: 720});
|
2018-11-08 12:02:28 +02:00
|
|
|
await page.goto('https://www.google.com/', {waitUntil: 'networkidle2'});
|
2023-02-22 12:38:45 +02:00
|
|
|
await page.waitForSelector('img[alt="Google"]');
|
2018-11-08 12:02:28 +02:00
|
|
|
await page.screenshot({path: 'google.png', fullPage: true});
|
|
|
|
await browser.close();
|
|
|
|
})();
|
|
|
|
```
|
|
|
|
|
2023-02-22 12:38:45 +02:00
|
|
|
## screenshot.py
|
|
|
|
|
|
|
|
```python
|
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
from selenium import webdriver
|
|
|
|
from selenium.webdriver.common.by import By
|
|
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
|
|
|
|
options = webdriver.ChromeOptions()
|
|
|
|
options.add_argument("--window-size=1024,768")
|
|
|
|
options.add_argument('--headless')
|
|
|
|
options.add_argument('--no-sandbox')
|
|
|
|
|
|
|
|
driver = webdriver.Remote(
|
|
|
|
command_executor='http://localhost:3000/webdriver',
|
|
|
|
options=options
|
|
|
|
)
|
|
|
|
|
|
|
|
driver.get('https://www.google.com/')
|
|
|
|
|
|
|
|
WebDriverWait(driver, 10).until(
|
|
|
|
EC.presence_of_element_located((By.CSS_SELECTOR, 'img[alt="Google"]'))
|
|
|
|
)
|
|
|
|
|
|
|
|
driver.save_screenshot('google.png')
|
|
|
|
driver.close()
|
|
|
|
driver.quit()
|
|
|
|
```
|
|
|
|
|
2018-11-08 12:02:28 +02:00
|
|
|
## Up and Running
|
|
|
|
|
|
|
|
```bash
|
|
|
|
$ docker-compose up -d
|
2018-11-13 11:38:17 +02:00
|
|
|
|
2024-04-15 09:00:40 +02:00
|
|
|
$ crontab -l
|
|
|
|
0 */4 * * * docker restart browserless-browserless-1
|
|
|
|
5 * * * * docker exec browserless-browserless-1 find /tmp -name '.com.google.Chrome.*' -mmin +60 -exec rm -rf {} \;
|
|
|
|
|
2023-02-22 12:38:45 +02:00
|
|
|
$ npm install puppeteer-core
|
2018-11-08 12:02:28 +02:00
|
|
|
$ node screenshot.js
|
|
|
|
$ imgcat google.png
|
2018-11-13 11:38:17 +02:00
|
|
|
|
2023-02-22 12:38:45 +02:00
|
|
|
$ pip install selenium
|
|
|
|
$ python screenshot.py
|
|
|
|
$ imgcat google.png
|
|
|
|
|
2024-02-23 18:23:20 +02:00
|
|
|
$ http :3000/screenshot \
|
2018-11-13 11:38:17 +02:00
|
|
|
url=https://www.youtube.com \
|
2023-02-22 12:38:45 +02:00
|
|
|
options[fullPage]:=true \
|
|
|
|
gotoOptions[waitUntil]=networkidle2 > youtube.png
|
2018-11-13 11:38:17 +02:00
|
|
|
$ imgcat youtube.png
|
2024-02-23 18:23:20 +02:00
|
|
|
|
|
|
|
$ http :3000/scrape url=https://www.youtube.com elements[0][selector]=title debug[network]:=true |
|
|
|
|
jq -r '.debug.network.outbound[].url' |
|
|
|
|
xargs -r tldextract -j |
|
|
|
|
jq -r 'select(.fqdn|length>0).fqdn' |
|
|
|
|
sort -u
|
|
|
|
accounts.google.com
|
|
|
|
fonts.googleapis.com
|
|
|
|
fonts.gstatic.com
|
|
|
|
googleads.g.doubleclick.net
|
|
|
|
i.ytimg.com
|
|
|
|
www.youtube.com
|
2018-11-08 12:02:28 +02:00
|
|
|
```
|
2018-11-13 11:38:17 +02:00
|
|
|
|
|
|
|
[1]: https://docs.browserless.io/
|