mirror of
https://github.com/vimagick/dockerfiles.git
synced 2025-01-24 05:17:06 +02:00
update crawlee
This commit is contained in:
parent
30e5f41486
commit
50e7ed4679
@ -1,29 +0,0 @@
|
|||||||
# Specify the base Docker image. You can read more about
|
|
||||||
# the available images at https://crawlee.dev/docs/guides/docker-images
|
|
||||||
# You can also use any other image from Docker Hub.
|
|
||||||
FROM apify/actor-node:16
|
|
||||||
|
|
||||||
# Copy just package.json and package-lock.json
|
|
||||||
# to speed up the build using Docker layer cache.
|
|
||||||
COPY package*.json ./
|
|
||||||
|
|
||||||
# Install NPM packages, skip optional and development dependencies to
|
|
||||||
# keep the image small. Avoid logging too much and print the dependency
|
|
||||||
# tree for debugging
|
|
||||||
RUN npm --quiet set progress=false \
|
|
||||||
&& npm install --omit=dev --omit=optional \
|
|
||||||
&& echo "Installed NPM packages:" \
|
|
||||||
&& (npm list --omit=dev --all || true) \
|
|
||||||
&& echo "Node.js version:" \
|
|
||||||
&& node --version \
|
|
||||||
&& echo "NPM version:" \
|
|
||||||
&& npm --version
|
|
||||||
|
|
||||||
# Next, copy the remaining files and directories with the source code.
|
|
||||||
# Since we do this after NPM install, quick build will be really fast
|
|
||||||
# for most source file changes.
|
|
||||||
COPY . ./
|
|
||||||
|
|
||||||
|
|
||||||
# Run the image.
|
|
||||||
CMD npm start --silent
|
|
@ -5,23 +5,16 @@ crawlee
|
|||||||
scraping and browser automation library.
|
scraping and browser automation library.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ docker-compose build
|
$ docker run --rm -e PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 -e PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1 -v $PWD:/tmp -w /tmp apify/actor-node:16 npx crawlee create -t cheerio-js my-crawler
|
||||||
Building crawlee
|
|
||||||
Successfully built xxxxxxxxxxxx
|
|
||||||
Successfully tagged crawlee:latest
|
|
||||||
|
|
||||||
$ docker-compose run --rm crawlee
|
$ docker-compose build my-crawler
|
||||||
INFO BasicCrawler: Starting the crawl
|
|
||||||
INFO BasicCrawler: Processing ...
|
|
||||||
Crawler finished.
|
|
||||||
|
|
||||||
$ tree data
|
$ docker-compose run --rm my-crawler
|
||||||
|
|
||||||
|
$ tree my-crawler/storage/
|
||||||
├── datasets
|
├── datasets
|
||||||
│ └── default
|
│ └── default
|
||||||
│ ├── 000000001.json
|
│ └── 000000001.json
|
||||||
│ ├── 000000002.json
|
|
||||||
│ ├── 000000003.json
|
|
||||||
│ └── 000000004.json
|
|
||||||
├── key_value_stores
|
├── key_value_stores
|
||||||
└── request_queues
|
└── request_queues
|
||||||
```
|
```
|
||||||
|
@ -1,7 +1,11 @@
|
|||||||
version: "3.8"
|
version: "3.8"
|
||||||
|
|
||||||
services:
|
services:
|
||||||
crawlee:
|
|
||||||
image: crawlee
|
my-crawler:
|
||||||
build: .
|
image: my-crawler
|
||||||
|
build:
|
||||||
|
context: my-crawler
|
||||||
|
dockerfile: Dockerfile
|
||||||
volumes:
|
volumes:
|
||||||
- ./data:/usr/src/app/storage
|
- ./my-crawler/storage:/usr/src/app/storage
|
||||||
|
@ -1,35 +0,0 @@
|
|||||||
import { BasicCrawler, Dataset } from 'crawlee';
|
|
||||||
|
|
||||||
// Create a BasicCrawler - the simplest crawler that enables
|
|
||||||
// users to implement the crawling logic themselves.
|
|
||||||
const crawler = new BasicCrawler({
|
|
||||||
// This function will be called for each URL to crawl.
|
|
||||||
async requestHandler({ request, sendRequest, log }) {
|
|
||||||
const { url } = request;
|
|
||||||
log.info(`Processing ${url}...`);
|
|
||||||
|
|
||||||
// Fetch the page HTML via the crawlee sendRequest utility method
|
|
||||||
// By default, the method will use the current request that is being handled, so you don't have to
|
|
||||||
// provide it yourself. You can also provide a custom request if you want.
|
|
||||||
const { body } = await sendRequest();
|
|
||||||
|
|
||||||
// Store the HTML and URL to the default dataset.
|
|
||||||
await Dataset.pushData({
|
|
||||||
url,
|
|
||||||
html: body,
|
|
||||||
});
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
// The initial list of URLs to crawl. Here we use just a few hard-coded URLs.
|
|
||||||
await crawler.addRequests([
|
|
||||||
'https://www.google.com',
|
|
||||||
'https://www.example.com',
|
|
||||||
'https://www.bing.com',
|
|
||||||
'https://www.wikipedia.com',
|
|
||||||
]);
|
|
||||||
|
|
||||||
// Run the crawler and wait for it to finish.
|
|
||||||
await crawler.run();
|
|
||||||
|
|
||||||
console.log('Crawler finished.');
|
|
@ -1,14 +0,0 @@
|
|||||||
{
|
|
||||||
"description": "Crawlee Demo Project",
|
|
||||||
"version": "0.0.1",
|
|
||||||
"license": "UNLICENSED",
|
|
||||||
"type": "module",
|
|
||||||
"main": "main.js",
|
|
||||||
"scripts": {
|
|
||||||
"start": "node main.js"
|
|
||||||
},
|
|
||||||
"dependencies": {
|
|
||||||
"crawlee": "*"
|
|
||||||
},
|
|
||||||
"repository": {}
|
|
||||||
}
|
|
Loading…
x
Reference in New Issue
Block a user