update crawlee

2025-07-15 01:14:25 +02:00 · 2023-03-08 15:51:09 +08:00
parent 30e5f41486
commit 50e7ed4679
5 changed files with 14 additions and 95 deletions
--- a/crawlee/Dockerfile
+++ b/crawlee/Dockerfile
@ -1,29 +0,0 @@
 # Specify the base Docker image. You can read more about
 # the available images at https://crawlee.dev/docs/guides/docker-images
 # You can also use any other image from Docker Hub.
 FROM apify/actor-node:16
 # Copy just package.json and package-lock.json
 # to speed up the build using Docker layer cache.
 COPY package*.json ./
 # Install NPM packages, skip optional and development dependencies to
 # keep the image small. Avoid logging too much and print the dependency
 # tree for debugging
 RUN npm --quiet set progress=false \
    && npm install --omit=dev --omit=optional \
    && echo "Installed NPM packages:" \
    && (npm list --omit=dev --all || true) \
    && echo "Node.js version:" \
    && node --version \
    && echo "NPM version:" \
    && npm --version
 # Next, copy the remaining files and directories with the source code.
 # Since we do this after NPM install, quick build will be really fast
 # for most source file changes.
 COPY . ./
 # Run the image.
 CMD npm start --silent
--- a/crawlee/README.md
+++ b/crawlee/README.md
@ -5,23 +5,16 @@ crawlee
 scraping and browser automation library.
 ```bash
-$ docker-compose build
+$ docker run --rm -e PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 -e PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1 -v $PWD:/tmp -w /tmp apify/actor-node:16 npx crawlee create -t cheerio-js my-crawler
 Building crawlee
 Successfully built xxxxxxxxxxxx
 Successfully tagged crawlee:latest
-$ docker-compose run --rm crawlee
+$ docker-compose build my-crawler
 INFO  BasicCrawler: Starting the crawl
 INFO  BasicCrawler: Processing ...
 Crawler finished.
-$ tree data
+$ docker-compose run --rm my-crawler
 $ tree my-crawler/storage/
 ├── datasets
 │   └── default
-│       ├── 000000001.json
+│       └── 000000001.json
 │       ├── 000000002.json
 │       ├── 000000003.json
 │       └── 000000004.json
 ├── key_value_stores
 └── request_queues
 ```
--- a/crawlee/docker-compose.yml
+++ b/crawlee/docker-compose.yml
@ -1,7 +1,11 @@
 version: "3.8"
 services:
-  crawlee:
+
-    image: crawlee
+  my-crawler:
-    build: .
+    image: my-crawler
    build:
      context: my-crawler
      dockerfile: Dockerfile
    volumes:
-      - ./data:/usr/src/app/storage
+      - ./my-crawler/storage:/usr/src/app/storage
--- a/crawlee/main.js
+++ b/crawlee/main.js
@ -1,35 +0,0 @@
 import { BasicCrawler, Dataset } from 'crawlee';
 // Create a BasicCrawler - the simplest crawler that enables
 // users to implement the crawling logic themselves.
 const crawler = new BasicCrawler({
    // This function will be called for each URL to crawl.
    async requestHandler({ request, sendRequest, log }) {
        const { url } = request;
        log.info(`Processing ${url}...`);
        // Fetch the page HTML via the crawlee sendRequest utility method
        // By default, the method will use the current request that is being handled, so you don't have to
        // provide it yourself. You can also provide a custom request if you want.
        const { body } = await sendRequest();
        // Store the HTML and URL to the default dataset.
        await Dataset.pushData({
            url,
            html: body,
        });
    },
 });
 // The initial list of URLs to crawl. Here we use just a few hard-coded URLs.
 await crawler.addRequests([
    'https://www.google.com',
    'https://www.example.com',
    'https://www.bing.com',
    'https://www.wikipedia.com',
 ]);
 // Run the crawler and wait for it to finish.
 await crawler.run();
 console.log('Crawler finished.');
--- a/crawlee/package.json
+++ b/crawlee/package.json
@ -1,14 +0,0 @@
 {
    "description": "Crawlee Demo Project",
    "version": "0.0.1",
    "license": "UNLICENSED",
    "type": "module",
    "main": "main.js",
    "scripts": {
        "start": "node main.js"
    },
    "dependencies": {
        "crawlee": "*"
    },
    "repository": {}
 }