update crawlee

2025-11-29 22:38:35 +02:00 · 2023-03-08 15:51:09 +08:00
parent 30e5f41486
commit 50e7ed4679
5 changed files with 14 additions and 95 deletions
--- a/crawlee/Dockerfile
+++ b/crawlee/Dockerfile
@@ -1,29 +0,0 @@
-# Specify the base Docker image. You can read more about
-# the available images at https://crawlee.dev/docs/guides/docker-images
-# You can also use any other image from Docker Hub.
-FROM apify/actor-node:16
-
-# Copy just package.json and package-lock.json
-# to speed up the build using Docker layer cache.
-COPY package*.json ./
-
-# Install NPM packages, skip optional and development dependencies to
-# keep the image small. Avoid logging too much and print the dependency
-# tree for debugging
-RUN npm --quiet set progress=false \
-    && npm install --omit=dev --omit=optional \
-    && echo "Installed NPM packages:" \
-    && (npm list --omit=dev --all || true) \
-    && echo "Node.js version:" \
-    && node --version \
-    && echo "NPM version:" \
-    && npm --version
-
-# Next, copy the remaining files and directories with the source code.
-# Since we do this after NPM install, quick build will be really fast
-# for most source file changes.
-COPY . ./
-
-
-# Run the image.
-CMD npm start --silent
--- a/crawlee/README.md
+++ b/crawlee/README.md
@@ -5,23 +5,16 @@ crawlee
 scraping and browser automation library.

 ```bash
-$ docker-compose build
-Building crawlee
-Successfully built xxxxxxxxxxxx
-Successfully tagged crawlee:latest
+$ docker run --rm -e PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 -e PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1 -v $PWD:/tmp -w /tmp apify/actor-node:16 npx crawlee create -t cheerio-js my-crawler

-$ docker-compose run --rm crawlee
-INFO  BasicCrawler: Starting the crawl
-INFO  BasicCrawler: Processing ...
-Crawler finished.
+$ docker-compose build my-crawler

-$ tree data
+$ docker-compose run --rm my-crawler
+
+$ tree my-crawler/storage/
 ├── datasets
 │   └── default
-│       ├── 000000001.json
-│       ├── 000000002.json
-│       ├── 000000003.json
-│       └── 000000004.json
+│       └── 000000001.json
 ├── key_value_stores
 └── request_queues
 ```
--- a/crawlee/docker-compose.yml
+++ b/crawlee/docker-compose.yml
@@ -1,7 +1,11 @@
 version: "3.8"
+
 services:
-  crawlee:
-    image: crawlee
-    build: .
+
+  my-crawler:
+    image: my-crawler
+    build:
+      context: my-crawler
+      dockerfile: Dockerfile
    volumes:
-      - ./data:/usr/src/app/storage
+      - ./my-crawler/storage:/usr/src/app/storage
--- a/crawlee/main.js
+++ b/crawlee/main.js
@@ -1,35 +0,0 @@
-import { BasicCrawler, Dataset } from 'crawlee';
-
-// Create a BasicCrawler - the simplest crawler that enables
-// users to implement the crawling logic themselves.
-const crawler = new BasicCrawler({
-    // This function will be called for each URL to crawl.
-    async requestHandler({ request, sendRequest, log }) {
-        const { url } = request;
-        log.info(`Processing ${url}...`);
-
-        // Fetch the page HTML via the crawlee sendRequest utility method
-        // By default, the method will use the current request that is being handled, so you don't have to
-        // provide it yourself. You can also provide a custom request if you want.
-        const { body } = await sendRequest();
-
-        // Store the HTML and URL to the default dataset.
-        await Dataset.pushData({
-            url,
-            html: body,
-        });
-    },
-});
-
-// The initial list of URLs to crawl. Here we use just a few hard-coded URLs.
-await crawler.addRequests([
-    'https://www.google.com',
-    'https://www.example.com',
-    'https://www.bing.com',
-    'https://www.wikipedia.com',
-]);
-
-// Run the crawler and wait for it to finish.
-await crawler.run();
-
-console.log('Crawler finished.');
--- a/crawlee/package.json
+++ b/crawlee/package.json
@@ -1,14 +0,0 @@
-{
-    "description": "Crawlee Demo Project",
-    "version": "0.0.1",
-    "license": "UNLICENSED",
-    "type": "module",
-    "main": "main.js",
-    "scripts": {
-        "start": "node main.js"
-    },
-    "dependencies": {
-        "crawlee": "*"
-    },
-    "repository": {}
-}