feat: Capture statistics about the crawler run by janbuchar · Pull Request #142 · apify/crawlee-python

import asyncio
import logging

from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.enqueue_strategy import EnqueueStrategy
from crawlee.storages import Dataset, RequestQueue

logging.basicConfig(level=logging.INFO)


async def main() -> None:
    rq = await RequestQueue.open()
    await rq.add_request('https://crawlee.dev')
    dataset = await Dataset.open()
    crawler = BeautifulSoupCrawler(request_provider=rq)

    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        await context.enqueue_links(strategy=EnqueueStrategy.SAME_DOMAIN)

        record = {
            'title': context.soup.title.text if context.soup.title else '',
            'url': context.request.url,
        }

        await dataset.push_data(record)

    stats = await crawler.run()
    print(stats)


if __name__ == '__main__':
    asyncio.run(main())
$ python run.py 
INFO:crawlee.autoscaling.snapshotter:Setting max_memory_size of this run to 3.84 GB.
INFO:crawlee.statistics.statistics:crawlee.basic_crawler.basic_crawler request statistics {
  "requests_finished": 0,
  "requests_failed": 0,
  "retry_histogram": [
    0
  ],
  "request_avg_failed_duration": null,
  "request_avg_finished_duration": null,
  "requests_finished_per_minute": 0,
  "requests_failed_per_minute": 0,
  "request_total_duration": 0.0,
  "requests_total": 0,
  "crawler_runtime": 0.010923
}
INFO:crawlee.autoscaling.autoscaled_pool:current_concurrency = 0; desired_concurrency = 2; cpu = 0; mem = 0; event_loop = 0.0; client_info = 0.0
INFO:httpx:HTTP Request: GET https://crawlee.dev "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/javascript-rendering "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/typescript-project "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/avoid-blocking "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/cheerio-crawler-guide "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/jsdom-crawler-guide "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/api/core/class/AutoscaledPool "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/proxy-management "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/result-storage "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/request-storage "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/api/utils/namespace/social "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/api/utils "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/api/basic-crawler/interface/BasicCrawlerOptions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/quick-start "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/deployment/aws-cheerio "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/deployment/gcp-cheerio "HTTP/1.1 200 OK"
INFO:crawlee.autoscaling.autoscaled_pool:Waiting for remaining tasks to finish
{
  "requests_finished": 16,
  "requests_failed": 0,
  "retry_histogram": [
    16
  ],
  "request_avg_failed_duration": null,
  "request_avg_finished_duration": 0.096618,
  "requests_finished_per_minute": 936,
  "requests_failed_per_minute": 0,
  "request_total_duration": 1.545896,
  "requests_total": 16,
  "crawler_runtime": 1.02534
}