feat: Capture statistics about the crawler run by janbuchar · Pull Request #142 · apify/crawlee-python
import asyncio import logging from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext from crawlee.enqueue_strategy import EnqueueStrategy from crawlee.storages import Dataset, RequestQueue logging.basicConfig(level=logging.INFO) async def main() -> None: rq = await RequestQueue.open() await rq.add_request('https://crawlee.dev') dataset = await Dataset.open() crawler = BeautifulSoupCrawler(request_provider=rq) @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await context.enqueue_links(strategy=EnqueueStrategy.SAME_DOMAIN) record = { 'title': context.soup.title.text if context.soup.title else '', 'url': context.request.url, } await dataset.push_data(record) stats = await crawler.run() print(stats) if __name__ == '__main__': asyncio.run(main())
$ python run.py
INFO:crawlee.autoscaling.snapshotter:Setting max_memory_size of this run to 3.84 GB.
INFO:crawlee.statistics.statistics:crawlee.basic_crawler.basic_crawler request statistics {
"requests_finished": 0,
"requests_failed": 0,
"retry_histogram": [
0
],
"request_avg_failed_duration": null,
"request_avg_finished_duration": null,
"requests_finished_per_minute": 0,
"requests_failed_per_minute": 0,
"request_total_duration": 0.0,
"requests_total": 0,
"crawler_runtime": 0.010923
}
INFO:crawlee.autoscaling.autoscaled_pool:current_concurrency = 0; desired_concurrency = 2; cpu = 0; mem = 0; event_loop = 0.0; client_info = 0.0
INFO:httpx:HTTP Request: GET https://crawlee.dev "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/javascript-rendering "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/typescript-project "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/avoid-blocking "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/cheerio-crawler-guide "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/jsdom-crawler-guide "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/api/core/class/AutoscaledPool "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/proxy-management "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/result-storage "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/guides/request-storage "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/api/utils/namespace/social "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/api/utils "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/api/basic-crawler/interface/BasicCrawlerOptions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/quick-start "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/deployment/aws-cheerio "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://crawlee.dev/docs/deployment/gcp-cheerio "HTTP/1.1 200 OK"
INFO:crawlee.autoscaling.autoscaled_pool:Waiting for remaining tasks to finish
{
"requests_finished": 16,
"requests_failed": 0,
"retry_histogram": [
16
],
"request_avg_failed_duration": null,
"request_avg_finished_duration": 0.096618,
"requests_finished_per_minute": 936,
"requests_failed_per_minute": 0,
"request_total_duration": 1.545896,
"requests_total": 16,
"crawler_runtime": 1.02534
}