feat: Add `SqlStorageClient` based on `sqlalchemy` v2+ by Mantisus · Pull Request #1339 · apify/crawlee-python
Have you done any performance comparisons with the memory and file-system storage clients? If not, could you please run some? For example, you could run the Parsel crawler on crawlee.dev, enqueue all links, and store the URL + title to the dataset.
MemoryStorageClient:
┌───────────────────────────────┬────────────┐ │ requests_finished │ 2363 │ │ requests_failed │ 0 │ │ retry_histogram │ [2363] │ │ request_avg_failed_duration │ None │ │ request_avg_finished_duration │ 224.7ms │ │ requests_finished_per_minute │ 4484 │ │ requests_failed_per_minute │ 0 │ │ request_total_duration │ 8min 51.0s │ │ requests_total │ 2363 │ │ crawler_runtime │ 31.62s │ └───────────────────────────────┴────────────┘
FileSystemStorageClient:
┌───────────────────────────────┬─────────────┐ │ requests_finished │ 2363 │ │ requests_failed │ 0 │ │ retry_histogram │ [2363] │ │ request_avg_failed_duration │ None │ │ request_avg_finished_duration │ 379.8ms │ │ requests_finished_per_minute │ 2489 │ │ requests_failed_per_minute │ 0 │ │ request_total_duration │ 14min 57.4s │ │ requests_total │ 2363 │ │ crawler_runtime │ 56.96s │ └───────────────────────────────┴─────────────┘
SqlStorageClient:
import asyncio from crawlee import ConcurrencySettings from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from crawlee.http_clients import HttpxHttpClient from crawlee.storage_clients import SqlStorageClient CONNECTION = None async def main() -> None: async with SqlStorageClient(connection_string=CONNECTION) as storage_client: http_client = HttpxHttpClient() crawler = ParselCrawler( storage_client=storage_client, http_client=http_client, concurrency_settings=ConcurrencySettings(desired_concurrency=20), ) @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing URL: {context.request.url}...') data = { 'url': context.request.url, 'title': context.selector.css('title::text').get(), } await context.push_data(data) await context.enqueue_links() await crawler.run(['https://crawlee.dev']) if __name__ == '__main__': asyncio.run(main())
SQLite
┌───────────────────────────────┬─────────────┐ │ requests_finished │ 2363 │ │ requests_failed │ 0 │ │ retry_histogram │ [2363] │ │ request_avg_failed_duration │ None │ │ request_avg_finished_duration │ 727.0ms │ │ requests_finished_per_minute │ 1460 │ │ requests_failed_per_minute │ 0 │ │ request_total_duration │ 28min 37.9s │ │ requests_total │ 2363 │ │ crawler_runtime │ 1min 37.1s │ └───────────────────────────────┴─────────────┘
PostgreSQL (standard installation in Docker, without database settings optimization)
┌───────────────────────────────┬─────────────┐ │ requests_finished │ 2363 │ │ requests_failed │ 0 │ │ retry_histogram │ [2363] │ │ request_avg_failed_duration │ None │ │ request_avg_finished_duration │ 503.8ms │ │ requests_finished_per_minute │ 2144 │ │ requests_failed_per_minute │ 0 │ │ request_total_duration │ 19min 50.5s │ │ requests_total │ 2363 │ │ crawler_runtime │ 1min 6.1s │ └───────────────────────────────┴─────────────┘
SqlStorageClient (3 processes)
import asyncio from concurrent.futures import ProcessPoolExecutor from crawlee import ConcurrencySettings, service_locator from crawlee.crawlers import ParselCrawler, ParselCrawlingContext from crawlee.http_clients import HttpxHttpClient from crawlee.storage_clients import SqlStorageClient from crawlee.storages import RequestQueue CONNECTION = None async def run(queue_name: str) -> None: async with SqlStorageClient(connection_string=CONNECTION) as storage_client: service_locator.set_storage_client(storage_client) queue = await RequestQueue.open(name=queue_name) http_client = HttpxHttpClient() crawler = ParselCrawler( http_client=http_client, request_manager=queue, concurrency_settings=ConcurrencySettings(desired_concurrency=20), ) @crawler.router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: context.log.info(f'Processing URL: {context.request.url}...') data = { 'url': context.request.url, 'title': context.selector.css('title::text').get(), } await context.push_data(data) await context.enqueue_links() await crawler.run(['https://crawlee.dev']) def process_run(queue_name: str) -> None: asyncio.run(run(queue_name)) def multi_run(queue_name: str = 'multi') -> None: workers = 3 with ProcessPoolExecutor(max_workers=workers) as executor: executor.map(process_run, [queue_name for i in range(workers)]) if __name__ == '__main__': multi_run()
SQLite
[ParselCrawler] INFO Final request statistics: ┌───────────────────────────────┬────────────┐ │ requests_finished │ 811 │ │ requests_failed │ 0 │ │ retry_histogram │ [811] │ │ request_avg_failed_duration │ None │ │ request_avg_finished_duration │ 964.6ms │ │ requests_finished_per_minute │ 669 │ │ requests_failed_per_minute │ 0 │ │ request_total_duration │ 13min 2.2s │ │ requests_total │ 811 │ │ crawler_runtime │ 1min 12.8s │ └───────────────────────────────┴────────────┘ [ParselCrawler] INFO Final request statistics: ┌───────────────────────────────┬─────────────┐ │ requests_finished │ 735 │ │ requests_failed │ 0 │ │ retry_histogram │ [735] │ │ request_avg_failed_duration │ None │ │ request_avg_finished_duration │ 930.9ms │ │ requests_finished_per_minute │ 606 │ │ requests_failed_per_minute │ 0 │ │ request_total_duration │ 11min 24.2s │ │ requests_total │ 735 │ │ crawler_runtime │ 1min 12.8s │ └───────────────────────────────┴─────────────┘ [ParselCrawler] INFO Final request statistics: ┌───────────────────────────────┬─────────────┐ │ requests_finished │ 817 │ │ requests_failed │ 0 │ │ retry_histogram │ [817] │ │ request_avg_failed_duration │ None │ │ request_avg_finished_duration │ 992.7ms │ │ requests_finished_per_minute │ 669 │ │ requests_failed_per_minute │ 0 │ │ request_total_duration │ 13min 31.0s │ │ requests_total │ 817 │ │ crawler_runtime │ 1min 13.3s │ └───────────────────────────────┴─────────────┘
PostgreSQL (standard installation in Docker, without database settings optimization)
[ParselCrawler] INFO Final request statistics: ┌───────────────────────────────┬────────────┐ │ requests_finished │ 787 │ │ requests_failed │ 0 │ │ retry_histogram │ [787] │ │ request_avg_failed_duration │ None │ │ request_avg_finished_duration │ 609.6ms │ │ requests_finished_per_minute │ 1527 │ │ requests_failed_per_minute │ 0 │ │ request_total_duration │ 7min 59.7s │ │ requests_total │ 787 │ │ crawler_runtime │ 30.92s │ └───────────────────────────────┴────────────┘ [ParselCrawler] INFO Final request statistics: ┌───────────────────────────────┬───────────┐ │ requests_finished │ 783 │ │ requests_failed │ 0 │ │ retry_histogram │ [783] │ │ request_avg_failed_duration │ None │ │ request_avg_finished_duration │ 625.0ms │ │ requests_finished_per_minute │ 1494 │ │ requests_failed_per_minute │ 0 │ │ request_total_duration │ 8min 9.4s │ │ requests_total │ 783 │ │ crawler_runtime │ 31.45s │ └───────────────────────────────┴───────────┘ [ParselCrawler] INFO Final request statistics: ┌───────────────────────────────┬────────────┐ │ requests_finished │ 793 │ │ requests_failed │ 0 │ │ retry_histogram │ [793] │ │ request_avg_failed_duration │ None │ │ request_avg_finished_duration │ 604.0ms │ │ requests_finished_per_minute │ 1512 │ │ requests_failed_per_minute │ 0 │ │ request_total_duration │ 7min 58.9s │ │ requests_total │ 793 │ │ crawler_runtime │ 31.47s │ └───────────────────────────────┴────────────┘