feat: Add `SqlStorageClient` based on `sqlalchemy` v2+ by Mantisus · Pull Request #1339 · apify/crawlee-python

Have you done any performance comparisons with the memory and file-system storage clients? If not, could you please run some? For example, you could run the Parsel crawler on crawlee.dev, enqueue all links, and store the URL + title to the dataset.

MemoryStorageClient:

┌───────────────────────────────┬────────────┐
│ requests_finished             │ 2363       │
│ requests_failed               │ 0          │
│ retry_histogram               │ [2363]     │
│ request_avg_failed_duration   │ None       │
│ request_avg_finished_duration │ 224.7ms    │
│ requests_finished_per_minute  │ 4484       │
│ requests_failed_per_minute    │ 0          │
│ request_total_duration        │ 8min 51.0s │
│ requests_total                │ 2363       │
│ crawler_runtime               │ 31.62s     │
└───────────────────────────────┴────────────┘

FileSystemStorageClient:

┌───────────────────────────────┬─────────────┐
│ requests_finished             │ 2363        │
│ requests_failed               │ 0           │
│ retry_histogram               │ [2363]      │
│ request_avg_failed_duration   │ None        │
│ request_avg_finished_duration │ 379.8ms     │
│ requests_finished_per_minute  │ 2489        │
│ requests_failed_per_minute    │ 0           │
│ request_total_duration        │ 14min 57.4s │
│ requests_total                │ 2363        │
│ crawler_runtime               │ 56.96s      │
└───────────────────────────────┴─────────────┘

SqlStorageClient:

import asyncio

from crawlee import ConcurrencySettings
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import HttpxHttpClient
from crawlee.storage_clients import SqlStorageClient

CONNECTION = None


async def main() -> None:
    async with SqlStorageClient(connection_string=CONNECTION) as storage_client:
        http_client = HttpxHttpClient()

        crawler = ParselCrawler(
            storage_client=storage_client,
            http_client=http_client,
            concurrency_settings=ConcurrencySettings(desired_concurrency=20),
        )

        @crawler.router.default_handler
        async def request_handler(context: ParselCrawlingContext) -> None:
            context.log.info(f'Processing URL: {context.request.url}...')
            data = {
                'url': context.request.url,
                'title': context.selector.css('title::text').get(),
            }
            await context.push_data(data)
            await context.enqueue_links()

        await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())

SQLite

┌───────────────────────────────┬─────────────┐
│ requests_finished             │ 2363        │
│ requests_failed               │ 0           │
│ retry_histogram               │ [2363]      │
│ request_avg_failed_duration   │ None        │
│ request_avg_finished_duration │ 727.0ms     │
│ requests_finished_per_minute  │ 1460        │
│ requests_failed_per_minute    │ 0           │
│ request_total_duration        │ 28min 37.9s │
│ requests_total                │ 2363        │
│ crawler_runtime               │ 1min 37.1s  │
└───────────────────────────────┴─────────────┘

PostgreSQL (standard installation in Docker, without database settings optimization)

┌───────────────────────────────┬─────────────┐
│ requests_finished             │ 2363        │
│ requests_failed               │ 0           │
│ retry_histogram               │ [2363]      │
│ request_avg_failed_duration   │ None        │
│ request_avg_finished_duration │ 503.8ms     │
│ requests_finished_per_minute  │ 2144        │
│ requests_failed_per_minute    │ 0           │
│ request_total_duration        │ 19min 50.5s │
│ requests_total                │ 2363        │
│ crawler_runtime               │ 1min 6.1s   │
└───────────────────────────────┴─────────────┘

SqlStorageClient (3 processes)

import asyncio
from concurrent.futures import ProcessPoolExecutor

from crawlee import ConcurrencySettings, service_locator
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import HttpxHttpClient
from crawlee.storage_clients import SqlStorageClient
from crawlee.storages import RequestQueue

CONNECTION = None

async def run(queue_name: str) -> None:
    async with SqlStorageClient(connection_string=CONNECTION) as storage_client:
        service_locator.set_storage_client(storage_client)
        queue = await RequestQueue.open(name=queue_name)

        http_client = HttpxHttpClient()

        crawler = ParselCrawler(
            http_client=http_client,
            request_manager=queue,
            concurrency_settings=ConcurrencySettings(desired_concurrency=20),
        )

        @crawler.router.default_handler
        async def request_handler(context: ParselCrawlingContext) -> None:
            context.log.info(f'Processing URL: {context.request.url}...')
            data = {
                'url': context.request.url,
                'title': context.selector.css('title::text').get(),
            }
            await context.push_data(data)
            await context.enqueue_links()

        await crawler.run(['https://crawlee.dev'])

def process_run(queue_name: str) -> None:
    asyncio.run(run(queue_name))

def multi_run(queue_name: str = 'multi') -> None:
    workers = 3
    with ProcessPoolExecutor(max_workers=workers) as executor:
        executor.map(process_run, [queue_name for i in range(workers)])

if __name__ == '__main__':
    multi_run()

SQLite

[ParselCrawler] INFO  Final request statistics:
┌───────────────────────────────┬────────────┐
│ requests_finished             │ 811        │
│ requests_failed               │ 0          │
│ retry_histogram               │ [811]      │
│ request_avg_failed_duration   │ None       │
│ request_avg_finished_duration │ 964.6ms    │
│ requests_finished_per_minute  │ 669        │
│ requests_failed_per_minute    │ 0          │
│ request_total_duration        │ 13min 2.2s │
│ requests_total                │ 811        │
│ crawler_runtime               │ 1min 12.8s │
└───────────────────────────────┴────────────┘
[ParselCrawler] INFO  Final request statistics:
┌───────────────────────────────┬─────────────┐
│ requests_finished             │ 735         │
│ requests_failed               │ 0           │
│ retry_histogram               │ [735]       │
│ request_avg_failed_duration   │ None        │
│ request_avg_finished_duration │ 930.9ms     │
│ requests_finished_per_minute  │ 606         │
│ requests_failed_per_minute    │ 0           │
│ request_total_duration        │ 11min 24.2s │
│ requests_total                │ 735         │
│ crawler_runtime               │ 1min 12.8s  │
└───────────────────────────────┴─────────────┘
[ParselCrawler] INFO  Final request statistics:
┌───────────────────────────────┬─────────────┐
│ requests_finished             │ 817         │
│ requests_failed               │ 0           │
│ retry_histogram               │ [817]       │
│ request_avg_failed_duration   │ None        │
│ request_avg_finished_duration │ 992.7ms     │
│ requests_finished_per_minute  │ 669         │
│ requests_failed_per_minute    │ 0           │
│ request_total_duration        │ 13min 31.0s │
│ requests_total                │ 817         │
│ crawler_runtime               │ 1min 13.3s  │
└───────────────────────────────┴─────────────┘

PostgreSQL (standard installation in Docker, without database settings optimization)

[ParselCrawler] INFO  Final request statistics:
┌───────────────────────────────┬────────────┐
│ requests_finished             │ 787        │
│ requests_failed               │ 0          │
│ retry_histogram               │ [787]      │
│ request_avg_failed_duration   │ None       │
│ request_avg_finished_duration │ 609.6ms    │
│ requests_finished_per_minute  │ 1527       │
│ requests_failed_per_minute    │ 0          │
│ request_total_duration        │ 7min 59.7s │
│ requests_total                │ 787        │
│ crawler_runtime               │ 30.92s     │
└───────────────────────────────┴────────────┘
[ParselCrawler] INFO  Final request statistics:
┌───────────────────────────────┬───────────┐
│ requests_finished             │ 783       │
│ requests_failed               │ 0         │
│ retry_histogram               │ [783]     │
│ request_avg_failed_duration   │ None      │
│ request_avg_finished_duration │ 625.0ms   │
│ requests_finished_per_minute  │ 1494      │
│ requests_failed_per_minute    │ 0         │
│ request_total_duration        │ 8min 9.4s │
│ requests_total                │ 783       │
│ crawler_runtime               │ 31.45s    │
└───────────────────────────────┴───────────┘
[ParselCrawler] INFO  Final request statistics:
┌───────────────────────────────┬────────────┐
│ requests_finished             │ 793        │
│ requests_failed               │ 0          │
│ retry_histogram               │ [793]      │
│ request_avg_failed_duration   │ None       │
│ request_avg_finished_duration │ 604.0ms    │
│ requests_finished_per_minute  │ 1512       │
│ requests_failed_per_minute    │ 0          │
│ request_total_duration        │ 7min 58.9s │
│ requests_total                │ 793        │
│ crawler_runtime               │ 31.47s     │
└───────────────────────────────┴────────────┘