fix: Optimize memory consumption for `HttpxHttpClient`, fix proxy handling by Mantisus · Pull Request #905

fix: Optimize memory consumption for `HttpxHttpClient`, fix proxy handling by Mantisus · Pull Request #905 · apify/crawlee-python

Something very strange is going on inside httpx. Since the changes made reduce memory consumption from 1.7 Gigabytes to 120 Megabytes, for 1000 httpx.AsyncClient

This is so, weird that someone might want to double-check it and tell me where I went wrong.

In order to test to exclude delays and errors that can cause real proxies, I tested using a local environment. Here is the configuration used

# compose.yaml

services:
  target-server:
    image: kennethreitz/httpbin
    ports:
      - "8000:80"
    networks:
      - test-net

  toxiproxy:
    image: ghcr.io/shopify/toxiproxy
    ports:
      - "8474:8474"
      - "8001-10000:8001-10000"
    networks:
      - test-net

networks:
  test-net:
    driver: bridge

# crawlee_test.py

import asyncio
import os
from contextlib import suppress

import psutil
from toxiproxy import Toxiproxy
from crawlee import ConcurrencySettings, Request
from crawlee.http_clients import HttpxHttpClient
from crawlee.sessions import SessionPool
from crawlee.crawlers import ParselCrawler, ParsedHttpCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration

current_process = psutil.Process(os.getpid())

def log_memory_usage():
    current_size_bytes = int(current_process.memory_info().rss)
    for child in current_process.children(recursive=True):
        with suppress(psutil.NoSuchProcess):
            current_size_bytes += int(child.memory_info().rss)
    
    memory_mb = current_size_bytes / 1024 /1024
    return memory_mb

async def setup_toxiproxy(proxy_count: int = 1000):
    toxiproxy = Toxiproxy()
    
    toxiproxy.destroy_all()
    
    proxies = []
    i = 0
    while proxy_count > 0:
        try:
            port = 8001 + i
            toxiproxy.create(
                name=f"proxy_{i}",
                upstream="target-server:80",
                enabled=True,
                listen=f"0.0.0.0:{port}"
            )
            proxies.append(f'http://localhost:{port}')
            proxy_count -= 1
        except:
            pass
        i += 1
    
    print(f"Created {len(proxies)} proxies")
    return proxies

async def run():
    proxies = await setup_toxiproxy(1500)

    session_pool = SessionPool()
    http_client = HttpxHttpClient(
        headers={'accept-encoding': 'gzip, deflate, br, zstd',
                 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'accept-language': 'en',
                 },
                 )
    crawler = ParselCrawler(
        http_client=http_client,
        concurrency_settings=ConcurrencySettings(min_concurrency=20, desired_concurrency=50),
        proxy_configuration=ProxyConfiguration(proxy_urls=proxies),
        max_requests_per_crawl=4000,
        session_pool=session_pool
    )

    @crawler.router.default_handler
    async def request_handler(context: ParsedHttpCrawlingContext) -> None:
        ip = context.parsed_content.jmespath('origin').get()
        memory = log_memory_usage()
        context.log.info(f'Processing {context.request.url} with response ip {ip}, {context.session.id} {context.proxy_info.url} clients {len(http_client._client_by_proxy_url)} {memory:.2f} MB"')

    requests = [Request.from_url(url='http://httpbin.org/get', always_enqueue=True) for _ in range(5000)]
    await crawler.run(requests)


if __name__ == "__main__":
    asyncio.run(run())