fix: Optimize memory consumption for `HttpxHttpClient`, fix proxy handling by Mantisus · Pull Request #905 · apify/crawlee-python
Something very strange is going on inside httpx. Since the changes made reduce memory consumption from 1.7 Gigabytes to 120 Megabytes, for 1000 httpx.AsyncClient
This is so, weird that someone might want to double-check it and tell me where I went wrong.
In order to test to exclude delays and errors that can cause real proxies, I tested using a local environment. Here is the configuration used
# compose.yaml
services:
target-server:
image: kennethreitz/httpbin
ports:
- "8000:80"
networks:
- test-net
toxiproxy:
image: ghcr.io/shopify/toxiproxy
ports:
- "8474:8474"
- "8001-10000:8001-10000"
networks:
- test-net
networks:
test-net:
driver: bridge
# crawlee_test.py import asyncio import os from contextlib import suppress import psutil from toxiproxy import Toxiproxy from crawlee import ConcurrencySettings, Request from crawlee.http_clients import HttpxHttpClient from crawlee.sessions import SessionPool from crawlee.crawlers import ParselCrawler, ParsedHttpCrawlingContext from crawlee.proxy_configuration import ProxyConfiguration current_process = psutil.Process(os.getpid()) def log_memory_usage(): current_size_bytes = int(current_process.memory_info().rss) for child in current_process.children(recursive=True): with suppress(psutil.NoSuchProcess): current_size_bytes += int(child.memory_info().rss) memory_mb = current_size_bytes / 1024 /1024 return memory_mb async def setup_toxiproxy(proxy_count: int = 1000): toxiproxy = Toxiproxy() toxiproxy.destroy_all() proxies = [] i = 0 while proxy_count > 0: try: port = 8001 + i toxiproxy.create( name=f"proxy_{i}", upstream="target-server:80", enabled=True, listen=f"0.0.0.0:{port}" ) proxies.append(f'http://localhost:{port}') proxy_count -= 1 except: pass i += 1 print(f"Created {len(proxies)} proxies") return proxies async def run(): proxies = await setup_toxiproxy(1500) session_pool = SessionPool() http_client = HttpxHttpClient( headers={'accept-encoding': 'gzip, deflate, br, zstd', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 'accept-language': 'en', }, ) crawler = ParselCrawler( http_client=http_client, concurrency_settings=ConcurrencySettings(min_concurrency=20, desired_concurrency=50), proxy_configuration=ProxyConfiguration(proxy_urls=proxies), max_requests_per_crawl=4000, session_pool=session_pool ) @crawler.router.default_handler async def request_handler(context: ParsedHttpCrawlingContext) -> None: ip = context.parsed_content.jmespath('origin').get() memory = log_memory_usage() context.log.info(f'Processing {context.request.url} with response ip {ip}, {context.session.id} {context.proxy_info.url} clients {len(http_client._client_by_proxy_url)} {memory:.2f} MB"') requests = [Request.from_url(url='http://httpbin.org/get', always_enqueue=True) for _ in range(5000)] await crawler.run(requests) if __name__ == "__main__": asyncio.run(run())