A proxy created with param proxy_urls crashes PlaywrightCrawler
Test program.
import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext from crawlee.proxy_configuration import ProxyConfiguration # If these go out of service then replace them with your own. proxies = ['http://178.48.68.61:18080', 'http://198.245.60.202:3128', 'http://15.204.240.177:3128',] proxy_configuration_fails = ProxyConfiguration(proxy_urls=proxies) proxy_configuration_succeeds = ProxyConfiguration( tiered_proxy_urls=[ # No proxy tier. (Not needed, but optional in case you do not want to use any proxy on lowest tier.) [None], # lower tier, cheaper, preferred as long as they work proxies, # higher tier, more expensive, used as a fallback ] ) async def main() -> None: crawler = PlaywrightCrawler( max_requests_per_crawl=5, # Limit the crawl to 5 requests. headless=False, # Show the browser window. browser_type='firefox', # Use the Firefox browser. proxy_configuration = proxy_configuration_fails, # proxy_configuration=proxy_configuration_succeeds, ) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Enqueue all links found on the page. await context.enqueue_links() # Extract data from the page using Playwright API. data = { 'url': context.request.url, 'title': await context.page.title(), 'content': (await context.page.content())[:100], } # Push the extracted data to the default dataset. await context.push_data(data) # Run the crawler with the initial list of URLs. await crawler.run(['https://crawlee.dev']) # Export the entire dataset to a JSON file. await crawler.export_data('results.json') # Or work with the data directly. data = await crawler.get_data() crawler.log.info(f'Extracted data: {data.items}') if __name__ == '__main__': asyncio.run(main())
Terminal output.
/Users/matecsaj/PycharmProjects/wat-crawlee/venv/bin/python /Users/matecsaj/Library/Application Support/JetBrains/PyCharm2024.3/scratches/scratch_4.py
[crawlee._autoscaling.snapshotter] INFO Setting max_memory_size of this run to 8.00 GB.
[crawlee.crawlers._playwright._playwright_crawler] INFO Current request statistics:
┌───────────────────────────────┬──────────┐
│ requests_finished │ 0 │
│ requests_failed │ 0 │
│ retry_histogram │ [0] │
│ request_avg_failed_duration │ None │
│ request_avg_finished_duration │ None │
│ requests_finished_per_minute │ 0 │
│ requests_failed_per_minute │ 0 │
│ request_total_duration │ 0.0 │
│ requests_total │ 0 │
│ crawler_runtime │ 0.038974 │
└───────────────────────────────┴──────────┘
[crawlee._autoscaling.autoscaled_pool] INFO current_concurrency = 0; desired_concurrency = 2; cpu = 0.0; mem = 0.0; event_loop = 0.0; client_info = 0.0
[crawlee.crawlers._playwright._playwright_crawler] ERROR Request failed and reached maximum retries
Traceback (most recent call last):
File "/Users/matecsaj/PycharmProjects/wat-crawlee/venv/lib/python3.13/site-packages/crawlee/crawlers/_basic/_context_pipeline.py", line 65, in __call__
result = await middleware_instance.__anext__()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/matecsaj/PycharmProjects/wat-crawlee/venv/lib/python3.13/site-packages/crawlee/crawlers/_playwright/_playwright_crawler.py", line 138, in _open_page
crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/matecsaj/PycharmProjects/wat-crawlee/venv/lib/python3.13/site-packages/crawlee/_utils/context.py", line 38, in async_wrapper
return await method(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/matecsaj/PycharmProjects/wat-crawlee/venv/lib/python3.13/site-packages/crawlee/browsers/_browser_pool.py", line 241, in new_page
return await self._get_new_page(page_id, plugin, proxy_info)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/matecsaj/PycharmProjects/wat-crawlee/venv/lib/python3.13/site-packages/crawlee/browsers/_browser_pool.py", line 270, in _get_new_page
page = await asyncio.wait_for(
^^^^^^^^^^^^^^^^^^^^^^^
...<5 lines>...
)
^
File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/asyncio/tasks.py", line 507, in wait_for
return await fut
^^^^^^^^^
File "/Users/matecsaj/PycharmProjects/wat-crawlee/venv/lib/python3.13/site-packages/crawlee/browsers/_playwright_browser_controller.py", line 119, in new_page
self._browser_context = await self._create_browser_context(browser_new_context_options, proxy_info)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/matecsaj/PycharmProjects/wat-crawlee/venv/lib/python3.13/site-packages/crawlee/browsers/_playwright_browser_controller.py", line 174, in _create_browser_context
if browser_new_context_options['proxy']:
~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^
KeyError: 'proxy'
[crawlee._autoscaling.autoscaled_pool] INFO Waiting for remaining tasks to finish
[crawlee.crawlers._playwright._playwright_crawler] INFO Error analysis: total_errors=3 unique_errors=1
[crawlee.crawlers._playwright._playwright_crawler] INFO Final request statistics:
┌───────────────────────────────┬───────────┐
│ requests_finished │ 0 │
│ requests_failed │ 1 │
│ retry_histogram │ [0, 0, 1] │
│ request_avg_failed_duration │ 0.025703 │
│ request_avg_finished_duration │ None │
│ requests_finished_per_minute │ 0 │
│ requests_failed_per_minute │ 14 │
│ request_total_duration │ 0.025703 │
│ requests_total │ 1 │
│ crawler_runtime │ 4.189647 │
└───────────────────────────────┴───────────┘
[crawlee.storages._dataset] WARN Attempting to export an empty dataset - no file will be created
[crawlee.crawlers._playwright._playwright_crawler] INFO Extracted data: []
Process finished with exit code 0