Error handler does not work
I am using the latest version of crawlee, python 3.11, windows 11, tried chromium and firefox. There is a simple example
P.S. There is also an error ValueError: Cannot close the browser while there are open pages. Dont know how to fix that
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext from datetime import timedelta as td urls = ["https://randomname32482395f.com"] crawler = PlaywrightCrawler( headless=False, browser_type='firefox', # tried chromium request_handler_timeout=td(seconds=30), ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: print(await context.page.title()) await context.page.close() @crawler.failed_request_handler # also tried error_handler async def error_handler(context: PlaywrightCrawlingContext, error) -> None: print(f"Error processing {context.request.url}") async def test(): await crawler.add_requests(urls) await crawler.run() import asyncio asyncio.run(test())
expected result: "Error processing ["https://randomname32482395f.com"]"
obtained result:
[crawlee.statistics.statistics] INFO crawlee.playwright_crawler.playwright_crawler request statistics {
"requests_finished": 0,
"requests_failed": 0,
"retry_histogram": [
0
],
"request_avg_failed_duration": null,
"request_avg_finished_duration": null,
"requests_finished_per_minute": 0,
"requests_failed_per_minute": 0,
"request_total_duration": 0.0,
"requests_total": 0,
"crawler_runtime": 0.008034
}
[crawlee.autoscaling.autoscaled_pool] INFO current_concurrency = 0; desired_concurrency = 2; cpu = 0.0; mem = 0.0; event_loop = 0.0; client_info = 0.0
[crawlee.playwright_crawler.playwright_crawler] ERROR Request failed and reached maximum retries
Traceback (most recent call last):
File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\basic_crawler\context_pipeline.py", line 62, in __call__
result = await middleware_instance.__anext__()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\playwright_crawler\playwright_crawler.py", line 68, in _page_goto
await crawlee_page.page.goto(context.request.url)
File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\playwright\async_api\_generated.py", line 8657, in goto
await self._impl_obj.goto(
File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\playwright\_impl\_page.py", line 519, in goto
return await self._main_frame.goto(**locals_to_params(locals()))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\playwright\_impl\_frame.py", line 145, in goto
await self._channel.send("goto", locals_to_params(locals()))
File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\playwright\_impl\_connection.py", line 59, in send
return await self._connection.wrap_api_call(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\playwright\_impl\_connection.py", line 514, in wrap_api_call
raise rewrite_error(error, f"{parsed_st['apiName']}: {error}") from None
playwright._impl._errors.Error: Page.goto: NS_ERROR_UNKNOWN_HOST
Call log:
navigating to "https://randomname32482395f.com/", waiting until "load"
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\basic_crawler\basic_crawler.py", line 717, in __run_task_function
await wait_for(
File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\_utils\wait.py", line 37, in wait_for
return await asyncio.wait_for(operation(), timeout.total_seconds())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\tasks.py", line 489, in wait_for
return fut.result()
^^^^^^^^^^^^
File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\basic_crawler\basic_crawler.py", line 849, in __run_request_handler
await self._context_pipeline(crawling_context, self.router)
File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\basic_crawler\context_pipeline.py", line 70, in __call__
raise ContextPipelineInitializationError(e, crawling_context) from e
crawlee.basic_crawler.errors.ContextPipelineInitializationError: (Error('Page.goto: NS_ERROR_UNKNOWN_HOST\nCall log:\nnavigating to "https://randomname32482395f.com/", waiting until "load"\n'), BasicCrawlingContext(request=Request(url='https://randomname32482395f.com', unique_key='https://randomname32482395f.com', method='get', payload=None, headers={}, user_data={'__crawlee': {'state': <RequestState.REQUEST_HANDLER: 3>}}, retry_count=2, no_retry=False, loaded_url=None, handled_at=None, id='9UKWVpeUSFHmghR', json_=None, order_no=None), session=<Session id='2058HIh7GZ' max_age=datetime.timedelta(seconds=3000) user_data={} max_error_score=3.0 error_score_decrement=0.5 created_at=datetime.datetime(2024, 7, 14, 11, 55, 40, 206931, tzinfo=datetime.timezone.utc) usage_count=0 max_usage_count=50 error_score=0.0 cookies={} blocked_status_codes=[401, 403, 429]>, proxy_info=None, send_request=<function BasicCrawler._prepare_send_request_function.<locals>.send_request at 0x000001C77D93D1C0>, add_requests=<bound method RequestHandlerRunResult.add_requests of RequestHandlerRunResult(add_requests_calls=[])>, push_data=<bound method BasicCrawler._push_data of <crawlee.playwright_crawler.playwright_crawler.PlaywrightCrawler object at 0x000001C7782F5590>>, log=<Logger crawlee.playwright_crawler.playwright_crawler (INFO)>))
[crawlee.autoscaling.autoscaled_pool] INFO Waiting for remaining tasks to finish
Traceback (most recent call last):
File "c:\Users\user\Desktop\parsera\check.py", line 28, in <module>
asyncio.run(test())
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\runners.py", line 190, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_events.py", line 654, in run_until_complete
return future.result()
^^^^^^^^^^^^^^^
File "c:\Users\user\Desktop\parsera\check.py", line 25, in test
await crawler.run()
File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\basic_crawler\basic_crawler.py", line 350, in run
await run_task
File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\basic_crawler\basic_crawler.py", line 378, in _run_crawler
async with AsyncExitStack() as exit_stack:
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\contextlib.py", line 745, in __aexit__
raise exc_details[1]
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\contextlib.py", line 728, in __aexit__
cb_suppress = await cb(*exc_details)
^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\browsers\browser_pool.py", line 173, in __aexit__
await browser.close(force=True)
File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\browsers\playwright_browser_controller.py", line 96, in close
raise ValueError('Cannot close the browser while there are open pages.')
ValueError: Cannot close the browser while there are open pages.
Exception ignored in: <function BaseSubprocessTransport.__del__ at 0x000001C77A2E1BC0>
Traceback (most recent call last):
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_subprocess.py", line 126, in __del__
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_subprocess.py", line 104, in close
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\proactor_events.py", line 109, in close
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_events.py", line 762, in call_soon
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_events.py", line 520, in _check_closed
RuntimeError: Event loop is closed