fix: Align `Request.state` transitions with `Request` lifecycle by Mantisus · Pull Request #1601 · apify/crawlee-python

Expand Up @@ -7,10 +7,11 @@
import pytest
from crawlee import ConcurrencySettings, Request from crawlee import ConcurrencySettings, Request, RequestState from crawlee.crawlers import HttpCrawler from crawlee.sessions import SessionPool from crawlee.statistics import Statistics from crawlee.storages import RequestQueue from tests.unit.server_endpoints import HELLO_WORLD
if TYPE_CHECKING: Expand Down Expand Up @@ -577,3 +578,57 @@ async def request_handler(context: HttpCrawlingContext) -> None: assert len(kvs_content) == 1 assert content_key.endswith('.html') assert kvs_content[content_key] == HELLO_WORLD.decode('utf8')

async def test_request_state(server_url: URL) -> None: queue = await RequestQueue.open(alias='http_request_state') crawler = HttpCrawler(request_manager=queue)
success_request = Request.from_url(str(server_url)) assert success_request.state == RequestState.UNPROCESSED
error_request = Request.from_url(str(server_url / 'error'), user_data={'cause_error': True})
requests_states: dict[str, dict[str, RequestState]] = {success_request.unique_key: {}, error_request.unique_key: {}}
@crawler.pre_navigation_hook async def pre_navigation_hook(context: BasicCrawlingContext) -> None: requests_states[context.request.unique_key]['pre_navigation'] = context.request.state
@crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: if context.request.user_data.get('cause_error'): raise ValueError('Caused error as requested') requests_states[context.request.unique_key]['request_handler'] = context.request.state
@crawler.error_handler async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None: requests_states[context.request.unique_key]['error_handler'] = context.request.state
@crawler.failed_request_handler async def failed_request_handler(context: BasicCrawlingContext, _error: Exception) -> None: requests_states[context.request.unique_key]['failed_request_handler'] = context.request.state
await crawler.run([success_request, error_request])
handled_success_request = await queue.get_request(success_request.unique_key)
assert handled_success_request is not None assert handled_success_request.state == RequestState.DONE
assert requests_states[success_request.unique_key] == { 'pre_navigation': RequestState.BEFORE_NAV, 'request_handler': RequestState.REQUEST_HANDLER, }
handled_error_request = await queue.get_request(error_request.unique_key) assert handled_error_request is not None assert handled_error_request.state == RequestState.ERROR
assert requests_states[error_request.unique_key] == { 'pre_navigation': RequestState.BEFORE_NAV, 'error_handler': RequestState.ERROR_HANDLER, 'failed_request_handler': RequestState.ERROR, }
await queue.drop()