refactor!: update status code handling by Mantisus · Pull Request #1028 · apify/crawlee-python
Expand Up
@@ -12,7 +12,6 @@
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
from crawlee.errors import SessionError
from crawlee.http_clients import HttpxHttpClient
from crawlee.statistics import StatisticsState
from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult Expand Down Expand Up @@ -57,16 +56,6 @@ def __init__( ) -> None: self._parser = parser self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = [] kwargs.setdefault('additional_http_error_status_codes', ()) kwargs.setdefault('ignore_http_error_status_codes', ())
kwargs.setdefault( 'http_client', HttpxHttpClient( additional_http_error_status_codes=kwargs['additional_http_error_status_codes'], ignore_http_error_status_codes=kwargs['ignore_http_error_status_codes'], ), )
if '_context_pipeline' not in kwargs: raise ValueError( Expand Down Expand Up @@ -111,8 +100,9 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC ContextPipeline() .compose(self._execute_pre_navigation_hooks) .compose(self._make_http_request) .compose(self._handle_status_code_response) .compose(self._parse_http_response) .compose(self._handle_blocked_request) .compose(self._handle_blocked_request_by_content) )
async def _execute_pre_navigation_hooks( Expand Down Expand Up @@ -216,10 +206,32 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
async def _handle_blocked_request( async def _handle_status_code_response( self, context: HttpCrawlingContext ) -> AsyncGenerator[HttpCrawlingContext, None]: """Validate the HTTP status code and raise appropriate exceptions if needed.
Args: context: The current crawling context containing the HTTP response.
Raises: SessionError: If the status code indicates the session is blocked. HttpStatusCodeError: If the status code represents a server error or is explicitly configured as an error. HttpClientStatusCodeError: If the status code represents a client error.
Yields: The original crawling context if no errors are detected. """ status_code = context.http_response.status_code if self._retry_on_blocked: self._raise_for_session_blocked_status_code(context.session, status_code) self._raise_for_error_status_code(status_code) yield context
async def _handle_blocked_request_by_content( self, context: ParsedHttpCrawlingContext[TParseResult] ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]: """Try to detect if the request is blocked based on the HTTP status code or the parsed response content. """Try to detect if the request is blocked based on the parsed response content.
Args: context: The current crawling context. Expand All @@ -228,14 +240,10 @@ async def _handle_blocked_request( SessionError: If the request is considered blocked.
Yields: The original crawling context if no errors are detected. The original crawling context if no blocking is detected. """ if self._retry_on_blocked: status_code = context.http_response.status_code if self._is_session_blocked_status_code(context.session, status_code): raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}') if blocked_info := self._parser.is_blocked(context.parsed_content): raise SessionError(blocked_info.reason) if self._retry_on_blocked and (blocked_info := self._parser.is_blocked(context.parsed_content)): raise SessionError(blocked_info.reason) yield context
def pre_navigation_hook(self, hook: Callable[[BasicCrawlingContext], Awaitable[None]]) -> None: Expand Down
from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult Expand Down Expand Up @@ -57,16 +56,6 @@ def __init__( ) -> None: self._parser = parser self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = [] kwargs.setdefault('additional_http_error_status_codes', ()) kwargs.setdefault('ignore_http_error_status_codes', ())
kwargs.setdefault( 'http_client', HttpxHttpClient( additional_http_error_status_codes=kwargs['additional_http_error_status_codes'], ignore_http_error_status_codes=kwargs['ignore_http_error_status_codes'], ), )
if '_context_pipeline' not in kwargs: raise ValueError( Expand Down Expand Up @@ -111,8 +100,9 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC ContextPipeline() .compose(self._execute_pre_navigation_hooks) .compose(self._make_http_request) .compose(self._handle_status_code_response) .compose(self._parse_http_response) .compose(self._handle_blocked_request) .compose(self._handle_blocked_request_by_content) )
async def _execute_pre_navigation_hooks( Expand Down Expand Up @@ -216,10 +206,32 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
async def _handle_blocked_request( async def _handle_status_code_response( self, context: HttpCrawlingContext ) -> AsyncGenerator[HttpCrawlingContext, None]: """Validate the HTTP status code and raise appropriate exceptions if needed.
Args: context: The current crawling context containing the HTTP response.
Raises: SessionError: If the status code indicates the session is blocked. HttpStatusCodeError: If the status code represents a server error or is explicitly configured as an error. HttpClientStatusCodeError: If the status code represents a client error.
Yields: The original crawling context if no errors are detected. """ status_code = context.http_response.status_code if self._retry_on_blocked: self._raise_for_session_blocked_status_code(context.session, status_code) self._raise_for_error_status_code(status_code) yield context
async def _handle_blocked_request_by_content( self, context: ParsedHttpCrawlingContext[TParseResult] ) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]: """Try to detect if the request is blocked based on the HTTP status code or the parsed response content. """Try to detect if the request is blocked based on the parsed response content.
Args: context: The current crawling context. Expand All @@ -228,14 +240,10 @@ async def _handle_blocked_request( SessionError: If the request is considered blocked.
Yields: The original crawling context if no errors are detected. The original crawling context if no blocking is detected. """ if self._retry_on_blocked: status_code = context.http_response.status_code if self._is_session_blocked_status_code(context.session, status_code): raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}') if blocked_info := self._parser.is_blocked(context.parsed_content): raise SessionError(blocked_info.reason) if self._retry_on_blocked and (blocked_info := self._parser.is_blocked(context.parsed_content)): raise SessionError(blocked_info.reason) yield context
def pre_navigation_hook(self, hook: Callable[[BasicCrawlingContext], Awaitable[None]]) -> None: Expand Down