feat: blocking detection for playwright crawler by vdusek · Pull Request #328 · apify/crawlee-python
Expand Up
@@ -5,7 +5,9 @@
from typing_extensions import Unpack
from crawlee._utils.blocked import RETRY_CSS_SELECTORS from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline from crawlee.basic_crawler.errors import SessionError from crawlee.browsers import BrowserPool from crawlee.enqueue_strategy import EnqueueStrategy from crawlee.models import BaseRequestData Expand All @@ -18,7 +20,24 @@
class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]): """A crawler that fetches the request URL using `Playwright`.""" """A crawler that leverages the [Playwright](https://playwright.dev/python/) browser automation library.
`PlaywrightCrawler` is a subclass of `BasicCrawler`, inheriting all its features, such as autoscaling of requests, request routing, and utilization of `RequestProvider`. Additionally, it offers Playwright-specific methods and properties, like the `page` property for user data extraction, and the `enqueue_links` method for crawling other pages.
This crawler is ideal for crawling websites that require JavaScript execution, as it uses headless browsers to download web pages and extract data. For websites that do not require JavaScript, consider using `BeautifulSoupCrawler`, which uses raw HTTP requests, and it is much faster.
`PlaywrightCrawler` opens a new browser page (i.e., tab) for each `Request` object and invokes the user-provided request handler function via the `Router`. Users can interact with the page and extract the data using the Playwright API.
Note that the pool of browser instances used by `PlaywrightCrawler`, and the pages they open, is internally managed by the `BrowserPool`. """
def __init__( self, Expand Down Expand Up @@ -50,19 +69,42 @@ def __init__(
self._browser_pool = browser_pool
kwargs['_context_pipeline'] = ContextPipeline().compose(self._page_goto) # Compose the context pipeline with the Playwright-specific context enhancer. kwargs['_context_pipeline'] = ( ContextPipeline().compose(self._make_http_request).compose(self._handle_blocked_request) ) kwargs['_additional_context_managers'] = [self._browser_pool]
kwargs.setdefault('_logger', logging.getLogger(__name__))
super().__init__(**kwargs)
async def _page_goto(self, context: BasicCrawlingContext) -> AsyncGenerator[PlaywrightCrawlingContext, None]: async def _make_http_request( self, context: BasicCrawlingContext, ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: """Enhance the crawling context with making an HTTP request using Playwright.
Args: context: The basic crawling context to be enhanced.
Raises: ValueError: If the browser pool is not initialized. SessionError: If the URL cannot be loaded by the browser.
Yields: An enhanced crawling context with Playwright-specific features. """ if self._browser_pool is None: raise ValueError('Browser pool is not initialized.')
# Create a new browser page, navigate to the URL and get response. crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info) await crawlee_page.page.goto(context.request.url) response = await crawlee_page.page.goto(context.request.url)
if response is None: raise SessionError(f'Failed to load the URL: {context.request.url}')
# Set the loaded URL to the actual URL after redirection. context.request.loaded_url = crawlee_page.page.url
async def enqueue_links( Expand All @@ -72,6 +114,7 @@ async def enqueue_links( user_data: dict | None = None, **kwargs: Unpack[AddRequestsKwargs], ) -> None: """The `PlaywrightCrawler` implementation of the `EnqueueLinksFunction` function.""" kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME)
requests = list[BaseRequestData]() Expand Down Expand Up @@ -102,7 +145,43 @@ async def enqueue_links( proxy_info=context.proxy_info, log=context.log, page=crawlee_page.page, response=response, enqueue_links=enqueue_links, )
await crawlee_page.page.close()
async def _handle_blocked_request( self, crawling_context: PlaywrightCrawlingContext, ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: """Enhance the crawling context with handling of blocked requests.
Args: crawling_context: The crawling context to be checked for blocking.
Raises: SessionError: If the session is blocked based on the HTTP status code or the response content.
Yields: The original crawling context if the session is not blocked. """ if self._retry_on_blocked: status_code = crawling_context.response.status
# Check if the session is blocked based on the HTTP status code. if crawling_context.session and crawling_context.session.is_blocked_status_code(status_code=status_code): raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}.')
matched_selectors = [ selector for selector in RETRY_CSS_SELECTORS if (await crawling_context.page.query_selector(selector)) ]
# Check if the session is blocked based on the response content if matched_selectors: raise SessionError( 'Assuming the session is blocked - ' f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}" )
yield crawling_context
from typing_extensions import Unpack
from crawlee._utils.blocked import RETRY_CSS_SELECTORS from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline from crawlee.basic_crawler.errors import SessionError from crawlee.browsers import BrowserPool from crawlee.enqueue_strategy import EnqueueStrategy from crawlee.models import BaseRequestData Expand All @@ -18,7 +20,24 @@
class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]): """A crawler that fetches the request URL using `Playwright`.""" """A crawler that leverages the [Playwright](https://playwright.dev/python/) browser automation library.
`PlaywrightCrawler` is a subclass of `BasicCrawler`, inheriting all its features, such as autoscaling of requests, request routing, and utilization of `RequestProvider`. Additionally, it offers Playwright-specific methods and properties, like the `page` property for user data extraction, and the `enqueue_links` method for crawling other pages.
This crawler is ideal for crawling websites that require JavaScript execution, as it uses headless browsers to download web pages and extract data. For websites that do not require JavaScript, consider using `BeautifulSoupCrawler`, which uses raw HTTP requests, and it is much faster.
`PlaywrightCrawler` opens a new browser page (i.e., tab) for each `Request` object and invokes the user-provided request handler function via the `Router`. Users can interact with the page and extract the data using the Playwright API.
Note that the pool of browser instances used by `PlaywrightCrawler`, and the pages they open, is internally managed by the `BrowserPool`. """
def __init__( self, Expand Down Expand Up @@ -50,19 +69,42 @@ def __init__(
self._browser_pool = browser_pool
kwargs['_context_pipeline'] = ContextPipeline().compose(self._page_goto) # Compose the context pipeline with the Playwright-specific context enhancer. kwargs['_context_pipeline'] = ( ContextPipeline().compose(self._make_http_request).compose(self._handle_blocked_request) ) kwargs['_additional_context_managers'] = [self._browser_pool]
kwargs.setdefault('_logger', logging.getLogger(__name__))
super().__init__(**kwargs)
async def _page_goto(self, context: BasicCrawlingContext) -> AsyncGenerator[PlaywrightCrawlingContext, None]: async def _make_http_request( self, context: BasicCrawlingContext, ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: """Enhance the crawling context with making an HTTP request using Playwright.
Args: context: The basic crawling context to be enhanced.
Raises: ValueError: If the browser pool is not initialized. SessionError: If the URL cannot be loaded by the browser.
Yields: An enhanced crawling context with Playwright-specific features. """ if self._browser_pool is None: raise ValueError('Browser pool is not initialized.')
# Create a new browser page, navigate to the URL and get response. crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info) await crawlee_page.page.goto(context.request.url) response = await crawlee_page.page.goto(context.request.url)
if response is None: raise SessionError(f'Failed to load the URL: {context.request.url}')
# Set the loaded URL to the actual URL after redirection. context.request.loaded_url = crawlee_page.page.url
async def enqueue_links( Expand All @@ -72,6 +114,7 @@ async def enqueue_links( user_data: dict | None = None, **kwargs: Unpack[AddRequestsKwargs], ) -> None: """The `PlaywrightCrawler` implementation of the `EnqueueLinksFunction` function.""" kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME)
requests = list[BaseRequestData]() Expand Down Expand Up @@ -102,7 +145,43 @@ async def enqueue_links( proxy_info=context.proxy_info, log=context.log, page=crawlee_page.page, response=response, enqueue_links=enqueue_links, )
await crawlee_page.page.close()
async def _handle_blocked_request( self, crawling_context: PlaywrightCrawlingContext, ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: """Enhance the crawling context with handling of blocked requests.
Args: crawling_context: The crawling context to be checked for blocking.
Raises: SessionError: If the session is blocked based on the HTTP status code or the response content.
Yields: The original crawling context if the session is not blocked. """ if self._retry_on_blocked: status_code = crawling_context.response.status
# Check if the session is blocked based on the HTTP status code. if crawling_context.session and crawling_context.session.is_blocked_status_code(status_code=status_code): raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}.')
matched_selectors = [ selector for selector in RETRY_CSS_SELECTORS if (await crawling_context.page.query_selector(selector)) ]
# Check if the session is blocked based on the response content if matched_selectors: raise SessionError( 'Assuming the session is blocked - ' f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}" )
yield crawling_context