fix: Improve error handling for `RobotsTxtFile.load` by Mantisus · Pull Request #1524

fix: Improve error handling for `RobotsTxtFile.load` by Mantisus · Pull Request #1524 · apify/crawlee-python

22 changes: 17 additions & 5 deletions src/crawlee/_utils/robots.py

Original file line number	Diff line number	Diff line change
		@@ -1,5 +1,6 @@
		from __future__ import annotations

		from logging import getLogger
		from typing import TYPE_CHECKING

		from protego import Protego
Expand All		@@ -15,6 +16,9 @@
		from crawlee.proxy_configuration import ProxyInfo


		logger = getLogger(__name__)


		class RobotsTxtFile:
		def __init__(
		self, url: str, robots: Protego, http_client: HttpClient \| None = None, proxy_info: ProxyInfo \| None = None
Expand Down Expand Up		@@ -56,12 +60,20 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo \| N
		http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
		proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
		"""
		response = await http_client.send_request(url, proxy_info=proxy_info)
		body = (
		b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
		)
		try:
		response = await http_client.send_request(url, proxy_info=proxy_info)

		body = (
		b'User-agent: *\nAllow: /'
		if is_status_code_client_error(response.status_code)
		else await response.read()
		)
		robots = Protego.parse(body.decode('utf-8'))

		except Exception as e:
		logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')

		robots = Protego.parse(body.decode('utf-8'))
		robots = Protego.parse('User-agent: *\nAllow: /')

		return cls(url, robots, http_client=http_client, proxy_info=proxy_info)

Expand Down

36 changes: 35 additions & 1 deletion tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,7 +6,7 @@
		import pytest

		from crawlee import ConcurrencySettings, Glob, HttpHeaders, RequestTransformAction, SkippedReason
		from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
		from crawlee.crawlers import BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext
		from crawlee.storages import RequestQueue

		if TYPE_CHECKING:
Expand Down Expand Up		@@ -167,6 +167,40 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
		}


		async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
		"""Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
		visit = mock.Mock()
		fail = mock.Mock()
		crawler = BeautifulSoupCrawler(
		http_client=http_client,
		respect_robots_txt_file=True,
		max_request_retries=0,
		)

		@crawler.router.default_handler
		async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
		visit(context.request.url)
		await context.enqueue_links(strategy='all')

		@crawler.failed_request_handler
		async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
		fail(context.request.url)

		await crawler.run([str(server_url / 'problematic_links')])

		visited = {call[0][0] for call in visit.call_args_list}
		failed = {call[0][0] for call in fail.call_args_list}

		# Email must be skipped
		# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
		assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}

		# The budplaceholder.com does not exist.
		assert failed == {
		'https://budplaceholder.com/',
		}


		async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
		crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True)
		skip = mock.Mock()
Expand Down

36 changes: 35 additions & 1 deletion tests/unit/crawlers/_parsel/test_parsel_crawler.py

Original file line number	Diff line number	Diff line change
Expand Up		@@ -14,7 +14,7 @@
		from yarl import URL

		from crawlee._request import RequestOptions
		from crawlee.crawlers import ParselCrawlingContext
		from crawlee.crawlers import BasicCrawlingContext, ParselCrawlingContext
		from crawlee.http_clients._base import HttpClient


Expand Down Expand Up		@@ -261,6 +261,40 @@ async def request_handler(context: ParselCrawlingContext) -> None:
		}


		async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
		"""Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
		visit = mock.Mock()
		fail = mock.Mock()
		crawler = ParselCrawler(
		http_client=http_client,
		respect_robots_txt_file=True,
		max_request_retries=0,
		)

		@crawler.router.default_handler
		async def request_handler(context: ParselCrawlingContext) -> None:
		visit(context.request.url)
		await context.enqueue_links(strategy='all')

		@crawler.failed_request_handler
		async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
		fail(context.request.url)

		await crawler.run([str(server_url / 'problematic_links')])

		visited = {call[0][0] for call in visit.call_args_list}
		failed = {call[0][0] for call in fail.call_args_list}

		# Email must be skipped
		# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
		assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}

		# The budplaceholder.com does not exist.
		assert failed == {
		'https://budplaceholder.com/',
		}


		async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
		crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True)
		skip = mock.Mock()
Expand Down

35 changes: 34 additions & 1 deletion tests/unit/crawlers/_playwright/test_playwright_crawler.py

Original file line number	Diff line number	Diff line change
Expand Up		@@ -48,7 +48,7 @@
		from crawlee._request import RequestOptions
		from crawlee._types import HttpMethod, HttpPayload
		from crawlee.browsers._types import BrowserType
		from crawlee.crawlers import PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
		from crawlee.crawlers import BasicCrawlingContext, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext


		@pytest.mark.parametrize(
Expand Down Expand Up		@@ -671,6 +671,39 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
		}


		async def test_respect_robots_txt_with_problematic_links(server_url: URL) -> None:
		"""Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
		visit = mock.Mock()
		fail = mock.Mock()
		crawler = PlaywrightCrawler(
		respect_robots_txt_file=True,
		max_request_retries=0,
		)

		@crawler.router.default_handler
		async def request_handler(context: PlaywrightCrawlingContext) -> None:
		visit(context.request.url)
		await context.enqueue_links(strategy='all')

		@crawler.failed_request_handler
		async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
		fail(context.request.url)

		await crawler.run([str(server_url / 'problematic_links')])

		visited = {call[0][0] for call in visit.call_args_list}
		failed = {call[0][0] for call in fail.call_args_list}

		# Email must be skipped
		# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
		assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}

		# The budplaceholder.com does not exist.
		assert failed == {
		'https://budplaceholder.com/',
		}


		async def test_on_skipped_request(server_url: URL) -> None:
		crawler = PlaywrightCrawler(respect_robots_txt_file=True)
		skip = mock.Mock()
Expand Down

10 changes: 10 additions & 0 deletions tests/unit/server.py

Original file line number	Diff line number	Diff line change
Expand Up		@@ -18,6 +18,7 @@
		GENERIC_RESPONSE,
		HELLO_WORLD,
		INCAPSULA,
		PROBLEMATIC_LINKS,
		ROBOTS_TXT,
		SECONDARY_INDEX,
		START_ENQUEUE,
Expand Down Expand Up		@@ -102,6 +103,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
		'page_1': generic_response_endpoint,
		'page_2': generic_response_endpoint,
		'page_3': generic_response_endpoint,
		'problematic_links': problematic_links_endpoint,
		'set_cookies': set_cookies,
		'set_complex_cookies': set_complex_cookies,
		'cookies': get_cookies,
Expand Down Expand Up		@@ -287,6 +289,14 @@ async def generic_response_endpoint(_scope: dict[str, Any], _receive: Receive, s
		)


		async def problematic_links_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
		"""Handle requests with a page containing problematic links."""
		await send_html_response(
		send,
		PROBLEMATIC_LINKS,
		)


		async def redirect_to_url(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
		"""Handle requests that should redirect to a specified full URL."""
		query_params = get_query_params(scope.get('query_string', b''))
Expand Down

10 changes: 10 additions & 0 deletions tests/unit/server_endpoints.py

Original file line number	Diff line number	Diff line change
Expand Up		@@ -35,6 +35,16 @@
		</iframe>
		</body></html>"""

		PROBLEMATIC_LINKS = b"""\
		<html><head>
		<title>Hello</title>
		</head>
		<body>
		<a href="https://budplaceholder.com/">Placeholder</a>
		<a href="mailto:test@test.com">test@test.com</a>
		<a href=https://avatars.githubusercontent.com/apify>Apify avatar/a>
		</body></html>"""

		GENERIC_RESPONSE = b"""\
		<html><head>
		<title>Hello</title>
Expand Down