fix: Improve error handling for `RobotsTxtFile.load` by Mantisus · Pull Request #1524 · apify/crawlee-python

22 changes: 17 additions & 5 deletions src/crawlee/_utils/robots.py

Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from logging import getLogger
from typing import TYPE_CHECKING

from protego import Protego
Expand All @@ -15,6 +16,9 @@
from crawlee.proxy_configuration import ProxyInfo


logger = getLogger(__name__)


class RobotsTxtFile:
def __init__(
self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
Expand Down Expand Up @@ -56,12 +60,20 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N
http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
"""
response = await http_client.send_request(url, proxy_info=proxy_info)
body = (
b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
)
try:
response = await http_client.send_request(url, proxy_info=proxy_info)

body = (
b'User-agent: *\nAllow: /'
if is_status_code_client_error(response.status_code)
else await response.read()
)
robots = Protego.parse(body.decode('utf-8'))

except Exception as e:
logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')

robots = Protego.parse(body.decode('utf-8'))
robots = Protego.parse('User-agent: *\nAllow: /')

return cls(url, robots, http_client=http_client, proxy_info=proxy_info)

Expand Down

36 changes: 35 additions & 1 deletion tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pytest

from crawlee import ConcurrencySettings, Glob, HttpHeaders, RequestTransformAction, SkippedReason
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.crawlers import BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.storages import RequestQueue

if TYPE_CHECKING:
Expand Down Expand Up @@ -167,6 +167,40 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
}


async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
"""Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
visit = mock.Mock()
fail = mock.Mock()
crawler = BeautifulSoupCrawler(
http_client=http_client,
respect_robots_txt_file=True,
max_request_retries=0,
)

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
visit(context.request.url)
await context.enqueue_links(strategy='all')

@crawler.failed_request_handler
async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
fail(context.request.url)

await crawler.run([str(server_url / 'problematic_links')])

visited = {call[0][0] for call in visit.call_args_list}
failed = {call[0][0] for call in fail.call_args_list}

# Email must be skipped
# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}

# The budplaceholder.com does not exist.
assert failed == {
'https://budplaceholder.com/',
}


async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True)
skip = mock.Mock()
Expand Down

36 changes: 35 additions & 1 deletion tests/unit/crawlers/_parsel/test_parsel_crawler.py

Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from yarl import URL

from crawlee._request import RequestOptions
from crawlee.crawlers import ParselCrawlingContext
from crawlee.crawlers import BasicCrawlingContext, ParselCrawlingContext
from crawlee.http_clients._base import HttpClient


Expand Down Expand Up @@ -261,6 +261,40 @@ async def request_handler(context: ParselCrawlingContext) -> None:
}


async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
"""Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
visit = mock.Mock()
fail = mock.Mock()
crawler = ParselCrawler(
http_client=http_client,
respect_robots_txt_file=True,
max_request_retries=0,
)

@crawler.router.default_handler
async def request_handler(context: ParselCrawlingContext) -> None:
visit(context.request.url)
await context.enqueue_links(strategy='all')

@crawler.failed_request_handler
async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
fail(context.request.url)

await crawler.run([str(server_url / 'problematic_links')])

visited = {call[0][0] for call in visit.call_args_list}
failed = {call[0][0] for call in fail.call_args_list}

# Email must be skipped
# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}

# The budplaceholder.com does not exist.
assert failed == {
'https://budplaceholder.com/',
}


async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True)
skip = mock.Mock()
Expand Down

35 changes: 34 additions & 1 deletion tests/unit/crawlers/_playwright/test_playwright_crawler.py

Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
from crawlee._request import RequestOptions
from crawlee._types import HttpMethod, HttpPayload
from crawlee.browsers._types import BrowserType
from crawlee.crawlers import PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
from crawlee.crawlers import BasicCrawlingContext, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext


@pytest.mark.parametrize(
Expand Down Expand Up @@ -671,6 +671,39 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
}


async def test_respect_robots_txt_with_problematic_links(server_url: URL) -> None:
"""Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
visit = mock.Mock()
fail = mock.Mock()
crawler = PlaywrightCrawler(
respect_robots_txt_file=True,
max_request_retries=0,
)

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
visit(context.request.url)
await context.enqueue_links(strategy='all')

@crawler.failed_request_handler
async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
fail(context.request.url)

await crawler.run([str(server_url / 'problematic_links')])

visited = {call[0][0] for call in visit.call_args_list}
failed = {call[0][0] for call in fail.call_args_list}

# Email must be skipped
# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}

# The budplaceholder.com does not exist.
assert failed == {
'https://budplaceholder.com/',
}


async def test_on_skipped_request(server_url: URL) -> None:
crawler = PlaywrightCrawler(respect_robots_txt_file=True)
skip = mock.Mock()
Expand Down

10 changes: 10 additions & 0 deletions tests/unit/server.py

Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
GENERIC_RESPONSE,
HELLO_WORLD,
INCAPSULA,
PROBLEMATIC_LINKS,
ROBOTS_TXT,
SECONDARY_INDEX,
START_ENQUEUE,
Expand Down Expand Up @@ -102,6 +103,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
'page_1': generic_response_endpoint,
'page_2': generic_response_endpoint,
'page_3': generic_response_endpoint,
'problematic_links': problematic_links_endpoint,
'set_cookies': set_cookies,
'set_complex_cookies': set_complex_cookies,
'cookies': get_cookies,
Expand Down Expand Up @@ -287,6 +289,14 @@ async def generic_response_endpoint(_scope: dict[str, Any], _receive: Receive, s
)


async def problematic_links_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests with a page containing problematic links."""
await send_html_response(
send,
PROBLEMATIC_LINKS,
)


async def redirect_to_url(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests that should redirect to a specified full URL."""
query_params = get_query_params(scope.get('query_string', b''))
Expand Down

10 changes: 10 additions & 0 deletions tests/unit/server_endpoints.py

Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@
</iframe>
</body></html>"""

PROBLEMATIC_LINKS = b"""\
<html><head>
<title>Hello</title>
</head>
<body>
<a href="https://budplaceholder.com/">Placeholder</a>
<a href="mailto:test@test.com">test@test.com</a>
<a href=https://avatars.githubusercontent.com/apify>Apify avatar/a>
</body></html>"""

GENERIC_RESPONSE = b"""\
<html><head>
<title>Hello</title>
Expand Down