Issues with payload in POST requests
Description
When using the crawlee.http_crawler._http_crawler module, I'm encountering a HttpStatusCodeError with a 404 status code. This error occurs during the _make_http_request operation and causes the crawler to reach the maximum number of retries.
The issue with the 404 error in the crawler seems to be related to the way i'm handling the request payload.
Steps to Reproduce
- Initialize a new Crawlee project and set up an
HttpCrawlerwith either theCurlImpersonateHttpClientor theHttpxHttpClient:
import asyncio import json from crawlee import Request from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient from crawlee.http_clients import HttpxHttpClient async def main() -> None: #http_client = CurlImpersonateHttpClient( # persist_cookies_per_session=True, #) # Or use HttpxHttpClient: http_client = HttpxHttpClient( persist_cookies_per_session=True, ) crawler = HttpCrawler( http_client=http_client, max_requests_per_crawl=20, ) url = "https://www.viagogo.es/Entradas-Deportes/Futbol/Real-Madrid-C-F-Entradas/E-153769088" payload = { 'ShowAllTickets': True, 'HideDuplicateTicketsV2': False, 'Quantity': 2, 'IsInitialQuantityChange': False, 'PageSize': 20, 'CurrentPage': 2, 'SortBy': 'NEWPRICE', 'SortDirection': 0, 'Sections': '', 'Rows': '', 'Seats': '', 'SeatTypes': '', 'TicketClasses': '', 'ListingNotes': '', 'PriceRange': '0,100', 'InstantDelivery': False, 'EstimatedFees': True, 'BetterValueTickets': True, 'PriceOption': '', 'HasFlexiblePricing': False, 'ExcludeSoldListings': False, 'RemoveObstructedView': False, 'NewListingsOnly': False, 'PriceDropListingsOnly': False, 'SelectBestListing': False, 'ConciergeTickets': False, 'Method': 'IndexSh' } payload_bytes = json.dumps(payload).encode() print(f"0. Start: {payload_bytes}") initial_req = Request.from_url( url=url, method="POST", payload=payload_bytes, use_extended_unique_key=True, ) @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: # Handle the response here pass # Run the crawler await crawler.run([initial_req]) if __name__ == '__main__': asyncio.run(main())
-
The crawler encounters a
404error during thecrawloperation and raises the following traceback:[crawlee.http_crawler._http_crawler] ERROR Request failed and reached maximum retries Traceback (most recent call last): File "/Development/scraper/.venv/lib/python3.12/site-packages/crawlee/basic_crawler/_context_pipeline.py", line 62, in __call__ result = await middleware_instance.__anext__() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Development/scraper/.venv/lib/python3.12/site-packages/crawlee/http_crawler/_http_crawler.py", line 101, in _make_http_request result = await self._http_client.crawl( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Development/scraper/.venv/lib/python3.12/site-packages/crawlee/http_clients/curl_impersonate.py", line 152, in crawl self._raise_for_error_status_code( File "/Development/scraper/.venv/lib/python3.12/site-packages/crawlee/http_clients/_base.py", line 153, in _raise_for_error_status_code raise HttpStatusCodeError('Error status code returned', status_code) crawlee.errors.HttpStatusCodeError: Error status code returned (status code: 404).
But if I pass the request as this:
initial_req = Request.from_url( url=url, method="POST", payload=payload, # payload as dictionary use_extended_unique_key=True, )
The error is the following:
Traceback (most recent call last):
File "/Development/scraper/src/test_clients.py", line 175, in <module>
asyncio.run(main())
File "/Library/Application Support/uv/python/cpython-3.12.7-macos-x86_64-none/lib/python3.12/asyncio/runners.py", line 194, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/Library/Application Support/uv/python/cpython-3.12.7-macos-x86_64-none/lib/python3.12/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Application Support/uv/python/cpython-3.12.7-macos-x86_64-none/lib/python3.12/asyncio/base_events.py", line 687, in run_until_complete
return future.result()
^^^^^^^^^^^^^^^
File "/Development/scraper/src/test_clients.py", line 159, in main
initial_req = Request.from_url(
^^^^^^^^^^^^^^^^^
File "/Development/scraper/.venv/lib/python3.12/site-packages/crawlee/_request.py", line 321, in from_url
unique_key = unique_key or compute_unique_key(
^^^^^^^^^^^^^^^^^^^
File "/Development/scraper/.venv/lib/python3.12/site-packages/crawlee/_utils/requests.py", line 126, in compute_unique_key
payload_hash = _get_payload_hash(payload)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Development/scraper/.venv/lib/python3.12/site-packages/crawlee/_utils/requests.py", line 151, in _get_payload_hash
return compute_short_hash(payload_in_bytes)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Development/scraper/.venv/lib/python3.12/site-packages/crawlee/_utils/crypto.py", line 17, in compute_short_hash
hash_object = sha256(data)
^^^^^^^^^^^^
TypeError: object supporting the buffer API required
Environment
- Python version: 3.12
- Crawlee version: 0.4.0
