BasicCrawler | API | Crawlee for Python · Fast, reliable Python web crawlers.
Index
Methods
- __init__(*, configuration, event_manager, storage_client, request_manager, session_pool, proxy_configuration, http_client, request_handler, max_request_retries, max_requests_per_crawl, max_session_rotations, max_crawl_depth, use_session_pool, retry_on_blocked, additional_http_error_status_codes, ignore_http_error_status_codes, concurrency_settings, request_handler_timeout, statistics, abort_on_error, keep_alive, configure_logging, statistics_log_format, respect_robots_txt_file, status_message_logging_interval, status_message_callback, id, _context_pipeline, _additional_context_managers, _logger): None
Parameters
optionalkeyword-onlyconfiguration: Configuration | None = None
optionalkeyword-onlyevent_manager: EventManager | None = None
optionalkeyword-onlystorage_client: StorageClient | None = None
optionalkeyword-onlyrequest_manager: RequestManager | None = None
optionalkeyword-onlysession_pool: SessionPool | None = None
optionalkeyword-onlyproxy_configuration: ProxyConfiguration | None = None
optionalkeyword-onlyhttp_client: HttpClient | None = None
optionalkeyword-onlyrequest_handler: Callable[[TCrawlingContext], Awaitable[None]] | None = None
optionalkeyword-onlymax_request_retries: int = 3
optionalkeyword-onlymax_requests_per_crawl: int | None = None
optionalkeyword-onlymax_session_rotations: int = 10
optionalkeyword-onlymax_crawl_depth: int | None = None
optionalkeyword-onlyuse_session_pool: bool = True
optionalkeyword-onlyretry_on_blocked: bool = True
optionalkeyword-onlyadditional_http_error_status_codes: Iterable[int] | None = None
optionalkeyword-onlyignore_http_error_status_codes: Iterable[int] | None = None
optionalkeyword-onlyconcurrency_settings: ConcurrencySettings | None = None
optionalkeyword-onlyrequest_handler_timeout: timedelta = timedelta(minutes=1)
optionalkeyword-onlystatistics: Statistics[TStatisticsState] | None = None
optionalkeyword-onlyabort_on_error: bool = False
optionalkeyword-onlykeep_alive: bool = False
optionalkeyword-onlyconfigure_logging: bool = True
optionalkeyword-onlystatistics_log_format: Literal[table, inline] = 'table'
optionalkeyword-onlyrespect_robots_txt_file: bool = False
optionalkeyword-onlystatus_message_logging_interval: timedelta = timedelta(seconds=10)
optionalkeyword-onlystatus_message_callback: Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]] | None = None
optionalkeyword-onlyid: int | None = None
optionalkeyword-only_context_pipeline: ContextPipeline[TCrawlingContext] | None = None
optionalkeyword-only_additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None
optionalkeyword-only_logger: logging.Logger | None = None
Returns None
- async add_requests(requests, *, forefront, batch_size, wait_time_between_batches, wait_for_all_requests_to_be_added, wait_for_all_requests_to_be_added_timeout): None
Parameters
requests: Sequence[str | Request]
optionalkeyword-onlyforefront: bool = False
optionalkeyword-onlybatch_size: int = 1000
optionalkeyword-onlywait_time_between_batches: timedelta = timedelta(0)
optionalkeyword-onlywait_for_all_requests_to_be_added: bool = False
optionalkeyword-onlywait_for_all_requests_to_be_added_timeout: timedelta | None = None
Returns None
- async export_data(path, dataset_id, dataset_name, dataset_alias): None
Parameters
path: str | Path
optionaldataset_id: str | None = None
optionaldataset_name: str | None = None
optionaldataset_alias: str | None = None
Returns None
- async get_data(dataset_id, dataset_name, dataset_alias, *, offset, limit, clean, desc, fields, omit, unwind, skip_empty, skip_hidden, flatten, view): DatasetItemsListPage
Parameters
optionaldataset_id: str | None = None
optionaldataset_name: str | None = None
optionaldataset_alias: str | None = None
keyword-onlyoptionaloffset: int
keyword-onlyoptionallimit: int | None
keyword-onlyoptionalclean: bool
keyword-onlyoptionaldesc: bool
keyword-onlyoptionalfields: list[str]
keyword-onlyoptionalomit: list[str]
keyword-onlyoptionalunwind: list[str]
keyword-onlyoptionalskip_empty: bool
keyword-onlyoptionalskip_hidden: bool
keyword-onlyoptionalflatten: list[str]
keyword-onlyoptionalview: str
Returns DatasetItemsListPage
- async get_dataset(*, id, name, alias): Dataset
Parameters
optionalkeyword-onlyid: str | None = None
optionalkeyword-onlyname: str | None = None
optionalkeyword-onlyalias: str | None = None
Returns Dataset
- async get_key_value_store(*, id, name, alias): KeyValueStore
Parameters
optionalkeyword-onlyid: str | None = None
optionalkeyword-onlyname: str | None = None
optionalkeyword-onlyalias: str | None = None
Returns KeyValueStore
- async get_request_manager(): RequestManager
- on_skipped_request(callback): SkippedRequestCallback
- router(router): None
Parameters
router: Router[TCrawlingContext]
Returns None
- async run(requests, *, purge_request_queue): FinalStatistics
Parameters
optionalrequests: Sequence[str | Request] | None = None
optionalkeyword-onlypurge_request_queue: bool = True
Returns FinalStatistics
- stop(reason): None
Parameters
optionalreason: str = 'Stop was called externally.'
Returns None
- async use_state(default_value): dict[str, JsonSerializable]
Parameters
optionaldefault_value: dict[str, JsonSerializable] | None = None
Returns dict[str, JsonSerializable]