Crawler doesn't respect `configuration` argument
Consider this sample program:
import asyncio from crawlee.configuration import Configuration from crawlee.parsel_crawler import ParselCrawler, ParselCrawlingContext async def default_handler(context: ParselCrawlingContext) -> None: for category in context.selector.xpath( '//div[@class="side_categories"]//ul/li/ul/li/a' ): await context.push_data({"category": category.xpath("normalize-space()").get()}) async def main() -> None: config = Configuration(persist_storage=False, write_metadata=False) crawler = ParselCrawler(request_handler=default_handler, configuration=config) await crawler.run(["https://books.toscrape.com"]) data = await crawler.get_data() print(data.items) if __name__ == "__main__": asyncio.run(main())
The configuration argument given to ParselCrawler is not respected, during the run it creates the ./storage directory and persist all the (meta)data. I have to work around it by overriding the global configuration likes this:
import asyncio from crawlee.configuration import Configuration from crawlee.parsel_crawler import ParselCrawler, ParselCrawlingContext async def default_handler(context: ParselCrawlingContext) -> None: for category in context.selector.xpath( '//div[@class="side_categories"]//ul/li/ul/li/a' ): await context.push_data({"category": category.xpath("normalize-space()").get()}) async def main() -> None: config = Configuration.get_global_configuration() config.persist_storage = False config.write_metadata = False crawler = ParselCrawler(request_handler=default_handler) await crawler.run(["https://books.toscrape.com"]) data = await crawler.get_data() print(data.items) if __name__ == "__main__": asyncio.run(main())