How to economize on RAM when starting a crawl with a large list of urls?
A very long list of starting URLs consumes a significant amount of RAM throughout the crawler's runtime. I tried converting the get_urls() function into a generator, but the crawler.run() method did not accept it. What is the recommended approach?
import asyncio from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext def get_urls(): urls = [] ids = range(1, 100000) for id in ids: url = f"https://example.com/product/{pinside_id}" urls.append(url) return urls async def crawl_example() -> None: # PlaywrightCrawler crawls the web using a headless browser controlled by the Playwright library. crawler = PlaywrightCrawler() # Define a request handler to process each crawled page and attach it to the crawler using a decorator. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract relevant data from the page context. data = { 'url': context.request.url, 'title': await context.page.title(), } # Store the extracted data. await context.push_data(data) # Extract links from the current page and add them to the crawling queue. # await context.enqueue_links() # Add initial URLs to the queue and start the crawl. await crawler.run(get_urls()) asyncio.run(crawl_example())