diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index a3e303f088..6c1fbb63f8 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -191,6 +191,7 @@ async def extract_links( robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url) kwargs.setdefault('strategy', 'same-hostname') + strategy = kwargs.get('strategy', 'same-hostname') links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector)) @@ -209,7 +210,9 @@ async def extract_links( skipped = iter([]) for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs): - request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label) + request_options = RequestOptions( + url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy + ) if transform_request_function: transform_request_options = transform_request_function(request_options) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index cc6664cbd6..1190d5ac73 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1038,7 +1038,12 @@ def _enqueue_links_filter_iterator( warning_flag = True for request in request_iterator: - target_url = request.url if isinstance(request, Request) else request + if isinstance(request, Request): + if request.enqueue_strategy != strategy: + request.enqueue_strategy = strategy + target_url = request.url + else: + target_url = request parsed_target_url = urlparse(target_url) if warning_flag and strategy != 'all' and not parsed_target_url.hostname: diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index ffe5d95e43..0beb04a375 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -399,6 +399,7 @@ async def extract_links( robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url) kwargs.setdefault('strategy', 'same-hostname') + strategy = kwargs.get('strategy', 'same-hostname') elements = await context.page.query_selector_all(selector) links_iterator: Iterator[str] = iter( @@ -417,17 +418,19 @@ async def extract_links( skipped = iter([]) for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs): - request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label}) + request_options = RequestOptions( + url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy + ) if transform_request_function: - transform_request_option = transform_request_function(request_option) - if transform_request_option == 'skip': + transform_request_options = transform_request_function(request_options) + if transform_request_options == 'skip': continue - if transform_request_option != 'unchanged': - request_option = transform_request_option + if transform_request_options != 'unchanged': + request_options = transform_request_options try: - request = Request.from_url(**request_option) + request = Request.from_url(**request_options) except ValidationError as exc: context.log.debug( f'Skipping URL "{url}" due to invalid format: {exc}. ' diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index 90abd5aab1..04046153c0 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -7,7 +7,7 @@ import pytest -from crawlee import ConcurrencySettings, Glob, HttpHeaders, RequestTransformAction, SkippedReason +from crawlee import ConcurrencySettings, Glob, HttpHeaders, Request, RequestTransformAction, SkippedReason from crawlee.crawlers import BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext from crawlee.storages import RequestQueue @@ -409,3 +409,22 @@ async def test_slow_navigation_does_not_count_toward_handler_timeout(server_url: assert result.requests_failed == 0 assert result.requests_finished == 1 assert request_handler.call_count == 1 + + +async def test_enqueue_strategy_after_redirect(server_url: URL, redirect_server_url: URL) -> None: + crawler = BeautifulSoupCrawler() + + handler_calls = mock.AsyncMock() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + await handler_calls(context.request.url) + + target_url = str(server_url.with_path('redirect').with_query(url=str(redirect_server_url))) + + await context.enqueue_links(requests=[Request.from_url(target_url)], strategy='same-origin') + + await crawler.run([str(server_url)]) + + assert handler_calls.called + assert handler_calls.call_count == 1