Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ async def extract_links(
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)

kwargs.setdefault('strategy', 'same-hostname')
strategy = kwargs.get('strategy', 'same-hostname')

links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))

Expand All @@ -209,7 +210,9 @@ async def extract_links(
skipped = iter([])

for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
request_options = RequestOptions(
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
)

if transform_request_function:
transform_request_options = transform_request_function(request_options)
Expand Down
7 changes: 6 additions & 1 deletion src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1038,7 +1038,12 @@ def _enqueue_links_filter_iterator(
warning_flag = True

for request in request_iterator:
target_url = request.url if isinstance(request, Request) else request
if isinstance(request, Request):
if request.enqueue_strategy != strategy:
request.enqueue_strategy = strategy
target_url = request.url
else:
target_url = request
parsed_target_url = urlparse(target_url)

if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
Expand Down
15 changes: 9 additions & 6 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@ async def extract_links(
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)

kwargs.setdefault('strategy', 'same-hostname')
strategy = kwargs.get('strategy', 'same-hostname')

elements = await context.page.query_selector_all(selector)
links_iterator: Iterator[str] = iter(
Expand All @@ -417,17 +418,19 @@ async def extract_links(
skipped = iter([])

for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})
request_options = RequestOptions(
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
)

if transform_request_function:
transform_request_option = transform_request_function(request_option)
if transform_request_option == 'skip':
transform_request_options = transform_request_function(request_options)
if transform_request_options == 'skip':
continue
if transform_request_option != 'unchanged':
request_option = transform_request_option
if transform_request_options != 'unchanged':
request_options = transform_request_options

try:
request = Request.from_url(**request_option)
request = Request.from_url(**request_options)
except ValidationError as exc:
context.log.debug(
f'Skipping URL "{url}" due to invalid format: {exc}. '
Expand Down
21 changes: 20 additions & 1 deletion tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import pytest

from crawlee import ConcurrencySettings, Glob, HttpHeaders, RequestTransformAction, SkippedReason
from crawlee import ConcurrencySettings, Glob, HttpHeaders, Request, RequestTransformAction, SkippedReason
from crawlee.crawlers import BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.storages import RequestQueue

Expand Down Expand Up @@ -409,3 +409,22 @@ async def test_slow_navigation_does_not_count_toward_handler_timeout(server_url:
assert result.requests_failed == 0
assert result.requests_finished == 1
assert request_handler.call_count == 1


async def test_enqueue_strategy_after_redirect(server_url: URL, redirect_server_url: URL) -> None:
crawler = BeautifulSoupCrawler()

handler_calls = mock.AsyncMock()

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await handler_calls(context.request.url)

target_url = str(server_url.with_path('redirect').with_query(url=str(redirect_server_url)))

await context.enqueue_links(requests=[Request.from_url(target_url)], strategy='same-origin')

await crawler.run([str(server_url)])

assert handler_calls.called
assert handler_calls.call_count == 1
Loading