Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ crawlee = "crawlee._cli:cli"

[dependency-groups]
dev = [
"anyio<5.0.0",
"apify_client", # For e2e tests.
"build<2.0.0", # For e2e tests.
"dycw-pytest-only<3.0.0",
Expand All @@ -112,7 +113,7 @@ dev = [
"pytest-timeout<3.0.0",
"pytest-xdist<4.0.0",
"pytest<9.0.0",
"ruff~=0.14.0",
"ruff~=0.15.0",
"setuptools", # setuptools are used by pytest, but not explicitly required
"ty~=0.0.0",
"types-beautifulsoup4<5.0.0",
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ class Request(BaseModel):

user_data: Annotated[
dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
Field(alias='userData', default_factory=lambda: UserData()),
Field(alias='userData', default_factory=UserData),
PlainValidator(user_data_adapter.validate_python),
PlainSerializer(
lambda instance: user_data_adapter.dump_python(
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ class HttpHeaders(RootModel, Mapping[str, str]):
else:
root: Annotated[
dict[str, str],
PlainValidator(lambda value: _normalize_headers(value)),
Field(default_factory=lambda: dict[str, str]()),
PlainValidator(_normalize_headers),
Field(default_factory=dict),
]

def __getitem__(self, key: str) -> str:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ async def extract_links(
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)

if robots_txt_file:
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
else:
skipped = iter([])

Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1386,7 +1386,7 @@ async def __run_task_function(self) -> None:
request_manager = await self.get_request_manager()

request = await wait_for(
lambda: request_manager.fetch_next_request(),
request_manager.fetch_next_request,
timeout=self._internal_timeout,
timeout_message=f'Fetching next request failed after {self._internal_timeout.total_seconds()} seconds',
logger=self._logger,
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ async def extract_links(
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)

if robots_txt_file:
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
else:
skipped = iter([])

Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/http_clients/_httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def _build_request(
headers=dict(headers) if headers else None,
content=payload,
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
timeout=timeout if timeout else httpx.USE_CLIENT_DEFAULT,
timeout=timeout or httpx.USE_CLIENT_DEFAULT,
)

def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:
Expand Down Expand Up @@ -329,7 +329,7 @@ def _combine_headers(self, explicit_headers: HttpHeaders | None) -> HttpHeaders
)
explicit_headers = explicit_headers or HttpHeaders()
headers = common_headers | user_agent_header | explicit_headers
return headers if headers else None
return headers or None

@staticmethod
def _is_proxy_error(error: httpx.TransportError) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/sessions/_cookies.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def _convert_cookie_to_dict(self, cookie: Cookie) -> CookieParam:
"""
cookie_dict = CookieParam(
name=cookie.name,
value=cookie.value if cookie.value else '',
value=cookie.value or '',
domain=cookie.domain,
path=cookie.path,
secure=cookie.secure,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -757,7 +757,7 @@ async def _get_request_files(cls, path_to_rq: Path) -> list[Path]:
await asyncio.to_thread(path_to_rq.mkdir, parents=True, exist_ok=True)

# List all the json files.
files = await asyncio.to_thread(lambda: list(path_to_rq.glob('*.json')))
files = list(await asyncio.to_thread(path_to_rq.glob, '*.json'))

# Filter out metadata file and non-file entries.
filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files)
Expand Down
6 changes: 3 additions & 3 deletions src/crawlee/storages/_storage_instance_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,17 @@ class _StorageCache:
"""Cache for storage instances."""

by_id: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(
default_factory=lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))
default_factory=lambda: defaultdict(lambda: defaultdict(defaultdict))
)
"""Cache for storage instances by ID. Example: by_id[Dataset]['some_id']['some_additional_cache_key']."""

by_name: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(
default_factory=lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))
default_factory=lambda: defaultdict(lambda: defaultdict(defaultdict))
)
"""Cache for storage instances by name. Example: by_name[Dataset]['some_name']['some_additional_cache_key']"""

by_alias: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(
default_factory=lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))
default_factory=lambda: defaultdict(lambda: defaultdict(defaultdict))
)
"""Cache for storage instances by alias. Example: by_alias[Dataset]['some_alias']['some_additional_cache_key']"""

Expand Down
8 changes: 5 additions & 3 deletions tests/unit/test_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from typing import TYPE_CHECKING

from anyio import Path as AnyioPath

from crawlee import service_locator
from crawlee.configuration import Configuration
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
Expand Down Expand Up @@ -45,7 +47,7 @@ async def default_handler(context: HttpCrawlingContext) -> None:
await crawler.run([str(server_url)])

# Verify that no files were created in the storage directory.
content = list(tmp_path.iterdir())
content = [path async for path in AnyioPath(tmp_path).iterdir()]
assert content == [], 'Expected the storage directory to be empty, but it is not.'


Expand All @@ -70,7 +72,7 @@ async def default_handler(context: HttpCrawlingContext) -> None:
await crawler.run([str(server_url)])

# Verify that files were created in the storage directory.
content = list(tmp_path.iterdir())
content = [path async for path in AnyioPath(tmp_path).iterdir()]
assert content != [], 'Expected the storage directory to contain files, but it does not.'


Expand All @@ -93,5 +95,5 @@ async def default_handler(context: HttpCrawlingContext) -> None:
await crawler.run([str(server_url)])

# Verify that files were created in the storage directory.
content = list(tmp_path.iterdir())
content = [path async for path in AnyioPath(tmp_path).iterdir()]
assert content != [], 'Expected the storage directory to contain files, but it does not.'
47 changes: 24 additions & 23 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading