Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 62 additions & 12 deletions databusclient/api/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,41 @@ def get_file_info(distribution_str: str) -> Tuple[Dict[str, str], str, str, str,
return cvs, format_extension, compression, sha256sum, content_length


def _get_file_info_from_dict(dist_dict: Dict[str, any]) -> Tuple[Dict[str, str], str, str, str, int]:
"""
Extract file info from a pre-parsed distribution dictionary.

Parameters
----------
dist_dict : dict
A dictionary with keys: url, variants, formatExtension, compression
(as returned by parse_distribution_str in cli.py)

Returns
-------
Tuple containing:
- cvs: Dict of content variants
- format_extension: File format extension
- compression: Compression type
- sha256sum: SHA-256 hash of file
- content_length: File size in bytes
"""
url = dist_dict.get("url", "")
cvs = dist_dict.get("variants", {})
format_extension = dist_dict.get("formatExtension") or "file"
compression = dist_dict.get("compression") or "none"

# Check if sha256sum and content_length are provided
sha256sum = dist_dict.get("sha256sum")
content_length = dist_dict.get("byteSize")

# If not provided, load from URL
if sha256sum is None or content_length is None:
sha256sum, content_length = _load_file_stats(url)

Comment on lines +218 to +249
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Validate required url before fallback download.

If a caller passes a dict without url, _load_file_stats("") raises a low-signal requests error. Fail fast with a clear exception before any network call.

🛠️ Proposed fix
-def _get_file_info_from_dict(dist_dict: Dict[str, any]) -> Tuple[Dict[str, str], str, str, str, int]:
+def _get_file_info_from_dict(dist_dict: Dict[str, any]) -> Tuple[Dict[str, str], str, str, str, int]:
     """
@@
-    url = dist_dict.get("url", "")
+    url = dist_dict.get("url")
+    if not url:
+        raise BadArgumentException("Distribution dict missing required 'url' field.")
🤖 Prompt for AI Agents
In `@databusclient/api/deploy.py` around lines 218 - 249, The function
_get_file_info_from_dict currently calls _load_file_stats(url) even when url is
empty; add an explicit validation in _get_file_info_from_dict to fail fast: if
either sha256sum or content_length is missing and the local variable url is
falsy/empty, raise a clear exception (e.g., ValueError) indicating the missing
required "url" instead of calling _load_file_stats; keep the existing fallback
to _load_file_stats only when url is present. Ensure references to url,
sha256sum, content_length, _get_file_info_from_dict and _load_file_stats are
used so reviewers can locate the change.

return cvs, format_extension, compression, sha256sum, content_length


def create_distribution(
url: str,
cvs: Dict[str, str],
Expand Down Expand Up @@ -314,7 +349,7 @@ def create_dataset(
artifact_version_abstract: str,
artifact_version_description: str,
license_url: str,
distributions: List[str],
distributions: Union[List[str], List[Dict]],
attribution: str = None,
derived_from: str = None,
group_title: str = None,
Expand All @@ -338,8 +373,10 @@ def create_dataset(
Artifact & Version Description: used for BOTH artifact and version. Supports Markdown. Updating it changes both artifact and version metadata.
license_url: str
The license of the dataset as a URI.
distributions: str
Distribution information string as it is in the CLI. Can be created by running the create_distribution function
distributions: Union[List[str], List[Dict]]
Distribution information. Can be either:
- List[str]: Legacy format with pipe-separated strings (created by create_distribution function)
- List[Dict]: Pre-parsed dictionaries with keys: url, variants, formatExtension, compression
attribution: str
OPTIONAL! The attribution information for the Dataset
derived_from: str
Expand Down Expand Up @@ -368,15 +405,28 @@ def create_dataset(
artifact_id = _versionId.rsplit("/", 1)[0]

distribution_list = []
for dst_string in distributions:
__url = str(dst_string).split("|")[0]
(
cvs,
formatExtension,
compression,
sha256sum,
content_length,
) = get_file_info(dst_string)
for dst in distributions:
# Check if distribution is a pre-parsed dict or a legacy string
if isinstance(dst, dict):
# New format: pre-parsed dictionary from parse_distribution_str()
__url = dst.get("url", "")
(
cvs,
formatExtension,
compression,
sha256sum,
content_length,
) = _get_file_info_from_dict(dst)
else:
# Legacy format: pipe-separated string
__url = str(dst).split("|")[0]
(
cvs,
formatExtension,
compression,
sha256sum,
content_length,
) = get_file_info(dst)

if not cvs and len(distributions) > 1:
raise BadArgumentException(
Expand Down
93 changes: 93 additions & 0 deletions databusclient/api/queries.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

File seems useless? Is ONTOLOGIES_QUERY or parse_content_variants_string used at all?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the feedback. The usage is currently indirect, which I agree is not very clear. I’ll add clarification (or refactor) to make the purpose and usage of ONTOLOGIES_QUERY / parse_content_variants_string more explicit.

Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""
SPARQL Queries for Databus Python Client

This module contains SPARQL queries used for interacting with the DBpedia Databus.
"""

# Query to fetch ontologies with proper content variant aggregation
# Uses GROUP_CONCAT to handle multiple content variants per distribution
ONTOLOGIES_QUERY = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX databus: <https://databus.dbpedia.org/>
PREFIX dataid: <http://dataid.dbpedia.org/ns/core#>
PREFIX dataid-cv: <http://dataid.dbpedia.org/ns/cv#>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT
?group ?art ?version ?title ?publisher ?comment ?description
?license ?file ?extension ?type ?bytes ?shasum
(GROUP_CONCAT(DISTINCT ?variantStr; separator=", ") AS ?contentVariants)
WHERE {
?dataset dataid:account databus:ontologies .
?dataset dataid:group ?group .
?dataset dataid:artifact ?art.
?dataset dcat:distribution ?distribution .
?dataset dct:license ?license .
?dataset dct:publisher ?publisher .
?dataset rdfs:comment ?comment .
?dataset dct:description ?description .
?dataset dct:title ?title .
?distribution dcat:downloadURL ?file .
?distribution dataid:formatExtension ?extension .
?distribution dataid-cv:type ?type .
?distribution dcat:byteSize ?bytes .
?distribution dataid:sha256sum ?shasum .
?dataset dct:hasVersion ?version .

# Excludes dev versions
FILTER (!regex(?art, "--DEV"))

# OPTIONAL: Check for variants, but don't fail if none exist
OPTIONAL {
?distribution dataid:contentVariant ?cv .
BIND(STR(?cv) AS ?variantStr)
}

}
GROUP BY ?group ?art ?version ?title ?publisher ?comment ?description ?license ?file ?extension ?type ?bytes ?shasum
ORDER BY ?version
"""


def parse_content_variants_string(variants_str: str) -> dict:
"""
Parse a comma-separated content variants string from SPARQL GROUP_CONCAT result.

Parameters
----------
variants_str : str
Comma-separated string of content variants, e.g., "lang=en, type=full, sorted"

Returns
-------
dict
Dictionary of parsed content variants. For key=value pairs, both the key
and value are returned as strings (no type conversion is performed, so
"true" remains the string "true", not a boolean). For standalone values
without an "=" sign, the value is recorded as the boolean ``True``.

Example: "lang=en, type=full, sorted" -> {"lang": "en", "type": "full", "sorted": True}

Notes
-----
- All values from key=value pairs are kept as strings. If you need boolean
or numeric conversion, perform it after calling this function.
- Standalone items (e.g., "sorted") are stored with boolean ``True`` as
their value, indicating presence rather than a specific string value.
"""
if not variants_str or variants_str.strip() == "":
return {}

variants = {}
for part in variants_str.split(","):
part = part.strip()
if "=" in part:
key, value = part.split("=", 1)
variants[key.strip()] = value.strip()
elif part:
# Handle standalone values (no key=value format)
variants[part] = True

return variants
53 changes: 53 additions & 0 deletions databusclient/cli.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer to keep cli.py as compact as possible and move logic (methods) always to the according CLI option (in this case, deploy.py).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer to keep cli.py as compact as possible and move logic (methods) always to the according CLI option (in this case, deploy.py).

sure @Integer-Ctrl i would incorporate the suggested chnages

Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,51 @@
from databusclient.extensions import webdav


def parse_distribution_str(dist_str: str):
"""
Parses a distribution string with format:
URL|key=value|...|.extension

Returns a dictionary suitable for the deploy API.
"""
parts = dist_str.split('|')
url = parts[0].strip()

variants = {}
format_ext = None
compression = None

# Iterate over the modifiers (everything after the URL)
for part in parts[1:]:
part = part.strip()

# Case 1: Extension (starts with .)
if part.startswith('.'):
# purely heuristic: if it looks like compression (gz, zip, br), treat as compression
# otherwise treat as format extension
if part.lower() in ['.gz', '.zip', '.br', '.tar', '.zst']:
compression = part.lstrip('.') # remove leading dot for API compatibility if needed
else:
format_ext = part.lstrip('.')

# Case 2: Content Variant (key=value)
elif '=' in part:
key, value = part.split('=', 1)
variants[key.strip()] = value.strip()

# Case 3: Standalone tag (treat as boolean variant or ignore?
# For now, we assume it's a value for a default key or warn)
else:
print(f"WARNING: Unrecognized modifier '{part}' in distribution. Expected '.ext' or 'key=val'.")

return {
"url": url,
"variants": variants,
"formatExtension": format_ext,
"compression": compression
}


@click.group()
def app():
"""Databus Client CLI.
Expand Down Expand Up @@ -85,14 +130,22 @@
click.echo("[MODE] Classic deploy with distributions")
click.echo(f"Deploying dataset version: {version_id}")

# --- CHANGE START ---
# Parse the input strings into structured objects
parsed_distributions = [parse_distribution_str(d) for d in distributions]

# Note: api_deploy.create_dataset now accepts this list of dicts
dataid = api_deploy.create_dataset(
version_id, title, abstract, description, license_url, parsed_distributions
version_id=version_id,

Check failure on line 140 in databusclient/cli.py

View workflow job for this annotation

GitHub Actions / build

Ruff

databusclient/cli.py:140:13: SyntaxError: Expected ',', found name
artifact_version_title=title,
artifact_version_abstract=abstract,
artifact_version_description=description,
license_url=license_url,
distributions=distributions
)
# --- CHANGE END ---
Comment on lines +133 to +147
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

# Check the actual file content at the specified lines
cat -n databusclient/cli.py | sed -n '130,150p'

Repository: dbpedia/databus-python-client

Length of output: 1099


🏁 Script executed:

# Also search for the create_dataset function signature to understand expected parameters
rg -n "def create_dataset" --type py -A 10

Repository: dbpedia/databus-python-client

Length of output: 764


🏁 Script executed:

# Find where api_deploy is defined/imported to understand the module
rg -n "api_deploy" databusclient/cli.py | head -20

Repository: dbpedia/databus-python-client

Length of output: 396


Fix syntax error in create_dataset call and use correct variable.

Line 139 is missing a comma before the keyword arguments on line 140, and you're passing distributions instead of parsed_distributions on line 145. This won't parse and breaks the CLI.

Fix
-        dataid = api_deploy.create_dataset(
-            version_id, title, abstract, description, license_url, parsed_distributions
-            version_id=version_id,
-            artifact_version_title=title,
-            artifact_version_abstract=abstract,
-            artifact_version_description=description,
-            license_url=license_url,
-            distributions=distributions
-        )
+        dataid = api_deploy.create_dataset(
+            version_id=version_id,
+            artifact_version_title=title,
+            artifact_version_abstract=abstract,
+            artifact_version_description=description,
+            license_url=license_url,
+            distributions=parsed_distributions,
+        )
🧰 Tools
🪛 GitHub Actions: Python CI (Lint & pytest)

[error] 140-140: Ruff check failed with SyntaxError: Expected ',', found name at databusclient/cli.py:140:13. Command: 'poetry run ruff check --output-format=github .'

🪛 GitHub Check: build

[failure] 140-140: Ruff
databusclient/cli.py:140:13: SyntaxError: Expected ',', found name

🤖 Prompt for AI Agents
In `@databusclient/cli.py` around lines 133 - 147, The create_dataset call in
cli.py has a syntax error (missing comma) and passes the wrong variable: remove
the duplicate positional args (first line with version_id, title, abstract,
description, license_url, parsed_distributions) or add the missing comma and
replace the final distributions argument with parsed_distributions so
api_deploy.create_dataset receives the structured list; update the call that
references api_deploy.create_dataset (and ensure you constructed
parsed_distributions via parse_distribution_str) to use keyword args version_id
/ artifact_version_title / artifact_version_abstract /
artifact_version_description / license_url and
distributions=parsed_distributions.


api_deploy.deploy(dataid=dataid, api_key=apikey)
return

Expand Down
1 change: 1 addition & 0 deletions tests/test_deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
BadArgumentException,
)


EXAMPLE_URL = "https://raw.githubusercontent.com/dbpedia/databus/608482875276ef5df00f2360a2f81005e62b58bd/server/app/api/swagger.yml"


Expand Down
Loading
Loading