dbpedia · vaibhav45sktech · Jan 2, 2026 · Jan 3, 2026 · Jan 4, 2026 · Jan 28, 2026
diff --git a/databusclient/api/deploy.py b/databusclient/api/deploy.py
@@ -215,6 +215,41 @@ def get_file_info(distribution_str: str) -> Tuple[Dict[str, str], str, str, str,
     return cvs, format_extension, compression, sha256sum, content_length
 
 
+def _get_file_info_from_dict(dist_dict: Dict[str, any]) -> Tuple[Dict[str, str], str, str, str, int]:
+    """
+    Extract file info from a pre-parsed distribution dictionary.
+
+    Parameters
+    ----------
+    dist_dict : dict
+        A dictionary with keys: url, variants, formatExtension, compression
+        (as returned by parse_distribution_str in cli.py)
+
+    Returns
+    -------
+    Tuple containing:
+        - cvs: Dict of content variants
+        - format_extension: File format extension
+        - compression: Compression type
+        - sha256sum: SHA-256 hash of file
+        - content_length: File size in bytes
+    """
+    url = dist_dict.get("url", "")
+    cvs = dist_dict.get("variants", {})
+    format_extension = dist_dict.get("formatExtension") or "file"
+    compression = dist_dict.get("compression") or "none"
+
+    # Check if sha256sum and content_length are provided
+    sha256sum = dist_dict.get("sha256sum")
+    content_length = dist_dict.get("byteSize")
+
+    # If not provided, load from URL
+    if sha256sum is None or content_length is None:
+        sha256sum, content_length = _load_file_stats(url)
+
+    return cvs, format_extension, compression, sha256sum, content_length
+
+
 def create_distribution(
     url: str,
     cvs: Dict[str, str],
@@ -314,7 +349,7 @@ def create_dataset(
     artifact_version_abstract: str,
     artifact_version_description: str,
     license_url: str,
-    distributions: List[str],
+    distributions: Union[List[str], List[Dict]],
     attribution: str = None,
     derived_from: str = None,
     group_title: str = None,
@@ -338,8 +373,10 @@ def create_dataset(
         Artifact & Version Description: used for BOTH artifact and version. Supports Markdown. Updating it changes both artifact and version metadata.
     license_url: str
         The license of the dataset as a URI.
-    distributions: str
-        Distribution information string as it is in the CLI. Can be created by running the create_distribution function
+    distributions: Union[List[str], List[Dict]]
+        Distribution information. Can be either:
+        - List[str]: Legacy format with pipe-separated strings (created by create_distribution function)
+        - List[Dict]: Pre-parsed dictionaries with keys: url, variants, formatExtension, compression
     attribution: str
         OPTIONAL! The attribution information for the Dataset
     derived_from: str
@@ -368,15 +405,28 @@ def create_dataset(
     artifact_id = _versionId.rsplit("/", 1)[0]
 
     distribution_list = []
-    for dst_string in distributions:
-        __url = str(dst_string).split("|")[0]
-        (
-            cvs,
-            formatExtension,
-            compression,
-            sha256sum,
-            content_length,
-        ) = get_file_info(dst_string)
+    for dst in distributions:
+        # Check if distribution is a pre-parsed dict or a legacy string
+        if isinstance(dst, dict):
+            # New format: pre-parsed dictionary from parse_distribution_str()
+            __url = dst.get("url", "")
+            (
+                cvs,
+                formatExtension,
+                compression,
+                sha256sum,
+                content_length,
+            ) = _get_file_info_from_dict(dst)
+        else:
+            # Legacy format: pipe-separated string
+            __url = str(dst).split("|")[0]
+            (
+                cvs,
+                formatExtension,
+                compression,
+                sha256sum,
+                content_length,
+            ) = get_file_info(dst)
 
         if not cvs and len(distributions) > 1:
             raise BadArgumentException(

diff --git a/databusclient/api/queries.py b/databusclient/api/queries.py
@@ -0,0 +1,93 @@
+"""
+SPARQL Queries for Databus Python Client
+
+This module contains SPARQL queries used for interacting with the DBpedia Databus.
+"""
+
+# Query to fetch ontologies with proper content variant aggregation
+# Uses GROUP_CONCAT to handle multiple content variants per distribution
+ONTOLOGIES_QUERY = """
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX databus: <https://databus.dbpedia.org/>
+PREFIX dataid: <http://dataid.dbpedia.org/ns/core#>
+PREFIX dataid-cv: <http://dataid.dbpedia.org/ns/cv#>
+PREFIX dct: <http://purl.org/dc/terms/>
+PREFIX dcat: <http://www.w3.org/ns/dcat#>
+PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+
+SELECT DISTINCT 
+  ?group ?art ?version ?title ?publisher ?comment ?description 
+  ?license ?file ?extension ?type ?bytes ?shasum 
+  (GROUP_CONCAT(DISTINCT ?variantStr; separator=", ") AS ?contentVariants) 
+WHERE { 
+  ?dataset dataid:account databus:ontologies .
+  ?dataset dataid:group ?group .
+  ?dataset dataid:artifact ?art.
+  ?dataset dcat:distribution ?distribution .
+  ?dataset dct:license ?license .
+  ?dataset dct:publisher ?publisher .
+  ?dataset rdfs:comment ?comment .
+  ?dataset dct:description ?description .
+  ?dataset dct:title ?title .
+  ?distribution dcat:downloadURL ?file .
+  ?distribution dataid:formatExtension ?extension .
+  ?distribution dataid-cv:type ?type .
+  ?distribution dcat:byteSize ?bytes .
+  ?distribution dataid:sha256sum ?shasum .
+  ?dataset dct:hasVersion ?version .
+
+  # Excludes dev versions
+  FILTER (!regex(?art, "--DEV"))
+
+  # OPTIONAL: Check for variants, but don't fail if none exist
+  OPTIONAL { 
+    ?distribution dataid:contentVariant ?cv . 
+    BIND(STR(?cv) AS ?variantStr)
+  }
+
+} 
+GROUP BY ?group ?art ?version ?title ?publisher ?comment ?description ?license ?file ?extension ?type ?bytes ?shasum 
+ORDER BY ?version
+"""
+
+
+def parse_content_variants_string(variants_str: str) -> dict:
+    """
+    Parse a comma-separated content variants string from SPARQL GROUP_CONCAT result.
+
+    Parameters
+    ----------
+    variants_str : str
+        Comma-separated string of content variants, e.g., "lang=en, type=full, sorted"
+
+    Returns
+    -------
+    dict
+        Dictionary of parsed content variants. For key=value pairs, both the key
+        and value are returned as strings (no type conversion is performed, so
+        "true" remains the string "true", not a boolean). For standalone values
+        without an "=" sign, the value is recorded as the boolean ``True``.
+
+        Example: "lang=en, type=full, sorted" -> {"lang": "en", "type": "full", "sorted": True}
+
+    Notes
+    -----
+    - All values from key=value pairs are kept as strings. If you need boolean
+      or numeric conversion, perform it after calling this function.
+    - Standalone items (e.g., "sorted") are stored with boolean ``True`` as
+      their value, indicating presence rather than a specific string value.
+    """
+    if not variants_str or variants_str.strip() == "":
+        return {}
+
+    variants = {}
+    for part in variants_str.split(","):
+        part = part.strip()
+        if "=" in part:
+            key, value = part.split("=", 1)
+            variants[key.strip()] = value.strip()
+        elif part:
+            # Handle standalone values (no key=value format)
+            variants[part] = True
+
+    return variants
diff --git a/databusclient/cli.py b/databusclient/cli.py
@@ -11,6 +11,51 @@
 from databusclient.extensions import webdav
 
 
+def parse_distribution_str(dist_str: str):
+    """
+    Parses a distribution string with format:
+    URL|key=value|...|.extension
+
+    Returns a dictionary suitable for the deploy API.
+    """
+    parts = dist_str.split('|')
+    url = parts[0].strip()
+
+    variants = {}
+    format_ext = None
+    compression = None
+
+    # Iterate over the modifiers (everything after the URL)
+    for part in parts[1:]:
+        part = part.strip()
+
+        # Case 1: Extension (starts with .)
+        if part.startswith('.'):
+            # purely heuristic: if it looks like compression (gz, zip, br), treat as compression
+            # otherwise treat as format extension
+            if part.lower() in ['.gz', '.zip', '.br', '.tar', '.zst']:
+                compression = part.lstrip('.') # remove leading dot for API compatibility if needed
+            else:
+                format_ext = part.lstrip('.')
+
+        # Case 2: Content Variant (key=value)
+        elif '=' in part:
+            key, value = part.split('=', 1)
+            variants[key.strip()] = value.strip()
+
+        # Case 3: Standalone tag (treat as boolean variant or ignore? 
+        # For now, we assume it's a value for a default key or warn)
+        else:
+             print(f"WARNING: Unrecognized modifier '{part}' in distribution. Expected '.ext' or 'key=val'.")
+
+    return {
+        "url": url,
+        "variants": variants,
+        "formatExtension": format_ext,
+        "compression": compression
+    }
+
+
 @click.group()
 def app():
     """Databus Client CLI.
@@ -85,14 +130,22 @@
         click.echo("[MODE] Classic deploy with distributions")
         click.echo(f"Deploying dataset version: {version_id}")
 
+        # --- CHANGE START ---
+        # Parse the input strings into structured objects
+        parsed_distributions = [parse_distribution_str(d) for d in distributions]
+
+        # Note: api_deploy.create_dataset now accepts this list of dicts
         dataid = api_deploy.create_dataset(
+            version_id, title, abstract, description, license_url, parsed_distributions
             version_id=version_id,
             artifact_version_title=title,
             artifact_version_abstract=abstract,
             artifact_version_description=description,
             license_url=license_url,
             distributions=distributions
         )
+        # --- CHANGE END ---
+
         api_deploy.deploy(dataid=dataid, api_key=apikey)
         return
 

diff --git a/tests/test_deploy.py b/tests/test_deploy.py
@@ -12,6 +12,7 @@
     BadArgumentException,
 )
 
+
 EXAMPLE_URL = "https://raw.githubusercontent.com/dbpedia/databus/608482875276ef5df00f2360a2f81005e62b58bd/server/app/api/swagger.yml"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,6 +12,7 @@
		BadArgumentException,
		)


		EXAMPLE_URL = "https://raw.githubusercontent.com/dbpedia/databus/608482875276ef5df00f2360a2f81005e62b58bd/server/app/api/swagger.yml"


Expand Down