Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 62 additions & 12 deletions databusclient/api/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,41 @@ def get_file_info(distribution_str: str) -> Tuple[Dict[str, str], str, str, str,
return cvs, format_extension, compression, sha256sum, content_length


def _get_file_info_from_dict(dist_dict: Dict[str, any]) -> Tuple[Dict[str, str], str, str, str, int]:
"""
Extract file info from a pre-parsed distribution dictionary.

Parameters
----------
dist_dict : dict
A dictionary with keys: url, variants, formatExtension, compression
(as returned by parse_distribution_str in cli.py)

Returns
-------
Tuple containing:
- cvs: Dict of content variants
- format_extension: File format extension
- compression: Compression type
- sha256sum: SHA-256 hash of file
- content_length: File size in bytes
"""
url = dist_dict.get("url", "")
cvs = dist_dict.get("variants", {})
format_extension = dist_dict.get("formatExtension") or "file"
compression = dist_dict.get("compression") or "none"

# Check if sha256sum and content_length are provided
sha256sum = dist_dict.get("sha256sum")
content_length = dist_dict.get("byteSize")

# If not provided, load from URL
if sha256sum is None or content_length is None:
sha256sum, content_length = _load_file_stats(url)

return cvs, format_extension, compression, sha256sum, content_length


def create_distribution(
url: str,
cvs: Dict[str, str],
Expand Down Expand Up @@ -272,7 +307,7 @@ def create_dataset(
abstract: str,
description: str,
license_url: str,
distributions: List[str],
distributions: Union[List[str], List[Dict]],
attribution: str = None,
derived_from: str = None,
group_title: str = None,
Expand All @@ -296,8 +331,10 @@ def create_dataset(
A long description of the dataset. Markdown syntax is supported
license_url: str
The license of the dataset as a URI.
distributions: str
Distribution information string as it is in the CLI. Can be created by running the create_distribution function
distributions: Union[List[str], List[Dict]]
Distribution information. Can be either:
- List[str]: Legacy format with pipe-separated strings (created by create_distribution function)
- List[Dict]: Pre-parsed dictionaries with keys: url, variants, formatExtension, compression
attribution: str
OPTIONAL! The attribution information for the Dataset
derived_from: str
Expand Down Expand Up @@ -326,15 +363,28 @@ def create_dataset(
artifact_id = _versionId.rsplit("/", 1)[0]

distribution_list = []
for dst_string in distributions:
__url = str(dst_string).split("|")[0]
(
cvs,
formatExtension,
compression,
sha256sum,
content_length,
) = get_file_info(dst_string)
for dst in distributions:
# Check if distribution is a pre-parsed dict or a legacy string
if isinstance(dst, dict):
# New format: pre-parsed dictionary from parse_distribution_str()
__url = dst.get("url", "")
(
cvs,
formatExtension,
compression,
sha256sum,
content_length,
) = _get_file_info_from_dict(dst)
else:
# Legacy format: pipe-separated string
__url = str(dst).split("|")[0]
(
cvs,
formatExtension,
compression,
sha256sum,
content_length,
) = get_file_info(dst)

if not cvs and len(distributions) > 1:
raise BadArgumentException(
Expand Down
93 changes: 93 additions & 0 deletions databusclient/api/queries.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

File seems useless? Is ONTOLOGIES_QUERY or parse_content_variants_string used at all?

Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""
SPARQL Queries for Databus Python Client

This module contains SPARQL queries used for interacting with the DBpedia Databus.
"""

# Query to fetch ontologies with proper content variant aggregation
# Uses GROUP_CONCAT to handle multiple content variants per distribution
ONTOLOGIES_QUERY = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX databus: <https://databus.dbpedia.org/>
PREFIX dataid: <http://dataid.dbpedia.org/ns/core#>
PREFIX dataid-cv: <http://dataid.dbpedia.org/ns/cv#>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dcat: <http://www.w3.org/ns/dcat#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT
?group ?art ?version ?title ?publisher ?comment ?description
?license ?file ?extension ?type ?bytes ?shasum
(GROUP_CONCAT(DISTINCT ?variantStr; separator=", ") AS ?contentVariants)
WHERE {
?dataset dataid:account databus:ontologies .
?dataset dataid:group ?group .
?dataset dataid:artifact ?art.
?dataset dcat:distribution ?distribution .
?dataset dct:license ?license .
?dataset dct:publisher ?publisher .
?dataset rdfs:comment ?comment .
?dataset dct:description ?description .
?dataset dct:title ?title .
?distribution dcat:downloadURL ?file .
?distribution dataid:formatExtension ?extension .
?distribution dataid-cv:type ?type .
?distribution dcat:byteSize ?bytes .
?distribution dataid:sha256sum ?shasum .
?dataset dct:hasVersion ?version .

# Excludes dev versions
FILTER (!regex(?art, "--DEV"))

# OPTIONAL: Check for variants, but don't fail if none exist
OPTIONAL {
?distribution dataid:contentVariant ?cv .
BIND(STR(?cv) AS ?variantStr)
}

}
GROUP BY ?group ?art ?version ?title ?publisher ?comment ?description ?license ?file ?extension ?type ?bytes ?shasum
ORDER BY ?version
"""


def parse_content_variants_string(variants_str: str) -> dict:
"""
Parse a comma-separated content variants string from SPARQL GROUP_CONCAT result.

Parameters
----------
variants_str : str
Comma-separated string of content variants, e.g., "lang=en, type=full, sorted"

Returns
-------
dict
Dictionary of parsed content variants. For key=value pairs, both the key
and value are returned as strings (no type conversion is performed, so
"true" remains the string "true", not a boolean). For standalone values
without an "=" sign, the value is recorded as the boolean ``True``.

Example: "lang=en, type=full, sorted" -> {"lang": "en", "type": "full", "sorted": True}

Notes
-----
- All values from key=value pairs are kept as strings. If you need boolean
or numeric conversion, perform it after calling this function.
- Standalone items (e.g., "sorted") are stored with boolean ``True`` as
their value, indicating presence rather than a specific string value.
"""
if not variants_str or variants_str.strip() == "":
return {}

variants = {}
for part in variants_str.split(","):
part = part.strip()
if "=" in part:
key, value = part.split("=", 1)
variants[key.strip()] = value.strip()
elif part:
# Handle standalone values (no key=value format)
variants[part] = True

return variants
54 changes: 53 additions & 1 deletion databusclient/cli.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer to keep cli.py as compact as possible and move logic (methods) always to the according CLI option (in this case, deploy.py).

Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,51 @@
from databusclient.extensions import webdav


def parse_distribution_str(dist_str: str):
"""
Parses a distribution string with format:
URL|key=value|...|.extension

Returns a dictionary suitable for the deploy API.
"""
parts = dist_str.split('|')
url = parts[0].strip()

variants = {}
format_ext = None
compression = None

# Iterate over the modifiers (everything after the URL)
for part in parts[1:]:
part = part.strip()

# Case 1: Extension (starts with .)
if part.startswith('.'):
# purely heuristic: if it looks like compression (gz, zip, br), treat as compression
# otherwise treat as format extension
if part.lower() in ['.gz', '.zip', '.br', '.tar', '.zst']:
compression = part.lstrip('.') # remove leading dot for API compatibility if needed
else:
format_ext = part.lstrip('.')

# Case 2: Content Variant (key=value)
elif '=' in part:
key, value = part.split('=', 1)
variants[key.strip()] = value.strip()

# Case 3: Standalone tag (treat as boolean variant or ignore?
# For now, we assume it's a value for a default key or warn)
else:
print(f"WARNING: Unrecognized modifier '{part}' in distribution. Expected '.ext' or 'key=val'.")

return {
"url": url,
"variants": variants,
"formatExtension": format_ext,
"compression": compression
}


@click.group()
def app():
"""Databus Client CLI"""
Expand Down Expand Up @@ -81,9 +126,16 @@ def deploy(
click.echo("[MODE] Classic deploy with distributions")
click.echo(f"Deploying dataset version: {version_id}")

# --- CHANGE START ---
# Parse the input strings into structured objects
parsed_distributions = [parse_distribution_str(d) for d in distributions]

# Note: api_deploy.create_dataset now accepts this list of dicts
dataid = api_deploy.create_dataset(
version_id, title, abstract, description, license_url, distributions
version_id, title, abstract, description, license_url, parsed_distributions
)
# --- CHANGE END ---

api_deploy.deploy(dataid=dataid, api_key=apikey)
return

Expand Down
1 change: 1 addition & 0 deletions tests/test_deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
BadArgumentException,
)


EXAMPLE_URL = "https://raw.githubusercontent.com/dbpedia/databus/608482875276ef5df00f2360a2f81005e62b58bd/server/app/api/swagger.yml"


Expand Down
Loading
Loading