diff --git a/databusclient/api/deploy.py b/databusclient/api/deploy.py index ef8ebf5..770234f 100644 --- a/databusclient/api/deploy.py +++ b/databusclient/api/deploy.py @@ -173,6 +173,41 @@ def get_file_info(distribution_str: str) -> Tuple[Dict[str, str], str, str, str, return cvs, format_extension, compression, sha256sum, content_length +def _get_file_info_from_dict(dist_dict: Dict[str, any]) -> Tuple[Dict[str, str], str, str, str, int]: + """ + Extract file info from a pre-parsed distribution dictionary. + + Parameters + ---------- + dist_dict : dict + A dictionary with keys: url, variants, formatExtension, compression + (as returned by parse_distribution_str in cli.py) + + Returns + ------- + Tuple containing: + - cvs: Dict of content variants + - format_extension: File format extension + - compression: Compression type + - sha256sum: SHA-256 hash of file + - content_length: File size in bytes + """ + url = dist_dict.get("url", "") + cvs = dist_dict.get("variants", {}) + format_extension = dist_dict.get("formatExtension") or "file" + compression = dist_dict.get("compression") or "none" + + # Check if sha256sum and content_length are provided + sha256sum = dist_dict.get("sha256sum") + content_length = dist_dict.get("byteSize") + + # If not provided, load from URL + if sha256sum is None or content_length is None: + sha256sum, content_length = _load_file_stats(url) + + return cvs, format_extension, compression, sha256sum, content_length + + def create_distribution( url: str, cvs: Dict[str, str], @@ -272,7 +307,7 @@ def create_dataset( abstract: str, description: str, license_url: str, - distributions: List[str], + distributions: Union[List[str], List[Dict]], attribution: str = None, derived_from: str = None, group_title: str = None, @@ -296,8 +331,10 @@ def create_dataset( A long description of the dataset. Markdown syntax is supported license_url: str The license of the dataset as a URI. - distributions: str - Distribution information string as it is in the CLI. Can be created by running the create_distribution function + distributions: Union[List[str], List[Dict]] + Distribution information. Can be either: + - List[str]: Legacy format with pipe-separated strings (created by create_distribution function) + - List[Dict]: Pre-parsed dictionaries with keys: url, variants, formatExtension, compression attribution: str OPTIONAL! The attribution information for the Dataset derived_from: str @@ -326,15 +363,28 @@ def create_dataset( artifact_id = _versionId.rsplit("/", 1)[0] distribution_list = [] - for dst_string in distributions: - __url = str(dst_string).split("|")[0] - ( - cvs, - formatExtension, - compression, - sha256sum, - content_length, - ) = get_file_info(dst_string) + for dst in distributions: + # Check if distribution is a pre-parsed dict or a legacy string + if isinstance(dst, dict): + # New format: pre-parsed dictionary from parse_distribution_str() + __url = dst.get("url", "") + ( + cvs, + formatExtension, + compression, + sha256sum, + content_length, + ) = _get_file_info_from_dict(dst) + else: + # Legacy format: pipe-separated string + __url = str(dst).split("|")[0] + ( + cvs, + formatExtension, + compression, + sha256sum, + content_length, + ) = get_file_info(dst) if not cvs and len(distributions) > 1: raise BadArgumentException( diff --git a/databusclient/api/queries.py b/databusclient/api/queries.py new file mode 100644 index 0000000..0abb7f1 --- /dev/null +++ b/databusclient/api/queries.py @@ -0,0 +1,93 @@ +""" +SPARQL Queries for Databus Python Client + +This module contains SPARQL queries used for interacting with the DBpedia Databus. +""" + +# Query to fetch ontologies with proper content variant aggregation +# Uses GROUP_CONCAT to handle multiple content variants per distribution +ONTOLOGIES_QUERY = """ +PREFIX rdfs: +PREFIX databus: +PREFIX dataid: +PREFIX dataid-cv: +PREFIX dct: +PREFIX dcat: +PREFIX xsd: + +SELECT DISTINCT + ?group ?art ?version ?title ?publisher ?comment ?description + ?license ?file ?extension ?type ?bytes ?shasum + (GROUP_CONCAT(DISTINCT ?variantStr; separator=", ") AS ?contentVariants) +WHERE { + ?dataset dataid:account databus:ontologies . + ?dataset dataid:group ?group . + ?dataset dataid:artifact ?art. + ?dataset dcat:distribution ?distribution . + ?dataset dct:license ?license . + ?dataset dct:publisher ?publisher . + ?dataset rdfs:comment ?comment . + ?dataset dct:description ?description . + ?dataset dct:title ?title . + ?distribution dcat:downloadURL ?file . + ?distribution dataid:formatExtension ?extension . + ?distribution dataid-cv:type ?type . + ?distribution dcat:byteSize ?bytes . + ?distribution dataid:sha256sum ?shasum . + ?dataset dct:hasVersion ?version . + + # Excludes dev versions + FILTER (!regex(?art, "--DEV")) + + # OPTIONAL: Check for variants, but don't fail if none exist + OPTIONAL { + ?distribution dataid:contentVariant ?cv . + BIND(STR(?cv) AS ?variantStr) + } + +} +GROUP BY ?group ?art ?version ?title ?publisher ?comment ?description ?license ?file ?extension ?type ?bytes ?shasum +ORDER BY ?version +""" + + +def parse_content_variants_string(variants_str: str) -> dict: + """ + Parse a comma-separated content variants string from SPARQL GROUP_CONCAT result. + + Parameters + ---------- + variants_str : str + Comma-separated string of content variants, e.g., "lang=en, type=full, sorted" + + Returns + ------- + dict + Dictionary of parsed content variants. For key=value pairs, both the key + and value are returned as strings (no type conversion is performed, so + "true" remains the string "true", not a boolean). For standalone values + without an "=" sign, the value is recorded as the boolean ``True``. + + Example: "lang=en, type=full, sorted" -> {"lang": "en", "type": "full", "sorted": True} + + Notes + ----- + - All values from key=value pairs are kept as strings. If you need boolean + or numeric conversion, perform it after calling this function. + - Standalone items (e.g., "sorted") are stored with boolean ``True`` as + their value, indicating presence rather than a specific string value. + """ + if not variants_str or variants_str.strip() == "": + return {} + + variants = {} + for part in variants_str.split(","): + part = part.strip() + if "=" in part: + key, value = part.split("=", 1) + variants[key.strip()] = value.strip() + elif part: + # Handle standalone values (no key=value format) + variants[part] = True + + return variants diff --git a/databusclient/cli.py b/databusclient/cli.py index 069408e..aa1ff7f 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -11,6 +11,51 @@ from databusclient.extensions import webdav +def parse_distribution_str(dist_str: str): + """ + Parses a distribution string with format: + URL|key=value|...|.extension + + Returns a dictionary suitable for the deploy API. + """ + parts = dist_str.split('|') + url = parts[0].strip() + + variants = {} + format_ext = None + compression = None + + # Iterate over the modifiers (everything after the URL) + for part in parts[1:]: + part = part.strip() + + # Case 1: Extension (starts with .) + if part.startswith('.'): + # purely heuristic: if it looks like compression (gz, zip, br), treat as compression + # otherwise treat as format extension + if part.lower() in ['.gz', '.zip', '.br', '.tar', '.zst']: + compression = part.lstrip('.') # remove leading dot for API compatibility if needed + else: + format_ext = part.lstrip('.') + + # Case 2: Content Variant (key=value) + elif '=' in part: + key, value = part.split('=', 1) + variants[key.strip()] = value.strip() + + # Case 3: Standalone tag (treat as boolean variant or ignore? + # For now, we assume it's a value for a default key or warn) + else: + print(f"WARNING: Unrecognized modifier '{part}' in distribution. Expected '.ext' or 'key=val'.") + + return { + "url": url, + "variants": variants, + "formatExtension": format_ext, + "compression": compression + } + + @click.group() def app(): """Databus Client CLI""" @@ -81,9 +126,16 @@ def deploy( click.echo("[MODE] Classic deploy with distributions") click.echo(f"Deploying dataset version: {version_id}") + # --- CHANGE START --- + # Parse the input strings into structured objects + parsed_distributions = [parse_distribution_str(d) for d in distributions] + + # Note: api_deploy.create_dataset now accepts this list of dicts dataid = api_deploy.create_dataset( - version_id, title, abstract, description, license_url, distributions + version_id, title, abstract, description, license_url, parsed_distributions ) + # --- CHANGE END --- + api_deploy.deploy(dataid=dataid, api_key=apikey) return diff --git a/tests/test_deploy.py b/tests/test_deploy.py index aada04c..ad5605f 100644 --- a/tests/test_deploy.py +++ b/tests/test_deploy.py @@ -12,6 +12,7 @@ BadArgumentException, ) + EXAMPLE_URL = "https://raw.githubusercontent.com/dbpedia/databus/608482875276ef5df00f2360a2f81005e62b58bd/server/app/api/swagger.yml" diff --git a/tests/test_parse_distribution.py b/tests/test_parse_distribution.py new file mode 100644 index 0000000..2a293eb --- /dev/null +++ b/tests/test_parse_distribution.py @@ -0,0 +1,275 @@ +"""Tests for parse_distribution_str function in cli.py""" + +from unittest.mock import patch + +import pytest + +from databusclient.cli import parse_distribution_str +from databusclient.api.deploy import ( + create_dataset, + _get_file_info_from_dict, +) + + +class TestParseDistributionStr: + """Unit tests for parse_distribution_str function.""" + + # ------------------------------------------------------------------------- + # URL Extraction Tests + # ------------------------------------------------------------------------- + + def test_basic_url_extraction(self): + """Test that URL is correctly extracted from distribution string.""" + result = parse_distribution_str("http://example.com/data.json") + assert result["url"] == "http://example.com/data.json" + + def test_url_with_modifiers(self): + """Test URL extraction when modifiers are present.""" + result = parse_distribution_str("http://example.com/data.json|lang=en|.json") + assert result["url"] == "http://example.com/data.json" + + # ------------------------------------------------------------------------- + # Content Variant Parsing Tests + # ------------------------------------------------------------------------- + + def test_single_variant(self): + """Test parsing a single key=value variant.""" + result = parse_distribution_str("http://example.com/file|lang=en") + assert result["variants"] == {"lang": "en"} + + def test_multiple_variants(self): + """Test parsing multiple key=value variants.""" + result = parse_distribution_str("http://example.com/file|lang=en|type=full|quality=high") + assert result["variants"] == { + "lang": "en", + "type": "full", + "quality": "high", + } + + def test_variant_with_equals_in_value(self): + """Test variant where value contains equals sign.""" + result = parse_distribution_str("http://example.com/file|filter=a=b") + assert result["variants"] == {"filter": "a=b"} + + def test_empty_variants(self): + """Test that empty variants dict is returned when no variants present.""" + result = parse_distribution_str("http://example.com/file.json|.json") + assert result["variants"] == {} + + # ------------------------------------------------------------------------- + # Format Extension Tests + # ------------------------------------------------------------------------- + + def test_json_extension(self): + """Test .json format extension detection.""" + result = parse_distribution_str("http://example.com/file|.json") + assert result["formatExtension"] == "json" + + def test_ttl_extension(self): + """Test .ttl format extension detection.""" + result = parse_distribution_str("http://example.com/file|.ttl") + assert result["formatExtension"] == "ttl" + + def test_csv_extension(self): + """Test .csv format extension detection.""" + result = parse_distribution_str("http://example.com/file|.csv") + assert result["formatExtension"] == "csv" + + def test_xml_extension(self): + """Test .xml format extension detection.""" + result = parse_distribution_str("http://example.com/file|.xml") + assert result["formatExtension"] == "xml" + + def test_no_extension(self): + """Test that formatExtension is None when not provided.""" + result = parse_distribution_str("http://example.com/file|lang=en") + assert result["formatExtension"] is None + + # ------------------------------------------------------------------------- + # Compression Detection Tests + # ------------------------------------------------------------------------- + + def test_gz_compression(self): + """Test .gz compression detection.""" + result = parse_distribution_str("http://example.com/file|.gz") + assert result["compression"] == "gz" + + def test_zip_compression(self): + """Test .zip compression detection.""" + result = parse_distribution_str("http://example.com/file|.zip") + assert result["compression"] == "zip" + + def test_br_compression(self): + """Test .br (brotli) compression detection.""" + result = parse_distribution_str("http://example.com/file|.br") + assert result["compression"] == "br" + + def test_tar_compression(self): + """Test .tar compression detection.""" + result = parse_distribution_str("http://example.com/file|.tar") + assert result["compression"] == "tar" + + def test_zst_compression(self): + """Test .zst (zstandard) compression detection.""" + result = parse_distribution_str("http://example.com/file|.zst") + assert result["compression"] == "zst" + + def test_no_compression(self): + """Test that compression is None when not provided.""" + result = parse_distribution_str("http://example.com/file|.json") + assert result["compression"] is None + + # ------------------------------------------------------------------------- + # Combined Modifiers Tests + # ------------------------------------------------------------------------- + + def test_full_distribution_string(self): + """Test parsing a complete distribution string with all modifiers.""" + result = parse_distribution_str( + "http://mysite.com/data.json|lang=fr|quality=high|.json|.gz" + ) + assert result == { + "url": "http://mysite.com/data.json", + "variants": {"lang": "fr", "quality": "high"}, + "formatExtension": "json", + "compression": "gz", + } + + def test_order_independence(self): + """Test that order of modifiers doesn't affect parsing.""" + result = parse_distribution_str( + "http://example.com/file|.gz|lang=en|.json|type=full" + ) + assert result["variants"] == {"lang": "en", "type": "full"} + assert result["formatExtension"] == "json" + assert result["compression"] == "gz" + + # ------------------------------------------------------------------------- + # Edge Cases + # ------------------------------------------------------------------------- + + def test_whitespace_handling(self): + """Test that whitespace is properly stripped.""" + result = parse_distribution_str("http://example.com/file | lang = en | .json ") + assert result["url"] == "http://example.com/file" + assert result["variants"] == {"lang": "en"} + assert result["formatExtension"] == "json" + + def test_standalone_tag_warning(self, capsys): + """Test that standalone tags (without =) produce a warning.""" + result = parse_distribution_str("http://example.com/file|unknown_tag") + captured = capsys.readouterr() + assert "WARNING" in captured.out + assert "unknown_tag" in captured.out + # Standalone tags should not be added to variants + assert "unknown_tag" not in result["variants"] + + def test_url_only(self): + """Test parsing URL without any modifiers.""" + result = parse_distribution_str("http://example.com/data.json") + assert result == { + "url": "http://example.com/data.json", + "variants": {}, + "formatExtension": None, + "compression": None, + } + + +class TestIntegrationWithDeployAPI: + """Integration tests verifying parsed dicts work with api_deploy functions.""" + + @patch("databusclient.api.deploy._load_file_stats") + def test_get_file_info_from_dict_basic(self, mock_load_stats): + """Test _get_file_info_from_dict with parsed distribution dict.""" + mock_load_stats.return_value = ("abc123" * 10 + "abcd", 12345) + + parsed = parse_distribution_str( + "http://example.com/data.json|lang=en|type=full|.json|.gz" + ) + cvs, ext, comp, sha, size = _get_file_info_from_dict(parsed) + + assert cvs == {"lang": "en", "type": "full"} + assert ext == "json" + assert comp == "gz" + assert sha == "abc123" * 10 + "abcd" + assert size == 12345 + + @patch("databusclient.api.deploy._load_file_stats") + def test_get_file_info_from_dict_defaults(self, mock_load_stats): + """Test default values when extension/compression not specified.""" + mock_load_stats.return_value = ("sha256hash", 1000) + + parsed = parse_distribution_str("http://example.com/data|lang=en") + cvs, ext, comp, sha, size = _get_file_info_from_dict(parsed) + + # Should use defaults + assert ext == "file" # default when not specified + assert comp == "none" # default when not specified + + @patch("databusclient.api.deploy._load_file_stats") + def test_create_dataset_with_dict_distributions(self, mock_load_stats): + """Test create_dataset accepts parsed dict distributions.""" + fake_sha = "a" * 64 + mock_load_stats.return_value = (fake_sha, 5000) + + parsed_dist = parse_distribution_str( + "http://example.com/file.json|lang=en|.json" + ) + + dataset = create_dataset( + version_id="https://databus.example.org/user/group/artifact/2024.01.01/", + title="Test Dataset", + abstract="Test abstract", + description="Test description", + license_url="https://example.org/license", + distributions=[parsed_dist], + ) + + # Verify dataset structure + assert "@context" in dataset + assert "@graph" in dataset + + # Find distribution in graph + graphs = dataset["@graph"] + version_graph = next( + (g for g in graphs if "@type" in g and "Version" in g.get("@type", [])), + None, + ) + assert version_graph is not None + assert "distribution" in version_graph + + dist = version_graph["distribution"][0] + assert dist["downloadURL"] == "http://example.com/file.json" + assert dist["formatExtension"] == "json" + assert dist["dcv:lang"] == "en" + + @patch("databusclient.api.deploy._load_file_stats") + def test_create_dataset_multiple_distributions(self, mock_load_stats): + """Test create_dataset with multiple distributions requires variants.""" + fake_sha = "b" * 64 + mock_load_stats.return_value = (fake_sha, 3000) + + dist1 = parse_distribution_str("http://example.com/en.json|lang=en|.json") + dist2 = parse_distribution_str("http://example.com/de.json|lang=de|.json") + + dataset = create_dataset( + version_id="https://databus.example.org/user/group/artifact/2024.01.01/", + title="Test Dataset", + abstract="Test abstract", + description="Test description", + license_url="https://example.org/license", + distributions=[dist1, dist2], + ) + + # Both distributions should be present + graphs = dataset["@graph"] + version_graph = next( + (g for g in graphs if "@type" in g and "Version" in g.get("@type", [])), + None, + ) + distributions = version_graph["distribution"] + assert len(distributions) == 2 + + # Verify different language variants + langs = {d["dcv:lang"] for d in distributions} + assert langs == {"en", "de"}