-
Notifications
You must be signed in to change notification settings - Fork 12
feature added: --validate checksum flag #44
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
350617c
743c623
e33ab8c
dc51aa9
5875a82
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,54 @@ | |
| get_databus_id_parts_from_file_url, | ||
| ) | ||
|
|
||
| from databusclient.extensions.webdav import compute_sha256_and_length | ||
|
|
||
| def _extract_checksum_from_node(node) -> str | None: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will work in most cases, but note that the fallback logic may also return any 64-character hex string found anywhere in the node. In edge cases, this could match a value that is not actually a SHA-256 checksum. To reduce the risk of false positives, it would be better to first restrict extraction to checksum-related keys (e.g. "checksum", "sha256sum", "sha256", "databus:checksum" as you already do around line 49), and only then validate the value format. For example, after selecting a candidate value via a known checksum key, you could re-check it with a regex like: |
||
| """ | ||
| Try to extract a 64-char hex checksum from a JSON-LD file node. | ||
| Handles these common shapes: | ||
| - checksum or sha256sum fields as plain string | ||
| - checksum fields as dict with '@value' | ||
| - nested values (recursively search strings for a 64-char hex) | ||
| """ | ||
| def find_in_value(v): | ||
| if isinstance(v, str): | ||
| s = v.strip() | ||
| if len(s) == 64 and all(c in "0123456789abcdefABCDEF" for c in s): | ||
| return s | ||
| if isinstance(v, dict): | ||
| # common JSON-LD value object | ||
| if "@value" in v and isinstance(v["@value"], str): | ||
| res = find_in_value(v["@value"]) | ||
| if res: | ||
| return res | ||
| # try all nested dict values | ||
| for vv in v.values(): | ||
| res = find_in_value(vv) | ||
| if res: | ||
| return res | ||
| if isinstance(v, list): | ||
| for item in v: | ||
| res = find_in_value(item) | ||
| if res: | ||
| return res | ||
| return None | ||
|
|
||
| # direct keys to try first | ||
| for key in ("checksum", "sha256sum", "sha256", "databus:checksum"): | ||
| if key in node: | ||
| res = find_in_value(node[key]) | ||
| if res: | ||
| return res | ||
|
|
||
| # fallback: search all values recursively for a 64-char hex string | ||
| for v in node.values(): | ||
| res = find_in_value(v) | ||
| if res: | ||
| return res | ||
| return None | ||
|
|
||
|
|
||
|
|
||
| # Hosts that require Vault token based authentication. Central source of truth. | ||
| VAULT_REQUIRED_HOSTS = { | ||
|
|
@@ -32,6 +80,8 @@ | |
| databus_key=None, | ||
| auth_url=None, | ||
| client_id=None, | ||
| validate_checksum: bool = False, | ||
| expected_checksum: str | None = None, | ||
| ) -> None: | ||
| """ | ||
| Download a file from the internet with a progress bar using tqdm. | ||
|
|
@@ -183,6 +233,27 @@ | |
| if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: | ||
| raise IOError("Downloaded size does not match Content-Length header") | ||
|
|
||
| # --- 6. Optional checksum validation --- | ||
| if validate_checksum: | ||
| # reuse compute_sha256_and_length from webdav extension | ||
| try: | ||
| actual, _ = compute_sha256_and_length(filename) | ||
| except (OSError, IOError) as e: | ||
| print(f"WARNING: error computing checksum for {filename}: {e}") | ||
| actual = None | ||
|
|
||
| if expected_checksum is None: | ||
| print(f"WARNING: no expected checksum available for {filename}; skipping validation") | ||
| elif actual is None: | ||
| print(f"WARNING: could not compute checksum for {filename}; skipping validation") | ||
| else: | ||
| if actual.lower() != expected_checksum.lower(): | ||
| try: os.remove(filename) # delete corrupted file | ||
| except OSError: pass | ||
| raise IOError( | ||
| f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}" | ||
| ) | ||
|
|
||
|
|
||
| def _download_files( | ||
| urls: List[str], | ||
|
|
@@ -191,6 +262,8 @@ | |
| databus_key: str = None, | ||
| auth_url: str = None, | ||
| client_id: str = None, | ||
| validate_checksum: bool = False, | ||
| checksums: dict | None = None, | ||
| ) -> None: | ||
| """ | ||
| Download multiple files from the databus. | ||
|
|
@@ -204,13 +277,18 @@ | |
| - client_id: Client ID for token exchange | ||
| """ | ||
| for url in urls: | ||
| expected = None | ||
| if checksums and isinstance(checksums, dict): | ||
| expected = checksums.get(url) | ||
| _download_file( | ||
| url=url, | ||
| localDir=localDir, | ||
| vault_token_file=vault_token_file, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| expected_checksum=expected, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -358,6 +436,7 @@ | |
| databus_key: str = None, | ||
| auth_url: str = None, | ||
| client_id: str = None, | ||
| validate_checksum: bool = False | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ) -> None: | ||
| """ | ||
| Download all files in a databus collection. | ||
|
|
@@ -375,13 +454,53 @@ | |
| file_urls = _get_file_download_urls_from_sparql_query( | ||
| endpoint, query, databus_key=databus_key | ||
| ) | ||
|
|
||
| # If checksum validation requested, attempt to build url->checksum mapping | ||
| # by fetching the Version JSON-LD for each file's version. We group files | ||
| # by their version URI to avoid fetching the same metadata repeatedly. | ||
| checksums: dict = {} | ||
| if validate_checksum: | ||
| # Map version_uri -> list of file urls | ||
| versions_map: dict = {} | ||
| for fu in file_urls: | ||
| try: | ||
| h, acc, grp, art, ver, f = get_databus_id_parts_from_file_url(fu) | ||
| except Exception: | ||
| continue | ||
| if ver is None: | ||
| continue | ||
| if h is None or acc is None or grp is None or art is None: | ||
|
Comment on lines
+465
to
+472
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is often more appropriate to use meaningful variable names for public/shared projects. Could you please write out some abbreviations? For example: h -> host, f -> file, fu -> ?, ... You should find a good balance between writing things out and using abbreviations, where appropriate. This applies to all code. I'll just comment it here. |
||
| continue | ||
| version_uri = f"https://{h}/{acc}/{grp}/{art}/{ver}" | ||
| versions_map.setdefault(version_uri, []).append(fu) | ||
|
|
||
| # Fetch each version's JSON-LD once and extract checksums for its files | ||
| for version_uri, urls_in_version in versions_map.items(): | ||
| try: | ||
| json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) | ||
| jd = json.loads(json_str) | ||
| graph = jd.get("@graph", []) | ||
| for node in graph: | ||
| if node.get("@type") == "Part": | ||
| file_uri = node.get("file") | ||
| if not isinstance(file_uri, str): | ||
| continue | ||
| expected = _extract_checksum_from_node(node) | ||
| if expected and file_uri in urls_in_version: | ||
| checksums[file_uri] = expected | ||
| except Exception: | ||
| # Best-effort: if fetching a version fails, skip it | ||
| continue | ||
|
|
||
| _download_files( | ||
| list(file_urls), | ||
| localDir, | ||
| vault_token_file=vault_token, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| checksums=checksums if checksums else None, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -392,6 +511,7 @@ | |
| databus_key: str = None, | ||
| auth_url: str = None, | ||
| client_id: str = None, | ||
| validate_checksum: bool = False, | ||
| ) -> None: | ||
| """ | ||
| Download all files in a databus artifact version. | ||
|
|
@@ -406,13 +526,31 @@ | |
| """ | ||
| json_str = fetch_databus_jsonld(uri, databus_key=databus_key) | ||
| file_urls = _get_file_download_urls_from_artifact_jsonld(json_str) | ||
| # build url -> checksum mapping from JSON-LD when available | ||
| checksums: dict = {} | ||
| try: | ||
| json_dict = json.loads(json_str) | ||
| graph = json_dict.get("@graph", []) | ||
| for node in graph: | ||
| if node.get("@type") == "Part": | ||
| file_uri = node.get("file") | ||
| if not isinstance(file_uri, str): | ||
| continue | ||
| expected = _extract_checksum_from_node(node) | ||
| if expected: | ||
| checksums[file_uri] = expected | ||
| except Exception: | ||
| checksums = {} | ||
|
|
||
| _download_files( | ||
| file_urls, | ||
| localDir, | ||
| vault_token_file=vault_token_file, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| checksums=checksums, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -424,6 +562,7 @@ | |
| databus_key: str = None, | ||
| auth_url: str = None, | ||
| client_id: str = None, | ||
| validate_checksum: bool = False, | ||
| ) -> None: | ||
| """ | ||
| Download files in a databus artifact. | ||
|
|
@@ -445,13 +584,31 @@ | |
| print(f"Downloading version: {version_uri}") | ||
| json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) | ||
| file_urls = _get_file_download_urls_from_artifact_jsonld(json_str) | ||
| # extract checksums for this version | ||
| checksums: dict = {} | ||
| try: | ||
| jd = json.loads(json_str) | ||
| graph = jd.get("@graph", []) | ||
| for node in graph: | ||
| if node.get("@type") == "Part": | ||
| file_uri = node.get("file") | ||
| if not isinstance(file_uri, str): | ||
| continue | ||
| expected = _extract_checksum_from_node(node) | ||
| if expected: | ||
| checksums[file_uri] = expected | ||
| except Exception: | ||
| checksums = {} | ||
|
|
||
| _download_files( | ||
| file_urls, | ||
| localDir, | ||
| vault_token_file=vault_token_file, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| checksums=checksums, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -527,6 +684,7 @@ | |
| databus_key: str = None, | ||
| auth_url: str = None, | ||
| client_id: str = None, | ||
| validate_checksum: bool = False, | ||
| ) -> None: | ||
| """ | ||
| Download files in a databus group. | ||
|
|
@@ -552,6 +710,7 @@ | |
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -598,6 +757,7 @@ | |
| all_versions=None, | ||
| auth_url="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", | ||
| client_id="vault-token-exchange", | ||
| validate_checksum: bool = False | ||
| ) -> None: | ||
| """ | ||
| Download datasets from databus. | ||
|
|
@@ -638,16 +798,36 @@ | |
| databus_key, | ||
| auth_url, | ||
| client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
| elif file is not None: | ||
| print(f"Downloading file: {databusURI}") | ||
| # Try to fetch expected checksum from the parent Version metadata | ||
| expected = None | ||
| if validate_checksum: | ||
| try: | ||
| version_uri = f"https://{host}/{account}/{group}/{artifact}/{version}" | ||
| json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) | ||
| json_dict = json.loads(json_str) | ||
| graph = json_dict.get("@graph", []) | ||
| for node in graph: | ||
| if node.get("file") == databusURI or node.get("@id") == databusURI: | ||
| expected = _extract_checksum_from_node(node) | ||
| if expected: | ||
| break | ||
| except Exception as e: | ||
| print(f"WARNING: Could not fetch checksum for single file: {e}") | ||
|
|
||
| # Call the worker to download the single file (passes expected checksum) | ||
| _download_file( | ||
| databusURI, | ||
| localDir, | ||
| vault_token_file=token, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| expected_checksum=expected, | ||
| ) | ||
| elif version is not None: | ||
| print(f"Downloading version: {databusURI}") | ||
|
|
@@ -658,6 +838,7 @@ | |
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
| elif artifact is not None: | ||
| print( | ||
|
|
@@ -671,6 +852,7 @@ | |
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
| elif group is not None and group != "collections": | ||
| print( | ||
|
|
@@ -684,6 +866,7 @@ | |
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
| elif account is not None: | ||
| print("accountId not supported yet") # TODO | ||
|
|
@@ -697,6 +880,8 @@ | |
| # query as argument | ||
| else: | ||
| print("QUERY {}", databusURI.replace("\n", " ")) | ||
| if validate_checksum: | ||
| print("WARNING: Checksum validation is not supported for user-defined SPARQL queries.") | ||
|
Comment on lines
+883
to
+884
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But it seems it is supported? See |
||
| if uri_endpoint is None: # endpoint is required for queries (--databus) | ||
| raise ValueError("No endpoint given for query") | ||
| res = _get_file_download_urls_from_sparql_query( | ||
|
|
@@ -709,4 +894,5 @@ | |
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice reuse 👍
Can you move
compute_sha256_and_lenghttoutils.py? Would make it more clear that it is a shared function