Documentation/Consistency Review (#3329)

benjeffery · web-flow · commit 179963c01726 · 2025-11-24T17:08:06.000Z
* Check table sequence length is finite

* Clarify data model and validity documentation

* Document VCF export behaviour and limits

* Clarify file-format compatibility and text output

* Clarify site statistics handling of missing data

* Support metadata decoding in parse_edges

* Fix Tree and TreeSequence docstrings

* Review comments
diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c
@@ -6575,6 +6575,14 @@ test_empty_tree_kc(void)
     ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES);
     CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SEQUENCE_LENGTH);
     tsk_treeseq_free(&ts);
+    tables.sequence_length = NAN;
+    ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES);
+    CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SEQUENCE_LENGTH);
+    tsk_treeseq_free(&ts);
+    tables.sequence_length = INFINITY;
+    ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES);
+    CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SEQUENCE_LENGTH);
+    tsk_treeseq_free(&ts);
     tables.sequence_length = 1.0;
     ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES);
     CU_ASSERT_EQUAL_FATAL(ret, 0);
diff --git a/c/tskit/tables.c b/c/tskit/tables.c
@@ -11078,7 +11078,7 @@ tsk_table_collection_check_integrity(
                    | TSK_CHECK_MIGRATION_ORDERING | TSK_CHECK_INDEXES;
     }
 
-    if (self->sequence_length <= 0) {
+    if (!tsk_isfinite(self->sequence_length) || self->sequence_length <= 0) {
         ret = tsk_trace_error(TSK_ERR_BAD_SEQUENCE_LENGTH);
         goto out;
     }
diff --git a/docs/data-model.md b/docs/data-model.md
@@ -356,14 +356,10 @@ required for a valid set of populations.
 
 #### Provenance Table
 
-:::{todo}
-Document the provenance table.
-:::
-
 | Column    | Type  | Description                                                             |
 | :-------- | ----- | ----------------------------------------------------------------------: |
 | timestamp | char  | Timestamp in [ISO-8601](https://en.wikipedia.org/wiki/ISO_8601) format. |
-| record    | char  | Provenance record.                                                      |
+| record    | char  | Provenance record as JSON.                                              |
 
 
 (sec_metadata_definition)=
@@ -374,10 +370,16 @@ Each table (excluding provenance) has a metadata column for storing and passing
 information that tskit does not use or interpret. See {ref}`sec_metadata` for details.
 The metadata columns are {ref}`binary columns <sec_tables_api_binary_columns>`.
 
-When using the {ref}`sec_text_file_format`, to ensure that metadata can be safely
-interchanged, each row is [base 64 encoded](https://en.wikipedia.org/wiki/Base64).
-Thus, binary information can be safely printed and exchanged, but may not be
-human readable.
+When using the {ref}`sec_text_file_format`, metadata values are written as opaque
+text. By default, :meth:`TreeSequence.dump_text` will base64-encode metadata values
+that are stored as raw bytes (when ``base64_metadata=True``) so that binary data can
+be safely printed and exchanged; in this case :func:`tskit.load_text` will base64-decode
+the corresponding text fields back to bytes. When metadata has already been decoded
+to a structured Python object (for example via a metadata schema), the textual
+representation written by :meth:`TreeSequence.dump_text` is the ``repr`` of that
+object, and :func:`tskit.load_text` does not attempt to reconstruct the original
+structured value from this representation. For reliable metadata round-tripping,
+prefer the native binary tree sequence file format over the text formats.
 
 The tree sequence itself also has metadata stored as a byte array.
 
@@ -399,6 +401,10 @@ error message. Some more complex requirements may not be detectable at load-time
 and errors may not occur until certain operations are attempted.
 These are documented below.
 
+At the tree-sequence level, we require that the coordinate space has a finite,
+strictly positive length; that is, the `sequence_length` attribute must be a
+finite value greater than zero.
+
 The Python API also provides tools that can transform a collection of
 tables into a valid collection of tables, so long as they are logically
 consistent, see {ref}`sec_tables_api_creating_valid_tree_sequence`.
@@ -410,7 +416,8 @@ consistent, see {ref}`sec_tables_api_creating_valid_tree_sequence`.
 
 Individuals are a basic type in a tree sequence and are not defined with
 respect to any other tables. Individuals can have a reference to their parent
-individuals, if present these references must be valid or null (-1).
+individuals, if present these references must be valid or null (-1). An
+individual cannot list itself as its own parent.
 
 A valid tree sequence does not require individuals to be sorted in any
 particular order, and sorting a set of tables using {meth}`TableCollection.sort`
@@ -424,6 +431,7 @@ using {meth}`TableCollection.sort_individuals`.
 Given a valid set of individuals and populations, the requirements for
 each node are:
 
+- `time` must be a finite (non-NaN, non-infinite) value;
 - `population` must either be null (-1) or refer to a valid population ID;
 - `individual` must either be null (-1) or refer to a valid individual ID.
 
@@ -443,7 +451,7 @@ has no effect on nodes.
 Given a valid set of nodes and a sequence length {math}`L`, the simple
 requirements for each edge are:
 
-- We must have {math}`0 \leq` `left` {math}`<` `right` {math}`\leq L`;
+- We must have finite coordinates with {math}`0 \leq` `left` {math}`<` `right` {math}`\leq L`;
 - `parent` and `child` must be valid node IDs;
 - `time[parent]` > `time[child]`;
 - edges must be unique (i.e., no duplicate edges are allowed).
@@ -480,7 +488,7 @@ properties are fulfilled.
 Given a valid set of nodes and a sequence length {math}`L`, the simple
 requirements for a valid set of sites are:
 
-- We must have {math}`0 \leq` `position` {math}`< L`;
+- We must have a finite coordinate with {math}`0 \leq` `position` {math}`< L`;
 - `position` values must be unique.
 
 For simplicity and algorithmic efficiency, sites must also:
@@ -546,19 +554,33 @@ will always fail. Use `tskit.is_unknown_time` to detect unknown values.
 
 #### Migration requirements
 
-Given a valid set of nodes and edges, the requirements for a value set of
+Given a valid set of nodes and edges, the requirements for a valid set of
 migrations are:
 
-- `left` and `right` must lie within the tree sequence coordinate space (i.e.,
-  from 0 to `sequence_length`).
-- `time` must be strictly between the time of its `node` and the time of any
-  ancestral node from which that node inherits on the segment `[left, right)`.
-- The `population` of any such ancestor matching `source`, if another
-  `migration` does not intervene.
+- `left` and `right` must be finite values that lie within the tree sequence
+  coordinate space (i.e., from 0 to `sequence_length`), with {math}`0 \leq`
+  `left` {math}`<` `right` {math}`\leq L`;
+- `node` must be a valid node ID;
+- if population references are checked, `source` and `dest` must be valid
+  population IDs;
+- `time` must be a finite value.
 
-To enable efficient processing, migrations must also be:
+To enable efficient processing, migrations must also be sorted by
+nondecreasing `time` value.
 
-- Sorted by nondecreasing `time` value.
+Conceptually, a migration records that a segment of ancestry for the given
+`node` moves between populations along the tree. In typical demographic
+models we expect:
+
+- `time` to lie strictly between the time of the migrating `node` and the time
+  of any ancestral node from which that node inherits on the segment
+  `[left, right)`;
+- the `population` of any such ancestor to match the `source` population,
+  until another `migration` intervenes.
+
+These conceptual relationships are not currently validated. It is
+the responsibility of code that creates migrations to satisfy them where
+required.
 
 Note in particular that there is no requirement that adjacent migration records
 should be "squashed". That is, we can have two records `m1` and `m2`
@@ -582,8 +604,10 @@ There are no requirements on a population table.
 The `timestamp` column of a provenance table should be in
 [ISO-8601](https://en.wikipedia.org/wiki/ISO_8601) format.
 
-The `record` should be valid JSON with structure defined in the Provenance
-Schema section (TODO).
+The `record` column stores a JSON document describing how and where the tree sequence
+was produced. For tree sequences generated by tskit and related tools, this JSON is
+expected to conform to the :ref:`provenance schema <sec_provenance_schema>` described
+in {ref}`sec_provenance`. 
 
 
 (sec_table_indexes)=
@@ -1148,4 +1172,3 @@ you won't see those parts of the tree sequence that are unrelated to the samples
 If you need to get those, too, you could either
 work with the {meth}`TreeSequence.edge_diffs` directly,
 or iterate over all nodes (instead of over {meth}`Tree.nodes`).
-
diff --git a/docs/export.md b/docs/export.md
@@ -319,6 +319,18 @@ the individual IDs are 2 and 3. See the
 these default labels.
 :::
 
+If some individuals have no associated nodes, they are omitted from the
+VCF output. By default, only nodes that are marked as samples contribute
+to the VCF genotypes; to include non-sample nodes as well (e.g., internal
+nodes that have been marked as individuals), set
+``include_non_sample_nodes=True`` when calling :meth:`TreeSequence.write_vcf`.
+
+:::{note}
+At present, :meth:`TreeSequence.write_vcf` only supports sites with up to
+9 distinct alleles; attempting to write a site with more than 9 alleles
+will result in a :class:`ValueError`.
+:::
+
 (sec_export_vcf_individual_names)=
 
 ### Individual names
@@ -369,31 +381,67 @@ id and individual id, and two underscores will throw an error.
 
 ### Modifying coordinates
 
-Tskit supports continuous genome coordinates, but VCF only supports discrete
-genome positions. Thus, when we convert a tree sequence that has sites
-at continuous positions we must discretise the coordinates in some way.
-
-The ``position_transform`` argument provides a way to flexibly translate
-the genomic location of sites in tskit to the appropriate value in VCF.
-There are two fundamental differences in the way that tskit and VCF define
-genomic coordinates. The first is that tskit uses floating point values
-to encode positions, whereas VCF uses integers. Thus, if the tree sequence
-contains positions at non-integral locations there is an information loss
-incurred by translating to VCF. By default, we round the site positions
-to the nearest integer, such that there may be several sites with the
-same integer position in the output. The second difference between VCF
-and tskit is that VCF is defined to be a 1-based coordinate system, whereas
-tskit uses 0-based. However, how coordinates are transformed depends
-on the VCF parser, and so we do **not** account for this change in
-coordinate system by default.
+Tree sequence site positions can be floating point values, whereas VCF
+requires positive integers. The ``position_transform`` argument
+controls how tskit maps coordinates into VCF. Translating non-integer
+positions necessarily loses precision; by default we round to the nearest
+integer, so multiple sites may share the same output position. 
+Furthermore, tskit's coordinate system starts at zero,
+whereas the VCF standard requires positions to be positive,
+and so a site at position 0 is not valid in the VCF standard.
+Because VCF parsers differ, we do **not** do anything to account for this.
+
+The simplest resolution of this discrepancy in convention between tskit and VCF
+positions is deal with any site at position 0 as a special case (for instance,
+by discarding or ignoring it).
+A different interpretation of this difference between tskit's position
+and VCF's POS field
+is that they are different coordinate systems: tskit coordinates are
+"distance to the right of the left end of the chromosome",
+while VCF coordinates are "which number site, counting from the left end
+of the chromosome and starting at one".
+Under this interpretation, the solution is to supply an explicit
+``position_transform`` that adds 1 to the coordinate when outputting
+to VCF (or, using the ``"legacy"`` option described below). However, this can
+easily lead to off-by-one errors converting between the coordinate systems,
+so should only be used if you really are using 0-based coordinates for your
+tree sequence.
+
+:::{warning}
+Most VCF tools cannot deal with a POS value of 0. If your tree
+sequence contains a site with position 0, this will likely cause an error.
+:::
+
+Internally, the coordinates used in the VCF output are obtained by applying
+the ``position_transform`` function to the array of site positions (and, for
+the contig length, to the tree sequence :attr:`.TreeSequence.sequence_length`).
+This function must return a one-dimensional array of the same length as its
+input; otherwise a :class:`ValueError` is raised. In addition to accepting a
+callable, tskit also supports the string value ``"legacy"`` here, which
+selects the pre-0.2.0 behaviour used by the original VCF exporter:
+positions are rounded to the nearest integer, starting at 1, and are forced
+to be strictly increasing by incrementing ties.
+
+The VCF specification does not allow positions to be 0. By default, if any
+transformed position is 0, :meth:`TreeSequence.write_vcf` will raise a
+:class:`ValueError`. If you wish to retain these records you can either:
+
+- set ``allow_position_zero=True`` to write such sites anyway;
+- mask the offending sites using the ``site_mask`` argument; or
+- choose a ``position_transform`` that maps 0 to a valid positive position.
+
+For example, to shift all coordinates by 1, we could define:
+
+```{code-cell}
+def one_based_positions(positions):
+    return [int(round(x)) + 1 for x in positions]
+
+ts.write_vcf(sys.stdout, position_transform=one_based_positions)
+```
 
 :::{note}
 The msprime 0.x legacy API simulates using continuous coordinates. It may
 be simpler to update your code to use the msprime 1.0 API (which uses
 discrete coordinates by default) than to work out how to transform
 coordinates in a way that is suitable for your application.
 :::
-
-:::{todo}
-Provide some examples of using position transform.
-:::
diff --git a/docs/file-formats.md b/docs/file-formats.md
@@ -43,9 +43,13 @@ e.g. ``python -m kastore ls file.trees``.
 
 ### Legacy Versions
 
-Tree sequence files written by older versions of tskit are not readable by
-newer versions of tskit. For tskit releases<0.6.2, `tskit upgrade`
-will convert older tree sequence files to the latest version.
+Tree sequence files are versioned. This version of tskit can read
+``.trees`` files produced by earlier releases that use the same *major*
+file format version (see ``format/version`` in a kastore listing).
+Files written using the pre-kastore HDF5 format (for example, by
+msprime < 0.6.0 or tskit < 0.6.2) cannot be read directly. To convert
+such legacy files, use the ``tskit upgrade`` command from an older
+tskit version (< 0.6.2) to produce a modern ``.trees`` file.
 
 
 (sec_text_file_format)=
@@ -234,8 +238,14 @@ ts.dump_text(sites=sys.stdout)
 The mutation text format must contain the columns `site`,
 `node` and `derived_state`. The `time`, `parent` and `metadata` columns
 may also be optionally present (but `parent` must be specified if
-more than one mutation occurs at the same site). If `time` is absent
-`UNKNOWN_TIME` will be used to fill the column. See the
+more than one mutation occurs at the same site). If the `time` column
+is absent, the mutation times in the resulting tree sequence are set to
+{data}`tskit.UNKNOWN_TIME`, which is a numeric value that behaves like NaN.
+Unknown mutation times written out by
+{meth}`TreeSequence.dump_text` are represented in the text file by the
+literal string ``\"unknown\"`` in the `time` column, and
+{func}`tskit.load_text` treats this string as `UNKNOWN_TIME` on input.
+See the
 {ref}`mutation table definitions <sec_mutation_table_definition>`
 for details on these columns.
 
@@ -280,5 +290,4 @@ ts.dump_text(populations=sys.stdout)
 ```
 
 The `metadata` contains base64-encoded data (in this case, the strings
-`pop1` and `pop1`).
-
+`pop1` and `pop2`).
diff --git a/docs/stats.md b/docs/stats.md
@@ -35,11 +35,14 @@ Please see the {ref}`tutorial <tutorials:sec_tutorial_stats>` for examples of th
 statistics API in use.
 
 :::{warning}
-{ref}`sec_data_model_missing_data` is not currently
-handled correctly by site statistics defined here, as we always
-assign missing data to be equal to the ancestral state. Later
-versions will add this behaviour as an option and will account
-for the presence of missing data by default.
+Site statistics defined here currently treat :ref:`missing data<sec_data_model_missing_data>`
+in the same way as in earlier versions of tskit: when computing site-based
+statistics, isolated samples without mutations directly above them are treated
+as carrying the ancestral allele rather than as missing. Future versions of
+tskit may expose options to treat missing data differently in statistics; for
+now, if you need explicit control over how missing data is handled you should
+use the low-level genotype/variant APIs (for example with
+``isolated_as_missing=True``) together with your own summary logic.
 :::
 
 (sec_stats_available)=
diff --git a/python/tests/test_metadata.py b/python/tests/test_metadata.py
@@ -217,6 +217,35 @@ def test_nodes(self):
         for a, b in zip(expected, n):
             assert a.encode("utf8") == b.metadata
 
+    @pytest.mark.parametrize(
+        "base64_metadata,metadata_text,expected",
+        [(True, "YWJj", b"abc"), (False, "plain", b"plain")],
+    )
+    def test_edges_metadata(self, base64_metadata, metadata_text, expected):
+        edges = io.StringIO(
+            f"""\
+        left    right    parent    child    metadata
+        0.0     1.0      2         0,1      {metadata_text}
+        """
+        )
+        table = tskit.parse_edges(
+            edges, strict=False, encoding="utf8", base64_metadata=base64_metadata
+        )
+        assert len(table) == 2
+        for row in table:
+            assert row.metadata == expected
+
+    def test_edges_without_metadata_column(self):
+        edges = io.StringIO(
+            """\
+        left    right    parent    child
+        0.0     1.0      2         3
+        """
+        )
+        table = tskit.parse_edges(edges, strict=False, encoding="utf8")
+        assert len(table) == 1
+        assert table[0].metadata == b""
+
     def test_sites(self):
         sites = io.StringIO(
             """\
diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py
@@ -2907,6 +2907,12 @@ def test_round_trip(self):
         b = tables.tree_sequence()
         assert a.tables == b.tables
 
+    @pytest.mark.parametrize("sequence_length", [np.inf, np.nan])
+    def test_nonfinite_sequence_length(self, sequence_length):
+        tables = tskit.TableCollection(sequence_length=sequence_length)
+        with pytest.raises(tskit.LibraryError, match="TSK_ERR_BAD_SEQUENCE_LENGTH"):
+            tables.tree_sequence()
+
 
 class TestMutationTimeErrors:
     def test_younger_than_node_below(self):
diff --git a/python/tskit/trees.py b/python/tskit/trees.py

Original file line number	Diff line number	Diff line change
`@@ -11078,7 +11078,7 @@ tsk_table_collection_check_integrity(`
`11078`	`11078`	`\| TSK_CHECK_MIGRATION_ORDERING \| TSK_CHECK_INDEXES;`
`11079`	`11079`	`}`
`11080`	`11080`
`11081`		`- if (self->sequence_length <= 0) {`
	`11081`	`+ if (!tsk_isfinite(self->sequence_length) \|\| self->sequence_length <= 0) {`
`11082`	`11082`	`ret = tsk_trace_error(TSK_ERR_BAD_SEQUENCE_LENGTH);`
`11083`	`11083`	`goto out;`
`11084`	`11084`	`}`