diff --git a/.gitignore b/.gitignore index a0715aea..225bf7aa 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ testext test/python/__pycache__/ .Rhistory vcpkg/ -*.log \ No newline at end of file +*.log +examples/quickstart/*.parquet diff --git a/CMakeLists.txt b/CMakeLists.txt index 47640ba4..3ca9e2fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -94,6 +94,7 @@ set(EXTENSION_SOURCES src/index/rtree_index_create_physical.cpp src/index/rtree_index_scan.cpp src/index/rtree_optimize_scan.cpp + src/temporal/temporal_parquet.cpp ) build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES}) diff --git a/Makefile b/Makefile index bc17d050..a9485498 100644 --- a/Makefile +++ b/Makefile @@ -11,9 +11,39 @@ include extension-ci-tools/makefiles/duckdb_extension.Makefile # both MEOS (meos_initialize_timezone) and DuckDB (DBConfig::SetOptionByName # "TimeZone") to Europe/Brussels. Tests pass on any OS timezone — the # extension is the single source of truth, no TZ env var needed. +# +# LoadInternal also calls ExtensionHelper::AutoLoadExtension(db, "icu") so +# the timezone option is honoured. Autoload looks for the extension on disk +# at $HOME/.duckdb/extensions///icu.duckdb_extension +# and falls back to a hub download. That fails both inside the linux_amd64 +# test docker container (empty path, no network egress) and on the macOS +# osx_arm64 test runner (hub icu not reliably resolvable). We copy the +# icu.duckdb_extension that was built locally as part of this extension's +# build (declared in extension_config.cmake) into the expected path, +# matched to the DuckDB platform string, before running the unittester. +DUCKDB_VERSION_TAG := v1.4.4 + +define stage_icu + @if [ -f ./build/$(1)/extension/icu/icu.duckdb_extension ]; then \ + case "$$(uname -s)-$$(uname -m)" in \ + Linux-x86_64) platform=linux_amd64 ;; \ + Linux-aarch64) platform=linux_arm64 ;; \ + Darwin-arm64) platform=osx_arm64 ;; \ + Darwin-x86_64) platform=osx_amd64 ;; \ + *) platform=$$(uname -m) ;; \ + esac; \ + target=$$HOME/.duckdb/extensions/$(DUCKDB_VERSION_TAG)/$$platform; \ + mkdir -p "$$target" && cp -f ./build/$(1)/extension/icu/icu.duckdb_extension "$$target/" && \ + echo "Staged icu.duckdb_extension at $$target/"; \ + fi +endef + test_release_internal: + $(call stage_icu,release) ./build/release/$(TEST_PATH) "$(PROJ_DIR)test/*" test_debug_internal: + $(call stage_icu,debug) ./build/debug/$(TEST_PATH) "$(PROJ_DIR)test/*" test_reldebug_internal: - ./build/reldebug/$(TEST_PATH) "$(PROJ_DIR)test/*" \ No newline at end of file + $(call stage_icu,reldebug) + ./build/reldebug/$(TEST_PATH) "$(PROJ_DIR)test/*" diff --git a/docs/beta-testing-edge-to-cloud.md b/docs/beta-testing-edge-to-cloud.md new file mode 100644 index 00000000..4ef88a49 --- /dev/null +++ b/docs/beta-testing-edge-to-cloud.md @@ -0,0 +1,212 @@ +# Beta-Testing Guide: Edge-to-Cloud Temporal Data Lake + +This guide has two sections. + +- **[Part 1 — For all beta testers](#part-1--for-all-beta-testers)**: what to + install, what to run, what to check, where to send feedback. +- **[Part 2 — For MobilityDB committers](#part-2--for-mobilitydb-committers)**: + PR / branch / implementation status, known engineering limitations. + +--- + +## Part 1 — For all beta testers + +### What you are testing + +The **edge-to-cloud pipeline** for MobilityDB temporal data: + +1. Raw GPS pings (CSV or inline values) are loaded into DuckDB. +2. They are assembled into typed `tgeogpointSeq` trajectories — geodetic + (spheroidal-metre) sequences backed by MEOS. +3. The trajectories are written to a **TemporalParquet** shard: a standard + Parquet file whose `BYTE_ARRAY` column carries MEOS-WKB values and whose + file footer contains a `temporal` metadata key describing each column's + type, encoding, and CRS. +4. The same shard is queryable on DuckDB, MobilityDB (PostgreSQL), and Spark — + using an identical named-function SQL dialect. + +### Time budget + +Scenario A (synthetic data, no CSV): **~15 minutes** including the build. +Scenario B (your own GPS CSV): add ~10 minutes. + +### Install + +```bash +git clone --recurse-submodules --branch feat/edge-to-cloud-quickstart \ + https://github.com/MobilityDB/MobilityDuck.git +cd MobilityDuck +make # first build: 5–10 min (downloads MEOS + dependencies) + # subsequent builds: ~30 s +``` + +After the build, a DuckDB shell with MobilityDuck pre-loaded is at +`./build/release/duckdb`. + +> A community extension (one-line `INSTALL`) is coming once this beta +> validates the feature. For now, build from source is required. + +### Scenario A — Zero-data quickstart + +Generates 5 synthetic North Sea vessels from inline data — no CSV, no +download. Demonstrates the full pipeline in under 2 seconds. + +```bash +TZ=UTC ./build/release/duckdb -c ".read examples/quickstart/quickstart.sql" +``` + +**Expected output:** + +Query A — geodetic distance and peak speed per vessel: +``` +┌───────────┬────────────┬──────────┬──────────────┐ +│ entity_id │ ping_count │ length_m │ max_speed_ms │ +├───────────┼────────────┼──────────┼──────────────┤ +│ 5 │ 12 │ 172001.0 │ 26.22 │ +│ 1 │ 12 │ 170169.0 │ 25.93 │ +│ 2 │ 12 │ 158771.0 │ 24.21 │ +│ 3 │ 12 │ 83644.0 │ 12.7 │ +│ 4 │ 12 │ 37155.0 │ 5.64 │ +└───────────┴────────────┴──────────┴──────────────┘ +``` + +Key checkpoints: +- Distances are in **metres**, not degrees (vessel 5 ≈ 172 km — not 1.55°). +- Vessel 3 (Skagerrak) is present here but must **not** appear in Query B. + +Query B — vessels that passed through the Copenhagen bounding box: +``` +┌───────────┐ +│ entity_id │ +├───────────┤ +│ 1 │ +│ 2 │ +│ 4 │ +│ 5 │ +└───────────┘ +``` + +Query C — trip duration (all 5 vessels: 12 pings × 10 min = 1 h 50 min): +``` +┌───────────┬───────────────┐ +│ entity_id │ trip_duration │ +├───────────┼───────────────┤ +│ 1 │ 01:50:00 │ (all five rows identical) +└───────────┴───────────────┘ +``` + +### Scenario B — Same queries on MobilityDB (portability check) + +```bash +psql -d -f examples/quickstart/quickstart_mobilitydb.sql +``` + +Queries A, B, and C must produce **identical values** to Scenario A. +This is the portability claim: one named-function SQL file, two platforms. + +### Scenario C — Your own GPS data + +```bash +# Edit the five CONFIGURE macros at the top of the template: +$EDITOR examples/generic-ingest/generic_ingest.sql +# Set: csv_path, col_entity_id, col_lon, col_lat, col_ts + +# Run: +TZ=UTC ./build/release/duckdb -c ".read examples/generic-ingest/generic_ingest.sql" +``` + +Output: `trajectories.parquet` in the current directory, readable by +MobilityDB, MobilitySpark, and PyMEOS without any MobilityDuck installation. + +### Scenario D — Real-world AIS data (optional, ~1 million pings) + +Download one day of Danish AIS data from the Maritime Authority: + + +Place the downloaded CSV (e.g. `aisdk-2026-02-26.csv`) at any convenient path, +then edit the path at the top of the demo file before running: + +```bash +# Set the CSV path in the demo file (one line to edit): +sed -i "s|../../meos/examples/data/aisdk-2026-02-26.csv|/path/to/your.csv|" \ + examples/ais-data-lake/ais_data_lake.sql + +TZ=UTC ./build/release/duckdb -c ".read examples/ais-data-lake/ais_data_lake.sql" +``` + +The demo filters to Class A vessels and a 1-hour window, so it completes in +under 30 seconds on a laptop even with a full-day file. + +### How to report feedback + +Open an issue or leave a comment on the beta thread: + + +Please include: +- Platform + OS version +- Output of `gcc --version` or `clang --version` +- For build failures: the last 20 lines of `make` output +- For wrong results: the full query + actual vs expected output +- Any ergonomic friction (confusing errors, missing functions, surprising behaviour) + +--- + +## Part 2 — For MobilityDB committers + +### PR and branch + +The feature lands via **MobilityDuck PR #113**: + + +Branch: `feat/edge-to-cloud-quickstart` (1 commit on top of `main`). + +Related RFC threads: +- [Issue #830](https://github.com/MobilityDB/MobilityDB/issues/830) — TemporalParquet spec +- [PR #911](https://github.com/MobilityDB/MobilityDB/pull/911) — TemporalParquet doc PR +- [PR #917](https://github.com/MobilityDB/MobilityDB/pull/917) — edge-to-cloud SQL portability RFC +- [Discussion #861](https://github.com/MobilityDB/MobilityDB/discussions/861) — portable SQL naming +- [Discussion #913](https://github.com/MobilityDB/MobilityDB/discussions/913) — Temporal Data Lake architecture + +### Test suite + +```bash +make test # 1446 assertions across 36 test files, all must pass +``` + +The new file `test/sql/tgeogpoint.test` (16 assertions) is the regression +guard for the SRID + geodetic-flag fix. + +### Implementation status + +| Function / type | Status | +|---|---| +| `TGEOGPOINT` construction + string parse | done | +| `TGEOGPOINT` Parquet round-trip (`asBinary` / `tgeogpointFromBinary`) | done | +| `eIntersects(GEOMETRY, tgeogpoint)` and all 12 `(GEOMETRY, temporal)` predicates | done (geodetic fix: `geom_to_geog()`) | +| `temporalFooter(MAP)` → TemporalParquet JSON | done | +| `asBinary`/`fromBinary` for spans, spansets, tgeometry, tcbuffer, tnpoint, tpose, th3index | **not yet wired** | +| Automatic footer injection on `COPY TO '*.parquet'` | **not yet** — call `KV_METADATA {'temporal': temporalFooter(...)}` explicitly | +| `tIntersects(GEOMETRY, tgeogpoint)` | **not yet** — MEOS roundoff error on geodetic sequences | +| `tDwithin(GEOMETRY, tgeogpoint, dist)` | **not yet** — only planar | + +### Geodetic fix — what changed + +Two bugs in `src/geo/tgeompoint_functions.cpp` (all 12 `(GEOMETRY, temporal)` functions): + +**Bug 1** — SRID was hardcoded 0 before the temporal value was deserialized. +`tgeogpoint` has SRID 4326; the geometry had SRID 0 → "Operation on mixed SRID". +Fix: `srid = tspatial_srid(tgeom)` after deserialization. + +**Bug 2** — `FLAGS_SET_GEODETIC(gs->gflags, 1)` alone corrupts the 2D bbox layout +(`FLAGS_NDIMS_BOX` changes from 2 → 3, shifting geometry data read offset by 16 bytes). +Fix: `geom_to_geog(gs)` (public MEOS API) properly rebuilds the GSERIALIZED with a +3D bounding box and GEODETIC=1, mirroring PostGIS's implicit `geometry → geography` cast. + +### Review checklist (committer) + +- [ ] `make test`: 1446 assertions pass +- [ ] `docs/tgeogpoint-design.md` — "Spatial Predicates" section is accurate +- [ ] `examples/quickstart/quickstart.sql` — readable and self-contained for a new user +- [ ] `examples/generic-ingest/generic_ingest.sql` — instructions clear, macros well-named +- [ ] No `Co-Authored-By` or internal planning references in commit messages +- [ ] Confirm `temporalFooter()` output matches the TemporalParquet spec in PR #911 diff --git a/docs/testing-tz-neutral-policy.md b/docs/testing-tz-neutral-policy.md new file mode 100644 index 00000000..3f44549a --- /dev/null +++ b/docs/testing-tz-neutral-policy.md @@ -0,0 +1,166 @@ +# Timezone testing policy — MEOS ecosystem + +**Applies to:** MobilityDB, MobilityDuck, PyMEOS, JMEOS, meos-rs, and any other binding or tool using MEOS. + +## Problem + +MEOS formats timestamps using the PostgreSQL-derived internal timezone, which is +thread-local and defaults to the system/POSIX timezone. A test that hardcodes the +UTC offset `+00` in its expected value will fail on any machine where the system +timezone differs: + +```sql +-- Written on a UTC machine; breaks on UTC+1, UTC+2, etc. +SELECT tint '[1@2000-01-01]'::TEXT +---- +[1@2000-01-01 00:00:00+00] ← passes only on UTC systems +``` + +`+00` is the worst choice because it looks like "no offset" but is actually a +hardcoded assumption. Any other fixed offset (`+01`, `-08`) at least makes it +obvious that a specific timezone is required. + +## The root fix: pin the test timezone to something non-UTC + +MEOS reads the `TZ` environment variable at first initialisation +(via `select_default_timezone → getenv("TZ")`). Setting it before the test +process starts is the correct, lightweight fix — no code changes, no per-thread +hacks, no effect on production behaviour. + +``` +TZ=Europe/Brussels # UTC+1 winter / UTC+2 summer +``` + +This is directly analogous to PostgreSQL's own practice of using +`America/Los_Angeles` (`PST8PDT`, UTC-8/UTC-7) for its regression tests. +Non-UTC offsets matter because they expose bugs that UTC silently hides +(sign errors, DST boundary logic, off-by-an-hour conversions, etc.). + +## Platform-specific approach + +Different test frameworks have different capabilities; apply the right +tool for each. + +### pg_regress (MobilityDB PostgreSQL) + +pg_regress compares plain-text expected files line by line. There is no +programmatic comparison hook, so hardcoded offsets are **unavoidable**. +The correct approach is: + +1. Set `PGTZ=Europe/Brussels` (or the project's chosen zone) in the regress + environment. +2. Use only winter dates in test fixtures to get a stable `+01` offset — avoid + dates that cross DST boundaries. +3. Expected files contain `+01` consistently; CI sets the same env variable. + +### DuckDB sqllogictest (MobilityDuck) + +Two approaches are available — prefer them in the order listed: + +#### a) Numeric/boolean value accessors (first choice) + +Accessor functions return non-timestamp types; the result never contains an +offset at all: + +```sql +SELECT minValue(tint '[1@2000-01-01, 2@2000-01-02, 3@2000-01-03]') -- 1 +SELECT maxValue(tint '[1@2000-01-01, 2@2000-01-02, 3@2000-01-03]') -- 3 +SELECT startValue(tbool '[t@2000-01-01, f@2000-01-02]') -- true +SELECT endValue(tbool '[t@2000-01-01, f@2000-01-02]') -- false +SELECT round(minValue(tfloat '[1.5@2000-01-01, 3.5@2000-01-02]'), 6) -- 1.5 +SELECT duration(ttext '[hello@2000-01-01, world@2000-01-02]') -- 1 day +``` + +#### b) Nosort cross-validation (DuckDB sqllogictest only) + +Both queries go through MEOS at the same timezone, so even if their output +contains an offset, the two sides are always equal — the comparison is +TZ-neutral: + +```sql +query IT nosort label +SELECT id, tintFromBinary(val)::VARCHAR FROM tbl ORDER BY id + +query IT nosort label +SELECT id, tintFromBinary(asBinary(tintFromBinary(val)))::VARCHAR FROM tbl ORDER BY id +``` + +This pattern is **not portable** to pg_regress, pytest, JUnit, or Spark tests. + +#### c) BLOB byte comparison (for metadata round-trips) + +```sql +SELECT value = expected_json_string::BLOB AS ok +FROM parquet_kv_metadata(file) +WHERE key = 'my_key'::BLOB +``` + +### pytest (PyMEOS) + +Use Python accessor methods that return non-timestamp types: + +```python +assert t.min_value() == 1 +assert t.max_value() == 3 +assert t.start_value() == True +assert t.duration() == timedelta(days=2) +``` + +### JUnit / Kotlin (JMEOS) + +Same principle — compare domain values, not formatted strings: + +```java +assertEquals(1, t.minValue()); +assertEquals(3, t.maxValue()); +assertEquals(Duration.ofDays(2), t.duration()); +``` + +### Spark + +Set `spark.sql.session.timeZone` to the project's chosen zone and use that +offset consistently in expected strings, or extract domain values with the +MEOS UDF equivalents before asserting. + +## What NOT to do + +```sql +-- ✗ Hardcoded UTC offset — fails on non-UTC systems +SELECT tint '[1@2000-01-01]'::TEXT +---- +[1@2000-01-01 00:00:00+00] + +-- ✗ Hardcoded single-TZ offset — fails everywhere else +SELECT tint '[1@2000-01-01]'::TEXT +---- +[1@2000-01-01 00:00:00+01] + +-- ✗ Forcing MEOS timezone in extension/binding code — breaks users +meos_initialize_timezone("UTC"); // per-thread in DuckDB wrapper → bad + +-- ✗ DuckDB-only nosort used in pg_regress or pytest test +``` + +## Using +00 in INPUT (always fine) + +Using `+00` in **input** literals anchors the absolute UTC time regardless of +the display timezone. This is correct and encouraged: + +```sql +-- ✓ Input literal anchors to UTC; only the *display* will shift with TZ +COPY (SELECT asBinary(tgeompoint '[POINT(1 2)@2026-01-01 00:00:00+00]') AS traj) +TO 'file.parquet' (FORMAT PARQUET) +``` + +## Migration status (2026-05-07) + +| File / test suite | Framework | Status | +|---|---|---| +| `test/sql/parquet/temporal_parquet.test` | DuckDB sqllogictest | ✓ Fully TZ-neutral (accessors + nosort) | +| `test/sql/parquet/` (new tests) | DuckDB sqllogictest | ✓ Policy in force | +| `test/sql/parity/*.test` | DuckDB sqllogictest | ✗ Still use `+00` — needs `TZ=Europe/Brussels` sweep | +| `test/sql/tint.test`, `tfloat.test`, … | DuckDB sqllogictest | ✗ Still use `+00` — needs sweep | +| `test/sql/tgeompoint.test`, `tgeometry.test` | DuckDB sqllogictest | ✗ Still use `+00` — needs sweep | +| `test/sql/stbox.test` | DuckDB sqllogictest | ✗ Still use `+00` — needs sweep | +| MobilityDB `expected/*.out` | pg_regress | ✓ Uses America/Los_Angeles (PST8PDT) — existing approach is correct | +| PyMEOS tests | pytest | ✗ Audit needed — accessor approach may not be consistently applied | diff --git a/docs/tgeogpoint-design.md b/docs/tgeogpoint-design.md index a44840e6..c1ff8544 100644 --- a/docs/tgeogpoint-design.md +++ b/docs/tgeogpoint-design.md @@ -57,6 +57,36 @@ The last point is important: a Parquet file written by `asBinary(tgeogpointSeq(. MobilityDuck can be read with `tgeogpointFromBinary` in MobilityDB and vice-versa — the MEOS-WKB type tag carries the geodetic flag. +## Spatial Predicates with GEOMETRY Input + +Because DuckDB has no `geography` type, spatial predicates that compare a `TGEOGPOINT` +against a region use the plain `GEOMETRY` type: + +```sql +SELECT entity_id +FROM trajectories +WHERE eIntersects( + ST_GeomFromText('POLYGON((11.5 55.0,13.5 55.0,13.5 56.5,11.5 56.5,11.5 55.0))'), + traj +); +``` + +MobilityDuck transparently converts the `GEOMETRY` to a proper geodetic GSERIALIZED using +MEOS's `geom_to_geog()` when the opposing temporal type is a geodetic one (i.e. when +`MEOS_FLAGS_GET_GEODETIC(tgeom->flags)` is true). This mirrors what PostgreSQL does when +an implicit `geometry → geography` cast is applied in MobilityDB. + +**Root causes fixed (commit `3441566`, 2026-05-07):** + +| Bug | Symptom | Fix | +|---|---|---| +| SRID hardcoded 0 in `(GEOMETRY, temporal)` direction | "Operation on mixed SRID" | Deserialize `tgeom` first; use `tspatial_srid(tgeom)` | +| Geodetic flag mismatch | "Operation on mixed planar and geodetic coordinates" | Call `geom_to_geog(gs)` to rebuild GSERIALIZED with valid 3D bbox + GEODETIC=1 | + +Applies to all 12 `(GEOMETRY, temporal)` overloads: `eIntersects/eContains/eDisjoint/ +eTouches` (ever/always variants) and `tIntersects/tContains/tDisjoint/tTouches/tDwithin` +families. + ## Usage ```sql @@ -66,6 +96,14 @@ SELECT length(tgeogpointSeq( )) FROM ais_raw GROUP BY mmsi; +-- Region intersection — GEOMETRY is auto-promoted to geodetic: +SELECT mmsi +FROM trajectories +WHERE eIntersects( + ST_GeomFromText('POLYGON((xmin ymin,xmax ymin,xmax ymax,xmin ymax,xmin ymin))'), + traj +); + -- Round-trip through Parquet: COPY (SELECT mmsi, asBinary(traj) AS traj FROM trajectories) TO 'ais.parquet' (FORMAT PARQUET); diff --git a/duckdb b/duckdb index 6ddac802..0b83e5d2 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 6ddac802ffa9bcfbcc3f5f0d71de5dff9b0bc250 +Subproject commit 0b83e5d2f68bc02dfefde74b846bd039f078affa diff --git a/examples/generic-ingest/generic_ingest.sql b/examples/generic-ingest/generic_ingest.sql new file mode 100644 index 00000000..c28add16 --- /dev/null +++ b/examples/generic-ingest/generic_ingest.sql @@ -0,0 +1,149 @@ +-- generic_ingest.sql — TemporalParquet ingest template (bring your own data) +-- +-- Converts any lon/lat/timestamp CSV into a TemporalParquet shard. +-- Edit the CONFIGURE macros below, then run from the MobilityDuck root: +-- TZ=UTC ./build/release/duckdb -c ".read examples/generic-ingest/generic_ingest.sql" +-- +-- Output: a self-describing Parquet file with MEOS-WKB trajectory column and +-- TemporalParquet footer metadata, readable by MobilityDB, MobilitySpark, PyMEOS. +-- +-- ───────────────────────────────────────────────────────────────────────────── +-- PREREQUISITES (build from source required — community extension coming soon) +-- ───────────────────────────────────────────────────────────────────────────── +-- +-- git clone --recurse-submodules https://github.com/MobilityDB/MobilityDuck.git +-- cd MobilityDuck +-- make # installs vcpkg + MEOS, builds the extension +-- +-- The two LOAD lines below assume a local build at ../../build/release/. +-- ───────────────────────────────────────────────────────────────────────────── + +LOAD '../../build/release/extension/mobilityduck/mobilityduck.duckdb_extension'; +LOAD '../../build/release/extension/parquet/parquet.duckdb_extension'; + +-- ───────────────────────────────────────────────────────────────────────────── +-- CONFIGURE: data source +-- ───────────────────────────────────────────────────────────────────────────── + +-- Path to your CSV file (wildcards accepted: 'data/*.csv') +-- The CSV must have a header row. +CREATE OR REPLACE MACRO csv_path() AS 'your_data.csv'; + +-- Output Parquet shard path +CREATE OR REPLACE MACRO output_path() AS 'trajectories.parquet'; + +-- ───────────────────────────────────────────────────────────────────────────── +-- CONFIGURE: column mapping +-- Replace these macro bodies with your actual column names. +-- ───────────────────────────────────────────────────────────────────────────── + +-- Column in your CSV that uniquely identifies each moving object +-- (vessel MMSI, vehicle ID, user ID, sensor tag, …) +CREATE OR REPLACE MACRO col_entity_id() AS 'entity_id'; + +-- Column containing longitude in WGS-84 decimal degrees (−180 … 180) +CREATE OR REPLACE MACRO col_lon() AS 'longitude'; + +-- Column containing latitude in WGS-84 decimal degrees (−90 … 90) +CREATE OR REPLACE MACRO col_lat() AS 'latitude'; + +-- Column containing the observation timestamp +-- DuckDB parses ISO-8601, Unix epoch (integer), and most common formats. +CREATE OR REPLACE MACRO col_ts() AS 'timestamp'; + +-- Minimum number of pings per entity to include in output (filters sparse tracks) +CREATE OR REPLACE MACRO min_pings() AS 3; + +-- ───────────────────────────────────────────────────────────────────────────── +-- Step 1: load and validate raw pings +-- +-- Drops rows with out-of-range coordinates and deduplicates (entity, ts) pairs +-- (common in AIS/GPS feeds that emit duplicate messages at the same timestamp). +-- ───────────────────────────────────────────────────────────────────────────── + +CREATE OR REPLACE TABLE raw_pings AS +SELECT + CAST(columns(col_entity_id()) AS BIGINT) AS entity_id, + CAST(columns(col_lon()) AS DOUBLE) AS lon, + CAST(columns(col_lat()) AS DOUBLE) AS lat, + CAST(columns(col_ts()) AS TIMESTAMPTZ) AS ts +FROM read_csv_auto(csv_path(), header = true, nullstr = '') +WHERE TRY_CAST(columns(col_lon()) AS DOUBLE) BETWEEN -180 AND 180 + AND TRY_CAST(columns(col_lat()) AS DOUBLE) BETWEEN -90 AND 90 +QUALIFY ROW_NUMBER() OVER ( + PARTITION BY CAST(columns(col_entity_id()) AS BIGINT), + CAST(columns(col_ts()) AS TIMESTAMPTZ) + ORDER BY CAST(columns(col_ts()) AS TIMESTAMPTZ) +) = 1; + +SELECT count(*) AS raw_pings, count(DISTINCT entity_id) AS entities FROM raw_pings; + +-- ───────────────────────────────────────────────────────────────────────────── +-- Step 2: build tgeogpointSeq trajectories +-- +-- One geodetic sequence per entity, ordered by timestamp. +-- Entities with fewer than min_pings() observations are excluded. +-- ───────────────────────────────────────────────────────────────────────────── + +CREATE OR REPLACE TABLE trajectories AS +SELECT + entity_id, + tgeogpointSeq( + list(TGEOGPOINT(ST_Point(lon, lat), ts) ORDER BY ts) + ) AS traj +FROM raw_pings +GROUP BY entity_id +HAVING count(*) >= min_pings(); + +SELECT count(*) AS trajectories FROM trajectories; + +-- ───────────────────────────────────────────────────────────────────────────── +-- Step 3: write TemporalParquet shard +-- +-- The TemporalParquet footer (KV_METADATA 'temporal') declares traj as a +-- tgeogpoint column encoded with MEOS-WKB. Any MEOS-aware reader can +-- reconstruct the typed value from the BYTE_ARRAY column without a schema file. +-- ───────────────────────────────────────────────────────────────────────────── + +COPY ( + SELECT + entity_id, + asBinary(traj) AS traj, + numInstants(traj) AS ping_count + FROM trajectories +) +TO output_path() ( + FORMAT PARQUET, + ROW_GROUP_SIZE 1000, + KV_METADATA {'temporal': temporalFooter(MAP {'traj': 'tgeogpoint'})} +); + +-- Verify schema: traj must appear as BYTE_ARRAY +SELECT name, type FROM parquet_schema(output_path()) +WHERE name NOT IN ('duckdb_schema'); + +-- Verify footer +SELECT value = temporalFooter(MAP {'traj': 'tgeogpoint'})::BLOB AS footer_ok +FROM parquet_kv_metadata(output_path()) +WHERE key = 'temporal'::BLOB; + +-- ───────────────────────────────────────────────────────────────────────────── +-- Step 4: quick sanity analytics on the written shard +-- These same queries run unchanged on MobilityDB and MobilitySpark. +-- ───────────────────────────────────────────────────────────────────────────── + +-- Top 10 entities by geodetic trajectory length (metres) +SELECT + entity_id, + ping_count, + round(length(tgeogpointFromBinary(traj))) AS length_m, + round(maxValue(speed(tgeogpointFromBinary(traj))), 2) AS max_speed_ms +FROM read_parquet(output_path()) +ORDER BY length_m DESC +LIMIT 10; + +-- Distribution of ping counts +SELECT ping_count, count(*) AS entities +FROM read_parquet(output_path()) +GROUP BY ping_count +ORDER BY ping_count; diff --git a/examples/quickstart/quickstart.sql b/examples/quickstart/quickstart.sql new file mode 100644 index 00000000..8a4e0b72 --- /dev/null +++ b/examples/quickstart/quickstart.sql @@ -0,0 +1,160 @@ +-- quickstart.sql — Edge-to-Cloud Temporal Data Lake demo (no external data needed) +-- +-- Demonstrates the full TemporalParquet pipeline with synthetic GPS trajectories: +-- 1. Generate 5 vessels × 12 pings from inline VALUES (no CSV required) +-- 2. Build tgeogpointSeq trajectories — geodetic WGS-84, length in metres +-- 3. Write TemporalParquet shard: asBinary() + temporalFooter() metadata +-- 4. Query the shard: length, speed, region intersection, trip duration +-- +-- Companion file: quickstart_mobilitydb.sql — same queries on PostgreSQL/MobilityDB. +-- +-- ───────────────────────────────────────────────────────────────────────────── +-- PREREQUISITES (build from source required — community extension coming soon) +-- ───────────────────────────────────────────────────────────────────────────── +-- +-- git clone --recurse-submodules https://github.com/MobilityDB/MobilityDuck.git +-- cd MobilityDuck +-- make # installs vcpkg + MEOS, builds the extension +-- # first build ~5-10 min; subsequent builds ~30 s +-- +-- Then run this file from the MobilityDuck root: +-- TZ=UTC ./build/release/duckdb -c ".read examples/quickstart/quickstart.sql" +-- +-- Or from the examples/quickstart/ directory: +-- TZ=UTC ../../build/release/duckdb :memory: -f quickstart.sql +-- +-- The two LOAD lines below assume a local build at ../../build/release/. +-- Adjust the paths if your build directory differs. +-- ───────────────────────────────────────────────────────────────────────────── + +LOAD '../../build/release/extension/mobilityduck/mobilityduck.duckdb_extension'; +LOAD '../../build/release/extension/parquet/parquet.duckdb_extension'; + +-- ───────────────────────────────────────────────────────────────────────────── +-- Step 1: generate synthetic pings (no external data needed) +-- +-- Five vessels depart from different positions in the North Sea / Kattegat area +-- and move linearly for 12 pings at 10-minute intervals (1h50m total). +-- Coordinates are in WGS-84 decimal degrees. +-- +-- Vessel coverage of the Copenhagen bounding box (lon 11.5–13.5, lat 55.0–56.5): +-- 1 → approaches from west, enters box around ping 7 +-- 2 → approaches from east, enters box around ping 3 +-- 3 → stays in Skagerrak, never enters box ← useful negative case +-- 4 → starts inside box, stays inside throughout +-- 5 → approaches from southwest, enters box around ping 10 +-- ───────────────────────────────────────────────────────────────────────────── + +CREATE OR REPLACE TABLE raw_pings AS +SELECT + entity_id, + round(start_lon + delta_lon * step, 6) AS lon, + round(start_lat + delta_lat * step, 6) AS lat, + -- to_timestamp(unix_epoch) avoids TIMESTAMPTZ+INTERVAL operator ambiguity + -- 1768464000 = 2026-01-15 08:00:00 UTC + to_timestamp(1768464000 + step * 600) AS ts +FROM (VALUES + -- entity_id start_lon start_lat delta_lon delta_lat + (1, 10.00, 55.50, 0.23, 0.05), + (2, 14.00, 56.00, -0.18, -0.08), + (3, 8.50, 57.50, 0.06, -0.06), + (4, 12.10, 55.20, 0.04, 0.02), + (5, 9.50, 54.50, 0.22, 0.06) +) t(entity_id, start_lon, start_lat, delta_lon, delta_lat), +generate_series(0, 11) g(step); + +SELECT entity_id, count(*) AS pings FROM raw_pings GROUP BY entity_id ORDER BY entity_id; + +-- ───────────────────────────────────────────────────────────────────────────── +-- Step 2: build tgeogpointSeq trajectories +-- +-- TGEOGPOINT(geometry, timestamptz) creates a geodetic instant. +-- tgeogpointSeq(list(... ORDER BY ts)) assembles them into a linear sequence. +-- The geodetic flag means length() and speed() return metres and m/s. +-- ───────────────────────────────────────────────────────────────────────────── + +CREATE OR REPLACE TABLE trajectories AS +SELECT + entity_id, + tgeogpointSeq( + list(TGEOGPOINT(ST_Point(lon, lat), ts) ORDER BY ts) + ) AS traj +FROM raw_pings +GROUP BY entity_id; + +SELECT count(*) AS trajectories FROM trajectories; + +-- ───────────────────────────────────────────────────────────────────────────── +-- Step 3: write TemporalParquet shard +-- +-- asBinary() → portable MEOS-WKB BLOB (BYTE_ARRAY in Parquet) +-- temporalFooter() → TemporalParquet JSON metadata injected via KV_METADATA +-- +-- Any MEOS-WKB-aware reader (MobilityDB, MobilitySpark, PyMEOS) can decode +-- the traj column using the base_type declared in the footer. +-- ───────────────────────────────────────────────────────────────────────────── + +COPY ( + SELECT + entity_id, + asBinary(traj) AS traj, + numInstants(traj) AS ping_count + FROM trajectories +) +TO 'edge_to_cloud_demo.parquet' ( + FORMAT PARQUET, + ROW_GROUP_SIZE 1000, + KV_METADATA {'temporal': temporalFooter(MAP {'traj': 'tgeogpoint'})} +); + +-- Verify the Parquet schema: traj must land as BYTE_ARRAY +SELECT name, type +FROM parquet_schema('edge_to_cloud_demo.parquet') +WHERE name NOT IN ('duckdb_schema'); + +-- Verify the TemporalParquet footer is embedded correctly +SELECT value = temporalFooter(MAP {'traj': 'tgeogpoint'})::BLOB AS footer_ok +FROM parquet_kv_metadata('edge_to_cloud_demo.parquet') +WHERE key = 'temporal'::BLOB; + +-- ───────────────────────────────────────────────────────────────────────────── +-- Step 4: analytics on the Parquet shard +-- +-- All queries use tgeogpointFromBinary() to reconstruct the typed value from +-- the BLOB column. The same named-function queries run unchanged on +-- MobilityDB and MobilitySpark — see quickstart_mobilitydb.sql. +-- ───────────────────────────────────────────────────────────────────────────── + +-- Query A: total distance and maximum speed per vessel +-- length() returns geodetic metres (spheroidal WGS-84 via Vincenty/Haversine) +-- speed() returns a tfloat of instantaneous speed in m/s; maxValue() extracts the peak +SELECT + entity_id, + ping_count, + round(length(tgeogpointFromBinary(traj))) AS length_m, + round(maxValue(speed(tgeogpointFromBinary(traj))), 2) AS max_speed_ms +FROM read_parquet('edge_to_cloud_demo.parquet') +ORDER BY length_m DESC; + +-- Query B: vessels that entered the Copenhagen bounding box +-- lon 11.5–13.5, lat 55.0–56.5 (approx. Øresund / Danish straits region) +-- eIntersects returns true if the trajectory ever enters the polygon. +-- DuckDB GEOMETRY is automatically promoted to geodetic when matched against +-- a tgeogpoint, so no SRID annotation on the polygon is required. +SELECT entity_id +FROM ( + SELECT entity_id, tgeogpointFromBinary(traj) AS traj + FROM read_parquet('edge_to_cloud_demo.parquet') +) +WHERE eIntersects( + ST_GeomFromText('POLYGON((11.5 55.0,13.5 55.0,13.5 56.5,11.5 56.5,11.5 55.0))'), + traj +) +ORDER BY entity_id; + +-- Query C: trip duration (timezone-independent interval) +SELECT + entity_id, + duration(tgeogpointFromBinary(traj))::VARCHAR AS trip_duration +FROM read_parquet('edge_to_cloud_demo.parquet') +ORDER BY entity_id; diff --git a/examples/quickstart/quickstart_mobilitydb.sql b/examples/quickstart/quickstart_mobilitydb.sql new file mode 100644 index 00000000..52408746 --- /dev/null +++ b/examples/quickstart/quickstart_mobilitydb.sql @@ -0,0 +1,159 @@ +-- quickstart_mobilitydb.sql — Edge-to-Cloud demo on PostgreSQL / MobilityDB +-- +-- Companion to quickstart.sql. Builds the same five synthetic trajectories and +-- runs the same three analytics queries — proving the portable named-function +-- SQL dialect produces identical results on both DuckDB and PostgreSQL. +-- +-- Requirements: PostgreSQL with MobilityDB and PostGIS installed. +-- Run as: psql -d -f quickstart_mobilitydb.sql +-- +-- Reading a TemporalParquet shard written by MobilityDuck: +-- Install pg_parquet (https://github.com/CrunchyData/pg_parquet), then: +-- CREATE EXTENSION pg_parquet; +-- SELECT tgeogpointFromBinary(traj), ping_count +-- FROM parquet.read('edge_to_cloud_demo.parquet'); +-- Replace the WITH trajs CTE below with that table expression. + +-- ───────────────────────────────────────────────────────────────────────────── +-- Construction (MobilityDB syntax) +-- +-- Key difference from DuckDB: +-- DuckDB: tgeogpointSeq(list(TGEOGPOINT(ST_Point(lon,lat), ts) ORDER BY ts)) +-- MobilityDB: tgeogpointseq(array_agg( +-- format('SRID=4326;POINT(%s %s)@%s',lon,lat,ts)::tgeogpoint +-- ORDER BY ts)) +-- +-- The analytics queries (A, B, C) below are identical on both platforms. +-- ───────────────────────────────────────────────────────────────────────────── + +WITH raw AS ( + SELECT + entity_id, + round((start_lon + delta_lon * s)::numeric, 6)::float8 AS lon, + round((start_lat + delta_lat * s)::numeric, 6)::float8 AS lat, + TIMESTAMPTZ '2026-01-15 08:00:00+00' + (s * INTERVAL '10 minutes') AS ts + FROM (VALUES + -- entity_id start_lon start_lat delta_lon delta_lat + (1, 10.00, 55.50, 0.23, 0.05), + (2, 14.00, 56.00, -0.18, -0.08), + (3, 8.50, 57.50, 0.06, -0.06), + (4, 12.10, 55.20, 0.04, 0.02), + (5, 9.50, 54.50, 0.22, 0.06) + ) t(entity_id, start_lon, start_lat, delta_lon, delta_lat), + generate_series(0, 11) g(s) +), +trajs AS ( + SELECT + entity_id, + tgeogpointseq( + array_agg( + format('SRID=4326;POINT(%s %s)@%s', lon, lat, ts)::tgeogpoint + ORDER BY ts + ) + ) AS traj + FROM raw + GROUP BY entity_id +) +-- ───────────────────────────────────────────────────────────────────────────── +-- Query A: total distance and maximum speed per vessel +-- Identical to DuckDB — length() returns geodetic metres, speed() returns m/s +-- ───────────────────────────────────────────────────────────────────────────── +SELECT + entity_id, + round(length(traj)) AS length_m, + round(maxValue(speed(traj))::numeric, 2) AS max_speed_ms +FROM trajs +ORDER BY length_m DESC; + +-- Expected (same as DuckDB): +-- entity_id | length_m | max_speed_ms +-- -----------+----------+-------------- +-- 5 | 172001 | 26.22 +-- 1 | 170169 | 25.93 +-- 2 | 158771 | 24.21 +-- 3 | 83644 | 12.70 +-- 4 | 37155 | 5.64 + + +-- ───────────────────────────────────────────────────────────────────────────── +-- Query B: vessels that entered the Copenhagen bounding box +-- MobilityDB uses ST_GeomFromText with SRID=4326 prefix (EWKT). +-- DuckDB requires GEODSTBOX workaround (DuckDB geometry type carries no SRID). +-- Both return the same four vessels. +-- ───────────────────────────────────────────────────────────────────────────── +WITH raw AS ( + SELECT + entity_id, + round((start_lon + delta_lon * s)::numeric, 6)::float8 AS lon, + round((start_lat + delta_lat * s)::numeric, 6)::float8 AS lat, + TIMESTAMPTZ '2026-01-15 08:00:00+00' + (s * INTERVAL '10 minutes') AS ts + FROM (VALUES + (1, 10.00, 55.50, 0.23, 0.05), + (2, 14.00, 56.00, -0.18, -0.08), + (3, 8.50, 57.50, 0.06, -0.06), + (4, 12.10, 55.20, 0.04, 0.02), + (5, 9.50, 54.50, 0.22, 0.06) + ) t(entity_id, start_lon, start_lat, delta_lon, delta_lat), + generate_series(0, 11) g(s) +), +trajs AS ( + SELECT entity_id, + tgeogpointseq(array_agg( + format('SRID=4326;POINT(%s %s)@%s', lon, lat, ts)::tgeogpoint + ORDER BY ts)) AS traj + FROM raw GROUP BY entity_id +) +SELECT entity_id +FROM trajs +WHERE eIntersects( + ST_GeomFromText('SRID=4326;POLYGON((11.5 55.0,13.5 55.0,13.5 56.5,11.5 56.5,11.5 55.0))'), + traj +) +ORDER BY entity_id; + +-- Expected (same as DuckDB): +-- entity_id +-- ----------- +-- 1 +-- 2 +-- 4 +-- 5 + + +-- ───────────────────────────────────────────────────────────────────────────── +-- Query C: trip duration (timezone-independent interval) +-- ───────────────────────────────────────────────────────────────────────────── +WITH raw AS ( + SELECT + entity_id, + round((start_lon + delta_lon * s)::numeric, 6)::float8 AS lon, + round((start_lat + delta_lat * s)::numeric, 6)::float8 AS lat, + TIMESTAMPTZ '2026-01-15 08:00:00+00' + (s * INTERVAL '10 minutes') AS ts + FROM (VALUES + (1, 10.00, 55.50, 0.23, 0.05), + (2, 14.00, 56.00, -0.18, -0.08), + (3, 8.50, 57.50, 0.06, -0.06), + (4, 12.10, 55.20, 0.04, 0.02), + (5, 9.50, 54.50, 0.22, 0.06) + ) t(entity_id, start_lon, start_lat, delta_lon, delta_lat), + generate_series(0, 11) g(s) +), +trajs AS ( + SELECT entity_id, + tgeogpointseq(array_agg( + format('SRID=4326;POINT(%s %s)@%s', lon, lat, ts)::tgeogpoint + ORDER BY ts)) AS traj + FROM raw GROUP BY entity_id +) +SELECT entity_id, duration(traj) AS trip_duration +FROM trajs +ORDER BY entity_id; + +-- Expected (same as DuckDB): +-- entity_id | trip_duration +-- -----------+--------------- +-- 1 | 01:50:00 +-- 2 | 01:50:00 +-- 3 | 01:50:00 +-- 4 | 01:50:00 +-- 5 | 01:50:00 diff --git a/extension-ci-tools b/extension-ci-tools index 32eb753d..b808e513 160000 --- a/extension-ci-tools +++ b/extension-ci-tools @@ -1 +1 @@ -Subproject commit 32eb753d9b660bf90bdca42652cf40c1ef64bf67 +Subproject commit b808e5130cb6e1341ab968a2a9c0f5f236dd2ec8 diff --git a/src/geo/tgeogpoint_ops.cpp b/src/geo/tgeogpoint_ops.cpp index 91136138..b127ac4d 100644 --- a/src/geo/tgeogpoint_ops.cpp +++ b/src/geo/tgeogpoint_ops.cpp @@ -136,10 +136,23 @@ template void TgeoGeoIntExec(DataChunk &args, ExpressionState &, Vector &result) { BinaryExecutor::ExecuteWithNulls( args.data[0], args.data[1], result, args.size(), - [&](string_t t_blob, string_t g_blob, ValidityMask &mask, idx_t idx) { + [&](string_t a, string_t b, ValidityMask &mask, idx_t idx) { + // DuckDB's alias-erased function resolution can route either + // (TGEO*, GEOM) or (GEOM, TGEO*) calls into this executor (see + // BlobLooksLikeTemporal in geo_util.hpp). Detect which blob is + // actually the Temporal so the rest of the body sees the + // expected (t_blob, g_blob) order. + const bool a_is_temporal = BlobLooksLikeTemporal(a); + string_t t_blob = a_is_temporal ? a : b; + string_t g_blob = a_is_temporal ? b : a; Temporal *t = DecodeTemporalCopy(t_blob); int32 srid = tspatial_srid(t); GSERIALIZED *gs = GeometryToGSerialized(g_blob, srid); + if (MEOS_FLAGS_GET_GEODETIC(t->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } int r = FN(t, gs); free(t); free(gs); if (r < 0) { mask.SetInvalid(idx); return false; } diff --git a/src/geo/tgeography_ops.cpp b/src/geo/tgeography_ops.cpp index 1eee6ed0..58489281 100644 --- a/src/geo/tgeography_ops.cpp +++ b/src/geo/tgeography_ops.cpp @@ -137,10 +137,23 @@ template void TgeoGeoIntExec(DataChunk &args, ExpressionState &, Vector &result) { BinaryExecutor::ExecuteWithNulls( args.data[0], args.data[1], result, args.size(), - [&](string_t t_blob, string_t g_blob, ValidityMask &mask, idx_t idx) { + [&](string_t a, string_t b, ValidityMask &mask, idx_t idx) { + // DuckDB's alias-erased function resolution can route either + // (TGEO*, GEOM) or (GEOM, TGEO*) calls into this executor (see + // BlobLooksLikeTemporal in geo_util.hpp). Detect which blob is + // actually the Temporal so the rest of the body sees the + // expected (t_blob, g_blob) order. + const bool a_is_temporal = BlobLooksLikeTemporal(a); + string_t t_blob = a_is_temporal ? a : b; + string_t g_blob = a_is_temporal ? b : a; Temporal *t = DecodeTemporalCopy(t_blob); int32 srid = tspatial_srid(t); GSERIALIZED *gs = GeometryToGSerialized(g_blob, srid); + if (MEOS_FLAGS_GET_GEODETIC(t->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } int r = FN(t, gs); free(t); free(gs); if (r < 0) { mask.SetInvalid(idx); return false; } diff --git a/src/geo/tgeometry_ops.cpp b/src/geo/tgeometry_ops.cpp index 2db1613a..c094961f 100644 --- a/src/geo/tgeometry_ops.cpp +++ b/src/geo/tgeometry_ops.cpp @@ -137,10 +137,23 @@ template void TgeoGeoIntExec(DataChunk &args, ExpressionState &, Vector &result) { BinaryExecutor::ExecuteWithNulls( args.data[0], args.data[1], result, args.size(), - [&](string_t t_blob, string_t g_blob, ValidityMask &mask, idx_t idx) { + [&](string_t a, string_t b, ValidityMask &mask, idx_t idx) { + // DuckDB's alias-erased function resolution can route either + // (TGEO*, GEOM) or (GEOM, TGEO*) calls into this executor (see + // BlobLooksLikeTemporal in geo_util.hpp). Detect which blob is + // actually the Temporal so the rest of the body sees the + // expected (t_blob, g_blob) order. + const bool a_is_temporal = BlobLooksLikeTemporal(a); + string_t t_blob = a_is_temporal ? a : b; + string_t g_blob = a_is_temporal ? b : a; Temporal *t = DecodeTemporalCopy(t_blob); int32 srid = tspatial_srid(t); GSERIALIZED *gs = GeometryToGSerialized(g_blob, srid); + if (MEOS_FLAGS_GET_GEODETIC(t->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } int r = FN(t, gs); free(t); free(gs); if (r < 0) { mask.SetInvalid(idx); return false; } diff --git a/src/geo/tgeompoint_functions.cpp b/src/geo/tgeompoint_functions.cpp index 9092e6e4..e23464c0 100644 --- a/src/geo/tgeompoint_functions.cpp +++ b/src/geo/tgeompoint_functions.cpp @@ -1236,6 +1236,11 @@ void TgeompointFunctions::Tgeo_at_geom(DataChunk &args, ExpressionState &state, free(tgeom); throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } Temporal *ret = tgeo_at_geom(tgeom, gs); if (!ret) { @@ -1504,12 +1509,6 @@ void TgeompointFunctions::Econtains_geo_tgeo(DataChunk &args, ExpressionState &s BinaryExecutor::ExecuteWithNulls( args.data[0], args.data[1], result, args.size(), [&](string_t geometry_blob, string_t tgeom_blob, ValidityMask &mask, idx_t idx) -> bool { - int32 srid = 0; - GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); - if (!gs) { - throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); - } - const uint8_t *tgeom_data = reinterpret_cast(tgeom_blob.GetData()); size_t tgeom_data_size = tgeom_blob.GetSize(); uint8_t *tgeom_data_copy = (uint8_t*)malloc(tgeom_data_size); @@ -1517,10 +1516,21 @@ void TgeompointFunctions::Econtains_geo_tgeo(DataChunk &args, ExpressionState &s Temporal *tgeom = reinterpret_cast(tgeom_data_copy); if (!tgeom) { free(tgeom_data_copy); - free(gs); throw InvalidInputException("Invalid TGEOMPOINT data: null pointer"); } + int32 srid = tspatial_srid(tgeom); + GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); + if (!gs) { + free(tgeom); + throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); + } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } + int ret = econtains_geo_tgeo(gs, tgeom); free(tgeom); free(gs); @@ -1540,12 +1550,6 @@ void TgeompointFunctions::Acontains_geo_tgeo(DataChunk &args, ExpressionState &s BinaryExecutor::ExecuteWithNulls( args.data[0], args.data[1], result, args.size(), [&](string_t geometry_blob, string_t tgeom_blob, ValidityMask &mask, idx_t idx) -> bool { - int32 srid = 0; - GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); - if (!gs) { - throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); - } - const uint8_t *tgeom_data = reinterpret_cast(tgeom_blob.GetData()); size_t tgeom_data_size = tgeom_blob.GetSize(); uint8_t *tgeom_data_copy = (uint8_t*)malloc(tgeom_data_size); @@ -1553,10 +1557,21 @@ void TgeompointFunctions::Acontains_geo_tgeo(DataChunk &args, ExpressionState &s Temporal *tgeom = reinterpret_cast(tgeom_data_copy); if (!tgeom) { free(tgeom_data_copy); - free(gs); throw InvalidInputException("Invalid TGEOMPOINT data: null pointer"); } + int32 srid = tspatial_srid(tgeom); + GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); + if (!gs) { + free(tgeom); + throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); + } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } + int ret = acontains_geo_tgeo(gs, tgeom); free(tgeom); free(gs); @@ -1595,6 +1610,11 @@ void TgeompointFunctions::Edisjoint_geo_tgeo(DataChunk &args, ExpressionState &s free(tgeom); throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } int ret = ea_disjoint_geo_tgeo_dispatch(gs, tgeom, true); free(tgeom); @@ -1634,6 +1654,11 @@ void TgeompointFunctions::Edisjoint_tgeo_geo(DataChunk &args, ExpressionState &s free(tgeom); throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } int ret = edisjoint_tgeo_geo(tgeom, gs); free(tgeom); @@ -1713,6 +1738,11 @@ void TgeompointFunctions::Adisjoint_geo_tgeo(DataChunk &args, ExpressionState &s free(tgeom); throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } // Same as MobilityDB EA_spatialrel_geo_tspatial(..., &ea_disjoint_geo_tgeo, ALWAYS) int ret = ea_disjoint_geo_tgeo_dispatch(gs, tgeom, false); @@ -1753,6 +1783,11 @@ void TgeompointFunctions::Adisjoint_tgeo_geo(DataChunk &args, ExpressionState &s free(tgeom); throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } int ret = adisjoint_tgeo_geo(tgeom, gs); free(tgeom); @@ -1854,12 +1889,6 @@ void TgeompointFunctions::Eintersects_geo_tgeo(DataChunk &args, ExpressionState BinaryExecutor::ExecuteWithNulls( args.data[0], args.data[1], result, args.size(), [&](string_t geometry_blob, string_t tgeom_blob, ValidityMask &mask, idx_t idx) -> bool { - int32 srid = 0; - GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); - if (!gs) { - throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); - } - const uint8_t *tgeom_data = reinterpret_cast(tgeom_blob.GetData()); size_t tgeom_data_size = tgeom_blob.GetSize(); uint8_t *tgeom_data_copy = (uint8_t*)malloc(tgeom_data_size); @@ -1867,10 +1896,21 @@ void TgeompointFunctions::Eintersects_geo_tgeo(DataChunk &args, ExpressionState Temporal *tgeom = reinterpret_cast(tgeom_data_copy); if (!tgeom) { free(tgeom_data_copy); - free(gs); throw InvalidInputException("Invalid TGEOMPOINT data: null pointer"); } + int32 srid = tspatial_srid(tgeom); + GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); + if (!gs) { + free(tgeom); + throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); + } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } + int ret = ea_intersects_geo_tgeo_dispatch(gs, tgeom, true); free(tgeom); free(gs); @@ -1906,6 +1946,11 @@ void TgeompointFunctions::Eintersects_tgeo_geo(DataChunk &args, ExpressionState free(tgeom); throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } int ret = eintersects_tgeo_geo(tgeom, gs); free(tgeom); @@ -1926,12 +1971,6 @@ void TgeompointFunctions::Aintersects_geo_tgeo(DataChunk &args, ExpressionState BinaryExecutor::ExecuteWithNulls( args.data[0], args.data[1], result, args.size(), [&](string_t geometry_blob, string_t tgeom_blob, ValidityMask &mask, idx_t idx) -> bool { - int32 srid = 0; - GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); - if (!gs) { - throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); - } - const uint8_t *tgeom_data = reinterpret_cast(tgeom_blob.GetData()); size_t tgeom_data_size = tgeom_blob.GetSize(); uint8_t *tgeom_data_copy = (uint8_t *)malloc(tgeom_data_size); @@ -1939,9 +1978,20 @@ void TgeompointFunctions::Aintersects_geo_tgeo(DataChunk &args, ExpressionState Temporal *tgeom = reinterpret_cast(tgeom_data_copy); if (!tgeom) { free(tgeom_data_copy); - free(gs); throw InvalidInputException("Invalid TGEOMPOINT data: null pointer"); } + + int32 srid = tspatial_srid(tgeom); + GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); + if (!gs) { + free(tgeom); + throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); + } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } int ret = ea_intersects_geo_tgeo_dispatch(gs, tgeom, false); free(tgeom); free(gs); @@ -1977,6 +2027,11 @@ void TgeompointFunctions::Aintersects_tgeo_geo(DataChunk &args, ExpressionState free(tgeom); throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } int ret = aintersects_tgeo_geo(tgeom, gs); free(tgeom); @@ -2129,6 +2184,11 @@ void TgeompointFunctions::Etouches_tpoint_geo(DataChunk &args, ExpressionState & free(tgeom); throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } int ret = etouches_tpoint_geo(tgeom, gs); free(tgeom); @@ -2165,6 +2225,11 @@ void TgeompointFunctions::Atouches_tpoint_geo(DataChunk &args, ExpressionState & free(tgeom); throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } int ret = atouches_tpoint_geo(tgeom, gs); free(tgeom); @@ -2241,6 +2306,11 @@ void TgeompointFunctions::Edwithin_tgeo_geo(DataChunk &args, ExpressionState &st free(tgeom); throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } int ret = edwithin_tgeo_geo(tgeom, gs, dist); free(tgeom); @@ -2280,6 +2350,11 @@ void TgeompointFunctions::Edwithin_geo_tgeo(DataChunk &args, ExpressionState &st free(tgeom); throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } int ret = ea_dwithin_geo_tgeo_dispatch(gs, tgeom, dist, true); free(tgeom); @@ -2319,6 +2394,11 @@ void TgeompointFunctions::Adwithin_tgeo_geo(DataChunk &args, ExpressionState &st free(tgeom); throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } int ret = adwithin_tgeo_geo(tgeom, gs, dist); free(tgeom); @@ -2358,6 +2438,11 @@ void TgeompointFunctions::Adwithin_geo_tgeo(DataChunk &args, ExpressionState &st free(tgeom); throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } int ret = ea_dwithin_geo_tgeo_dispatch(gs, tgeom, dist, false); free(tgeom); @@ -2421,12 +2506,6 @@ void TgeompointFunctions::Tcontains_geo_tgeo(DataChunk &args, ExpressionState &s const idx_t count = args.size(); auto eval = [&](string_t geometry_blob, string_t tgeom_blob, bool restr, bool at_value, ValidityMask &mask, idx_t idx) -> string_t { - int32 srid = 0; - GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); - if (!gs) { - throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); - } - const uint8_t *tgeom_data = reinterpret_cast(tgeom_blob.GetData()); size_t tgeom_data_size = tgeom_blob.GetSize(); uint8_t *tgeom_data_copy = (uint8_t *)malloc(tgeom_data_size); @@ -2434,10 +2513,21 @@ void TgeompointFunctions::Tcontains_geo_tgeo(DataChunk &args, ExpressionState &s Temporal *tgeom = reinterpret_cast(tgeom_data_copy); if (!tgeom) { free(tgeom_data_copy); - free(gs); throw InvalidInputException("Invalid TGEOMPOINT data: null pointer"); } + int32 srid = tspatial_srid(tgeom); + GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); + if (!gs) { + free(tgeom); + throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); + } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } + Temporal *ret = tcontains_geo_tgeo(gs, tgeom, restr, at_value); free(tgeom); free(gs); @@ -2483,12 +2573,6 @@ void TgeompointFunctions::Tdisjoint_geo_tgeo(DataChunk &args, ExpressionState &s BinaryExecutor::ExecuteWithNulls( args.data[0], args.data[1], result, args.size(), [&](string_t geometry_blob, string_t tgeom_blob, ValidityMask &mask, idx_t idx) -> string_t { - int32 srid = 0; - GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); - if (!gs) { - throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); - } - const uint8_t *tgeom_data = reinterpret_cast(tgeom_blob.GetData()); size_t tgeom_data_size = tgeom_blob.GetSize(); uint8_t *tgeom_data_copy = (uint8_t *)malloc(tgeom_data_size); @@ -2496,10 +2580,21 @@ void TgeompointFunctions::Tdisjoint_geo_tgeo(DataChunk &args, ExpressionState &s Temporal *tgeom = reinterpret_cast(tgeom_data_copy); if (!tgeom) { free(tgeom_data_copy); - free(gs); throw InvalidInputException("Invalid TGEOMPOINT data: null pointer"); } + int32 srid = tspatial_srid(tgeom); + GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); + if (!gs) { + free(tgeom); + throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); + } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } + Temporal *ret = tdisjoint_geo_tgeo(gs, tgeom, restr, at_value); free(tgeom); free(gs); @@ -2528,12 +2623,6 @@ void TgeompointFunctions::Tdisjoint_tgeo_geo(DataChunk &args, ExpressionState &s BinaryExecutor::ExecuteWithNulls( args.data[0], args.data[1], result, args.size(), [&](string_t tgeom_blob, string_t geometry_blob, ValidityMask &mask, idx_t idx) -> string_t { - int32 srid = 0; - GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); - if (!gs) { - throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); - } - const uint8_t *tgeom_data = reinterpret_cast(tgeom_blob.GetData()); size_t tgeom_data_size = tgeom_blob.GetSize(); uint8_t *tgeom_data_copy = (uint8_t *)malloc(tgeom_data_size); @@ -2541,10 +2630,21 @@ void TgeompointFunctions::Tdisjoint_tgeo_geo(DataChunk &args, ExpressionState &s Temporal *tgeom = reinterpret_cast(tgeom_data_copy); if (!tgeom) { free(tgeom_data_copy); - free(gs); throw InvalidInputException("Invalid TGEOMPOINT data: null pointer"); } + int32 srid = tspatial_srid(tgeom); + GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); + if (!gs) { + free(tgeom); + throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); + } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } + Temporal *ret = tdisjoint_tgeo_geo(tgeom, gs, restr, at_value); free(tgeom); free(gs); @@ -2622,12 +2722,6 @@ void TgeompointFunctions::Tintersects_geo_tgeo(DataChunk &args, ExpressionState BinaryExecutor::ExecuteWithNulls( args.data[0], args.data[1], result, args.size(), [&](string_t geometry_blob, string_t tgeom_blob, ValidityMask &mask, idx_t idx) -> string_t { - int32 srid = 0; - GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); - if (!gs) { - throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); - } - const uint8_t *tgeom_data = reinterpret_cast(tgeom_blob.GetData()); size_t tgeom_data_size = tgeom_blob.GetSize(); uint8_t *tgeom_data_copy = (uint8_t *)malloc(tgeom_data_size); @@ -2635,10 +2729,21 @@ void TgeompointFunctions::Tintersects_geo_tgeo(DataChunk &args, ExpressionState Temporal *tgeom = reinterpret_cast(tgeom_data_copy); if (!tgeom) { free(tgeom_data_copy); - free(gs); throw InvalidInputException("Invalid TGEOMPOINT data: null pointer"); } + int32 srid = tspatial_srid(tgeom); + GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); + if (!gs) { + free(tgeom); + throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); + } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } + Temporal *ret = tintersects_geo_tgeo(gs, tgeom, restr, at_value); free(tgeom); free(gs); @@ -2667,12 +2772,6 @@ void TgeompointFunctions::Tintersects_tgeo_geo(DataChunk &args, ExpressionState BinaryExecutor::ExecuteWithNulls( args.data[0], args.data[1], result, args.size(), [&](string_t tgeom_blob, string_t geometry_blob, ValidityMask &mask, idx_t idx) -> string_t { - int32 srid = 0; - GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); - if (!gs) { - throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); - } - const uint8_t *tgeom_data = reinterpret_cast(tgeom_blob.GetData()); size_t tgeom_data_size = tgeom_blob.GetSize(); uint8_t *tgeom_data_copy = (uint8_t *)malloc(tgeom_data_size); @@ -2680,10 +2779,21 @@ void TgeompointFunctions::Tintersects_tgeo_geo(DataChunk &args, ExpressionState Temporal *tgeom = reinterpret_cast(tgeom_data_copy); if (!tgeom) { free(tgeom_data_copy); - free(gs); throw InvalidInputException("Invalid TGEOMPOINT data: null pointer"); } + int32 srid = tspatial_srid(tgeom); + GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); + if (!gs) { + free(tgeom); + throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); + } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } + Temporal *ret = tintersects_tgeo_geo(tgeom, gs, restr, at_value); free(tgeom); free(gs); @@ -2761,12 +2871,6 @@ void TgeompointFunctions::Ttouches_geo_tgeo(DataChunk &args, ExpressionState &st BinaryExecutor::ExecuteWithNulls( args.data[0], args.data[1], result, args.size(), [&](string_t geometry_blob, string_t tgeom_blob, ValidityMask &mask, idx_t idx) -> string_t { - int32 srid = 0; - GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); - if (!gs) { - throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); - } - const uint8_t *tgeom_data = reinterpret_cast(tgeom_blob.GetData()); size_t tgeom_data_size = tgeom_blob.GetSize(); uint8_t *tgeom_data_copy = (uint8_t *)malloc(tgeom_data_size); @@ -2774,10 +2878,21 @@ void TgeompointFunctions::Ttouches_geo_tgeo(DataChunk &args, ExpressionState &st Temporal *tgeom = reinterpret_cast(tgeom_data_copy); if (!tgeom) { free(tgeom_data_copy); - free(gs); throw InvalidInputException("Invalid TGEOMPOINT data: null pointer"); } + int32 srid = tspatial_srid(tgeom); + GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); + if (!gs) { + free(tgeom); + throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); + } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } + Temporal *ret = ttouches_geo_tgeo(gs, tgeom, restr, at_value); free(tgeom); free(gs); @@ -2806,12 +2921,6 @@ void TgeompointFunctions::Ttouches_tgeo_geo(DataChunk &args, ExpressionState &st BinaryExecutor::ExecuteWithNulls( args.data[0], args.data[1], result, args.size(), [&](string_t tgeom_blob, string_t geometry_blob, ValidityMask &mask, idx_t idx) -> string_t { - int32 srid = 0; - GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); - if (!gs) { - throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); - } - const uint8_t *tgeom_data = reinterpret_cast(tgeom_blob.GetData()); size_t tgeom_data_size = tgeom_blob.GetSize(); uint8_t *tgeom_data_copy = (uint8_t *)malloc(tgeom_data_size); @@ -2819,10 +2928,21 @@ void TgeompointFunctions::Ttouches_tgeo_geo(DataChunk &args, ExpressionState &st Temporal *tgeom = reinterpret_cast(tgeom_data_copy); if (!tgeom) { free(tgeom_data_copy); - free(gs); throw InvalidInputException("Invalid TGEOMPOINT data: null pointer"); } + int32 srid = tspatial_srid(tgeom); + GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); + if (!gs) { + free(tgeom); + throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); + } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } + Temporal *ret = ttouches_tgeo_geo(tgeom, gs, restr, at_value); free(tgeom); free(gs); @@ -2917,6 +3037,11 @@ void TgeompointFunctions::Tdwithin_tgeo_geo(DataChunk &args, ExpressionState &st free(tgeom); throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } Temporal *ret = tdwithin_tgeo_geo(tgeom, gs, dist, restr, at_value); free(tgeom); @@ -2946,12 +3071,6 @@ void TgeompointFunctions::Tdwithin_geo_tgeo(DataChunk &args, ExpressionState &st TernaryExecutor::ExecuteWithNulls( args.data[0], args.data[1], args.data[2], result, args.size(), [&](string_t geometry_blob, string_t tgeom_blob, double dist, ValidityMask &mask, idx_t idx) -> string_t { - int32 srid = 0; - GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); - if (!gs) { - throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); - } - const uint8_t *tgeom_data = reinterpret_cast(tgeom_blob.GetData()); size_t tgeom_data_size = tgeom_blob.GetSize(); uint8_t *tgeom_data_copy = (uint8_t *)malloc(tgeom_data_size); @@ -2959,10 +3078,21 @@ void TgeompointFunctions::Tdwithin_geo_tgeo(DataChunk &args, ExpressionState &st Temporal *tgeom = reinterpret_cast(tgeom_data_copy); if (!tgeom) { free(tgeom_data_copy); - free(gs); throw InvalidInputException("Invalid TGEOMPOINT data: null pointer"); } + int32 srid = tspatial_srid(tgeom); + GSERIALIZED *gs = GeometryToGSerialized(geometry_blob, srid); + if (!gs) { + free(tgeom); + throw InvalidInputException("Invalid geometry format: " + geometry_blob.GetString()); + } + if (MEOS_FLAGS_GET_GEODETIC(tgeom->flags)) { + GSERIALIZED *gs_geog = geom_to_geog(gs); + free(gs); + gs = gs_geog; + } + Temporal *ret = tdwithin_geo_tgeo(gs, tgeom, dist, restr, at_value); free(tgeom); free(gs); diff --git a/src/include/geo_util.hpp b/src/include/geo_util.hpp index 4505d9ea..b832bc6f 100644 --- a/src/include/geo_util.hpp +++ b/src/include/geo_util.hpp @@ -11,6 +11,35 @@ namespace duckdb { +// Defensive arg-order detection for spatial-relation executors. +// +// DuckDB function resolution treats GEOMETRY, TGEOMPOINT, TGEOGPOINT, +// TGEOMETRY, TGEOGRAPHY as alias-equivalent because each is a +// LogicalType::BLOB with an alias label. For a call like +// `eIntersects(GEOMETRY, TGEOGPOINT)`, every two-arg `eIntersects` +// overload (declared as {GEOMETRY, TGEO*} / {TGEO*, GEOMETRY} / +// {TGEO*, TGEO*}) scores equally at the BLOB level — earlier-registered +// wins, so the executor that runs may be the wrong direction. +// +// A Temporal blob's layout is `{ int32 vl_len_; uint8 temptype; uint8 +// subtype; int16 flags; ... }`. We probe byte 4 (temptype) against +// `tspatial_type` — a pure predicate returning true only for +// T_TGEOMPOINT / T_TGEOGPOINT / T_TGEOMETRY / T_TGEOGRAPHY / T_TRGEOMETRY. +// A DuckDB GEOMETRY blob's byte 4 sits in its WKB header and never +// matches one of those MeosType enum values. +// +// Confirmed via gdb backtrace: the constant-folder calls TgeoGeoIntExec +// which assumes args.data[0]=Temporal — wrong when alias-erasure routes +// a (GEOMETRY, TGEOGPOINT) call here. This probe lets us silently swap +// roles instead of failing inside MEOS's tspatial_srid. +inline bool BlobLooksLikeTemporal(string_t blob) { + if (blob.GetSize() < sizeof(Temporal)) { + return false; + } + uint8_t temptype = static_cast(blob.GetData()[4]); + return tspatial_type(static_cast(temptype)); +} + inline GSERIALIZED* GeometryToGSerialized(string_t geometry_blob, int32_t srid) { vector wkb_buffer; WKBWriter::Write(geometry_blob, wkb_buffer); diff --git a/src/include/temporal/temporal_parquet.hpp b/src/include/temporal/temporal_parquet.hpp new file mode 100644 index 00000000..7018f905 --- /dev/null +++ b/src/include/temporal/temporal_parquet.hpp @@ -0,0 +1,12 @@ +#pragma once + +#include "duckdb/function/scalar_function.hpp" +#include "duckdb/main/extension/extension_loader.hpp" + +namespace duckdb { + +struct TemporalParquetFunctions { + static void Register(ExtensionLoader &loader); +}; + +} // namespace duckdb diff --git a/src/include/tydef.hpp b/src/include/tydef.hpp index b7b28109..b9860ca0 100644 --- a/src/include/tydef.hpp +++ b/src/include/tydef.hpp @@ -11,11 +11,27 @@ extern "C" { #include } -// Forward-compat alias for the meosType → MeosType rename (MobilityDB -// pr785-sync-script). Vcpkg's MEOS exposes `MeosType`; existing -// MobilityDuck code still uses `meosType`. This alias bridges the two -// without touching every reference site. -using meosType = MeosType; +// MEOS naming history: `meosType` is the **pre-consolidation** spelling +// and `MeosType` is the **post-consolidation** target (the rename is +// part of the upstream consolidation sweep, not yet reached by the +// vcpkg pin). The current pin +// (`vcpkg_ports/meos/portfile.cmake` REF f11b7443ee98…) is still +// pre-consolidation and exposes `meosType` — see +// meos/include/temporal/meos_catalog.h, where line 121 declares +// `} meosType;`. MobilityDuck's source consistently uses +// `meosType` (verified via `grep -rn '\bmeosType\b' src/`), which +// matches the pin, so no alias is needed today. +// +// An earlier version of this file added `using meosType = MeosType;` +// as a forward-looking bridge for the eventual consolidation bump. +// That alias references `MeosType`, which the current pin does NOT +// yet expose, so it broke the build: +// "'MeosType' does not name a type; did you mean 'meosType'?". +// +// When the MEOS pin is bumped past the consolidation point, restore +// a bridge here (`using meosType = MeosType;` becomes valid then) or +// sweep the source `meosType → MeosType` in one PR — whichever the +// project prefers at that time. namespace duckdb { diff --git a/src/mobilityduck_extension.cpp b/src/mobilityduck_extension.cpp index 0eb7ebd4..fddcf43a 100644 --- a/src/mobilityduck_extension.cpp +++ b/src/mobilityduck_extension.cpp @@ -30,6 +30,7 @@ #include #include "index/rtree_module.hpp" #include "single_tile_getters.hpp" +#include "temporal/temporal_parquet.hpp" #include #include @@ -337,6 +338,9 @@ static void LoadInternal(ExtensionLoader &loader) { // Single-tile getters depend on TBOX, STBOX, and the spatial GEOMETRY // type being registered first. SingleTileGetters::RegisterScalarFunctions(loader); + + // TemporalParquet footer helper for COPY ... TO '*.parquet' KV_METADATA. + TemporalParquetFunctions::Register(loader); } void MobilityduckExtension::Load(ExtensionLoader &loader) { diff --git a/src/temporal/set.cpp b/src/temporal/set.cpp index b803498a..1858e489 100644 --- a/src/temporal/set.cpp +++ b/src/temporal/set.cpp @@ -945,6 +945,14 @@ static inline Set *date_to_set_duckdb(DateADT d) { return date_to_set(ToMeosDate(duckdb::date_t(d))); } +// macOS LP64: int64 (long) and int64_t (long long) are the same width but +// distinct types, so clang rejects passing bigint_to_set where a +// Set *(*)(int64_t) is expected as a non-type template arg. The cast is a +// no-op on Linux. See SetUnionScalarFunction below. +static inline Set *bigint_to_set_duckdb(int64_t i) { + return bigint_to_set(static_cast(i)); +} + struct SetPtrState { Set *accumulated; }; @@ -1069,7 +1077,7 @@ void SetTypes::RegisterSetUnionAgg(ExtensionLoader &loader) { LogicalType::INTEGER, SetTypes::intset())); set_union_set.AddFunction( AggregateFunction::UnaryAggregateDestructor>( + SetUnionScalarFunction>( LogicalType::BIGINT, SetTypes::bigintset())); set_union_set.AddFunction( AggregateFunction::UnaryAggregateDestructorblob = blob; r->vsize = vsize; r->vorigin = vorigin; - return r; + return unique_ptr(std::move(r)); } bool Equals(const FunctionData &other_p) const override { auto &other = other_p.Cast(); diff --git a/src/temporal/temporal_parquet.cpp b/src/temporal/temporal_parquet.cpp new file mode 100644 index 00000000..9fd0ea30 --- /dev/null +++ b/src/temporal/temporal_parquet.cpp @@ -0,0 +1,61 @@ +#include "temporal/temporal_parquet.hpp" +#include "duckdb/common/vector_operations/unary_executor.hpp" +#include "duckdb/function/scalar_function.hpp" +#include "duckdb/main/extension/extension_loader.hpp" + +namespace duckdb { + +static void TemporalFooterFun(DataChunk &args, ExpressionState &state, Vector &result) { + auto count = args.size(); + auto &map_vec = args.data[0]; + + auto &keys_child = MapVector::GetKeys(map_vec); + auto &vals_child = MapVector::GetValues(map_vec); + auto child_count = ListVector::GetListSize(map_vec); + + keys_child.Flatten(child_count); + vals_child.Flatten(child_count); + auto *keys_data = FlatVector::GetData(keys_child); + auto *vals_data = FlatVector::GetData(vals_child); + auto &keys_validity = FlatVector::Validity(keys_child); + auto &vals_validity = FlatVector::Validity(vals_child); + + UnifiedVectorFormat map_data; + map_vec.ToUnifiedFormat(count, map_data); + auto *list_entries = UnifiedVectorFormat::GetData(map_data); + auto &map_validity = map_data.validity; + + auto *result_data = FlatVector::GetData(result); + auto &result_validity = FlatVector::Validity(result); + + for (idx_t i = 0; i < count; i++) { + idx_t idx = map_data.sel->get_index(i); + if (!map_validity.RowIsValid(idx)) { + result_validity.SetInvalid(i); + continue; + } + const auto &entry = list_entries[idx]; + std::string json = "{\"version\":\"1.0.0\",\"columns\":{"; + bool first = true; + for (idx_t j = entry.offset; j < entry.offset + entry.length; j++) { + if (!keys_validity.RowIsValid(j) || !vals_validity.RowIsValid(j)) continue; + if (!first) json += ","; + first = false; + std::string col_name = keys_data[j].GetString(); + std::string base_type = vals_data[j].GetString(); + json += "\"" + col_name + "\":{\"encoding\":\"MEOS-WKB\"," + "\"encoding_version\":\"1.0\"," + "\"base_type\":\"" + base_type + "\"}"; + } + json += "}}"; + result_data[i] = StringVector::AddString(result, json); + } +} + +void TemporalParquetFunctions::Register(ExtensionLoader &loader) { + auto map_type = LogicalType::MAP(LogicalType::VARCHAR, LogicalType::VARCHAR); + loader.RegisterFunction( + ScalarFunction("temporalFooter", {map_type}, LogicalType::VARCHAR, TemporalFooterFun)); +} + +} // namespace duckdb diff --git a/test/sql/parquet/temporal_parquet.test b/test/sql/parquet/temporal_parquet.test index 84a2a68d..d6d14d47 100644 --- a/test/sql/parquet/temporal_parquet.test +++ b/test/sql/parquet/temporal_parquet.test @@ -198,3 +198,66 @@ FROM read_parquet('__TEST_DIR__/tgeogpoint.parquet') WHERE vessel_id = 1 ---- 90 + +# ============================================================================= +# tgeogpoint — geodetic (spheroidal) round-trip; asBinary must preserve type tag +# ============================================================================= + +statement ok +COPY ( + SELECT 1 AS vessel_id, + asBinary(tgeogpointSeq( + list(TGEOGPOINT(ST_Point(lon, lat), ts) ORDER BY ts) + )) AS traj + FROM (VALUES + (4.35, 50.85, TIMESTAMPTZ '2026-01-01 00:00:00+00'), + (5.57, 50.63, TIMESTAMPTZ '2026-01-01 02:00:00+00') + ) t(lon, lat, ts) +) +TO '__TEST_DIR__/tgeogpoint.parquet' (FORMAT PARQUET) + +# Must land as BYTE_ARRAY +query T +SELECT type FROM parquet_schema('__TEST_DIR__/tgeogpoint.parquet') +WHERE name = 'traj' +---- +BYTE_ARRAY + +# Geodetic length survives round-trip: Brussels→Liège ~89500 m +query I +SELECT round(length(tgeogpointFromBinary(traj)) / 1000) AS length_km +FROM read_parquet('__TEST_DIR__/tgeogpoint.parquet') +WHERE vessel_id = 1 +---- +90 + +# ============================================================================= +# temporalFooter — JSON metadata builder for KV_METADATA embedding +# ============================================================================= + +# Single-column map produces correct TemporalParquet JSON +query T +SELECT temporalFooter(MAP {'traj': 'tgeogpoint'}) +---- +{"version":"1.0.0","columns":{"traj":{"encoding":"MEOS-WKB","encoding_version":"1.0","base_type":"tgeogpoint"}}} + +# Multi-column map preserves insertion order +query T +SELECT temporalFooter(MAP {'temperature': 'tfloat', 'position': 'tgeompoint'}) +---- +{"version":"1.0.0","columns":{"temperature":{"encoding":"MEOS-WKB","encoding_version":"1.0","base_type":"tfloat"},"position":{"encoding":"MEOS-WKB","encoding_version":"1.0","base_type":"tgeompoint"}}} + +# KV_METADATA round-trip: footer survives Parquet file-level metadata +statement ok +COPY (SELECT 1 AS id) +TO '__TEST_DIR__/footer_meta.parquet' ( + FORMAT PARQUET, + KV_METADATA {'temporal': temporalFooter(MAP {'traj': 'tgeogpoint'})} +) + +query T +SELECT value = temporalFooter(MAP {'traj': 'tgeogpoint'})::BLOB AS ok +FROM parquet_kv_metadata('__TEST_DIR__/footer_meta.parquet') +WHERE key = 'temporal'::BLOB +---- +true diff --git a/test/sql/tgeogpoint.test b/test/sql/tgeogpoint.test new file mode 100644 index 00000000..30e08098 --- /dev/null +++ b/test/sql/tgeogpoint.test @@ -0,0 +1,161 @@ +require mobilityduck + +# ────────────────────────────────────────────────────────────────────────────── +# Construction +# ────────────────────────────────────────────────────────────────────────────── + +# MobilityDuck initializes MEOS with `Europe/Brussels` timezone at extension +# load time (see `mobilityduck_extension.cpp` ConfigureMeosSridCsvOnce path). +# Brussels is UTC+1 in winter (no DST); test fixtures use January dates so +# the offset is deterministically `+01`. See `docs/testing-tz-neutral-policy.md`. + +query I +SELECT asText(TGEOGPOINT(ST_Point(1, 2), to_timestamp(946684800))); +---- +POINT(1 2)@2000-01-01 01:00:00+01 + +query I +SELECT asEWKT(TGEOGPOINT(ST_Point(1, 2), to_timestamp(946684800))); +---- +SRID=4326;POINT(1 2)@2000-01-01 01:00:00+01 + +query I +SELECT asText(tgeogpointSeq(ARRAY[ + TGEOGPOINT(ST_Point(0, 0), to_timestamp(946684800)), + TGEOGPOINT(ST_Point(0, 1), to_timestamp(946771200)) +])); +---- +[POINT(0 0)@2000-01-01 01:00:00+01, POINT(0 1)@2000-01-02 01:00:00+01] + +query I +SELECT asText(tgeogpoint 'SRID=4326;Point(1 2)@2000-01-01'); +---- +POINT(1 2)@2000-01-01 00:00:00+01 + +# ────────────────────────────────────────────────────────────────────────────── +# Geodetic length — must return metres, not degrees +# ────────────────────────────────────────────────────────────────────────────── + +query I +SELECT round(length(tgeogpointSeq(ARRAY[ + TGEOGPOINT(ST_Point(0, 0), to_timestamp(946684800)), + TGEOGPOINT(ST_Point(0, 1), to_timestamp(946771200)) +]))) AS len_m; +---- +110574.0 + +# ────────────────────────────────────────────────────────────────────────────── +# eIntersects(GEOMETRY, tgeogpoint) — (geo, temporal) direction +# +# Regression for the SRID=0 + geodetic-flag-mismatch bugs: +# Bug 1: srid was hardcoded 0 before tgeom deserialization → mixed-SRID error +# Bug 2: FLAGS_SET_GEODETIC alone corrupts bbox layout → wrong results / NULL +# Fix: use tspatial_srid(tgeom) for SRID; use geom_to_geog() for flag+bbox. +# ────────────────────────────────────────────────────────────────────────────── + +query I +SELECT eIntersects( + ST_GeomFromText('POLYGON((0 0, 2 0, 2 2, 0 2, 0 0))'), + TGEOGPOINT(ST_Point(1, 1), to_timestamp(946684800)) +); +---- +true + +query I +SELECT eIntersects( + ST_GeomFromText('POLYGON((0 0, 2 0, 2 2, 0 2, 0 0))'), + TGEOGPOINT(ST_Point(5, 5), to_timestamp(946684800)) +); +---- +false + +query I +SELECT eIntersects( + ST_GeomFromText('POLYGON((0 0, 2 0, 2 2, 0 2, 0 0))'), + tgeogpointSeq(ARRAY[ + TGEOGPOINT(ST_Point(-1, 1), to_timestamp(946684800)), + TGEOGPOINT(ST_Point(3, 1), to_timestamp(946771200)) + ]) +); +---- +true + +# ────────────────────────────────────────────────────────────────────────────── +# eDisjoint(GEOMETRY, tgeogpoint) — (geo, temporal) direction +# ────────────────────────────────────────────────────────────────────────────── + +query I +SELECT eDisjoint( + ST_GeomFromText('POLYGON((10 10, 20 10, 20 20, 10 20, 10 10))'), + TGEOGPOINT(ST_Point(1, 1), to_timestamp(946684800)) +); +---- +true + +query I +SELECT eDisjoint( + ST_GeomFromText('POLYGON((0 0, 2 0, 2 2, 0 2, 0 0))'), + TGEOGPOINT(ST_Point(1, 1), to_timestamp(946684800)) +); +---- +false + +# ────────────────────────────────────────────────────────────────────────────── +# eIntersects / eDisjoint — (tgeogpoint, GEOMETRY) direction +# ────────────────────────────────────────────────────────────────────────────── + +query I +SELECT eIntersects( + TGEOGPOINT(ST_Point(1, 1), to_timestamp(946684800)), + ST_GeomFromText('POLYGON((0 0, 2 0, 2 2, 0 2, 0 0))') +); +---- +true + +query I +SELECT eDisjoint( + TGEOGPOINT(ST_Point(1, 1), to_timestamp(946684800)), + ST_GeomFromText('POLYGON((10 10, 20 10, 20 20, 10 20, 10 10))') +); +---- +true + +query I +SELECT eDisjoint( + TGEOGPOINT(ST_Point(1, 1), to_timestamp(946684800)), + ST_GeomFromText('POLYGON((0 0, 2 0, 2 2, 0 2, 0 0))') +); +---- +false + +# ────────────────────────────────────────────────────────────────────────────── +# eIntersects — (tgeogpoint, tgeogpoint) +# ────────────────────────────────────────────────────────────────────────────── + +query I +SELECT eIntersects( + TGEOGPOINT(ST_Point(1, 1), to_timestamp(946684800)), + TGEOGPOINT(ST_Point(1, 1), to_timestamp(946684800)) +); +---- +true + +query I +SELECT eIntersects( + TGEOGPOINT(ST_Point(1, 1), to_timestamp(946684800)), + TGEOGPOINT(ST_Point(5, 5), to_timestamp(946684800)) +); +---- +false + +# ────────────────────────────────────────────────────────────────────────────── +# duration +# ────────────────────────────────────────────────────────────────────────────── + +query I +SELECT duration(tgeogpointSeq(ARRAY[ + TGEOGPOINT(ST_Point(0, 0), to_timestamp(946684800)), + TGEOGPOINT(ST_Point(1, 1), to_timestamp(946771200)) +]))::VARCHAR; +---- +1 day