diff --git a/Makefile b/Makefile index bc17d050..a9485498 100644 --- a/Makefile +++ b/Makefile @@ -11,9 +11,39 @@ include extension-ci-tools/makefiles/duckdb_extension.Makefile # both MEOS (meos_initialize_timezone) and DuckDB (DBConfig::SetOptionByName # "TimeZone") to Europe/Brussels. Tests pass on any OS timezone — the # extension is the single source of truth, no TZ env var needed. +# +# LoadInternal also calls ExtensionHelper::AutoLoadExtension(db, "icu") so +# the timezone option is honoured. Autoload looks for the extension on disk +# at $HOME/.duckdb/extensions///icu.duckdb_extension +# and falls back to a hub download. That fails both inside the linux_amd64 +# test docker container (empty path, no network egress) and on the macOS +# osx_arm64 test runner (hub icu not reliably resolvable). We copy the +# icu.duckdb_extension that was built locally as part of this extension's +# build (declared in extension_config.cmake) into the expected path, +# matched to the DuckDB platform string, before running the unittester. +DUCKDB_VERSION_TAG := v1.4.4 + +define stage_icu + @if [ -f ./build/$(1)/extension/icu/icu.duckdb_extension ]; then \ + case "$$(uname -s)-$$(uname -m)" in \ + Linux-x86_64) platform=linux_amd64 ;; \ + Linux-aarch64) platform=linux_arm64 ;; \ + Darwin-arm64) platform=osx_arm64 ;; \ + Darwin-x86_64) platform=osx_amd64 ;; \ + *) platform=$$(uname -m) ;; \ + esac; \ + target=$$HOME/.duckdb/extensions/$(DUCKDB_VERSION_TAG)/$$platform; \ + mkdir -p "$$target" && cp -f ./build/$(1)/extension/icu/icu.duckdb_extension "$$target/" && \ + echo "Staged icu.duckdb_extension at $$target/"; \ + fi +endef + test_release_internal: + $(call stage_icu,release) ./build/release/$(TEST_PATH) "$(PROJ_DIR)test/*" test_debug_internal: + $(call stage_icu,debug) ./build/debug/$(TEST_PATH) "$(PROJ_DIR)test/*" test_reldebug_internal: - ./build/reldebug/$(TEST_PATH) "$(PROJ_DIR)test/*" \ No newline at end of file + $(call stage_icu,reldebug) + ./build/reldebug/$(TEST_PATH) "$(PROJ_DIR)test/*" diff --git a/src/include/index/rtree_module.hpp b/src/include/index/rtree_module.hpp index b16c05de..106931b3 100644 --- a/src/include/index/rtree_module.hpp +++ b/src/include/index/rtree_module.hpp @@ -87,6 +87,11 @@ class TRTreeIndex : public BoundIndex { MeosType bbox_meostype; size_t bbox_size_; LogicalType column_type_; + // Multi-entry (MEST) split bound for temporal columns: each temporal + // value is indexed as up to this many tight per-segment bounding + // boxes. <= 1 degenerates to the single minimum bounding box (the + // pre-MEST behaviour). Tunable via WITH (max_boxes = N) on the index. + int max_boxes_ = 8; size_t current_size_ = 0; size_t current_capacity_ = 0; diff --git a/src/include/tydef.hpp b/src/include/tydef.hpp index b7b28109..b9860ca0 100644 --- a/src/include/tydef.hpp +++ b/src/include/tydef.hpp @@ -11,11 +11,27 @@ extern "C" { #include } -// Forward-compat alias for the meosType → MeosType rename (MobilityDB -// pr785-sync-script). Vcpkg's MEOS exposes `MeosType`; existing -// MobilityDuck code still uses `meosType`. This alias bridges the two -// without touching every reference site. -using meosType = MeosType; +// MEOS naming history: `meosType` is the **pre-consolidation** spelling +// and `MeosType` is the **post-consolidation** target (the rename is +// part of the upstream consolidation sweep, not yet reached by the +// vcpkg pin). The current pin +// (`vcpkg_ports/meos/portfile.cmake` REF f11b7443ee98…) is still +// pre-consolidation and exposes `meosType` — see +// meos/include/temporal/meos_catalog.h, where line 121 declares +// `} meosType;`. MobilityDuck's source consistently uses +// `meosType` (verified via `grep -rn '\bmeosType\b' src/`), which +// matches the pin, so no alias is needed today. +// +// An earlier version of this file added `using meosType = MeosType;` +// as a forward-looking bridge for the eventual consolidation bump. +// That alias references `MeosType`, which the current pin does NOT +// yet expose, so it broke the build: +// "'MeosType' does not name a type; did you mean 'meosType'?". +// +// When the MEOS pin is bumped past the consolidation point, restore +// a bridge here (`using meosType = MeosType;` becomes valid then) or +// sweep the source `meosType → MeosType` in one PR — whichever the +// project prefers at that time. namespace duckdb { diff --git a/src/index/rtree_index_scan.cpp b/src/index/rtree_index_scan.cpp index 623fc5d1..db2003b8 100644 --- a/src/index/rtree_index_scan.cpp +++ b/src/index/rtree_index_scan.cpp @@ -28,12 +28,17 @@ BindInfo TRTreeIndexScanBindInfo(const optional_ptr bind_data_p) { struct RTreeIndexScanGlobalState : public GlobalTableFunctionState { DataChunk all_columns; vector projection_ids; + vector scanned_types; ColumnFetchState fetch_state; TableScanState local_storage_state; vector column_ids; unique_ptr index_state; - Vector row_ids = Vector(LogicalType::ROW_TYPE); + // rowid is BIGINT. Use the LogicalTypeId enumerator, never the + // LogicalType::ROW_TYPE static const member: ODR-using it from this + // extension TU emits a second definition that clashes with libduckdb + // ("multiple definition of duckdb::LogicalType::ROW_TYPE" at link). + Vector row_ids = Vector(LogicalType(LogicalTypeId::BIGINT)); }; static unique_ptr RTreeIndexScanInitGlobal(ClientContext &context, @@ -50,10 +55,24 @@ static unique_ptr RTreeIndexScanInitGlobal(ClientConte storage_t col_id = id; if (id != DConstants::INVALID_INDEX) { col_id = bind_data.table.GetColumn(LogicalIndex(id)).StorageOid(); + result->scanned_types.push_back(bind_data.table.GetColumn(LogicalIndex(id)).Type()); + } else { + // rowid column: BIGINT (see row_ids note re: ROW_TYPE ODR clash) + result->scanned_types.emplace_back(LogicalType(LogicalTypeId::BIGINT)); } result->column_ids.emplace_back(col_id); } + // Honour projection pushdown exactly like DuckDB's table_scan: when the + // optimizer removes filter-only columns, fetch the full scanned set into + // all_columns (which MUST be initialized with the scanned column types, + // else Fetch hits "Expected vector of type X, found Y") and reference + // the projected subset out of it. + if (input.CanRemoveFilterColumns()) { + result->projection_ids = input.projection_ids; + result->all_columns.Initialize(context, result->scanned_types); + } + // Initialize the storage scan state result->local_storage_state.Initialize(result->column_ids, context, input.filters); local_storage.InitializeScan(bind_data.table.GetStorage(), result->local_storage_state.local_state, input.filters); diff --git a/src/index/rtree_module.cpp b/src/index/rtree_module.cpp index 9b9fdaac..c931f087 100644 --- a/src/index/rtree_module.cpp +++ b/src/index/rtree_module.cpp @@ -121,7 +121,14 @@ TRTreeIndex::TRTreeIndex(const string &name, IndexConstraintType constraint_type if (!rtree_) { throw InternalException("Failed to create MEOS RTree"); } - + + // MEST split bound: WITH (max_boxes = N). Default 8; N <= 1 reproduces + // the pre-MEST single-box index. Only affects temporal columns. + auto mb_it = options_.find("max_boxes"); + if (mb_it != options_.end() && !mb_it->second.IsNull()) { + max_boxes_ = mb_it->second.GetValue(); + } + function_matcher = MakeFunctionMatcher(); } @@ -222,16 +229,11 @@ ErrorData TRTreeIndex::Insert(IndexLock &lock, DataChunk &data, Vector &row_ids) box = (STBox*)malloc(stbox_size); memcpy(box, stbox_data, stbox_size); - - int32_t box_srid = stbox_srid(box); - if (box_srid != 0) { - STBox *normalized_box = stbox_set_srid(box, 0); - if (normalized_box) { - free(box); - box = normalized_box; - } - } - } + // No SRID normalisation: index keys keep their natural SRID + // and && requires matching SRID, mirroring MobilityDB/PostGIS + // GiST. Normalising here but not at search time (or vice + // versa) makes ensure_same_srid reject every overlap. + } else { continue; } @@ -312,26 +314,18 @@ void TRTreeIndex::Construct(DataChunk &expression_result, Vector &row_identifier if (indexes_temporal) { const Temporal *temp = reinterpret_cast(data); - // For temporal-spatial types extract the bbox and strip the - // SRID so index keys agree with the SRID-stripped query box - // used at search time (InitializeScan strips it too). - if (bbox_meostype == T_STBOX) { - STBox *box = tspatial_to_stbox(temp); - if (!box) { - continue; - } - if (stbox_srid(box) != 0) { - STBox *normalized = stbox_set_srid(box, 0); - if (normalized) { - free(box); - box = normalized; - } - } - rtree_insert(rtree_, box, static_cast(row_data[i])); - free(box); - } else { - rtree_insert_temporal(rtree_, temp, static_cast(row_data[i])); - } + const int id = static_cast(row_data[i]); + // Multi-entry (MEST): index the temporal as up to max_boxes_ + // tight per-segment bounding boxes sharing this id; the + // splitter degenerates to a single minimum bounding box for + // instants or max_boxes_ <= 1, byte-identical to the pre-MEST + // single-box path. The produced boxes carry the temporal's + // own SRID, exactly like rtree_insert_temporal / + // tspatial_to_stbox. Do NOT pre-normalise the SRID: + // tspatial_set_srid(temp, 0) makes the SRID "unknown" and the + // stbox conversion inside the splitter then raises + // "The SRID cannot be unknown" at index-build time. + rtree_insert_temporal_split(rtree_, temp, id, max_boxes_); continue; } @@ -342,18 +336,9 @@ void TRTreeIndex::Construct(DataChunk &expression_result, Vector &row_identifier void *box = malloc(data_size); memcpy(box, data, data_size); - - if (bbox_meostype == T_STBOX) { - STBox *stbox = (STBox *) box; - int32_t box_srid = stbox_srid(stbox); - if (box_srid != 0) { - STBox *normalized_box = stbox_set_srid(stbox, 0); - if (normalized_box) { - free(box); - box = normalized_box; - } - } - } + // No SRID normalisation (see Construct/InitializeScan): keep the + // natural SRID so index keys and the query box agree; && requires + // matching SRID, mirroring MobilityDB/PostGIS GiST. void *target = (char *) boxes + (i * bbox_size_); memcpy(target, box, bbox_size_); @@ -417,21 +402,10 @@ unique_ptr TRTreeIndex::InitializeScan(const void* query_blob, s state->query_box = malloc(blob_size); memcpy(state->query_box, data, blob_size); + // No SRID normalisation: the query box keeps its natural SRID so + // it matches the index keys (which also keep theirs); && requires + // matching SRID, mirroring MobilityDB/PostGIS GiST. - if (bbox_meostype == T_STBOX) { - STBox *stbox = (STBox*)state->query_box; - int32_t query_srid = stbox_srid(stbox); - if (query_srid != 0) { - STBox *normalized_query = stbox_set_srid(stbox, 0); - if (normalized_query) { - free(state->query_box); - state->query_box = malloc(blob_size); - memcpy(state->query_box, normalized_query, blob_size); - free(normalized_query); - } - } - } - } else { throw InvalidInputException("Unsupported R-Tree operation: " + operation + " for bbox_type: " + std::to_string(bbox_meostype)); @@ -486,9 +460,18 @@ vector TRTreeIndex::Search(const void *query_box, RTreeSearchOp op) const if (count > 0) { results.reserve(count); + // Multi-entry (MEST) leaves return the same id once per + // overlapping per-segment box. Emit each row exactly once; + // duplicates would otherwise surface as duplicate rows in the + // index scan. Harmless no-op for single-box indexes. + unordered_set seen; + seen.reserve(count); for (int i = 0; i < count; i++) { int *id = static_cast(meos_array_get(ids, i)); - results.push_back(static_cast(*id)); + row_t rid = static_cast(*id); + if (seen.insert(rid).second) { + results.push_back(rid); + } } } } catch (...) { diff --git a/test/sql/parity/050_index_types.test b/test/sql/parity/050_index_types.test index 40150568..84495b5a 100644 --- a/test/sql/parity/050_index_types.test +++ b/test/sql/parity/050_index_types.test @@ -283,3 +283,80 @@ statement error CREATE INDEX i_unsupported ON idx_unsupported USING TRTREE (x); ---- TRTREE index supports + +# ============================================================================= +# Multi-entry (MEST) indexing on temporal columns. +# +# A temporal column is indexed as up to max_boxes tight per-segment +# bounding boxes per row. This must (a) never produce false negatives, +# (b) return each row exactly once even when many of its per-segment +# boxes overlap the query (dedup), and (c) behave identically for the +# default split, an explicit max_boxes, and the degenerate max_boxes = 1 +# single-box index. +# ============================================================================= + +statement ok +CREATE TABLE idx_mest(t tgeompoint); + +# trip 1: a wiggly zig-zag with a large X-extent (many tight segments); +# trip 2: a small trip far away. +statement ok +INSERT INTO idx_mest VALUES + ('[Point(0 0)@2000-01-01, Point(10 0)@2000-01-02, Point(0 0)@2000-01-03, Point(10 0)@2000-01-04, Point(0 0)@2000-01-05]'::tgeompoint), + ('[Point(1000 1000)@2000-01-01, Point(1001 1001)@2000-01-02]'::tgeompoint); + +statement ok +CREATE INDEX i_mest ON idx_mest USING TRTREE (t); + +# Query box covers the whole zig-zag and therefore overlaps many of +# trip 1's per-segment boxes: it must still come back exactly once. +query I +SELECT count(*) FROM idx_mest WHERE t && 'STBOX X((-1,-1),(11,11))'::stbox; +---- +1 + +# Disjoint from both trips: a true negative for single-box and MEST alike. +query I +SELECT count(*) FROM idx_mest WHERE t && 'STBOX X((100,100),(200,200))'::stbox; +---- +0 + +# Near trip 2 only. +query I +SELECT count(*) FROM idx_mest WHERE t && 'STBOX X((999,999),(1002,1002))'::stbox; +---- +1 + +statement ok +DROP INDEX i_mest; + +# Explicit, tighter split: same correct results, no false negatives. +statement ok +CREATE INDEX i_mest ON idx_mest USING TRTREE (t) WITH (max_boxes = 16); + +query I +SELECT count(*) FROM idx_mest WHERE t && 'STBOX X((-1,-1),(11,11))'::stbox; +---- +1 + +query I +SELECT count(*) FROM idx_mest WHERE t && 'STBOX X((100,100),(200,200))'::stbox; +---- +0 + +statement ok +DROP INDEX i_mest; + +# Degenerate single-box index (pre-MEST behaviour) via the option. +statement ok +CREATE INDEX i_mest ON idx_mest USING TRTREE (t) WITH (max_boxes = 1); + +query I +SELECT count(*) FROM idx_mest WHERE t && 'STBOX X((-1,-1),(11,11))'::stbox; +---- +1 + +query I +SELECT count(*) FROM idx_mest WHERE t && 'STBOX X((999,999),(1002,1002))'::stbox; +---- +1 diff --git a/vcpkg_ports/meos/portfile.cmake b/vcpkg_ports/meos/portfile.cmake index 57fd92eb..4efec32e 100644 --- a/vcpkg_ports/meos/portfile.cmake +++ b/vcpkg_ports/meos/portfile.cmake @@ -1,8 +1,8 @@ vcpkg_from_github( OUT_SOURCE_PATH SOURCE_PATH REPO MobilityDB/MobilityDB - REF dd4ccd3c2812af88716a318764c837fa5354c5f4 - SHA512 4e2e7077a5bd3bce0681047448352b3c6a784aad15e89f2a65feb01b71a6329ffc926079c6ae2dccb8510b797d92b2c4c6a9e0e89ecf737997eabe1880e22a14 + REF 7a29e92e7eee37e8b6069e2b686f02d0773ca8c5 + SHA512 570d71dadcfebe90c89ebb074e1b7ed1a8d23eab3d4c1f1130e1d2cfef5a0d9a458b045858ac36b66b11499466cb801f8f037d3bc4c000ec63c8a1d06b604a64 ) vcpkg_replace_string(