diff --git a/arrowdriver/dense_bitset.go b/arrowdriver/dense_bitset.go new file mode 100644 index 0000000..bfad90a --- /dev/null +++ b/arrowdriver/dense_bitset.go @@ -0,0 +1,66 @@ +package arrowdriver + +import "math/bits" + +type denseBitset struct { + words []uint64 +} + +func newDenseBitset(size int) *denseBitset { + if size <= 0 { + return &denseBitset{} + } + return &denseBitset{words: make([]uint64, (size+63)>>6)} +} + +func (b *denseBitset) set(i uint32) { + idx := int(i >> 6) + if idx < 0 || idx >= len(b.words) { + return + } + b.words[idx] |= 1 << (i & 63) +} + +func (b *denseBitset) and(other *denseBitset) { + if b == nil || other == nil { + return + } + n := len(b.words) + if len(other.words) < n { + n = len(other.words) + } + for i := 0; i < n; i++ { + b.words[i] &= other.words[i] + } + for i := n; i < len(b.words); i++ { + b.words[i] = 0 + } +} + +func (b *denseBitset) any() bool { + if b == nil { + return false + } + for _, w := range b.words { + if w != 0 { + return true + } + } + return false +} + +func (b *denseBitset) indices() []uint32 { + if b == nil { + return nil + } + out := make([]uint32, 0, len(b.words)) + for wi, w := range b.words { + for w != 0 { + tz := bits.TrailingZeros64(w) + out = append(out, uint32((wi<<6)+tz)) + w &= w - 1 + } + } + return out +} + diff --git a/arrowdriver/driver.go b/arrowdriver/driver.go new file mode 100644 index 0000000..d393cc4 --- /dev/null +++ b/arrowdriver/driver.go @@ -0,0 +1,864 @@ +package arrowdriver + +import ( + "encoding/binary" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "sync" + "time" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/query" + "github.com/bmeg/grip/log" + "github.com/bytedance/sonic" + "go.etcd.io/bbolt" +) + +const driverMetaName = "driver.meta" +const bulkLoadBatchRows = 20000 +const postBulkCompactRowsPerSection = 200000 +const schemaGraphSuffix = "__schema__" + +var ( + bucketTablesByName = []byte("tables_by_name") + bucketNamesByID = []byte("names_by_id") + keyNextTableID = []byte("next_table_id") +) + +type ArrowDriver struct { + base string + zoneDir string + + lock sync.RWMutex + tables map[string]*ArrowTable + tableIDs map[string]uint16 + idToTable map[uint16]string + fields map[string]map[string]struct{} + schemaHints map[string]tableWriteHints + schemaHintsLoaded bool + schemaHintsBuilding bool + metaDB *bbolt.DB +} + +type tableWriteHints struct { + keys []string + enc map[string]columnEncoding +} + +func NewArrowDriver(path string) (benchtop.TableDriver, error) { + zoneDir := filepath.Join(path, "ARROW_TABLES") + if err := os.MkdirAll(zoneDir, 0700); err != nil { + return nil, err + } + + metaPath := filepath.Join(zoneDir, driverMetaName) + metaDB, err := bbolt.Open(metaPath, 0600, nil) + if err != nil { + return nil, err + } + + d := &ArrowDriver{ + base: path, + zoneDir: zoneDir, + lock: sync.RWMutex{}, + tables: make(map[string]*ArrowTable), + tableIDs: make(map[string]uint16), + idToTable: make(map[uint16]string), + fields: make(map[string]map[string]struct{}), + schemaHints: make(map[string]tableWriteHints), + metaDB: metaDB, + } + + if err := d.initMeta(); err != nil { + metaDB.Close() + return nil, err + } + if err := d.discoverTables(); err != nil { + metaDB.Close() + return nil, err + } + return d, nil +} + +func (d *ArrowDriver) initMeta() error { + return d.metaDB.Update(func(tx *bbolt.Tx) error { + if _, err := tx.CreateBucketIfNotExists(bucketTablesByName); err != nil { + return err + } + if _, err := tx.CreateBucketIfNotExists(bucketNamesByID); err != nil { + return err + } + meta, err := tx.CreateBucketIfNotExists([]byte("meta")) + if err != nil { + return err + } + if meta.Get(keyNextTableID) == nil { + v := make([]byte, 2) + binary.LittleEndian.PutUint16(v, 1) + if err := meta.Put(keyNextTableID, v); err != nil { + return err + } + } + return nil + }) +} + +func (d *ArrowDriver) reserveTableID() (uint16, error) { + var next uint16 + err := d.metaDB.Update(func(tx *bbolt.Tx) error { + meta := tx.Bucket([]byte("meta")) + if meta == nil { + return fmt.Errorf("missing driver meta bucket") + } + v := meta.Get(keyNextTableID) + if len(v) < 2 { + next = 1 + } else { + next = binary.LittleEndian.Uint16(v) + if next == 0 { + next = 1 + } + } + nv := make([]byte, 2) + binary.LittleEndian.PutUint16(nv, next+1) + return meta.Put(keyNextTableID, nv) + }) + return next, err +} + +func (d *ArrowDriver) setTableMeta(name string, tableID uint16) error { + return d.metaDB.Update(func(tx *bbolt.Tx) error { + byName := tx.Bucket(bucketTablesByName) + byID := tx.Bucket(bucketNamesByID) + if byName == nil || byID == nil { + return fmt.Errorf("missing table metadata buckets") + } + idBytes := make([]byte, 2) + binary.LittleEndian.PutUint16(idBytes, tableID) + if err := byName.Put([]byte(name), idBytes); err != nil { + return err + } + if err := byID.Put(idBytes, []byte(name)); err != nil { + return err + } + return nil + }) +} + +func (d *ArrowDriver) loadTableMeta(name string) (uint16, bool, error) { + var ( + tableID uint16 + found bool + ) + err := d.metaDB.View(func(tx *bbolt.Tx) error { + b := tx.Bucket(bucketTablesByName) + if b == nil { + return nil + } + v := b.Get([]byte(name)) + if len(v) >= 2 { + tableID = binary.LittleEndian.Uint16(v) + found = true + } + return nil + }) + return tableID, found, err +} + +func (d *ArrowDriver) discoverTables() error { + pattern := filepath.Join(d.zoneDir, "*"+indexFileExt) + files, err := filepath.Glob(pattern) + if err != nil { + return err + } + + for _, p := range files { + name := strings.TrimSuffix(filepath.Base(p), indexFileExt) + if name == "" || name == driverMetaName { + continue + } + if name == strings.TrimSuffix(driverMetaName, indexFileExt) { + continue + } + + tableID, found, err := d.loadTableMeta(name) + if err != nil { + return err + } + if !found { + t, err := loadArrowTable(d.zoneDir, name) + if err != nil { + continue + } + tableID = t.TableID() + if tableID == 0 { + tableID, err = d.reserveTableID() + if err != nil { + t.Close() + return err + } + if err := t.SetTableID(tableID); err != nil { + t.Close() + return err + } + } + if err := d.setTableMeta(name, tableID); err != nil { + t.Close() + return err + } + t.Close() + } + + d.tableIDs[name] = tableID + d.idToTable[tableID] = name + if _, ok := d.fields[name]; !ok { + d.fields[name] = map[string]struct{}{} + } + } + return nil +} + +func (d *ArrowDriver) GetKV() any { + return d.base +} + +func (d *ArrowDriver) Close() { + d.lock.Lock() + defer d.lock.Unlock() + for _, t := range d.tables { + t.Close() + } + if d.metaDB != nil { + d.metaDB.Close() + } +} + +// resolveTableName returns the canonical name for a table, handling +// case-insensitive filesystem collisions (e.g. macOS HFS+/APFS). +// If a case-variant of name already exists in tableIDs, the existing +// name is returned so that we reuse the same bbolt file instead of +// trying to open it a second time (which would deadlock on flock). +func (d *ArrowDriver) resolveTableName(name string) string { + if _, ok := d.tableIDs[name]; ok { + return name + } + lower := strings.ToLower(name) + for existing := range d.tableIDs { + if strings.ToLower(existing) == lower { + return existing + } + } + return name +} + +func (d *ArrowDriver) New(name string, columns []benchtop.ColumnDef) (benchtop.TableStore, error) { + d.lock.Lock() + defer d.lock.Unlock() + + // Resolve case-variant names to prevent bbolt file lock deadlocks + // on case-insensitive filesystems. + name = d.resolveTableName(name) + + if t, ok := d.tables[name]; ok { + log.Debugf("arrowdriver.New reuse_open_table name=%s", name) + return t, nil + } + if _, ok := d.tableIDs[name]; ok { + log.Debugf("arrowdriver.New load_existing_table name=%s", name) + return d.getOrLoadLocked(name) + } + + tableID, err := d.reserveTableID() + if err != nil { + return nil, err + } + t, err := newArrowTable(d.zoneDir, name, tableID, columns) + if err != nil { + return nil, err + } + if err := d.setTableMeta(name, tableID); err != nil { + t.Close() + return nil, err + } + d.tables[name] = t + d.applySchemaHintsLocked(name, t) + d.tableIDs[name] = tableID + d.idToTable[tableID] = name + if _, ok := d.fields[name]; !ok { + d.fields[name] = make(map[string]struct{}) + } + log.Infof("arrowdriver.New created_table name=%s tableID=%d columns=%d", name, tableID, len(columns)) + return t, nil +} + +func (d *ArrowDriver) getOrLoadLocked(name string) (*ArrowTable, error) { + if t, ok := d.tables[name]; ok { + return t, nil + } + start := time.Now() + t, err := loadArrowTable(d.zoneDir, name) + if err != nil { + return nil, err + } + if tableID, ok := d.tableIDs[name]; ok && tableID > 0 && t.TableID() != tableID { + if err := t.SetTableID(tableID); err != nil { + t.Close() + return nil, err + } + } + d.tables[name] = t + if t.TableID() > 0 { + d.tableIDs[name] = t.TableID() + d.idToTable[t.TableID()] = name + } + if _, ok := d.fields[name]; !ok { + d.fields[name] = make(map[string]struct{}) + } + for _, idxField := range t.IndexedFields() { + d.fields[name][idxField] = struct{}{} + } + d.applySchemaHintsLocked(name, t) + log.Debugf("arrowdriver.getOrLoad loaded_table name=%s tableID=%d indexedFields=%d elapsed=%s", name, t.TableID(), len(t.IndexedFields()), time.Since(start).Round(time.Millisecond)) + return t, nil +} + +func (d *ArrowDriver) applySchemaHintsLocked(tableName string, t *ArrowTable) { + if strings.HasSuffix(tableName, schemaGraphSuffix) || t == nil { + return + } + if !d.schemaHintsLoaded && !d.schemaHintsBuilding { + d.refreshSchemaHintsLocked() + } + if hint, ok := d.schemaHints[tableName]; ok && len(hint.keys) > 0 { + // Keep schema hints additive so structural/runtime fields + // (for example edge linkage fields) are not dropped. + t.SetWriteHints(hint.keys, hint.enc, false) + log.Infof("arrowdriver.schema_hints_applied table=%s hintedFields=%d", tableName, len(hint.keys)) + } +} + +func (d *ArrowDriver) refreshSchemaHintsLocked() { + if d.schemaHintsBuilding { + return + } + d.schemaHintsBuilding = true + defer func() { + d.schemaHintsBuilding = false + d.schemaHintsLoaded = true + }() + out := map[string]tableWriteHints{} + for tableName := range d.tableIDs { + if !strings.HasSuffix(tableName, schemaGraphSuffix) { + continue + } + store, err := d.getOrLoadLocked(tableName) + if err != nil { + continue + } + for row := range store.ScanDoc(nil) { + label, keys, enc, ok := extractWriteHintsFromSchemaRow(row) + if !ok { + continue + } + targets := []string{label, "v_" + label, "e_" + label} + for _, target := range targets { + merged := mergeWriteHints(out[target], keys, enc) + out[target] = merged + } + } + } + d.schemaHints = out +} + +func mergeWriteHints(cur tableWriteHints, keys []string, enc map[string]columnEncoding) tableWriteHints { + keySet := map[string]struct{}{} + for _, k := range cur.keys { + keySet[k] = struct{}{} + } + for _, k := range keys { + if k == "" || k == idColumn || k == dataColumn { + continue + } + if _, ok := keySet[k]; ok { + continue + } + keySet[k] = struct{}{} + cur.keys = append(cur.keys, k) + } + sort.Strings(cur.keys) + if cur.enc == nil { + cur.enc = map[string]columnEncoding{} + } + for k, v := range enc { + if _, ok := keySet[k]; ok { + cur.enc[k] = v + } + } + return cur +} + +func extractWriteHintsFromSchemaRow(row map[string]any) (string, []string, map[string]columnEncoding, bool) { + src := row + if v, ok := row["vertex"].(map[string]any); ok { + src = v + } else if v, ok := row["vertex"].(string); ok && v != "" { + tmp := map[string]any{} + if err := sonic.ConfigFastest.Unmarshal([]byte(v), &tmp); err == nil && len(tmp) > 0 { + src = tmp + } + } else if v, ok := row["vertex"].([]byte); ok && len(v) > 0 { + tmp := map[string]any{} + if err := sonic.ConfigFastest.Unmarshal(v, &tmp); err == nil && len(tmp) > 0 { + src = tmp + } + } + label := schemaLabelFromMap(src) + if label == "" { + label = schemaLabelFromMap(row) + } + if label == "" { + return "", nil, nil, false + } + + keys := []string{} + enc := map[string]columnEncoding{} + if props, ok := src["properties"].(map[string]any); ok { + for field, def := range props { + if field == "" || field == idColumn || field == dataColumn { + continue + } + keys = append(keys, field) + enc[field] = schemaTypeToEncoding(def) + } + } else { + for field, def := range src { + if field == "" || strings.HasPrefix(field, "_") || field == "id" || field == "$id" || field == "label" || field == "title" { + continue + } + keys = append(keys, field) + enc[field] = schemaTypeToEncoding(def) + } + } + if len(keys) == 0 { + return "", nil, nil, false + } + sort.Strings(keys) + return label, keys, enc, true +} + +func schemaLabelFromMap(m map[string]any) string { + for _, k := range []string{"_label", "label", "title", "name"} { + if s, ok := m[k].(string); ok && s != "" { + return normalizeSchemaLabel(s) + } + } + if s, ok := m["_id"].(string); ok && s != "" { + return normalizeSchemaLabel(s) + } + if s, ok := m["id"].(string); ok && s != "" { + return normalizeSchemaLabel(s) + } + return "" +} + +func normalizeSchemaLabel(s string) string { + if i := strings.LastIndex(s, "/"); i >= 0 && i+1 < len(s) { + s = s[i+1:] + } + if strings.HasPrefix(s, "v_") || strings.HasPrefix(s, "e_") { + return s[2:] + } + return s +} + +func schemaTypeToEncoding(v any) columnEncoding { + switch tv := v.(type) { + case string: + switch strings.ToLower(tv) { + case "string": + return encString + case "boolean", "bool": + return encBool + case "number", "integer", "float", "double", "long", "int": + return encFloat64 + default: + return encJSON + } + case map[string]any: + if t, ok := tv["type"]; ok { + return schemaTypeToEncoding(t) + } + return encJSON + case []any: + // JSON schema can expose union types in arrays, prefer scalar when present. + for _, e := range tv { + enc := schemaTypeToEncoding(e) + if enc != encJSON { + return enc + } + } + return encJSON + default: + return encJSON + } +} + +func (d *ArrowDriver) Get(tableID uint16) (benchtop.TableStore, error) { + d.lock.Lock() + defer d.lock.Unlock() + + name, ok := d.idToTable[tableID] + if !ok { + return nil, fmt.Errorf("table ID %d not found", tableID) + } + + return d.getOrLoadLocked(name) +} + +func (d *ArrowDriver) List() []string { + d.lock.RLock() + defer d.lock.RUnlock() + out := make([]string, 0, len(d.tableIDs)) + for name := range d.tableIDs { + out = append(out, name) + } + sort.Strings(out) + return out +} + +func (d *ArrowDriver) BulkLoad(tableID uint16, rows chan *benchtop.Row) error { + start := time.Now() + tableStore, err := d.Get(tableID) + if err != nil { + log.Errorf("BulkLoad Get error: %v", err) + return err + } + at, ok := tableStore.(*ArrowTable) + if !ok { + return fmt.Errorf("table ID %d is not ArrowTable", tableID) + } + log.Infof("arrowdriver.BulkLoad start table=%s tableID=%d batchSize=%d", at.name, tableID, bulkLoadBatchRows) + + batch := make([]benchtop.Row, 0, bulkLoadBatchRows) + var totalRows int + var flushes int + for row := range rows { + if row == nil { + continue + } + batch = append(batch, *row) + totalRows++ + if len(batch) >= bulkLoadBatchRows { + flushStart := time.Now() + if err := at.BulkLoad(batch); err != nil { + return err + } + flushes++ + log.Debugf("arrowdriver.BulkLoad flush table=%s tableID=%d rows=%d flush=%d elapsed=%s", at.name, tableID, len(batch), flushes, time.Since(flushStart).Round(time.Millisecond)) + batch = batch[:0] + } + } + if len(batch) > 0 { + flushStart := time.Now() + if err := at.BulkLoad(batch); err != nil { + return err + } + flushes++ + log.Debugf("arrowdriver.BulkLoad flush table=%s tableID=%d rows=%d flush=%d elapsed=%s", at.name, tableID, len(batch), flushes, time.Since(flushStart).Round(time.Millisecond)) + } + compactStart := time.Now() + if err := at.CompactSections(postBulkCompactRowsPerSection); err != nil { + log.Warningf("arrowdriver.BulkLoad compact_error table=%s tableID=%d err=%v", at.name, tableID, err) + } else { + log.Infof("arrowdriver.BulkLoad compact_done table=%s tableID=%d targetRowsPerSection=%d elapsed=%s", at.name, tableID, postBulkCompactRowsPerSection, time.Since(compactStart).Round(time.Millisecond)) + } + log.Infof("arrowdriver.BulkLoad done table=%s tableID=%d rows=%d flushes=%d elapsed=%s", at.name, tableID, totalRows, flushes, time.Since(start).Round(time.Millisecond)) + return nil +} + +func (d *ArrowDriver) RowIdsByHas(field string, value any, op query.Condition) chan benchtop.Index { + out := make(chan benchtop.Index, 100) + go func() { + defer close(out) + start := time.Now() + total := 0 + for _, name := range d.List() { + tableStore, err := d.Get(d.tableIDs[name]) // Changed to use tableID + if err != nil { + continue + } + table, ok := tableStore.(*ArrowTable) + if !ok { + continue + } + tableStart := time.Now() + matched := 0 + for idx := range table.RowIndexesByHas(field, value, op) { + out <- idx + matched++ + total++ + } + log.Debugf("arrowdriver.RowIdsByHas table=%s field=%s op=%d matched=%d elapsed=%s", name, field, op, matched, time.Since(tableStart).Round(time.Millisecond)) + } + log.Debugf("arrowdriver.RowIdsByHas done field=%s op=%d total=%d elapsed=%s", field, op, total, time.Since(start).Round(time.Millisecond)) + }() + return out +} + +func (d *ArrowDriver) ListTableKeys(tableID uint16) (chan benchtop.Index, error) { + d.lock.RLock() + name, ok := d.idToTable[tableID] + d.lock.RUnlock() + if !ok { + out := make(chan benchtop.Index) + close(out) + return out, nil + } + store, err := d.Get(tableID) // Changed to use tableID + if err != nil { + return nil, err + } + t, ok := store.(*ArrowTable) + if !ok { + return nil, fmt.Errorf("table %q is not ArrowTable", name) + } + return t.ListTableKeys() +} + +func (d *ArrowDriver) GetAllColNames() chan string { + out := make(chan string, 10) + go func() { + defer close(out) + seen := make(map[string]struct{}) + for _, name := range d.List() { + tableStore, err := d.Get(d.tableIDs[name]) // Changed to use tableID + if err != nil { + continue + } + for _, col := range tableStore.GetColumnDefs() { + if _, ok := seen[col.Key]; ok { + continue + } + seen[col.Key] = struct{}{} + out <- col.Key + } + } + }() + return out +} + +func (d *ArrowDriver) GetLabels(edges bool, removePrefix bool) chan string { + out := make(chan string, 10) + go func() { + defer close(out) + for _, label := range d.List() { + isEdge := strings.HasPrefix(label, "e_") + if (edges && isEdge) || (!edges && !isEdge) { + if removePrefix && len(label) > 2 { + out <- label[2:] + } else { + out <- label + } + } + } + }() + return out +} + +func (d *ArrowDriver) RowIdsByTableFieldValue(tableID uint16, field string, value any, op query.Condition) chan benchtop.Index { + out := make(chan benchtop.Index, 100) + go func() { + defer close(out) + store, err := d.Get(tableID) + if err != nil { + return + } + table, ok := store.(*ArrowTable) + if !ok { + return + } + for idx := range table.RowIndexesByHas(field, value, op) { + out <- idx + } + }() + return out +} + +func (d *ArrowDriver) Delete(tableID uint16) error { + d.lock.Lock() + defer d.lock.Unlock() + + name, ok := d.idToTable[tableID] + if !ok { + return fmt.Errorf("table ID %d not found", tableID) + } + + if t, ok := d.tables[name]; ok { + t.Close() + delete(d.tables, name) + } + delete(d.tableIDs, name) + delete(d.fields, name) + delete(d.idToTable, tableID) + + _ = d.metaDB.Update(func(tx *bbolt.Tx) error { + if b := tx.Bucket(bucketTablesByName); b != nil { + _ = b.Delete([]byte(name)) + } + if tableID > 0 { + idBytes := make([]byte, 2) + binary.LittleEndian.PutUint16(idBytes, tableID) + if b := tx.Bucket(bucketNamesByID); b != nil { + _ = b.Delete(idBytes) + } + } + return nil + }) + + _ = os.Remove(filepath.Join(d.zoneDir, name+indexFileExt)) + pattern := filepath.Join(d.zoneDir, fmt.Sprintf("%s_*%s", name, arrowFileExt)) + segments, _ := filepath.Glob(pattern) + for _, seg := range segments { + _ = os.Remove(seg) + } + return nil +} + +func (d *ArrowDriver) LookupTableID(name string) (uint16, error) { + d.lock.RLock() + defer d.lock.RUnlock() + name = d.resolveTableName(name) + if id, ok := d.tableIDs[name]; ok { + return id, nil + } + return 0, fmt.Errorf("table %q not found", name) +} + +func (d *ArrowDriver) ListTableIDs() []uint16 { + d.lock.RLock() + defer d.lock.RUnlock() + ids := make([]uint16, 0, len(d.idToTable)) + for id := range d.idToTable { + ids = append(ids, id) + } + return ids +} + +func (d *ArrowDriver) GetTableInfo(tableID uint16) (*benchtop.TableInfo, error) { + d.lock.RLock() + name, ok := d.idToTable[tableID] + d.lock.RUnlock() + if !ok { + return nil, fmt.Errorf("table ID %d not found", tableID) + } + return &benchtop.TableInfo{ + Name: name, + TableId: tableID, + }, nil +} + +func (d *ArrowDriver) AddField(tableID uint16, field string) error { + d.lock.Lock() + name, ok := d.idToTable[tableID] + if !ok { + d.lock.Unlock() + return fmt.Errorf("table ID %d not found", tableID) + } + t, err := d.getOrLoadLocked(name) + if err != nil { + d.lock.Unlock() + return err + } + if _, ok := d.fields[name]; !ok { + d.fields[name] = map[string]struct{}{} + } + d.fields[name][field] = struct{}{} + d.lock.Unlock() + start := time.Now() + log.Infof("arrowdriver.AddField ensure_index_start table=%s tableID=%d field=%s", name, tableID, field) + err = t.EnsureFieldIndex(field) + if err != nil { + log.Errorf("arrowdriver.AddField ensure_index_error table=%s tableID=%d field=%s err=%v", name, tableID, field, err) + return err + } + log.Infof("arrowdriver.AddField ensure_index_done table=%s tableID=%d field=%s elapsed=%s", name, tableID, field, time.Since(start).Round(time.Millisecond)) + return nil +} + +func (d *ArrowDriver) RemoveField(tableID uint16, field string) error { + d.lock.Lock() + name, ok := d.idToTable[tableID] + if !ok { + d.lock.Unlock() + return fmt.Errorf("table ID %d not found", tableID) + } + t, err := d.getOrLoadLocked(name) + if err != nil { + d.lock.Unlock() + return err + } + if fields, ok := d.fields[name]; ok { + delete(fields, field) + } + d.lock.Unlock() + start := time.Now() + log.Infof("arrowdriver.RemoveField remove_index_start table=%s tableID=%d field=%s", name, tableID, field) + err = t.RemoveFieldIndex(field) + if err != nil { + log.Errorf("arrowdriver.RemoveField remove_index_error table=%s tableID=%d field=%s err=%v", name, tableID, field, err) + return err + } + log.Infof("arrowdriver.RemoveField remove_index_done table=%s tableID=%d field=%s elapsed=%s", name, tableID, field, time.Since(start).Round(time.Millisecond)) + return nil +} + +func (d *ArrowDriver) ListFields() []benchtop.FieldInfo { + d.lock.RLock() + defer d.lock.RUnlock() + + out := make([]benchtop.FieldInfo, 0) + for label, fields := range d.fields { + for field := range fields { + out = append(out, benchtop.FieldInfo{Label: label, Field: field}) + } + } + sort.Slice(out, func(i, j int) bool { + if out[i].Label == out[j].Label { + return out[i].Field < out[j].Field + } + return out[i].Label < out[j].Label + }) + return out +} + +func (d *ArrowDriver) DeleteRowField(tableID uint16, field, rowID string) error { + // Arrow driver computes field filters from row payloads at query time. + // There is no separate field-index keyspace to mutate for one row. + return nil +} + +func (d *ArrowDriver) InvalidateLoc(tableID uint16, rowID string) { + // Arrow driver does not currently use a table-aware location cache +} + +func (d *ArrowDriver) GetIDsForLabel(label string) chan string { + out := make(chan string, 100) + go func() { + defer close(out) + id, err := d.LookupTableID(label) + if err != nil { + return + } + store, err := d.Get(id) + if err != nil { + return + } + for id := range store.ScanId(nil) { + out <- id + } + }() + return out +} diff --git a/arrowdriver/driver_test.go b/arrowdriver/driver_test.go new file mode 100644 index 0000000..9b1db7f --- /dev/null +++ b/arrowdriver/driver_test.go @@ -0,0 +1,229 @@ +package arrowdriver + +import ( + "path/filepath" + "sort" + "testing" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/query" +) + +func TestArrowDriverRoundTrip(t *testing.T) { + base := t.TempDir() + + drvRaw, err := NewArrowDriver(base) + if err != nil { + t.Fatalf("NewArrowDriver failed: %v", err) + } + drv := drvRaw.(*ArrowDriver) + defer drv.Close() + + storeRaw, err := drv.New("v_person", []benchtop.ColumnDef{{Key: "name"}, {Key: "age"}}) + if err != nil { + t.Fatalf("New table failed: %v", err) + } + store := storeRaw.(*ArrowTable) + + rows := []benchtop.Row{ + {Id: []byte("p1"), Data: map[string]any{"name": "alice", "age": 30.0}}, + {Id: []byte("p2"), Data: map[string]any{"name": "bob", "age": 41.0}}, + {Id: []byte("p3"), Data: map[string]any{"name": "cory", "age": 25.0}}, + } + locs, err := store.AddRows(rows) + if err != nil { + t.Fatalf("AddRows failed: %v", err) + } + if len(locs) != len(rows) { + t.Fatalf("unexpected loc count: got=%d want=%d", len(locs), len(rows)) + } + + loc, err := store.GetRowLoc("p2") + if err != nil { + t.Fatalf("GetRowLoc failed: %v", err) + } + + row, err := store.GetRow(loc) + if err != nil { + t.Fatalf("GetRow failed: %v", err) + } + if row["_id"] != "p2" { + t.Fatalf("unexpected id: %#v", row["_id"]) + } + + got := map[string]struct{}{} + for id := range store.RowIdsByHas("age", 30.0, query.GTE) { + got[id] = struct{}{} + } + if _, ok := got["p1"]; !ok { + t.Fatalf("expected p1 in gte filter") + } + if _, ok := got["p2"]; !ok { + t.Fatalf("expected p2 in gte filter") + } + if _, ok := got["p3"]; ok { + t.Fatalf("did not expect p3 in gte filter") + } + + keys, err := drv.ListTableKeys(store.TableID()) + if err != nil { + t.Fatalf("ListTableKeys failed: %v", err) + } + seen := []string{} + for k := range keys { + seen = append(seen, string(k.Key)) + if k.Loc == nil { + t.Fatalf("expected location in index entry") + } + } + sort.Strings(seen) + if len(seen) != 3 { + t.Fatalf("unexpected key count: %d", len(seen)) + } + + nestedRows := []benchtop.Row{ + { + Id: []byte("obs1"), + Data: map[string]any{ + "code": map[string]any{ + "coding": []any{ + map[string]any{"code": "81247-9"}, + }, + }, + }, + }, + { + Id: []byte("obs2"), + Data: map[string]any{ + "code": map[string]any{ + "coding": []any{ + map[string]any{"code": "81247-8"}, + }, + }, + }, + }, + } + if _, err := store.AddRows(nestedRows); err != nil { + t.Fatalf("AddRows nested failed: %v", err) + } + + nestedGot := map[string]struct{}{} + for id := range store.RowIdsByHas("code.coding.[0].code", "81247-8", query.EQ) { + nestedGot[id] = struct{}{} + } + if _, ok := nestedGot["obs2"]; !ok { + t.Fatalf("expected obs2 in nested eq filter") + } + if _, ok := nestedGot["obs1"]; ok { + t.Fatalf("did not expect obs1 in nested eq filter") + } +} + +func TestArrowDriverReloadPreservesTableIDAndData(t *testing.T) { + base := t.TempDir() + + drvRaw, err := NewArrowDriver(base) + if err != nil { + t.Fatalf("NewArrowDriver failed: %v", err) + } + drv := drvRaw.(*ArrowDriver) + + storeRaw, err := drv.New("e_knows", nil) + if err != nil { + t.Fatalf("New table failed: %v", err) + } + store := storeRaw.(*ArrowTable) + _, err = store.AddRows([]benchtop.Row{ + {Id: []byte("e1"), Data: map[string]any{"from": "p1", "to": "p2"}}, + }) + if err != nil { + t.Fatalf("AddRows failed: %v", err) + } + origID := store.TableID() + if origID == 0 { + t.Fatalf("table id should be non-zero") + } + drv.Close() + + drvRaw2, err := NewArrowDriver(base) + if err != nil { + t.Fatalf("reload driver failed: %v", err) + } + drv2 := drvRaw2.(*ArrowDriver) + defer drv2.Close() + + tid, err := drv2.LookupTableID("e_knows") + if err != nil { + t.Fatalf("LookupTableID failed: %v", err) + } + storeRaw2, err := drv2.Get(tid) + if err != nil { + t.Fatalf("Get after reload failed: %v", err) + } + store2 := storeRaw2.(*ArrowTable) + if store2.TableID() != origID { + t.Fatalf("table id changed across reload: got=%d want=%d", store2.TableID(), origID) + } + loc, err := store2.GetRowLoc("e1") + if err != nil { + t.Fatalf("GetRowLoc after reload failed: %v", err) + } + if loc.TableId != origID { + t.Fatalf("rowloc table id mismatch: got=%d want=%d", loc.TableId, origID) + } + row, err := store2.GetRow(loc) + if err != nil { + t.Fatalf("GetRow after reload failed: %v", err) + } + if row["_id"] != "e1" { + t.Fatalf("unexpected row id: %#v", row["_id"]) + } + + idxFile := filepath.Join(base, "ARROW_TABLES", "e_knows.idx") + if _, err := filepath.Glob(idxFile); err != nil { + t.Fatalf("expected idx file to exist: %v", err) + } +} + +func TestArrowDriverRowIdsByHasContainsOnListField(t *testing.T) { + base := t.TempDir() + + drvRaw, err := NewArrowDriver(base) + if err != nil { + t.Fatalf("NewArrowDriver failed: %v", err) + } + drv := drvRaw.(*ArrowDriver) + defer drv.Close() + + storeRaw, err := drv.New("v_species", []benchtop.ColumnDef{{Key: "eye_colors"}}) + if err != nil { + t.Fatalf("New table failed: %v", err) + } + store := storeRaw.(*ArrowTable) + + _, err = store.AddRows([]benchtop.Row{ + {Id: []byte("s1"), Data: map[string]any{"eye_colors": []any{"blue", "yellow"}}}, + {Id: []byte("s2"), Data: map[string]any{"eye_colors": []any{"yellow"}}}, + {Id: []byte("s3"), Data: map[string]any{"eye_colors": []any{"red"}}}, + }) + if err != nil { + t.Fatalf("AddRows failed: %v", err) + } + + got := map[string]struct{}{} + for id := range store.RowIdsByHas("eye_colors", "yellow", query.CONTAINS) { + got[id] = struct{}{} + } + if len(got) != 2 { + t.Fatalf("expected 2 matches, got %d", len(got)) + } + if _, ok := got["s1"]; !ok { + t.Fatalf("expected s1 in contains result") + } + if _, ok := got["s2"]; !ok { + t.Fatalf("expected s2 in contains result") + } + if _, ok := got["s3"]; ok { + t.Fatalf("did not expect s3 in contains result") + } +} diff --git a/arrowdriver/table.go b/arrowdriver/table.go new file mode 100644 index 0000000..e372bc9 --- /dev/null +++ b/arrowdriver/table.go @@ -0,0 +1,78 @@ +package arrowdriver + +import ( + "sync" + + "github.com/apache/arrow/go/v18/arrow" + "github.com/bmeg/benchtop" + "go.etcd.io/bbolt" +) + +const ( + arrowFileExt = ".arrow" + indexFileExt = ".idx" + idColumn = "_id" + dataColumn = "_data" + + idsBucket = "ids" + metaBucket = "meta" + metaTableIDKey = "table_id" + metaNextSecKey = "next_section" + metaColumnsKey = "columns" + metaIndexKey = "indexed_fields" + + fieldIndexBucket = "field_index" + reverseFieldIndexBucket = "reverse_field_index" +) + +const sectionWriteBatchRows = 16384 + +const defaultCompactRowsPerSection = 50000 + +type columnEncoding uint8 + +const ( + encString columnEncoding = iota + encFloat64 + encBool + encJSON +) + +type ArrowTable struct { + name string + baseDir string + tableID uint16 + columns []benchtop.ColumnDef + schema *arrow.Schema + + indexedFields map[string]struct{} + writeHintKeys []string + writeHintEnc map[string]columnEncoding + writeHintOnly bool + + indexPath string + indexDB *bbolt.DB + lock sync.RWMutex + + columnCacheLock sync.RWMutex + columnCache map[string][]any + columnCacheOrder []string + columnCacheCap int + + sectionRowCacheLock sync.RWMutex + sectionRowCache map[uint16][]map[string]any + sectionRowCacheOrder []uint16 + sectionRowCacheCap int + + valueIndexCacheLock sync.RWMutex + valueIndexCache map[string]map[string][]indexedRow + + rowOrdinalCacheLock sync.RWMutex + rowOrdinalRows []indexedRow + rowOrdinalByID map[string]uint32 +} + +type indexedRow struct { + id string + loc *benchtop.RowLoc +} diff --git a/arrowdriver/table_filtering.go b/arrowdriver/table_filtering.go new file mode 100644 index 0000000..f1148c9 --- /dev/null +++ b/arrowdriver/table_filtering.go @@ -0,0 +1,552 @@ +package arrowdriver + +import ( + "reflect" + "runtime" + "sort" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/bmeg/benchtop" + bfilters "github.com/bmeg/benchtop/filters" + "github.com/bmeg/benchtop/jsontable/tpath" + "github.com/bmeg/benchtop/query" + "github.com/bmeg/benchtop/util" + "github.com/bmeg/grip/log" +) + +func isTopLevelField(field string) bool { + if field == "" { + return false + } + return !strings.Contains(field, ".") && !strings.Contains(field, "[") +} + +func tableLabelFromName(name string) string { + if len(name) > 2 && name[1] == '_' { + return name[2:] + } + return name +} + +func extractSimpleFieldFilters(filter benchtop.RowFilter) ([]query.FieldFilter, bool) { + if filter == nil || filter.IsNoOp() { + return nil, false + } + raw := filter.GetFilter() + if raw == nil { + return nil, false + } + switch f := raw.(type) { + case []query.FieldFilter: + if len(f) == 0 { + return nil, false + } + out := make([]query.FieldFilter, len(f)) + copy(out, f) + return out, true + } + + v := reflect.ValueOf(raw) + if v.Kind() != reflect.Slice { + return nil, false + } + out := make([]query.FieldFilter, 0, v.Len()) + for i := 0; i < v.Len(); i++ { + elem := v.Index(i) + if elem.Kind() == reflect.Interface && !elem.IsNil() { + elem = elem.Elem() + } + if !elem.IsValid() || elem.Kind() != reflect.Struct { + return nil, false + } + + fField := elem.FieldByName("Field") + fOp := elem.FieldByName("Operator") + fVal := elem.FieldByName("Value") + if !fField.IsValid() || !fOp.IsValid() || !fVal.IsValid() || fField.Kind() != reflect.String { + return nil, false + } + var op int64 + switch fOp.Kind() { + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: + op = fOp.Int() + case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64: + op = int64(fOp.Uint()) + default: + return nil, false + } + out = append(out, query.FieldFilter{ + Field: fField.String(), + Operator: query.Condition(op), + Value: fVal.Interface(), + }) + } + if len(out) == 0 { + return nil, false + } + return out, true +} + +func canUseTopLevelFilter(filters []query.FieldFilter) bool { + for _, f := range filters { + switch f.Field { + case "_id", "_label": + continue + default: + if strings.Contains(f.Field, "*") { + return false + } + } + } + return true +} + +func isIndexedSetOp(op query.Condition) bool { + return op == query.EQ || op == query.WITHIN +} + +func groupIndexedRowsBySection(rows []indexedRow) map[uint16][]indexedRow { + out := map[uint16][]indexedRow{} + for _, r := range rows { + out[r.loc.Section] = append(out[r.loc.Section], r) + } + return out +} + +func (t *ArrowTable) indexedRowsByIDFilter(f query.FieldFilter) []indexedRow { + values := []any{f.Value} + if f.Operator == query.WITHIN { + values = util.SliceToAny(f.Value) + } + out := make([]indexedRow, 0, len(values)) + for _, v := range values { + id, ok := v.(string) + if !ok { + continue + } + loc, err := t.GetRowLoc(id) + if err != nil { + continue + } + out = append(out, indexedRow{id: id, loc: loc}) + } + return out +} + +// tryIndexedConjunction returns a fully-indexed top-level AND filter result. +// It only optimizes EQ/WITHIN conjunctions on indexed fields (plus _id/_label guards). +func (t *ArrowTable) tryIndexedConjunction(filters []query.FieldFilter) (map[uint16][]indexedRow, bool) { + label := tableLabelFromName(t.name) + allRows, byID, err := t.getOrLoadRowOrdinalCache() + if err != nil { + return nil, false + } + if len(allRows) == 0 { + return map[uint16][]indexedRow{}, true + } + + seeded := false + var acc *denseBitset + + for _, f := range filters { + if !isIndexedSetOp(f.Operator) { + return nil, false + } + switch f.Field { + case "_label": + if !bfilters.ApplyFilterCondition(label, &f) { + return map[uint16][]indexedRow{}, true + } + continue + case "_id": + rows := t.indexedRowsByIDFilter(f) + bits := newDenseBitset(len(allRows)) + for _, r := range rows { + if ord, ok := byID[r.id]; ok { + bits.set(ord) + } + } + if !seeded { + acc = bits + seeded = true + } else { + acc.and(bits) + } + continue + default: + t.lock.RLock() + _, indexed := t.indexedFields[f.Field] + t.lock.RUnlock() + if !indexed { + return nil, false + } + values := []any{f.Value} + if f.Operator == query.WITHIN { + values = util.SliceToAny(f.Value) + } + rows, _ := t.indexedMatches(f.Field, values) + bits := newDenseBitset(len(allRows)) + for _, r := range rows { + if ord, ok := byID[r.id]; ok { + bits.set(ord) + } + } + if !seeded { + acc = bits + seeded = true + } else { + acc.and(bits) + } + } + if seeded && !acc.any() { + return map[uint16][]indexedRow{}, true + } + } + if !seeded { + return nil, false + } + ordinals := acc.indices() + rows := make([]indexedRow, 0, len(ordinals)) + for _, ord := range ordinals { + i := int(ord) + if i >= 0 && i < len(allRows) { + rows = append(rows, allRows[i]) + } + } + return groupIndexedRowsBySection(rows), true +} + +func applyConditionsOnOffset(filters []query.FieldFilter, ids []string, cols map[string][]any, offset uint32, label string) bool { + i := int(offset) + if i < 0 || i >= len(ids) { + return false + } + for _, cond := range filters { + var v any + switch cond.Field { + case "_id": + v = ids[i] + case "_label": + v = label + default: + cv, ok := cols[cond.Field] + if !ok || i >= len(cv) { + v = nil + } else { + v = cv[i] + } + } + if !bfilters.ApplyFilterCondition(v, &cond) { + return false + } + } + return true +} + +func (t *ArrowTable) findRowsByTopLevelFilters(filters []query.FieldFilter) (map[uint16][]indexedRow, bool, error) { + if len(filters) == 0 || !canUseTopLevelFilter(filters) { + return nil, false, nil + } + start := time.Now() + if matchedBySection, ok := t.tryIndexedConjunction(filters); ok { + matchCount := 0 + for _, m := range matchedBySection { + matchCount += len(m) + } + log.Debugf("arrowtable.findRowsByTopLevelFilters indexed_conjunction table=%s filters=%d matches=%d sections=%d elapsed=%s", t.name, len(filters), matchCount, len(matchedBySection), time.Since(start).Round(time.Millisecond)) + return matchedBySection, true, nil + } + rows, err := t.listIndexRows() + if err != nil { + return nil, false, err + } + bySection := map[uint16][]indexedRow{} + for _, r := range rows { + bySection[r.loc.Section] = append(bySection[r.loc.Section], r) + } + label := tableLabelFromName(t.name) + out := map[uint16][]indexedRow{} + + required := make([]string, 0, len(filters)) + seen := map[string]struct{}{} + for _, f := range filters { + if f.Field == "_id" || f.Field == "_label" { + continue + } + if _, ok := seen[f.Field]; ok { + continue + } + seen[f.Field] = struct{}{} + required = append(required, f.Field) + } + + sections := make([]uint16, 0, len(bySection)) + for sec := range bySection { + sections = append(sections, sec) + } + sort.Slice(sections, func(i, j int) bool { return sections[i] < sections[j] }) + + workers := runtime.NumCPU() + if workers < 2 { + workers = 2 + } + if workers > 16 { + workers = 16 + } + secCh := make(chan uint16, len(sections)) + var wg sync.WaitGroup + var outMu sync.Mutex + var failed atomic.Bool + + for i := 0; i < workers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for sec := range secCh { + secRows := bySection[sec] + ids, cols, err := t.readSectionTopLevelColumns(sec, required) + if err != nil { + failed.Store(true) + continue + } + matches := make([]indexedRow, 0, len(secRows)) + for _, r := range secRows { + if applyConditionsOnOffset(filters, ids, cols, r.loc.Offset, label) { + matches = append(matches, r) + } + } + if len(matches) > 0 { + outMu.Lock() + out[sec] = matches + outMu.Unlock() + } + } + }() + } + for _, sec := range sections { + secCh <- sec + } + close(secCh) + wg.Wait() + if failed.Load() { + log.Debugf("arrowtable.findRowsByTopLevelFilters fallback table=%s filters=%d reason=column_read_error elapsed=%s", t.name, len(filters), time.Since(start).Round(time.Millisecond)) + return nil, false, nil + } + matchCount := 0 + for _, m := range out { + matchCount += len(m) + } + log.Debugf("arrowtable.findRowsByTopLevelFilters optimized table=%s filters=%d rows=%d matches=%d sections=%d elapsed=%s", t.name, len(filters), len(rows), matchCount, len(out), time.Since(start).Round(time.Millisecond)) + return out, true, nil +} + +func (t *ArrowTable) indexedMatches(field string, values []any) ([]indexedRow, int) { + seen := map[string]struct{}{} + out := make([]indexedRow, 0, 1024) + totalMissing := 0 + for _, v := range values { + valueBytes, ok := encodeIndexValue(v) + if !ok { + continue + } + rows, missing := t.getOrLoadValuePostings(field, valueBytes) + totalMissing += missing + for _, r := range rows { + if _, ok := seen[r.id]; ok { + continue + } + seen[r.id] = struct{}{} + out = append(out, r) + } + } + return out, totalMissing +} + +func (t *ArrowTable) RowIdsByHas(field string, value any, op query.Condition) chan string { + out := make(chan string, 100) + go func() { + defer close(out) + if field == "_label" { + label := tableLabelFromName(t.name) + rows, err := t.listIndexRows() + if err != nil { + return + } + cond := &query.FieldFilter{Field: field, Operator: op, Value: value} + for _, r := range rows { + if bfilters.ApplyFilterCondition(label, cond) { + out <- r.id + } + } + return + } + if op == query.EQ || op == query.WITHIN { + t.lock.RLock() + _, indexed := t.indexedFields[field] + t.lock.RUnlock() + if indexed { + vals := []any{value} + if op == query.WITHIN { + vals = util.SliceToAny(value) + } + matches, _ := t.indexedMatches(field, vals) + if len(matches) > 0 { + for _, m := range matches { + out <- m.id + } + return + } + } + } + if isTopLevelField(field) && !strings.Contains(field, "*") { + filters := []query.FieldFilter{{Field: field, Operator: op, Value: value}} + if matchedBySection, optimized, _ := t.findRowsByTopLevelFilters(filters); optimized { + matchCount := 0 + for _, rows := range matchedBySection { + matchCount += len(rows) + } + if matchCount == 0 { + // Defensive fallback: avoid false-zero regressions from + // top-level-only evaluation; run legacy path below. + } else { + sections := make([]int, 0, len(matchedBySection)) + for sec := range matchedBySection { + sections = append(sections, int(sec)) + } + sort.Ints(sections) + for _, secInt := range sections { + sec := uint16(secInt) + for _, r := range matchedBySection[sec] { + out <- r.id + } + } + return + } + } + } + rows, err := t.listIndexRows() + if err != nil { + return + } + + bySection := map[uint16][]indexedRow{} + for _, r := range rows { + bySection[r.loc.Section] = append(bySection[r.loc.Section], r) + } + + sections := make([]int, 0, len(bySection)) + for sec := range bySection { + sections = append(sections, int(sec)) + } + sort.Ints(sections) + + materializedField := !strings.Contains(field, "*") + + for _, secInt := range sections { + sec := uint16(secInt) + if materializedField { + colVals, found, err := t.readSectionTopLevelColumn(sec, field) + if err == nil && found { + for _, r := range bySection[sec] { + if int(r.loc.Offset) >= len(colVals) { + continue + } + if bfilters.ApplyFilterCondition(colVals[int(r.loc.Offset)], &query.FieldFilter{Field: field, Operator: op, Value: value}) { + out <- r.id + } + } + continue + } + } + + secRows, _, err := t.readSectionRows(sec) + if err != nil { + continue + } + for _, r := range bySection[sec] { + if int(r.loc.Offset) >= len(secRows) { + continue + } + row := secRows[int(r.loc.Offset)] + fieldVal := tpath.PathLookup(row, field) + if bfilters.ApplyFilterCondition(fieldVal, &query.FieldFilter{Field: field, Operator: op, Value: value}) { + out <- r.id + } + } + } + }() + return out +} + +func (t *ArrowTable) RowIndexesByHas(field string, value any, op query.Condition) chan benchtop.Index { + out := make(chan benchtop.Index, 100) + go func() { + defer close(out) + start := time.Now() + total := 0 + if op == query.EQ || op == query.WITHIN { + t.lock.RLock() + _, indexed := t.indexedFields[field] + t.lock.RUnlock() + vals := []any{value} + if op == query.WITHIN { + vals = util.SliceToAny(value) + } + if indexed { + matches, missing := t.indexedMatches(field, vals) + for _, m := range matches { + out <- benchtop.Index{Key: []byte(m.id), Loc: m.loc} + total++ + } + if total > 0 { + log.Debugf("arrowtable.RowIndexesByHas indexed_first table=%s field=%s op=%d results=%d missingLoc=%d elapsed=%s", t.name, field, op, total, missing, time.Since(start).Round(time.Millisecond)) + return + } + } + } + + if isTopLevelField(field) && !strings.Contains(field, "*") { + filters := []query.FieldFilter{{Field: field, Operator: op, Value: value}} + if matchedBySection, optimized, _ := t.findRowsByTopLevelFilters(filters); optimized { + matchCount := 0 + for _, rows := range matchedBySection { + matchCount += len(rows) + } + if matchCount == 0 { + // Defensive fallback: avoid false-zero regressions from + // top-level-only evaluation; run legacy path below. + } else { + sections := make([]int, 0, len(matchedBySection)) + for sec := range matchedBySection { + sections = append(sections, int(sec)) + } + sort.Ints(sections) + for _, secInt := range sections { + sec := uint16(secInt) + for _, r := range matchedBySection[sec] { + out <- benchtop.Index{Key: []byte(r.id), Loc: r.loc} + total++ + } + } + log.Debugf("arrowtable.RowIndexesByHas top_level_scan table=%s field=%s op=%d results=%d elapsed=%s", t.name, field, op, total, time.Since(start).Round(time.Millisecond)) + return + } + } + } + + log.Debugf("arrowtable.RowIndexesByHas fallback_scan table=%s field=%s op=%d", t.name, field, op) + for id := range t.RowIdsByHas(field, value, op) { + loc, err := t.GetRowLoc(id) + if err != nil { + continue + } + out <- benchtop.Index{Key: []byte(id), Loc: loc} + total++ + } + log.Debugf("arrowtable.RowIndexesByHas fallback_done table=%s field=%s op=%d results=%d elapsed=%s", t.name, field, op, total, time.Since(start).Round(time.Millisecond)) + }() + return out +} diff --git a/arrowdriver/table_index.go b/arrowdriver/table_index.go new file mode 100644 index 0000000..1337d47 --- /dev/null +++ b/arrowdriver/table_index.go @@ -0,0 +1,329 @@ +package arrowdriver + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + "sort" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/jsontable/tpath" + "github.com/bytedance/sonic" + "go.etcd.io/bbolt" +) + +func makeFieldIndexKey(field string, valueBytes []byte, rowID string) []byte { + return bytes.Join([][]byte{[]byte(field), valueBytes, []byte(rowID)}, []byte{0x1f}) +} + +func makeFieldIndexPrefix(field string, valueBytes []byte) []byte { + return bytes.Join([][]byte{[]byte(field), valueBytes}, []byte{0x1f}) +} + +func encodeIndexValue(v any) ([]byte, bool) { + switch x := v.(type) { + case string: + out := make([]byte, 1+len(x)) + out[0] = 's' + copy(out[1:], x) + return out, true + case bool: + out := []byte{'b', 0} + if x { + out[1] = 1 + } + return out, true + } + if fv, ok := toFloat64(v); ok { + out := make([]byte, 1+8) + out[0] = 'f' + binary.LittleEndian.PutUint64(out[1:], math.Float64bits(fv)) + return out, true + } + valueBytes, err := sonic.ConfigFastest.Marshal(v) + if err != nil { + return nil, false + } + out := make([]byte, 1+len(valueBytes)) + out[0] = 'j' + copy(out[1:], valueBytes) + return out, true +} + +func makeReverseFieldIndexKey(field, rowID string) []byte { + return bytes.Join([][]byte{[]byte(field), []byte(rowID)}, []byte{0x1f}) +} + +func indexedIDFromKey(key []byte) (string, bool) { + i := bytes.LastIndexByte(key, 0x1f) + if i < 0 || i+1 >= len(key) { + return "", false + } + return string(key[i+1:]), true +} + +func (t *ArrowTable) persistIndexedFieldsLocked(tx *bbolt.Tx) error { + mb := tx.Bucket([]byte(metaBucket)) + if mb == nil { + return fmt.Errorf("missing meta bucket") + } + idxFields := make([]string, 0, len(t.indexedFields)) + for f := range t.indexedFields { + idxFields = append(idxFields, f) + } + sort.Strings(idxFields) + b, err := sonic.ConfigFastest.Marshal(idxFields) + if err != nil { + return err + } + return mb.Put([]byte(metaIndexKey), b) +} + +func (t *ArrowTable) removeFieldIndexLocked(tx *bbolt.Tx, field string) error { + fwd := tx.Bucket([]byte(fieldIndexBucket)) + rev := tx.Bucket([]byte(reverseFieldIndexBucket)) + if fwd == nil || rev == nil { + return fmt.Errorf("missing field index buckets") + } + prefix := append([]byte(field), 0x1f) + for c, k := fwd.Cursor(), []byte(nil); ; { + if k == nil { + k, _ = c.Seek(prefix) + } else { + k, _ = c.Next() + } + if k == nil || !bytes.HasPrefix(k, prefix) { + break + } + if err := c.Delete(); err != nil { + return err + } + } + for c, k := rev.Cursor(), []byte(nil); ; { + if k == nil { + k, _ = c.Seek(prefix) + } else { + k, _ = c.Next() + } + if k == nil || !bytes.HasPrefix(k, prefix) { + break + } + if err := c.Delete(); err != nil { + return err + } + } + return nil +} + +func (t *ArrowTable) buildFieldIndexLocked(field string) error { + rows, err := t.listIndexRows() + if err != nil { + return err + } + bySection := map[uint16][]indexedRow{} + for _, r := range rows { + bySection[r.loc.Section] = append(bySection[r.loc.Section], r) + } + sections := make([]int, 0, len(bySection)) + for sec := range bySection { + sections = append(sections, int(sec)) + } + sort.Ints(sections) + + for _, secInt := range sections { + sec := uint16(secInt) + secRows, _, err := t.readSectionRows(sec) + if err != nil { + continue + } + err = t.indexDB.Update(func(tx *bbolt.Tx) error { + fwd := tx.Bucket([]byte(fieldIndexBucket)) + rev := tx.Bucket([]byte(reverseFieldIndexBucket)) + if fwd == nil || rev == nil { + return fmt.Errorf("missing field index buckets") + } + for _, r := range bySection[sec] { + if int(r.loc.Offset) >= len(secRows) { + continue + } + row := secRows[int(r.loc.Offset)] + fieldVal := tpath.PathLookup(row, field) + if fieldVal == nil { + continue + } + valueBytes, ok := encodeIndexValue(fieldVal) + if !ok { + continue + } + if err := fwd.Put(makeFieldIndexKey(field, valueBytes, r.id), encodeRowLoc(r.loc)); err != nil { + return err + } + if err := rev.Put(makeReverseFieldIndexKey(field, r.id), valueBytes); err != nil { + return err + } + } + return nil + }) + if err != nil { + return err + } + } + return nil +} + +func (t *ArrowTable) EnsureFieldIndex(field string) error { + t.lock.Lock() + defer t.lock.Unlock() + if t.indexedFields == nil { + t.indexedFields = map[string]struct{}{} + } + t.indexedFields[field] = struct{}{} + if err := t.indexDB.Update(func(tx *bbolt.Tx) error { + if err := t.persistIndexedFieldsLocked(tx); err != nil { + return err + } + return t.removeFieldIndexLocked(tx, field) + }); err != nil { + return err + } + if err := t.buildFieldIndexLocked(field); err != nil { + return err + } + t.invalidateExecutionCaches() + return nil +} + +func (t *ArrowTable) RemoveFieldIndex(field string) error { + t.lock.Lock() + defer t.lock.Unlock() + delete(t.indexedFields, field) + err := t.indexDB.Update(func(tx *bbolt.Tx) error { + if err := t.persistIndexedFieldsLocked(tx); err != nil { + return err + } + return t.removeFieldIndexLocked(tx, field) + }) + if err == nil { + t.invalidateExecutionCaches() + } + return err +} + +func (t *ArrowTable) IndexedFields() []string { + t.lock.RLock() + defer t.lock.RUnlock() + out := make([]string, 0, len(t.indexedFields)) + for f := range t.indexedFields { + out = append(out, f) + } + sort.Strings(out) + return out +} + +func encodeRowLoc(loc *benchtop.RowLoc) []byte { + out := make([]byte, 14) + binary.LittleEndian.PutUint16(out[0:2], loc.TableId) + binary.LittleEndian.PutUint16(out[2:4], loc.Section) + binary.LittleEndian.PutUint32(out[4:8], loc.Offset) + binary.LittleEndian.PutUint32(out[8:12], loc.Size) + binary.LittleEndian.PutUint16(out[12:14], loc.Index) + return out +} + +func decodeRowLoc(v []byte) (*benchtop.RowLoc, error) { + if len(v) < 14 { + return nil, fmt.Errorf("invalid row loc length: %d", len(v)) + } + return &benchtop.RowLoc{ + TableId: binary.LittleEndian.Uint16(v[0:2]), + Section: binary.LittleEndian.Uint16(v[2:4]), + Offset: binary.LittleEndian.Uint32(v[4:8]), + Size: binary.LittleEndian.Uint32(v[8:12]), + Index: binary.LittleEndian.Uint16(v[12:14]), + }, nil +} + +func (t *ArrowTable) listIndexRows() ([]indexedRow, error) { + out := []indexedRow{} + err := t.indexDB.View(func(tx *bbolt.Tx) error { + b := tx.Bucket([]byte(idsBucket)) + if b == nil { + return nil + } + return b.ForEach(func(k, v []byte) error { + loc, err := decodeRowLoc(v) + if err != nil { + return err + } + out = append(out, indexedRow{id: string(k), loc: loc}) + return nil + }) + }) + if err != nil { + return nil, err + } + sort.Slice(out, func(i, j int) bool { + if out[i].loc.Section == out[j].loc.Section { + if out[i].loc.Offset == out[j].loc.Offset { + return out[i].id < out[j].id + } + return out[i].loc.Offset < out[j].loc.Offset + } + return out[i].loc.Section < out[j].loc.Section + }) + return out, nil +} + +func (t *ArrowTable) activeRowLocs() (map[string]*benchtop.RowLoc, error) { + out := map[string]*benchtop.RowLoc{} + err := t.indexDB.View(func(tx *bbolt.Tx) error { + b := tx.Bucket([]byte(idsBucket)) + if b == nil { + return nil + } + return b.ForEach(func(k, v []byte) error { + loc, err := decodeRowLoc(v) + if err != nil { + return err + } + out[string(k)] = loc + return nil + }) + }) + return out, err +} + +func (t *ArrowTable) nextSection() (uint16, error) { + var next uint16 + err := t.indexDB.View(func(tx *bbolt.Tx) error { + mb := tx.Bucket([]byte(metaBucket)) + if mb == nil { + next = 0 + return nil + } + v := mb.Get([]byte(metaNextSecKey)) + if len(v) >= 2 { + next = binary.LittleEndian.Uint16(v) + } + return nil + }) + return next, err +} + +func (t *ArrowTable) ScanIndex(fn func(id string, loc *benchtop.RowLoc)) error { + return t.indexDB.View(func(tx *bbolt.Tx) error { + b := tx.Bucket([]byte(idsBucket)) + if b == nil { + return nil + } + return b.ForEach(func(k, v []byte) error { + loc, err := decodeRowLoc(v) + if err != nil { + return err + } + fn(string(k), loc) + return nil + }) + }) +} diff --git a/arrowdriver/table_meta.go b/arrowdriver/table_meta.go new file mode 100644 index 0000000..0466077 --- /dev/null +++ b/arrowdriver/table_meta.go @@ -0,0 +1,500 @@ +package arrowdriver + +import ( + "encoding/binary" + "fmt" + "path/filepath" + "sort" + "strings" + "sync" + + "github.com/apache/arrow/go/v18/arrow" + "github.com/bmeg/benchtop" + "github.com/bytedance/sonic" + "go.etcd.io/bbolt" +) + +func isIndexableValue(v any) bool { + if v == nil { + return false + } + switch v.(type) { + case string, bool: + return true + } + return isNumeric(v) +} + +func collectIndexableFlatValues(data map[string]any) map[string]any { + flat := flattenRowData(data) + out := make(map[string]any, len(flat)) + for k, v := range flat { + if strings.Contains(k, ".") || strings.Contains(k, "[") { + continue + } + if !isIndexableValue(v) { + continue + } + out[k] = v + } + return out +} + +type sectionRowData struct { + id string + payload string + cols map[string]any +} + +func isNumeric(v any) bool { + switch v.(type) { + case int, int8, int16, int32, int64: + return true + case uint, uint8, uint16, uint32, uint64: + return true + case float32, float64: + return true + default: + return false + } +} + +func toFloat64(v any) (float64, bool) { + switch n := v.(type) { + case int: + return float64(n), true + case int8: + return float64(n), true + case int16: + return float64(n), true + case int32: + return float64(n), true + case int64: + return float64(n), true + case uint: + return float64(n), true + case uint8: + return float64(n), true + case uint16: + return float64(n), true + case uint32: + return float64(n), true + case uint64: + return float64(n), true + case float32: + return float64(n), true + case float64: + return n, true + default: + return 0, false + } +} + +func flattenRowData(data map[string]any) map[string]any { + out := make(map[string]any, len(data)) + for k, v := range data { + out[k] = v + } + return out +} + +func buildSectionRows(rows []benchtop.Row) ([]sectionRowData, error) { + out := make([]sectionRowData, len(rows)) + for i, row := range rows { + payload, err := sonic.ConfigFastest.Marshal(row.Data) + if err != nil { + return nil, err + } + out[i] = sectionRowData{ + id: string(row.Id), + payload: string(payload), + cols: flattenRowData(row.Data), + } + } + return out, nil +} + +func detectEncoding(rows []sectionRowData, key string) columnEncoding { + seen := false + onlyString := true + onlyBool := true + onlyNumeric := true + for _, row := range rows { + v, ok := row.cols[key] + if !ok || v == nil { + continue + } + seen = true + if _, ok := v.(string); !ok { + onlyString = false + } + if _, ok := v.(bool); !ok { + onlyBool = false + } + if !isNumeric(v) { + onlyNumeric = false + } + } + if !seen { + return encJSON + } + if onlyString { + return encString + } + if onlyBool { + return encBool + } + if onlyNumeric { + return encFloat64 + } + return encJSON +} + +func inferSectionColumns(rows []sectionRowData) ([]string, map[string]columnEncoding) { + keySet := map[string]struct{}{} + for _, row := range rows { + for k := range row.cols { + if k == idColumn || k == dataColumn { + continue + } + keySet[k] = struct{}{} + } + } + keys := make([]string, 0, len(keySet)) + for k := range keySet { + keys = append(keys, k) + } + sort.Strings(keys) + enc := make(map[string]columnEncoding, len(keys)) + for _, k := range keys { + enc[k] = detectEncoding(rows, k) + } + return keys, enc +} + +func (t *ArrowTable) SetWriteHints(keys []string, enc map[string]columnEncoding, strict bool) { + keySet := map[string]struct{}{} + outKeys := make([]string, 0, len(keys)) + for _, k := range keys { + if k == "" || k == idColumn || k == dataColumn { + continue + } + if _, ok := keySet[k]; ok { + continue + } + keySet[k] = struct{}{} + outKeys = append(outKeys, k) + } + sort.Strings(outKeys) + outEnc := map[string]columnEncoding{} + for k, v := range enc { + if _, ok := keySet[k]; ok { + outEnc[k] = v + } + } + + t.lock.Lock() + defer t.lock.Unlock() + t.writeHintKeys = outKeys + t.writeHintEnc = outEnc + t.writeHintOnly = strict +} + +func (t *ArrowTable) inferSectionColumns(rows []sectionRowData) ([]string, map[string]columnEncoding) { + hintKeys := append([]string(nil), t.writeHintKeys...) + hintEnc := map[string]columnEncoding{} + for k, v := range t.writeHintEnc { + hintEnc[k] = v + } + hintOnly := t.writeHintOnly + + if len(hintKeys) == 0 { + return inferSectionColumns(rows) + } + + keys := append([]string(nil), hintKeys...) + if !hintOnly { + keySet := map[string]struct{}{} + for _, k := range keys { + keySet[k] = struct{}{} + } + for _, row := range rows { + for k := range row.cols { + if k == idColumn || k == dataColumn { + continue + } + if _, ok := keySet[k]; ok { + continue + } + keySet[k] = struct{}{} + keys = append(keys, k) + } + } + sort.Strings(keys) + } + + enc := make(map[string]columnEncoding, len(keys)) + for _, k := range keys { + if v, ok := hintEnc[k]; ok { + enc[k] = v + continue + } + enc[k] = detectEncoding(rows, k) + } + + // Structural fields are used heavily by graph traversals and should + // remain materialized even with schema hints. + need := []string{"_from", "_to", "_label"} + seen := map[string]struct{}{} + for _, k := range keys { + seen[k] = struct{}{} + } + for _, k := range need { + if _, ok := seen[k]; ok { + continue + } + keys = append(keys, k) + enc[k] = detectEncoding(rows, k) + } + sort.Strings(keys) + return keys, enc +} + +func newArrowTable(baseDir, name string, tableID uint16, columns []benchtop.ColumnDef) (*ArrowTable, error) { + indexPath := filepath.Join(baseDir, name+indexFileExt) + + db, err := bbolt.Open(indexPath, 0600, nil) + if err != nil { + return nil, err + } + + t := &ArrowTable{ + name: name, + baseDir: baseDir, + tableID: tableID, + columns: columns, + schema: arrow.NewSchema([]arrow.Field{{Name: idColumn, Type: arrow.BinaryTypes.String}}, nil), + indexedFields: map[string]struct{}{}, + indexPath: indexPath, + indexDB: db, + lock: sync.RWMutex{}, + columnCache: map[string][]any{}, + columnCacheCap: 256, + sectionRowCache: map[uint16][]map[string]any{}, + sectionRowCacheCap: 24, + valueIndexCache: map[string]map[string][]indexedRow{}, + rowOrdinalByID: map[string]uint32{}, + } + if len(columns) > 0 { + keys := make([]string, 0, len(columns)) + for _, c := range columns { + if c.Key == "" || c.Key == idColumn || c.Key == dataColumn { + continue + } + keys = append(keys, c.Key) + } + t.writeHintKeys = keys + t.writeHintEnc = map[string]columnEncoding{} + t.writeHintOnly = false + } + + if err := t.indexDB.Update(func(tx *bbolt.Tx) error { + if _, err := tx.CreateBucketIfNotExists([]byte(idsBucket)); err != nil { + return err + } + if _, err := tx.CreateBucketIfNotExists([]byte(fieldIndexBucket)); err != nil { + return err + } + if _, err := tx.CreateBucketIfNotExists([]byte(reverseFieldIndexBucket)); err != nil { + return err + } + mb, err := tx.CreateBucketIfNotExists([]byte(metaBucket)) + if err != nil { + return err + } + if mb.Get([]byte(metaTableIDKey)) == nil { + v := make([]byte, 2) + binary.LittleEndian.PutUint16(v, tableID) + if err := mb.Put([]byte(metaTableIDKey), v); err != nil { + return err + } + } + if mb.Get([]byte(metaNextSecKey)) == nil { + v := make([]byte, 2) + binary.LittleEndian.PutUint16(v, 0) + if err := mb.Put([]byte(metaNextSecKey), v); err != nil { + return err + } + } + colBytes, err := sonic.ConfigFastest.Marshal(columns) + if err != nil { + return err + } + if err := mb.Put([]byte(metaColumnsKey), colBytes); err != nil { + return err + } + if mb.Get([]byte(metaIndexKey)) == nil { + idxBytes, err := sonic.ConfigFastest.Marshal([]string{}) + if err != nil { + return err + } + if err := mb.Put([]byte(metaIndexKey), idxBytes); err != nil { + return err + } + } + return nil + }); err != nil { + db.Close() + return nil, err + } + + return t, nil +} + +func loadArrowTable(baseDir, name string) (*ArrowTable, error) { + indexPath := filepath.Join(baseDir, name+indexFileExt) + db, err := bbolt.Open(indexPath, 0600, nil) + if err != nil { + return nil, err + } + + t := &ArrowTable{ + name: name, + baseDir: baseDir, + schema: arrow.NewSchema([]arrow.Field{{Name: idColumn, Type: arrow.BinaryTypes.String}}, nil), + indexedFields: map[string]struct{}{}, + indexPath: indexPath, + indexDB: db, + lock: sync.RWMutex{}, + columnCache: map[string][]any{}, + columnCacheCap: 256, + sectionRowCache: map[uint16][]map[string]any{}, + sectionRowCacheCap: 24, + valueIndexCache: map[string]map[string][]indexedRow{}, + rowOrdinalByID: map[string]uint32{}, + } + + if err := t.indexDB.Update(func(tx *bbolt.Tx) error { + if _, err := tx.CreateBucketIfNotExists([]byte(idsBucket)); err != nil { + return err + } + if _, err := tx.CreateBucketIfNotExists([]byte(fieldIndexBucket)); err != nil { + return err + } + if _, err := tx.CreateBucketIfNotExists([]byte(reverseFieldIndexBucket)); err != nil { + return err + } + mb, err := tx.CreateBucketIfNotExists([]byte(metaBucket)) + if err != nil { + return err + } + + tid := mb.Get([]byte(metaTableIDKey)) + if tid == nil { + v := make([]byte, 2) + binary.LittleEndian.PutUint16(v, 0) + if err := mb.Put([]byte(metaTableIDKey), v); err != nil { + return err + } + t.tableID = 0 + } else { + t.tableID = binary.LittleEndian.Uint16(tid) + } + + next := mb.Get([]byte(metaNextSecKey)) + if next == nil { + v := make([]byte, 2) + binary.LittleEndian.PutUint16(v, t.inferNextSectionUnsafe()) + if err := mb.Put([]byte(metaNextSecKey), v); err != nil { + return err + } + } + + colBytes := mb.Get([]byte(metaColumnsKey)) + if len(colBytes) > 0 { + var cols []benchtop.ColumnDef + if err := sonic.ConfigFastest.Unmarshal(colBytes, &cols); err == nil { + t.columns = cols + } + } + idxBytes := mb.Get([]byte(metaIndexKey)) + if len(idxBytes) > 0 { + var idxFields []string + if err := sonic.ConfigFastest.Unmarshal(idxBytes, &idxFields); err == nil { + for _, f := range idxFields { + t.indexedFields[f] = struct{}{} + } + } + } + return nil + }); err != nil { + db.Close() + return nil, err + } + + return t, nil +} + +func (t *ArrowTable) Close() error { + t.lock.Lock() + defer t.lock.Unlock() + if t.indexDB != nil { + return t.indexDB.Close() + } + return nil +} + +func (t *ArrowTable) sectionPath(section uint16) string { + return filepath.Join(t.baseDir, fmt.Sprintf("%s_%06d%s", t.name, section, arrowFileExt)) +} + +func (t *ArrowTable) inferNextSectionUnsafe() uint16 { + prefix := t.name + "_" + pattern := filepath.Join(t.baseDir, t.name+"_*"+arrowFileExt) + files, err := filepath.Glob(pattern) + if err != nil { + return 0 + } + maxSection := -1 + for _, p := range files { + base := filepath.Base(p) + if !strings.HasPrefix(base, prefix) || !strings.HasSuffix(base, arrowFileExt) { + continue + } + mid := strings.TrimSuffix(strings.TrimPrefix(base, prefix), arrowFileExt) + var sec int + if _, err := fmt.Sscanf(mid, "%d", &sec); err != nil { + continue + } + if sec > maxSection { + maxSection = sec + } + } + if maxSection < 0 { + return 0 + } + if maxSection >= int(^uint16(0)) { + return ^uint16(0) + } + return uint16(maxSection + 1) +} + +func (t *ArrowTable) reserveSection() (uint16, error) { + var section uint16 + err := t.indexDB.Update(func(tx *bbolt.Tx) error { + mb := tx.Bucket([]byte(metaBucket)) + if mb == nil { + return fmt.Errorf("missing meta bucket") + } + next := mb.Get([]byte(metaNextSecKey)) + if len(next) < 2 { + section = t.inferNextSectionUnsafe() + } else { + section = binary.LittleEndian.Uint16(next) + } + v := make([]byte, 2) + binary.LittleEndian.PutUint16(v, section+1) + return mb.Put([]byte(metaNextSecKey), v) + }) + return section, err +} diff --git a/arrowdriver/table_ops.go b/arrowdriver/table_ops.go new file mode 100644 index 0000000..6daded0 --- /dev/null +++ b/arrowdriver/table_ops.go @@ -0,0 +1,424 @@ +package arrowdriver + +import ( + "encoding/binary" + "fmt" + "os" + "runtime" + "sync" + "time" + + "github.com/bmeg/benchtop" + "github.com/bmeg/grip/log" + "go.etcd.io/bbolt" +) + +func (t *ArrowTable) GetColumnDefs() []benchtop.ColumnDef { + out := make([]benchtop.ColumnDef, len(t.columns)) + copy(out, t.columns) + return out +} + +func (t *ArrowTable) HasField(field string) bool { + if field == idColumn { + return true + } + for _, c := range t.columns { + if c.Key == field { + return true + } + } + return true +} + +func (t *ArrowTable) AddRow(elem benchtop.Row) (*benchtop.RowLoc, error) { + locs, err := t.AddRows([]benchtop.Row{elem}) + if err != nil { + return nil, err + } + if len(locs) == 0 { + return nil, fmt.Errorf("no location returned") + } + return locs[0], nil +} + +func (t *ArrowTable) AddRows(elems []benchtop.Row) ([]*benchtop.RowLoc, error) { + if len(elems) == 0 { + return []*benchtop.RowLoc{}, nil + } + start := time.Now() + t.lock.Lock() + defer t.lock.Unlock() + + section, err := t.reserveSection() + if err != nil { + return nil, err + } + sectionRows, err := buildSectionRows(elems) + if err != nil { + return nil, err + } + if err := t.writeSectionMaterialized(section, sectionRows); err != nil { + return nil, err + } + indexableRows := make([]map[string]any, len(elems)) + seenFields := map[string]struct{}{} + for i := range sectionRows { + vals := map[string]any{} + for f, v := range sectionRows[i].cols { + if !isIndexableValue(v) { + continue + } + vals[f] = v + } + indexableRows[i] = vals + for f := range vals { + seenFields[f] = struct{}{} + } + } + newFields := false + for f := range seenFields { + if _, ok := t.indexedFields[f]; !ok { + newFields = true + } + t.indexedFields[f] = struct{}{} + } + + locs := make([]*benchtop.RowLoc, len(elems)) + err = t.indexDB.Update(func(tx *bbolt.Tx) error { + b := tx.Bucket([]byte(idsBucket)) + fwd := tx.Bucket([]byte(fieldIndexBucket)) + rev := tx.Bucket([]byte(reverseFieldIndexBucket)) + if b == nil { + return fmt.Errorf("missing ids bucket") + } + if newFields { + if err := t.persistIndexedFieldsLocked(tx); err != nil { + return err + } + } + for i, row := range elems { + loc := &benchtop.RowLoc{TableId: t.tableID, Section: section, Offset: uint32(i), Size: 0, Index: 0} + if err := b.Put(row.Id, encodeRowLoc(loc)); err != nil { + return err + } + if fwd != nil && rev != nil { + rowID := string(row.Id) + for field, fieldVal := range indexableRows[i] { + valueBytes, ok := encodeIndexValue(fieldVal) + if !ok { + continue + } + if err := fwd.Put(makeFieldIndexKey(field, valueBytes, rowID), encodeRowLoc(loc)); err != nil { + return err + } + if err := rev.Put(makeReverseFieldIndexKey(field, rowID), valueBytes); err != nil { + return err + } + } + } + locs[i] = loc + } + return nil + }) + if err != nil { + return nil, err + } + t.invalidateExecutionCaches() + + log.Debugf("arrowtable.AddRows table=%s tableID=%d section=%d rows=%d elapsed=%s", t.name, t.tableID, section, len(elems), time.Since(start).Round(time.Millisecond)) + return locs, nil +} + +func (t *ArrowTable) DeleteRow(loc *benchtop.RowLoc, id []byte) error { + t.lock.Lock() + defer t.lock.Unlock() + if len(id) == 0 { + return nil + } + idStr := string(id) + err := t.indexDB.Update(func(tx *bbolt.Tx) error { + b := tx.Bucket([]byte(idsBucket)) + fwd := tx.Bucket([]byte(fieldIndexBucket)) + rev := tx.Bucket([]byte(reverseFieldIndexBucket)) + if b == nil { + return fmt.Errorf("missing ids bucket") + } + if fwd != nil && rev != nil { + for field := range t.indexedFields { + rKey := makeReverseFieldIndexKey(field, idStr) + val := rev.Get(rKey) + if val == nil { + continue + } + if err := fwd.Delete(makeFieldIndexKey(field, val, idStr)); err != nil { + return err + } + if err := rev.Delete(rKey); err != nil { + return err + } + } + } + return b.Delete(id) + }) + if err == nil { + t.invalidateExecutionCaches() + } + return err +} + +func (t *ArrowTable) MarkDeleteTable(loc *benchtop.RowLoc) error { + return nil +} + +func (t *ArrowTable) GetRowLoc(id string) (*benchtop.RowLoc, error) { + var loc *benchtop.RowLoc + err := t.indexDB.View(func(tx *bbolt.Tx) error { + b := tx.Bucket([]byte(idsBucket)) + if b == nil { + return fmt.Errorf("missing ids bucket") + } + v := b.Get([]byte(id)) + if v == nil { + return fmt.Errorf("id %s not found", id) + } + decoded, err := decodeRowLoc(v) + if err != nil { + return err + } + loc = decoded + return nil + }) + return loc, err +} + +func (t *ArrowTable) GetRow(loc *benchtop.RowLoc) (map[string]any, error) { + t.lock.RLock() + defer t.lock.RUnlock() + + if cached, ok := t.getCachedSectionRows(loc.Section); ok { + if int(loc.Offset) < len(cached) { + return cloneRowMap(cached[loc.Offset]), nil + } + } + rows, _, err := t.readSectionRows(loc.Section) + if err != nil { + return nil, err + } + if int(loc.Offset) >= len(rows) { + return nil, fmt.Errorf("row not found at section=%d offset=%d", loc.Section, loc.Offset) + } + return cloneRowMap(rows[loc.Offset]), nil +} + +func (t *ArrowTable) GetRows(locs []*benchtop.RowLoc) ([]map[string]any, []error) { + results := make([]map[string]any, len(locs)) + errs := make([]error, len(locs)) + + bySection := map[uint16][]int{} + for i, loc := range locs { + if loc == nil { + errs[i] = fmt.Errorf("nil row location") + continue + } + bySection[loc.Section] = append(bySection[loc.Section], i) + } + + type sectionWork struct { + sec uint16 + idxs []int + } + workCh := make(chan sectionWork, len(bySection)) + var wg sync.WaitGroup + workers := runtime.NumCPU() + if workers < 2 { + workers = 2 + } + if workers > 16 { + workers = 16 + } + + for w := 0; w < workers; w++ { + wg.Add(1) + go func() { + defer wg.Done() + for work := range workCh { + offsets := map[uint32]struct{}{} + for _, i := range work.idxs { + offsets[locs[i].Offset] = struct{}{} + } + rowsByOffset, err := t.readSectionRowsByOffsets(work.sec, offsets) + if err != nil { + for _, i := range work.idxs { + errs[i] = err + } + continue + } + for _, i := range work.idxs { + loc := locs[i] + row, ok := rowsByOffset[loc.Offset] + if !ok { + errs[i] = fmt.Errorf("row not found at section=%d offset=%d", loc.Section, loc.Offset) + continue + } + results[i] = row + } + } + }() + } + for sec, idxs := range bySection { + workCh <- sectionWork{sec: sec, idxs: idxs} + } + close(workCh) + wg.Wait() + + return results, errs +} + +func (t *ArrowTable) ListTableKeys() (chan benchtop.Index, error) { + out := make(chan benchtop.Index, 100) + go func() { + defer close(out) + rows, err := t.listIndexRows() + if err != nil { + return + } + for _, r := range rows { + out <- benchtop.Index{Key: []byte(r.id), Loc: r.loc} + } + }() + return out, nil +} + +func (t *ArrowTable) TableID() uint16 { + return t.tableID +} + +// CompactSections rewrites scattered small section files into larger sections. +// This is intended to run after bulk load to reduce file count and scan overhead. +func (t *ArrowTable) CompactSections(maxRowsPerSection int) error { + if maxRowsPerSection <= 0 { + maxRowsPerSection = defaultCompactRowsPerSection + } + + t.lock.Lock() + defer t.lock.Unlock() + + indexed, err := t.listIndexRows() + if err != nil { + return err + } + if len(indexed) <= maxRowsPerSection { + return nil + } + + oldSectionsMap := map[uint16]struct{}{} + for _, r := range indexed { + oldSectionsMap[r.loc.Section] = struct{}{} + } + if len(oldSectionsMap) <= 1 { + return nil + } + + type stagedRow struct { + id string + data map[string]any + } + + staged := make([]stagedRow, 0, maxRowsPerSection) + var currSection uint16 = ^uint16(0) + var secRows []map[string]any + + flush := func() error { + if len(staged) == 0 { + return nil + } + + section, err := t.reserveSection() + if err != nil { + return err + } + + rows := make([]benchtop.Row, len(staged)) + for i, s := range staged { + clean := make(map[string]any, len(s.data)) + for k, v := range s.data { + if k == idColumn { + continue + } + clean[k] = v + } + rows[i] = benchtop.Row{Id: []byte(s.id), Data: clean} + } + + if err := t.writeSection(section, rows); err != nil { + return err + } + + err = t.indexDB.Update(func(tx *bbolt.Tx) error { + b := tx.Bucket([]byte(idsBucket)) + if b == nil { + return fmt.Errorf("missing ids bucket") + } + for i, s := range staged { + loc := &benchtop.RowLoc{TableId: t.tableID, Section: section, Offset: uint32(i), Size: 0, Index: 0} + if err := b.Put([]byte(s.id), encodeRowLoc(loc)); err != nil { + return err + } + } + return nil + }) + if err != nil { + return err + } + + staged = staged[:0] + return nil + } + + for _, r := range indexed { + if r.loc.Section != currSection { + secRows, _, err = t.readSectionRows(r.loc.Section) + if err != nil { + return err + } + currSection = r.loc.Section + } + if int(r.loc.Offset) >= len(secRows) { + return fmt.Errorf("row not found at section=%d offset=%d", r.loc.Section, r.loc.Offset) + } + staged = append(staged, stagedRow{id: r.id, data: secRows[int(r.loc.Offset)]}) + if len(staged) >= maxRowsPerSection { + if err := flush(); err != nil { + return err + } + } + } + if err := flush(); err != nil { + return err + } + + for sec := range oldSectionsMap { + t.invalidateSectionCaches(sec) + _ = os.Remove(t.sectionPath(sec)) + } + t.clearAllSectionCaches() + t.invalidateExecutionCaches() + + return nil +} + +func (t *ArrowTable) SetTableID(id uint16) error { + t.lock.Lock() + defer t.lock.Unlock() + t.tableID = id + return t.indexDB.Update(func(tx *bbolt.Tx) error { + mb := tx.Bucket([]byte(metaBucket)) + if mb == nil { + return fmt.Errorf("missing meta bucket") + } + v := make([]byte, 2) + binary.LittleEndian.PutUint16(v, id) + return mb.Put([]byte(metaTableIDKey), v) + }) +} diff --git a/arrowdriver/table_row_ordinal_cache.go b/arrowdriver/table_row_ordinal_cache.go new file mode 100644 index 0000000..dfc8f0c --- /dev/null +++ b/arrowdriver/table_row_ordinal_cache.go @@ -0,0 +1,40 @@ +package arrowdriver + +func (t *ArrowTable) invalidateRowOrdinalCache() { + t.rowOrdinalCacheLock.Lock() + defer t.rowOrdinalCacheLock.Unlock() + t.rowOrdinalRows = nil + t.rowOrdinalByID = map[string]uint32{} +} + +func (t *ArrowTable) getOrLoadRowOrdinalCache() ([]indexedRow, map[string]uint32, error) { + t.rowOrdinalCacheLock.RLock() + if len(t.rowOrdinalRows) > 0 && len(t.rowOrdinalByID) > 0 { + rows := t.rowOrdinalRows + byID := t.rowOrdinalByID + t.rowOrdinalCacheLock.RUnlock() + return rows, byID, nil + } + t.rowOrdinalCacheLock.RUnlock() + + rows, err := t.listIndexRows() + if err != nil { + return nil, nil, err + } + byID := make(map[string]uint32, len(rows)) + for i, r := range rows { + byID[r.id] = uint32(i) + } + + t.rowOrdinalCacheLock.Lock() + t.rowOrdinalRows = rows + t.rowOrdinalByID = byID + t.rowOrdinalCacheLock.Unlock() + return rows, byID, nil +} + +func (t *ArrowTable) invalidateExecutionCaches() { + t.invalidateValueIndexCache() + t.invalidateRowOrdinalCache() +} + diff --git a/arrowdriver/table_scan.go b/arrowdriver/table_scan.go new file mode 100644 index 0000000..ac3537a --- /dev/null +++ b/arrowdriver/table_scan.go @@ -0,0 +1,528 @@ +package arrowdriver + +import ( + "runtime" + "sort" + "sync" + + "github.com/bmeg/benchtop" + "github.com/bytedance/sonic" +) + +func (t *ArrowTable) ScanDoc(filter benchtop.RowFilter) chan map[string]any { + out := make(chan map[string]any, 100) + go func() { + defer close(out) + filterActive := filter != nil && !filter.IsNoOp() + if !filterActive { + rows, err := t.listIndexRows() + if err != nil { + return + } + bySection := map[uint16][]indexedRow{} + for _, r := range rows { + bySection[r.loc.Section] = append(bySection[r.loc.Section], r) + } + sections := make([]int, 0, len(bySection)) + for sec := range bySection { + sections = append(sections, int(sec)) + } + sort.Ints(sections) + type sectionDocResult struct { + sec uint16 + rows []map[string]any + } + secCh := make(chan uint16, len(sections)) + resCh := make(chan sectionDocResult, len(sections)) + workers := runtime.NumCPU() + if workers < 2 { + workers = 2 + } + if workers > 16 { + workers = 16 + } + var wg sync.WaitGroup + for i := 0; i < workers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for sec := range secCh { + secRows, _, err := t.readSectionRows(sec) + if err != nil { + resCh <- sectionDocResult{sec: sec, rows: nil} + continue + } + matched := make([]map[string]any, 0, len(bySection[sec])) + for _, r := range bySection[sec] { + if int(r.loc.Offset) >= len(secRows) { + continue + } + matched = append(matched, secRows[int(r.loc.Offset)]) + } + resCh <- sectionDocResult{sec: sec, rows: matched} + } + }() + } + for _, secInt := range sections { + secCh <- uint16(secInt) + } + close(secCh) + go func() { + wg.Wait() + close(resCh) + }() + sectionRowsOut := map[uint16][]map[string]any{} + for res := range resCh { + sectionRowsOut[res.sec] = res.rows + } + for _, secInt := range sections { + sec := uint16(secInt) + for _, row := range sectionRowsOut[sec] { + out <- row + } + } + return + } + + if simpleFilters, ok := extractSimpleFieldFilters(filter); ok { + if matchedBySection, optimized, _ := t.findRowsByTopLevelFilters(simpleFilters); optimized { + sections := make([]int, 0, len(matchedBySection)) + for sec := range matchedBySection { + sections = append(sections, int(sec)) + } + sort.Ints(sections) + for _, secInt := range sections { + sec := uint16(secInt) + needed := map[uint32]struct{}{} + for _, r := range matchedBySection[sec] { + needed[r.loc.Offset] = struct{}{} + } + rowsAtOffset, err := t.readSectionRowsByOffsets(sec, needed) + if err != nil { + continue + } + for _, r := range matchedBySection[sec] { + row, ok := rowsAtOffset[r.loc.Offset] + if !ok { + continue + } + out <- row + } + } + return + } + } + + rows, err := t.listIndexRows() + if err != nil { + return + } + bySection := map[uint16][]indexedRow{} + for _, r := range rows { + bySection[r.loc.Section] = append(bySection[r.loc.Section], r) + } + sections := make([]int, 0, len(bySection)) + for sec := range bySection { + sections = append(sections, int(sec)) + } + sort.Ints(sections) + type sectionDocResult struct { + sec uint16 + rows []map[string]any + } + secCh := make(chan uint16, len(sections)) + resCh := make(chan sectionDocResult, len(sections)) + workers := runtime.NumCPU() + if workers < 2 { + workers = 2 + } + if workers > 16 { + workers = 16 + } + var wg sync.WaitGroup + for i := 0; i < workers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for sec := range secCh { + secRows, _, err := t.readSectionRows(sec) + if err != nil { + resCh <- sectionDocResult{sec: sec, rows: nil} + continue + } + matched := make([]map[string]any, 0, len(bySection[sec])) + for _, r := range bySection[sec] { + if int(r.loc.Offset) >= len(secRows) { + continue + } + row := secRows[int(r.loc.Offset)] + if filterActive { + payload, err := sonic.ConfigFastest.Marshal(row) + if err != nil { + continue + } + if !filter.Matches(payload, t.name) { + continue + } + } + matched = append(matched, row) + } + resCh <- sectionDocResult{sec: sec, rows: matched} + } + }() + } + for _, secInt := range sections { + secCh <- uint16(secInt) + } + close(secCh) + go func() { + wg.Wait() + close(resCh) + }() + sectionRowsOut := map[uint16][]map[string]any{} + for res := range resCh { + sectionRowsOut[res.sec] = res.rows + } + for _, secInt := range sections { + sec := uint16(secInt) + for _, row := range sectionRowsOut[sec] { + out <- row + } + } + }() + return out +} + +func (t *ArrowTable) ScanDocProjected(fields []string, filter benchtop.RowFilter) chan map[string]any { + out := make(chan map[string]any, 100) + go func() { + defer close(out) + + if len(fields) == 0 { + for row := range t.ScanDoc(filter) { + out <- row + } + return + } + + filterActive := filter != nil && !filter.IsNoOp() + needsJSON := filterActive + if filterActive { + if simpleFilters, ok := extractSimpleFieldFilters(filter); ok { + if matchedBySection, optimized, _ := t.findRowsByTopLevelFilters(simpleFilters); optimized { + sections := make([]int, 0, len(matchedBySection)) + for sec := range matchedBySection { + sections = append(sections, int(sec)) + } + sort.Ints(sections) + for _, secInt := range sections { + sec := uint16(secInt) + needed := map[uint32]struct{}{} + for _, r := range matchedBySection[sec] { + needed[r.loc.Offset] = struct{}{} + } + + if !needsJSON { + projRows, err := t.readSectionProjectedRowsByOffsets(sec, fields, needed) + if err == nil { + for _, r := range matchedBySection[sec] { + row, ok := projRows[r.loc.Offset] + if !ok { + continue + } + out <- row + } + continue + } + } + + fullRows, err := t.readSectionRowsByOffsets(sec, needed) + if err != nil { + continue + } + for _, r := range matchedBySection[sec] { + full, ok := fullRows[r.loc.Offset] + if !ok { + continue + } + proj := map[string]any{"_id": full["_id"]} + for _, f := range fields { + if f == "_id" { + continue + } + if v, ok := full[f]; ok { + proj[f] = v + } + } + out <- proj + } + } + return + } + } + } + if !filterActive && len(fields) > 0 { + active, err := t.activeRowLocs() + if err != nil { + return + } + next, err := t.nextSection() + if err != nil { + return + } + for sec := uint16(0); sec < next; sec++ { + if !needsJSON { + projRows, err := t.readSectionProjectedRows(sec, fields) + if err == nil { + for i, row := range projRows { + id, _ := row["_id"].(string) + loc, ok := active[id] + if !ok { + continue + } + if loc.Section != sec || loc.Offset != uint32(i) { + continue + } + out <- row + } + continue + } + } + err := t.streamSectionRows(sec, func(id string, full map[string]any, offset uint32) bool { + loc, ok := active[id] + if !ok { + return true + } + if loc.Section != sec || loc.Offset != offset { + return true + } + proj := map[string]any{"_id": full["_id"]} + for _, f := range fields { + if f == "_id" { + continue + } + if v, ok := full[f]; ok { + proj[f] = v + } + } + out <- proj + return true + }) + if err != nil { + continue + } + } + return + } + rows, err := t.listIndexRows() + if err != nil { + return + } + bySection := map[uint16][]indexedRow{} + for _, r := range rows { + bySection[r.loc.Section] = append(bySection[r.loc.Section], r) + } + sections := make([]int, 0, len(bySection)) + for sec := range bySection { + sections = append(sections, int(sec)) + } + sort.Ints(sections) + + for _, secInt := range sections { + sec := uint16(secInt) + if !needsJSON { + projRows, err := t.readSectionProjectedRows(sec, fields) + if err == nil { + for _, r := range bySection[sec] { + if int(r.loc.Offset) >= len(projRows) { + continue + } + out <- projRows[int(r.loc.Offset)] + } + continue + } + } + + secRows, _, err := t.readSectionRows(sec) + if err != nil { + continue + } + for _, r := range bySection[sec] { + if int(r.loc.Offset) >= len(secRows) { + continue + } + full := secRows[int(r.loc.Offset)] + if filterActive { + payload, err := sonic.ConfigFastest.Marshal(full) + if err != nil || !filter.Matches(payload, t.name) { + continue + } + } + proj := map[string]any{"_id": full["_id"]} + for _, f := range fields { + if f == "_id" { + continue + } + if v, ok := full[f]; ok { + proj[f] = v + } + } + out <- proj + } + } + }() + return out +} + +func (t *ArrowTable) ScanId(filter benchtop.RowFilter) chan string { + out := make(chan string, 100) + go func() { + defer close(out) + if filter == nil || filter.IsNoOp() { + // Fast path used by label scans: stream ids directly from index without + // reading/decompressing Arrow row payloads. + rows, err := t.listIndexRows() + if err != nil { + return + } + for _, r := range rows { + out <- r.id + } + return + } + if simpleFilters, ok := extractSimpleFieldFilters(filter); ok { + if matchedBySection, optimized, _ := t.findRowsByTopLevelFilters(simpleFilters); optimized { + sections := make([]int, 0, len(matchedBySection)) + for sec := range matchedBySection { + sections = append(sections, int(sec)) + } + sort.Ints(sections) + for _, secInt := range sections { + sec := uint16(secInt) + for _, r := range matchedBySection[sec] { + out <- r.id + } + } + return + } + } + for row := range t.ScanDoc(filter) { + if id, ok := row[idColumn].(string); ok { + out <- id + } + } + }() + return out +} + +func (t *ArrowTable) ScanFull(filter benchtop.RowFilter) chan benchtop.RowLocData { + out := make(chan benchtop.RowLocData, 100) + go func() { + defer close(out) + filterActive := filter != nil && !filter.IsNoOp() + if filterActive { + if simpleFilters, ok := extractSimpleFieldFilters(filter); ok { + if matchedBySection, optimized, _ := t.findRowsByTopLevelFilters(simpleFilters); optimized { + sections := make([]int, 0, len(matchedBySection)) + for sec := range matchedBySection { + sections = append(sections, int(sec)) + } + sort.Ints(sections) + for _, secInt := range sections { + sec := uint16(secInt) + needed := map[uint32]struct{}{} + for _, r := range matchedBySection[sec] { + needed[r.loc.Offset] = struct{}{} + } + rowsAtOffset, err := t.readSectionRowsByOffsets(sec, needed) + if err != nil { + continue + } + for _, r := range matchedBySection[sec] { + row, ok := rowsAtOffset[r.loc.Offset] + if !ok { + continue + } + out <- benchtop.RowLocData{DataMap: row, Loc: r.loc} + } + } + return + } + } + } + rows, err := t.listIndexRows() + if err != nil { + return + } + bySection := map[uint16][]indexedRow{} + for _, r := range rows { + bySection[r.loc.Section] = append(bySection[r.loc.Section], r) + } + sections := make([]int, 0, len(bySection)) + for sec := range bySection { + sections = append(sections, int(sec)) + } + sort.Ints(sections) + type sectionFullResult struct { + sec uint16 + rows []benchtop.RowLocData + } + secCh := make(chan uint16, len(sections)) + resCh := make(chan sectionFullResult, len(sections)) + workers := runtime.NumCPU() + if workers < 2 { + workers = 2 + } + if workers > 16 { + workers = 16 + } + var wg sync.WaitGroup + for i := 0; i < workers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for sec := range secCh { + secRows, _, err := t.readSectionRows(sec) + if err != nil { + resCh <- sectionFullResult{sec: sec, rows: nil} + continue + } + matched := make([]benchtop.RowLocData, 0, len(bySection[sec])) + for _, r := range bySection[sec] { + if int(r.loc.Offset) >= len(secRows) { + continue + } + row := secRows[int(r.loc.Offset)] + if filterActive { + payload, err := sonic.ConfigFastest.Marshal(row) + if err != nil || !filter.Matches(payload, t.name) { + continue + } + } + matched = append(matched, benchtop.RowLocData{DataMap: row, Loc: r.loc}) + } + resCh <- sectionFullResult{sec: sec, rows: matched} + } + }() + } + for _, secInt := range sections { + secCh <- uint16(secInt) + } + close(secCh) + go func() { + wg.Wait() + close(resCh) + }() + sectionRowsOut := map[uint16][]benchtop.RowLocData{} + for res := range resCh { + sectionRowsOut[res.sec] = res.rows + } + for _, secInt := range sections { + sec := uint16(secInt) + for _, row := range sectionRowsOut[sec] { + out <- row + } + } + }() + return out +} diff --git a/arrowdriver/table_sections.go b/arrowdriver/table_sections.go new file mode 100644 index 0000000..62b4cf1 --- /dev/null +++ b/arrowdriver/table_sections.go @@ -0,0 +1,1039 @@ +package arrowdriver + +import ( + "fmt" + "os" + "strings" + "time" + + "github.com/apache/arrow/go/v18/arrow" + "github.com/apache/arrow/go/v18/arrow/array" + "github.com/apache/arrow/go/v18/arrow/ipc" + "github.com/apache/arrow/go/v18/arrow/memory" + "github.com/bmeg/benchtop" + "github.com/bmeg/grip/log" + "github.com/bytedance/sonic" + "go.etcd.io/bbolt" +) + +func decodeJSONTextCell(v string) any { + if len(v) == 0 { + return v + } + // Complex values are stored as JSON text in Arrow string columns. + // Decode array/object payloads so filter semantics (e.g. CONTAINS) match JSON driver behavior. + if v[0] != '[' && v[0] != '{' { + return v + } + var out any + if err := sonic.ConfigFastest.Unmarshal([]byte(v), &out); err != nil { + return v + } + return out +} + +func (t *ArrowTable) sectionColumnCacheKey(section uint16, field string) string { + return fmt.Sprintf("%d|%s", section, field) +} + +func (t *ArrowTable) getCachedSectionColumn(section uint16, field string) ([]any, bool) { + key := t.sectionColumnCacheKey(section, field) + t.columnCacheLock.RLock() + defer t.columnCacheLock.RUnlock() + v, ok := t.columnCache[key] + return v, ok +} + +func (t *ArrowTable) setCachedSectionColumn(section uint16, field string, vals []any) { + if t.columnCacheCap <= 0 { + return + } + key := t.sectionColumnCacheKey(section, field) + t.columnCacheLock.Lock() + defer t.columnCacheLock.Unlock() + if t.columnCache == nil { + t.columnCache = map[string][]any{} + } + if _, exists := t.columnCache[key]; !exists { + if len(t.columnCacheOrder) >= t.columnCacheCap { + evict := t.columnCacheOrder[0] + t.columnCacheOrder = t.columnCacheOrder[1:] + delete(t.columnCache, evict) + } + t.columnCacheOrder = append(t.columnCacheOrder, key) + } + t.columnCache[key] = vals +} + +func cloneRowMap(in map[string]any) map[string]any { + out := make(map[string]any, len(in)) + for k, v := range in { + out[k] = v + } + return out +} + +func (t *ArrowTable) getCachedSectionRows(section uint16) ([]map[string]any, bool) { + t.sectionRowCacheLock.RLock() + defer t.sectionRowCacheLock.RUnlock() + rows, ok := t.sectionRowCache[section] + if !ok || len(rows) == 0 { + return nil, false + } + return rows, true +} + +func (t *ArrowTable) setCachedSectionRows(section uint16, rows []map[string]any) { + if t.sectionRowCacheCap <= 0 || len(rows) == 0 { + return + } + t.sectionRowCacheLock.Lock() + defer t.sectionRowCacheLock.Unlock() + if t.sectionRowCache == nil { + t.sectionRowCache = map[uint16][]map[string]any{} + } + if _, exists := t.sectionRowCache[section]; !exists { + if len(t.sectionRowCacheOrder) >= t.sectionRowCacheCap { + evict := t.sectionRowCacheOrder[0] + t.sectionRowCacheOrder = t.sectionRowCacheOrder[1:] + delete(t.sectionRowCache, evict) + } + t.sectionRowCacheOrder = append(t.sectionRowCacheOrder, section) + } + t.sectionRowCache[section] = rows +} + +func (t *ArrowTable) invalidateSectionCaches(section uint16) { + t.columnCacheLock.Lock() + for k := range t.columnCache { + if strings.HasPrefix(k, fmt.Sprintf("%d|", section)) { + delete(t.columnCache, k) + } + } + t.columnCacheOrder = t.columnCacheOrder[:0] + for k := range t.columnCache { + t.columnCacheOrder = append(t.columnCacheOrder, k) + } + t.columnCacheLock.Unlock() + + t.sectionRowCacheLock.Lock() + delete(t.sectionRowCache, section) + t.sectionRowCacheOrder = t.sectionRowCacheOrder[:0] + for k := range t.sectionRowCache { + t.sectionRowCacheOrder = append(t.sectionRowCacheOrder, k) + } + t.sectionRowCacheLock.Unlock() +} + +func (t *ArrowTable) clearAllSectionCaches() { + t.columnCacheLock.Lock() + t.columnCache = map[string][]any{} + t.columnCacheOrder = nil + t.columnCacheLock.Unlock() + + t.sectionRowCacheLock.Lock() + t.sectionRowCache = map[uint16][]map[string]any{} + t.sectionRowCacheOrder = nil + t.sectionRowCacheLock.Unlock() +} + +func isTopLevelMaterializedField(field string) bool { + if field == "" || field == idColumn || field == dataColumn { + return false + } + return !strings.Contains(field, ".") && !strings.Contains(field, "[") +} + +func decodeRecordValue(col arrow.Array, i int) any { + switch arr := col.(type) { + case *array.String: + if arr.IsNull(i) { + return nil + } + return decodeJSONTextCell(arr.Value(i)) + case *array.Float64: + if arr.IsNull(i) { + return nil + } + return arr.Value(i) + case *array.Boolean: + if arr.IsNull(i) { + return nil + } + return arr.Value(i) + default: + return nil + } +} + +func (t *ArrowTable) writeSectionMaterialized(section uint16, sectionRows []sectionRowData) error { + t.invalidateSectionCaches(section) + f, err := os.Create(t.sectionPath(section)) + if err != nil { + return err + } + defer f.Close() + + keys, enc := t.inferSectionColumns(sectionRows) + fields := make([]arrow.Field, 0, 2+len(keys)) + fields = append(fields, + arrow.Field{Name: idColumn, Type: arrow.BinaryTypes.String}, + arrow.Field{Name: dataColumn, Type: arrow.BinaryTypes.String}, + ) + for _, k := range keys { + switch enc[k] { + case encFloat64: + fields = append(fields, arrow.Field{Name: k, Type: arrow.PrimitiveTypes.Float64, Nullable: true}) + case encBool: + fields = append(fields, arrow.Field{Name: k, Type: arrow.FixedWidthTypes.Boolean, Nullable: true}) + default: + // encString and encJSON are persisted as UTF-8 string columns. + fields = append(fields, arrow.Field{Name: k, Type: arrow.BinaryTypes.String, Nullable: true}) + } + } + schema := arrow.NewSchema(fields, nil) + + mem := memory.NewGoAllocator() + writer := ipc.NewWriter(f, ipc.WithSchema(schema), ipc.WithAllocator(mem)) + defer writer.Close() + + for start := 0; start < len(sectionRows); start += sectionWriteBatchRows { + end := start + sectionWriteBatchRows + if end > len(sectionRows) { + end = len(sectionRows) + } + + idb := array.NewStringBuilder(mem) + db := array.NewStringBuilder(mem) + strBuilders := map[string]*array.StringBuilder{} + floatBuilders := map[string]*array.Float64Builder{} + boolBuilders := map[string]*array.BooleanBuilder{} + for _, k := range keys { + switch enc[k] { + case encFloat64: + floatBuilders[k] = array.NewFloat64Builder(mem) + case encBool: + boolBuilders[k] = array.NewBooleanBuilder(mem) + default: + strBuilders[k] = array.NewStringBuilder(mem) + } + } + + for _, row := range sectionRows[start:end] { + idb.Append(row.id) + db.Append(row.payload) + + for _, k := range keys { + v, ok := row.cols[k] + if !ok || v == nil { + if b := strBuilders[k]; b != nil { + b.AppendNull() + } + if b := floatBuilders[k]; b != nil { + b.AppendNull() + } + if b := boolBuilders[k]; b != nil { + b.AppendNull() + } + continue + } + + if b := floatBuilders[k]; b != nil { + if fv, ok := toFloat64(v); ok { + b.Append(fv) + } else { + b.AppendNull() + } + continue + } + if b := boolBuilders[k]; b != nil { + if bv, ok := v.(bool); ok { + b.Append(bv) + } else { + b.AppendNull() + } + continue + } + if b := strBuilders[k]; b != nil { + if enc[k] == encString { + if sv, ok := v.(string); ok { + b.Append(sv) + } else { + b.AppendNull() + } + continue + } + // Complex value encoded as JSON text. + jv, err := sonic.ConfigFastest.Marshal(v) + if err != nil { + b.AppendNull() + continue + } + b.Append(string(jv)) + } + } + } + + arrays := make([]arrow.Array, 0, 2+len(keys)) + ids := idb.NewArray() + data := db.NewArray() + arrays = append(arrays, ids, data) + for _, k := range keys { + if b := floatBuilders[k]; b != nil { + arrays = append(arrays, b.NewArray()) + continue + } + if b := boolBuilders[k]; b != nil { + arrays = append(arrays, b.NewArray()) + continue + } + arrays = append(arrays, strBuilders[k].NewArray()) + } + + rec := array.NewRecord(schema, arrays, int64(end-start)) + err := writer.Write(rec) + rec.Release() + for _, a := range arrays { + a.Release() + } + idb.Release() + db.Release() + for _, b := range strBuilders { + b.Release() + } + for _, b := range floatBuilders { + b.Release() + } + for _, b := range boolBuilders { + b.Release() + } + if err != nil { + return err + } + } + + return nil +} + +func (t *ArrowTable) writeSection(section uint16, rows []benchtop.Row) error { + sectionRows, err := buildSectionRows(rows) + if err != nil { + return err + } + return t.writeSectionMaterialized(section, sectionRows) +} + +func (t *ArrowTable) BulkLoad(rows []benchtop.Row) error { + if len(rows) == 0 { + return nil + } + start := time.Now() + t.lock.Lock() + defer t.lock.Unlock() + + section, err := t.reserveSection() + if err != nil { + return err + } + + sectionRows, err := buildSectionRows(rows) + if err != nil { + return err + } + + writeStart := time.Now() + if err := t.writeSectionMaterialized(section, sectionRows); err != nil { + return err + } + writeElapsed := time.Since(writeStart).Round(time.Millisecond) + + indexableRows := make([]map[string]any, len(rows)) + seenFields := map[string]struct{}{} + for i := range sectionRows { + vals := map[string]any{} + for f, v := range sectionRows[i].cols { + if !isIndexableValue(v) { + continue + } + vals[f] = v + } + indexableRows[i] = vals + for f := range vals { + seenFields[f] = struct{}{} + } + } + newFields := false + for f := range seenFields { + if _, ok := t.indexedFields[f]; !ok { + newFields = true + } + t.indexedFields[f] = struct{}{} + } + log.Debugf("arrowtable.BulkLoad section_written table=%s tableID=%d section=%d rows=%d indexedFields=%d writeElapsed=%s", t.name, t.tableID, section, len(rows), len(seenFields), writeElapsed) + + indexStart := time.Now() + indexWrites := 0 + err = t.indexDB.Update(func(tx *bbolt.Tx) error { + b := tx.Bucket([]byte(idsBucket)) + fwd := tx.Bucket([]byte(fieldIndexBucket)) + rev := tx.Bucket([]byte(reverseFieldIndexBucket)) + if b == nil { + return fmt.Errorf("missing ids bucket") + } + if newFields { + if err := t.persistIndexedFieldsLocked(tx); err != nil { + return err + } + } + for i, row := range rows { + loc := &benchtop.RowLoc{TableId: t.tableID, Section: section, Offset: uint32(i), Size: 0, Index: 0} + if err := b.Put(row.Id, encodeRowLoc(loc)); err != nil { + return err + } + if fwd != nil && rev != nil { + rowID := string(row.Id) + for field, fieldVal := range indexableRows[i] { + valueBytes, ok := encodeIndexValue(fieldVal) + if !ok { + continue + } + if err := fwd.Put(makeFieldIndexKey(field, valueBytes, rowID), encodeRowLoc(loc)); err != nil { + return err + } + if err := rev.Put(makeReverseFieldIndexKey(field, rowID), valueBytes); err != nil { + return err + } + indexWrites++ + } + } + } + log.Debugf("arrowtable.BulkLoad section_indexed table=%s tableID=%d section=%d rows=%d indexWrites=%d indexElapsed=%s totalElapsed=%s", t.name, t.tableID, section, len(rows), indexWrites, time.Since(indexStart).Round(time.Millisecond), time.Since(start).Round(time.Millisecond)) + return nil + }) + if err == nil { + t.invalidateExecutionCaches() + } + return err +} + +func (t *ArrowTable) streamSectionRows(section uint16, fn func(id string, data map[string]any, offset uint32) bool) error { + f, err := os.Open(t.sectionPath(section)) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer f.Close() + + reader, err := ipc.NewReader(f) + if err != nil { + return err + } + defer reader.Release() + + var offset uint32 = 0 + for reader.Next() { + rec := reader.Record() + schema := rec.Schema() + idIdx := schema.FieldIndices(idColumn) + if len(idIdx) == 0 { + return fmt.Errorf("missing _id column") + } + idArr, ok := rec.Column(idIdx[0]).(*array.String) + if !ok { + return fmt.Errorf("id column is not string") + } + topFields := make([]string, 0, len(schema.Fields())) + topCols := make([]arrow.Array, 0, len(schema.Fields())) + for idx, f := range schema.Fields() { + if !isTopLevelMaterializedField(f.Name) { + continue + } + topFields = append(topFields, f.Name) + topCols = append(topCols, rec.Column(idx)) + } + var dataArr *array.String + if dataIdx := schema.FieldIndices(dataColumn); len(dataIdx) > 0 { + if arr, ok := rec.Column(dataIdx[0]).(*array.String); ok { + dataArr = arr + } + } + for i := 0; i < int(rec.NumRows()); i++ { + id := idArr.Value(i) + var data map[string]any + if dataArr != nil && !dataArr.IsNull(i) { + _ = sonic.UnmarshalString(dataArr.Value(i), &data) + if data == nil { + data = map[string]any{} + } + data[idColumn] = id + } else { + data = map[string]any{idColumn: id} + for cIdx, field := range topFields { + if v := decodeRecordValue(topCols[cIdx], i); v != nil { + data[field] = v + } + } + } + if !fn(id, data, offset) { + return nil + } + offset++ + } + } + return nil +} + +// ScanIndex iterates the bbolt index and emits (id, RowLoc) pairs without +// reading any row data. Used for efficient cache preloading. +func (t *ArrowTable) readSectionRows(section uint16) ([]map[string]any, []string, error) { + if cached, ok := t.getCachedSectionRows(section); ok { + rows := make([]map[string]any, len(cached)) + ids := make([]string, len(cached)) + for i, r := range cached { + rows[i] = cloneRowMap(r) + if id, ok := r[idColumn].(string); ok { + ids[i] = id + } + } + return rows, ids, nil + } + + f, err := os.Open(t.sectionPath(section)) + if err != nil { + return nil, nil, err + } + defer f.Close() + + reader, err := ipc.NewReader(f) + if err != nil { + return nil, nil, err + } + defer reader.Release() + + rows := []map[string]any{} + ids := []string{} + for reader.Next() { + rec := reader.Record() + schema := rec.Schema() + idIdx := schema.FieldIndices(idColumn) + if len(idIdx) == 0 { + return nil, nil, fmt.Errorf("missing _id column") + } + idArr, ok := rec.Column(idIdx[0]).(*array.String) + if !ok { + return nil, nil, fmt.Errorf("id column is not string") + } + topFields := make([]string, 0, len(schema.Fields())) + topCols := make([]arrow.Array, 0, len(schema.Fields())) + for idx, f := range schema.Fields() { + if !isTopLevelMaterializedField(f.Name) { + continue + } + topFields = append(topFields, f.Name) + topCols = append(topCols, rec.Column(idx)) + } + var dataArr *array.String + if dataIdx := schema.FieldIndices(dataColumn); len(dataIdx) > 0 { + if arr, ok := rec.Column(dataIdx[0]).(*array.String); ok { + dataArr = arr + } + } + for i := 0; i < int(rec.NumRows()); i++ { + id := idArr.Value(i) + var data map[string]any + if dataArr != nil && !dataArr.IsNull(i) { + if err := sonic.UnmarshalString(dataArr.Value(i), &data); err != nil { + return nil, nil, err + } + if data == nil { + data = map[string]any{} + } + data[idColumn] = id + } else { + data = map[string]any{idColumn: id} + for cIdx, field := range topFields { + if v := decodeRecordValue(topCols[cIdx], i); v != nil { + data[field] = v + } + } + } + ids = append(ids, id) + rows = append(rows, data) + } + } + t.setCachedSectionRows(section, rows) + return rows, ids, nil +} + +func (t *ArrowTable) readSectionTopLevelColumn(section uint16, field string) ([]any, bool, error) { + if cached, ok := t.getCachedSectionColumn(section, field); ok { + return cached, true, nil + } + + f, err := os.Open(t.sectionPath(section)) + if err != nil { + return nil, false, err + } + defer f.Close() + + reader, err := ipc.NewReader(f) + if err != nil { + return nil, false, err + } + defer reader.Release() + + out := []any{} + found := false + for reader.Next() { + rec := reader.Record() + idx := rec.Schema().FieldIndices(field) + if len(idx) == 0 { + return nil, false, nil + } + found = true + col := rec.Column(idx[0]) + switch arr := col.(type) { + case *array.String: + for i := 0; i < int(rec.NumRows()); i++ { + if arr.IsNull(i) { + out = append(out, nil) + } else { + out = append(out, decodeJSONTextCell(arr.Value(i))) + } + } + case *array.Float64: + for i := 0; i < int(rec.NumRows()); i++ { + if arr.IsNull(i) { + out = append(out, nil) + } else { + out = append(out, arr.Value(i)) + } + } + case *array.Boolean: + for i := 0; i < int(rec.NumRows()); i++ { + if arr.IsNull(i) { + out = append(out, nil) + } else { + out = append(out, arr.Value(i)) + } + } + default: + return nil, false, nil + } + } + if found { + t.setCachedSectionColumn(section, field, out) + } + return out, found, nil +} + +func (t *ArrowTable) readSectionProjectedRows(section uint16, fields []string) ([]map[string]any, error) { + f, err := os.Open(t.sectionPath(section)) + if err != nil { + return nil, err + } + defer f.Close() + + reader, err := ipc.NewReader(f) + if err != nil { + return nil, err + } + defer reader.Release() + + out := []map[string]any{} + for reader.Next() { + rec := reader.Record() + schema := rec.Schema() + idIdx := schema.FieldIndices(idColumn) + if len(idIdx) == 0 { + return nil, fmt.Errorf("missing _id column") + } + idArr, ok := rec.Column(idIdx[0]).(*array.String) + if !ok { + return nil, fmt.Errorf("_id column is not string") + } + + fieldIdx := map[string]int{} + for _, field := range fields { + if field == idColumn { + continue + } + idx := schema.FieldIndices(field) + if len(idx) == 0 { + fieldIdx[field] = -1 + } else { + fieldIdx[field] = idx[0] + } + } + + for i := 0; i < int(rec.NumRows()); i++ { + row := map[string]any{"_id": idArr.Value(i)} + for _, field := range fields { + if field == idColumn { + continue + } + idx := fieldIdx[field] + if idx < 0 { + continue + } + col := rec.Column(idx) + switch arr := col.(type) { + case *array.String: + if arr.IsNull(i) { + continue + } + row[field] = arr.Value(i) + case *array.Float64: + if arr.IsNull(i) { + continue + } + row[field] = arr.Value(i) + case *array.Boolean: + if arr.IsNull(i) { + continue + } + row[field] = arr.Value(i) + } + } + out = append(out, row) + } + } + return out, nil +} +func (t *ArrowTable) readSectionTopLevelColumns(section uint16, fields []string) ([]string, map[string][]any, error) { + requiredFields := []string{} + for _, field := range fields { + if field == "" || field == "_label" { + continue + } + requiredFields = append(requiredFields, field) + } + + cachedIDsAny, idCached := t.getCachedSectionColumn(section, idColumn) + cachedCols := map[string][]any{} + missingFields := []string{} + for _, field := range requiredFields { + if field == idColumn { + continue + } + if v, ok := t.getCachedSectionColumn(section, field); ok { + cachedCols[field] = v + } else { + missingFields = append(missingFields, field) + } + } + + if idCached && len(missingFields) == 0 { + outIDs := make([]string, 0, len(cachedIDsAny)) + for _, v := range cachedIDsAny { + if s, ok := v.(string); ok { + outIDs = append(outIDs, s) + } else { + outIDs = append(outIDs, "") + } + } + return outIDs, cachedCols, nil + } + + f, err := os.Open(t.sectionPath(section)) + if err != nil { + return nil, nil, err + } + defer f.Close() + + reader, err := ipc.NewReader(f) + if err != nil { + return nil, nil, err + } + defer reader.Release() + + outIDsAny := []any{} + outCols := map[string][]any{} + for k, v := range cachedCols { + outCols[k] = v + } + missingSet := map[string]struct{}{} + for _, field := range missingFields { + missingSet[field] = struct{}{} + outCols[field] = []any{} + } + + for reader.Next() { + rec := reader.Record() + schema := rec.Schema() + idIdx := schema.FieldIndices(idColumn) + if len(idIdx) == 0 { + return nil, nil, fmt.Errorf("missing _id column") + } + idArr, ok := rec.Column(idIdx[0]).(*array.String) + if !ok { + return nil, nil, fmt.Errorf("id column is not string") + } + n := int(rec.NumRows()) + if !idCached { + for i := 0; i < n; i++ { + outIDsAny = append(outIDsAny, idArr.Value(i)) + } + } + for field := range missingSet { + idx := schema.FieldIndices(field) + if len(idx) == 0 { + for i := 0; i < n; i++ { + outCols[field] = append(outCols[field], nil) + } + continue + } + col := rec.Column(idx[0]) + switch arr := col.(type) { + case *array.String: + for i := 0; i < n; i++ { + if arr.IsNull(i) { + outCols[field] = append(outCols[field], nil) + } else { + outCols[field] = append(outCols[field], decodeJSONTextCell(arr.Value(i))) + } + } + case *array.Float64: + for i := 0; i < n; i++ { + if arr.IsNull(i) { + outCols[field] = append(outCols[field], nil) + } else { + outCols[field] = append(outCols[field], arr.Value(i)) + } + } + case *array.Boolean: + for i := 0; i < n; i++ { + if arr.IsNull(i) { + outCols[field] = append(outCols[field], nil) + } else { + outCols[field] = append(outCols[field], arr.Value(i)) + } + } + default: + for i := 0; i < n; i++ { + outCols[field] = append(outCols[field], nil) + } + } + } + } + + if !idCached { + t.setCachedSectionColumn(section, idColumn, outIDsAny) + cachedIDsAny = outIDsAny + } + for field := range missingSet { + t.setCachedSectionColumn(section, field, outCols[field]) + } + + outIDs := make([]string, 0, len(cachedIDsAny)) + for _, v := range cachedIDsAny { + if s, ok := v.(string); ok { + outIDs = append(outIDs, s) + } else { + outIDs = append(outIDs, "") + } + } + + return outIDs, outCols, nil +} + +func (t *ArrowTable) readSectionRowsByOffsets(section uint16, offsets map[uint32]struct{}) (map[uint32]map[string]any, error) { + if len(offsets) == 0 { + return map[uint32]map[string]any{}, nil + } + if cached, ok := t.getCachedSectionRows(section); ok { + out := map[uint32]map[string]any{} + for off := range offsets { + if int(off) < len(cached) { + out[off] = cloneRowMap(cached[off]) + } + } + return out, nil + } + if len(offsets) >= 256 { + rows, _, err := t.readSectionRows(section) + if err == nil { + out := map[uint32]map[string]any{} + for off := range offsets { + if int(off) < len(rows) { + out[off] = cloneRowMap(rows[off]) + } + } + return out, nil + } + } + maxOffset := uint32(0) + remaining := len(offsets) + for off := range offsets { + if off > maxOffset { + maxOffset = off + } + } + + f, err := os.Open(t.sectionPath(section)) + if err != nil { + return nil, err + } + defer f.Close() + + reader, err := ipc.NewReader(f) + if err != nil { + return nil, err + } + defer reader.Release() + + out := map[uint32]map[string]any{} + var offset uint32 + for reader.Next() { + rec := reader.Record() + schema := rec.Schema() + idIdx := schema.FieldIndices(idColumn) + if len(idIdx) == 0 { + return nil, fmt.Errorf("missing _id column") + } + idArr, ok := rec.Column(idIdx[0]).(*array.String) + if !ok { + return nil, fmt.Errorf("id column is not string") + } + topFields := make([]string, 0, len(schema.Fields())) + topCols := make([]arrow.Array, 0, len(schema.Fields())) + for idx, f := range schema.Fields() { + if !isTopLevelMaterializedField(f.Name) { + continue + } + topFields = append(topFields, f.Name) + topCols = append(topCols, rec.Column(idx)) + } + var dataArr *array.String + if dataIdx := schema.FieldIndices(dataColumn); len(dataIdx) > 0 { + if arr, ok := rec.Column(dataIdx[0]).(*array.String); ok { + dataArr = arr + } + } + for i := 0; i < int(rec.NumRows()); i++ { + if offset > maxOffset || remaining == 0 { + return out, nil + } + if _, ok := offsets[offset]; !ok { + offset++ + continue + } + var row map[string]any + if dataArr != nil && !dataArr.IsNull(i) { + if err := sonic.UnmarshalString(dataArr.Value(i), &row); err != nil { + offset++ + continue + } + if row == nil { + row = map[string]any{} + } + row[idColumn] = idArr.Value(i) + } else { + row = map[string]any{idColumn: idArr.Value(i)} + for cIdx, field := range topFields { + if v := decodeRecordValue(topCols[cIdx], i); v != nil { + row[field] = v + } + } + } + out[offset] = row + remaining-- + offset++ + } + } + return out, nil +} + +func (t *ArrowTable) readSectionProjectedRowsByOffsets(section uint16, fields []string, offsets map[uint32]struct{}) (map[uint32]map[string]any, error) { + if len(offsets) == 0 { + return map[uint32]map[string]any{}, nil + } + maxOffset := uint32(0) + remaining := len(offsets) + for off := range offsets { + if off > maxOffset { + maxOffset = off + } + } + + f, err := os.Open(t.sectionPath(section)) + if err != nil { + return nil, err + } + defer f.Close() + + reader, err := ipc.NewReader(f) + if err != nil { + return nil, err + } + defer reader.Release() + + out := map[uint32]map[string]any{} + var offset uint32 + for reader.Next() { + rec := reader.Record() + schema := rec.Schema() + idIdx := schema.FieldIndices(idColumn) + if len(idIdx) == 0 { + return nil, fmt.Errorf("missing _id column") + } + idArr, ok := rec.Column(idIdx[0]).(*array.String) + if !ok { + return nil, fmt.Errorf("_id column is not string") + } + + fieldIdx := map[string]int{} + for _, field := range fields { + if field == idColumn { + continue + } + idx := schema.FieldIndices(field) + if len(idx) == 0 { + fieldIdx[field] = -1 + } else { + fieldIdx[field] = idx[0] + } + } + + for i := 0; i < int(rec.NumRows()); i++ { + if offset > maxOffset || remaining == 0 { + return out, nil + } + if _, ok := offsets[offset]; !ok { + offset++ + continue + } + row := map[string]any{"_id": idArr.Value(i)} + for _, field := range fields { + if field == idColumn { + continue + } + idx := fieldIdx[field] + if idx < 0 { + continue + } + col := rec.Column(idx) + switch arr := col.(type) { + case *array.String: + if !arr.IsNull(i) { + row[field] = arr.Value(i) + } + case *array.Float64: + if !arr.IsNull(i) { + row[field] = arr.Value(i) + } + case *array.Boolean: + if !arr.IsNull(i) { + row[field] = arr.Value(i) + } + } + } + out[offset] = row + remaining-- + offset++ + } + } + return out, nil +} diff --git a/arrowdriver/table_value_cache.go b/arrowdriver/table_value_cache.go new file mode 100644 index 0000000..eefd3c3 --- /dev/null +++ b/arrowdriver/table_value_cache.go @@ -0,0 +1,104 @@ +package arrowdriver + +import ( + "bytes" + "sort" + + "go.etcd.io/bbolt" +) + +func (t *ArrowTable) invalidateValueIndexCache() { + t.valueIndexCacheLock.Lock() + defer t.valueIndexCacheLock.Unlock() + t.valueIndexCache = map[string]map[string][]indexedRow{} +} + +func cloneIndexedRows(in []indexedRow) []indexedRow { + out := make([]indexedRow, len(in)) + copy(out, in) + return out +} + +func (t *ArrowTable) getCachedValuePostings(field string, valueBytes []byte) ([]indexedRow, bool) { + key := string(valueBytes) + t.valueIndexCacheLock.RLock() + defer t.valueIndexCacheLock.RUnlock() + fm, ok := t.valueIndexCache[field] + if !ok { + return nil, false + } + rows, ok := fm[key] + if !ok { + return nil, false + } + return cloneIndexedRows(rows), true +} + +func (t *ArrowTable) setCachedValuePostings(field string, valueBytes []byte, rows []indexedRow) { + key := string(valueBytes) + cloned := cloneIndexedRows(rows) + t.valueIndexCacheLock.Lock() + defer t.valueIndexCacheLock.Unlock() + fm, ok := t.valueIndexCache[field] + if !ok { + fm = map[string][]indexedRow{} + t.valueIndexCache[field] = fm + } + fm[key] = cloned +} + +func (t *ArrowTable) loadValuePostings(field string, valueBytes []byte) ([]indexedRow, int) { + out := make([]indexedRow, 0, 256) + missingLoc := []string{} + _ = t.indexDB.View(func(tx *bbolt.Tx) error { + fwd := tx.Bucket([]byte(fieldIndexBucket)) + if fwd == nil { + return nil + } + c := fwd.Cursor() + prefix := append(makeFieldIndexPrefix(field, valueBytes), 0x1f) + for k, rawLoc := c.Seek(prefix); k != nil && bytes.HasPrefix(k, prefix); k, rawLoc = c.Next() { + id, ok := indexedIDFromKey(k) + if !ok { + continue + } + loc, err := decodeRowLoc(rawLoc) + if err != nil || loc == nil { + missingLoc = append(missingLoc, id) + continue + } + out = append(out, indexedRow{id: id, loc: loc}) + } + return nil + }) + + for _, id := range missingLoc { + loc, err := t.GetRowLoc(id) + if err != nil { + continue + } + out = append(out, indexedRow{id: id, loc: loc}) + } + + // Keep section/offset order for downstream read locality. + sort.Slice(out, func(i, j int) bool { + if out[i].loc.Section == out[j].loc.Section { + if out[i].loc.Offset == out[j].loc.Offset { + return out[i].id < out[j].id + } + return out[i].loc.Offset < out[j].loc.Offset + } + return out[i].loc.Section < out[j].loc.Section + }) + return out, len(missingLoc) +} + +func (t *ArrowTable) getOrLoadValuePostings(field string, valueBytes []byte) ([]indexedRow, int) { + if rows, ok := t.getCachedValuePostings(field, valueBytes); ok { + return rows, 0 + } + rows, missing := t.loadValuePostings(field, valueBytes) + t.setCachedValuePostings(field, valueBytes, rows) + return rows, missing +} + diff --git a/cache/cache.go b/cache/cache.go new file mode 100644 index 0000000..8dbb3df --- /dev/null +++ b/cache/cache.go @@ -0,0 +1,237 @@ +package cache + +import ( + "bytes" + "context" + "fmt" + "strconv" + "strings" + "sync" + "time" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/pebblebulk" + "github.com/bmeg/grip/log" + "github.com/bytedance/sonic" + "github.com/maypok86/otter/v2" +) + +// Cache defines the interface for a location lookup cache. +type Cache interface { + Get(ctx context.Context, key string) (*benchtop.RowLoc, error) + GetBatch(ctx context.Context, keys []string) (map[string]*benchtop.RowLoc, error) + Set(key string, value *benchtop.RowLoc) (*benchtop.RowLoc, bool) + Invalidate(key string) (*benchtop.RowLoc, bool) + PreloadCache() error +} + +// TableLookup is a function that, given a row ID, searches all loaded +// tables and returns the matching RowLoc. +type TableLookup func(id string) (*benchtop.RowLoc, error) + +// TableScanner is a function that iterates ALL row locations across all +// loaded tables, calling fn for each one. Used by PreloadCache. +type TableScanner func(fn func(id string, loc *benchtop.RowLoc)) error + +type StandardCache struct { + pageCache *otter.Cache[string, *benchtop.RowLoc] + pageLoader otter.LoaderFunc[string, *benchtop.RowLoc] + bulkPageLoader otter.BulkLoaderFunc[string, *benchtop.RowLoc] + tableLookup TableLookup + tableScanner TableScanner +} + +// Get retrieves an item from the cache. If the item is not present, +// it is automatically loaded from the underlying table index. +func (ca *StandardCache) Get(ctx context.Context, key string) (*benchtop.RowLoc, error) { + return ca.pageCache.Get(ctx, key, ca.pageLoader) +} + +// GetBatch retrieves multiple items from the cache. +func (ca *StandardCache) GetBatch(ctx context.Context, keys []string) (map[string]*benchtop.RowLoc, error) { + result := make(map[string]*benchtop.RowLoc, len(keys)) + var missing []string + dummyLoader := otter.LoaderFunc[string, *benchtop.RowLoc](func(ctx context.Context, key string) (*benchtop.RowLoc, error) { + return nil, fmt.Errorf("miss") + }) + + for _, k := range keys { + if loc, err := ca.pageCache.Get(ctx, k, dummyLoader); err == nil { + result[k] = loc + } else { + missing = append(missing, k) + } + } + if len(missing) > 0 { + missed, err := ca.bulkPageLoader(ctx, missing) + if err != nil { + return result, err + } + for k, loc := range missed { + ca.pageCache.Set(k, loc) + result[k] = loc + } + } + return result, nil +} + +// Set adds or updates an item in the cache. +func (ca *StandardCache) Set(key string, value *benchtop.RowLoc) (*benchtop.RowLoc, bool) { + return ca.pageCache.Set(key, value) +} + +// Invalidate removes an item from the cache. +func (ca *StandardCache) Invalidate(key string) (*benchtop.RowLoc, bool) { + return ca.pageCache.Invalidate(key) +} + +// NewStandardCache creates a Cache that uses provided lookup and scanner functions. +func NewStandardCache(lookup TableLookup, scanner TableScanner) Cache { + c := &StandardCache{ + tableLookup: lookup, + tableScanner: scanner, + pageCache: otter.Must(&otter.Options[string, *benchtop.RowLoc]{ + MaximumSize: 10_000_000, + }), + } + c.pageLoader = otter.LoaderFunc[string, *benchtop.RowLoc](func(ctx context.Context, key string) (*benchtop.RowLoc, error) { + loc, err := lookup(key) + if err != nil { + return &benchtop.RowLoc{}, err + } + return loc, nil + }) + + c.bulkPageLoader = otter.BulkLoaderFunc[string, *benchtop.RowLoc](func(ctx context.Context, keys []string) (map[string]*benchtop.RowLoc, error) { + result := make(map[string]*benchtop.RowLoc, len(keys)) + for _, key := range keys { + loc, err := lookup(key) + if err != nil { + continue + } + if loc != nil { + result[key] = loc + } + } + return result, nil + }) + return c +} + +// NewKVCache creates a Cache that uses a Pebble KVStore for lookups. +// This is for backward compatibility with the original JSONDriver. +func NewKVCache(kv pebblebulk.KVStore) Cache { + var tids []uint16 + var tidsMu sync.Mutex + + lookup := TableLookup(func(key string) (*benchtop.RowLoc, error) { + var tableID uint16 + var entryID []byte + + parts := strings.Split(key, ":") + if len(parts) == 2 { + tid, _ := strconv.Atoi(parts[0]) + tableID = uint16(tid) + entryID = []byte(parts[1]) + } else { + // If no table prefix, we must treat it as a potential global search + entryID = []byte(key) + } + + if tableID > 0 { + // Direct lookup if table ID is known + posKey := benchtop.NewPosKey(tableID, entryID) + val, closer, err := kv.Get(posKey) + if err == nil { + defer closer.Close() + loc := benchtop.DecodeRowLoc(val) + // Verification: Ensure the returned loc matches requested tableID + if loc != nil { + if loc.TableId == 0 { + loc.TableId = tableID + } + if loc.TableId == tableID { + return loc, nil + } + } + } + } + + // Fallback: search across all active tables + tidsMu.Lock() + if len(tids) == 0 { + prefix := []byte{benchtop.TablePrefix} + _ = kv.View(func(it *pebblebulk.PebbleIterator) error { + for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { + val, _ := it.Value() + var tinfo benchtop.TableInfo + if err := sonic.ConfigFastest.Unmarshal(val, &tinfo); err == nil { + tids = append(tids, tinfo.TableId) + } + } + return nil + }) + } + currentTIDs := make([]uint16, len(tids)) + copy(currentTIDs, tids) + tidsMu.Unlock() + + for _, tid := range currentTIDs { + if tid == tableID { + continue // Already checked + } + pk := benchtop.NewPosKey(tid, entryID) + val, closer, err := kv.Get(pk) + if err == nil { + defer closer.Close() + loc := benchtop.DecodeRowLoc(val) + if loc != nil { + if loc.TableId == 0 { + loc.TableId = tid + } + if loc.TableId == tid { + return loc, nil + } + } + } + } + + return nil, fmt.Errorf("id %s not found in any table", key) + }) + scanner := TableScanner(func(fn func(key string, loc *benchtop.RowLoc)) error { + prefix := []byte{benchtop.PosPrefix} + return kv.View(func(it *pebblebulk.PebbleIterator) error { + for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { + tid, id := benchtop.ParsePosKey(it.Key()) + val, err := it.Value() + if err != nil { + continue + } + loc := benchtop.DecodeRowLoc(val) + if loc != nil && loc.TableId == tid { + // Use table-aware key for cache population + cacheKey := strconv.FormatUint(uint64(tid), 10) + ":" + string(id) + fn(cacheKey, loc) + } + } + return nil + }) + }) + return NewStandardCache(lookup, scanner) +} + +// PreloadCache iterates the table scanner and populates the in-memory cache. +func (ca *StandardCache) PreloadCache() error { + L_Start := time.Now() + count := 0 + err := ca.tableScanner(func(id string, loc *benchtop.RowLoc) { + if loc != nil { + ca.pageCache.Set(id, loc) + count++ + } + }) + if err == nil { + log.Debugf("Successfully preloaded %d keys in RowLoc cache in %v", count, time.Since(L_Start)) + } + return err +} diff --git a/cmdline/benchtop/cmds/get/main.go b/cmdline/benchtop/cmds/get/main.go index 83d7f1b..bb4a6d9 100644 --- a/cmdline/benchtop/cmds/get/main.go +++ b/cmdline/benchtop/cmds/get/main.go @@ -27,7 +27,12 @@ var Cmd = &cobra.Command{ return err } - table, err := driver.Get(tableName) + tid, err := driver.LookupTableID(tableName) + if err != nil { + return err + } + + table, err := driver.Get(tid) if err != nil { return err } diff --git a/cmdline/benchtop/cmds/keys/main.go b/cmdline/benchtop/cmds/keys/main.go index 9c03902..6be8d8c 100644 --- a/cmdline/benchtop/cmds/keys/main.go +++ b/cmdline/benchtop/cmds/keys/main.go @@ -4,7 +4,6 @@ import ( "fmt" "github.com/bmeg/benchtop/jsontable" - jTable "github.com/bmeg/benchtop/jsontable/table" "github.com/spf13/cobra" ) @@ -24,14 +23,18 @@ var Cmd = &cobra.Command{ return err } - table, err := driver.Get(tableName) + tid, err := driver.LookupTableID(tableName) if err != nil { return err } - jT, _ := table.(*jTable.JSONTable) + // ListTableKeys is not part of TableDriver interface, need to cast + jd, ok := driver.(*jsontable.JSONDriver) + if !ok { + return fmt.Errorf("driver is not a JSONDriver") + } - keys, err := driver.ListTableKeys(jT.TableId) + keys, err := jd.ListTableKeys(tid) if err != nil { return err } diff --git a/filters/filters.go b/filters/filters.go index 09d4aa3..a5e6fa4 100644 --- a/filters/filters.go +++ b/filters/filters.go @@ -5,30 +5,25 @@ import ( "reflect" "strconv" - "github.com/bmeg/grip/gripql" - "github.com/bmeg/grip/log" + "github.com/bmeg/benchtop/query" "github.com/spf13/cast" ) -type FieldFilter struct { - Field string - Operator gripql.Condition - Value any -} +type FieldFilter = query.FieldFilter func ApplyFilterCondition(val any, cond *FieldFilter) bool { condVal := cond.Value if (val == nil || condVal == nil) && - cond.Operator != gripql.Condition_EQ && - cond.Operator != gripql.Condition_NEQ && - cond.Operator != gripql.Condition_WITHIN && - cond.Operator != gripql.Condition_WITHOUT && - cond.Operator != gripql.Condition_CONTAINS { + cond.Operator != query.EQ && + cond.Operator != query.NEQ && + cond.Operator != query.WITHIN && + cond.Operator != query.WITHOUT && + cond.Operator != query.CONTAINS { return false } switch cond.Operator { - case gripql.Condition_EQ: + case query.EQ: switch v := val.(type) { case string: condS, ok := condVal.(string) @@ -48,7 +43,7 @@ func ApplyFilterCondition(val any, cond *FieldFilter) bool { return reflect.DeepEqual(val, condVal) } - case gripql.Condition_NEQ: + case query.NEQ: switch v := val.(type) { case string: condS, ok := condVal.(string) @@ -68,7 +63,7 @@ func ApplyFilterCondition(val any, cond *FieldFilter) bool { return !reflect.DeepEqual(val, condVal) } - case gripql.Condition_GT, gripql.Condition_GTE, gripql.Condition_LT, gripql.Condition_LTE: + case query.GT, query.GTE, query.LT, query.LTE: valN, err := getFloat64(val) // Use optimized getter if err != nil { return false @@ -78,99 +73,88 @@ func ApplyFilterCondition(val any, cond *FieldFilter) bool { return false } - if cond.Operator == gripql.Condition_GT { + if cond.Operator == query.GT { return valN > condN } - if cond.Operator == gripql.Condition_GTE { + if cond.Operator == query.GTE { return valN >= condN } - if cond.Operator == gripql.Condition_LT { + if cond.Operator == query.LT { return valN < condN } - if cond.Operator == gripql.Condition_LTE { + if cond.Operator == query.LTE { return valN <= condN } return false // Should not be reached - case gripql.Condition_INSIDE, gripql.Condition_OUTSIDE, gripql.Condition_BETWEEN: + case query.INSIDE, query.OUTSIDE, query.BETWEEN: // Still requires slice check, but we can use the optimized getFloat64 inside vals, err := cast.ToSliceE(condVal) if err != nil || len(vals) != 2 { return false } - - lower, err := getFloat64(vals[0]) + valN, err := getFloat64(val) if err != nil { return false } - upper, err := getFloat64(vals[1]) + lower, err := getFloat64(vals[0]) if err != nil { return false } - valF, err := getFloat64(val) + upper, err := getFloat64(vals[1]) if err != nil { return false } - if cond.Operator == gripql.Condition_INSIDE { - return valF > lower && valF < upper + if cond.Operator == query.INSIDE { + return valN > lower && valN < upper } - if cond.Operator == gripql.Condition_OUTSIDE { - return valF < lower || valF > upper + if cond.Operator == query.BETWEEN { + return valN >= lower && valN <= upper } - if cond.Operator == gripql.Condition_BETWEEN { - return valF >= lower && valF < upper + if cond.Operator == query.OUTSIDE { + return valN < lower || valN > upper } return false - case gripql.Condition_WITHIN: - // val is the single document value. condVal is the slice of allowed values. - // Check if val is EQ to any element in condVal slice. - condSlice, ok := condVal.([]any) - if !ok { - log.Debugf("UserError: expected slice not %T for WITHIN condition value", condVal) + case query.WITHIN: + vals, err := cast.ToSliceE(condVal) + if err != nil { return false } - for _, v := range condSlice { - if ApplyFilterCondition(val, &FieldFilter{Operator: gripql.Condition_EQ, Value: v}) { - return true // Found a match + for _, v := range vals { + if reflect.DeepEqual(val, v) { + return true } } return false - case gripql.Condition_WITHOUT: - condSlice, ok := condVal.([]any) - if !ok { - log.Debugf("UserError: expected slice not %T for WITHIN condition value", condVal) - return true - } - for _, v := range condSlice { - if ApplyFilterCondition(val, &FieldFilter{Operator: gripql.Condition_EQ, Value: v}) { + case query.WITHOUT: + vals, err := cast.ToSliceE(condVal) + if err != nil { + return false + } + for _, v := range vals { + if reflect.DeepEqual(val, v) { return false } } return true - case gripql.Condition_CONTAINS: - // val is the slice from the document. condVal is the single target element. - // Check if any element in val slice is EQ to condVal. - valSlice, ok := val.([]any) - if !ok { - log.Debugf("UserError: expected slice not %T for CONTAINS condition value", val) + case query.CONTAINS: + vals, err := cast.ToSliceE(val) + if err != nil { return false } - for _, v := range valSlice { - // Use the optimized EQ check recursively instead of reflect.DeepEqual(v, condVal) - // Note: Arguments are v (slice element) and condVal (target). - if ApplyFilterCondition(v, &FieldFilter{Operator: gripql.Condition_EQ, Value: condVal}) { - return true // Found a match + for _, v := range vals { + if reflect.DeepEqual(v, condVal) { + return true } } return false - - default: - return false } + + return false } // getFloat64 is a highly optimized helper to convert 'any' value to float64, diff --git a/go.mod b/go.mod index b4851b5..1f8263e 100644 --- a/go.mod +++ b/go.mod @@ -18,9 +18,12 @@ require ( github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b // indirect github.com/cockroachdb/redact v1.1.5 // indirect github.com/cockroachdb/tokenbucket v0.0.0-20230807174530-cc333fc44b06 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect github.com/getsentry/sentry-go v0.28.1 // indirect + github.com/goccy/go-json v0.10.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/snappy v0.0.5-0.20231225225746-43d5d4cd4e0e // indirect + github.com/google/flatbuffers v24.3.25+incompatible // indirect github.com/grpc-ecosystem/go-grpc-middleware v1.4.0 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect @@ -32,32 +35,43 @@ require ( github.com/logrusorgru/aurora v2.0.3+incompatible // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/client_golang v1.19.1 // indirect github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/common v0.55.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/rogpeppe/go-internal v1.12.0 // indirect github.com/sirupsen/logrus v1.9.3 // indirect - github.com/spf13/pflag v1.0.5 // indirect + github.com/spf13/pflag v1.0.6 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect + github.com/zeebo/xxh3 v1.0.2 // indirect golang.org/x/arch v0.0.0-20210923205945-b76863e36670 // indirect + golang.org/x/mod v0.20.0 // indirect golang.org/x/net v0.37.0 // indirect + golang.org/x/sync v0.12.0 // indirect golang.org/x/sys v0.33.0 // indirect golang.org/x/term v0.30.0 // indirect golang.org/x/text v0.23.0 // indirect + golang.org/x/tools v0.24.0 // indirect + golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20250811230008-5f3141c8851a // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250811230008-5f3141c8851a // indirect google.golang.org/grpc v1.71.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect ) require ( github.com/DataDog/zstd v1.5.7 + github.com/apache/arrow/go/v18 v18.0.0-20241007013041-ab95a4d25142 github.com/bmeg/jsonpath v0.0.0-20210207014051-cca5355553ad github.com/bytedance/sonic v1.14.0 github.com/cockroachdb/pebble v1.1.5 github.com/edsrzf/mmap-go v1.2.0 github.com/hashicorp/go-multierror v1.1.1 github.com/maypok86/otter/v2 v2.1.0 + github.com/pierrec/lz4/v4 v4.1.22 github.com/spf13/cast v1.9.2 + github.com/stretchr/testify v1.10.0 + go.etcd.io/bbolt v1.4.3 google.golang.org/protobuf v1.36.7 ) diff --git a/go.sum b/go.sum index 60ad324..09043c2 100644 --- a/go.sum +++ b/go.sum @@ -2,6 +2,12 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/DataDog/zstd v1.5.7 h1:ybO8RBeh29qrxIhCA9E8gKY6xfONU9T6G6aP9DTKfLE= github.com/DataDog/zstd v1.5.7/go.mod h1:g4AWEaM3yOg3HYfnJ3YIawPnVdXJh9QME85blwSAmyw= +github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= +github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= +github.com/apache/arrow/go/v18 v18.0.0-20241007013041-ab95a4d25142 h1:6EtsUpu9/vLtVl6oVpFiZe9GRax7STd2bG55VNwsRdI= +github.com/apache/arrow/go/v18 v18.0.0-20241007013041-ab95a4d25142/go.mod h1:GjCnS5QddrJzyqrdYqCUvwlND7SfAw4WH/722M2U2NM= +github.com/apache/thrift v0.20.0 h1:631+KvYbsBZxmuJjYwhezVsrfc/TbqtZV4QcxOX1fOI= +github.com/apache/thrift v0.20.0/go.mod h1:hOk1BQqcp2OLzGsyVXdfMk7YFlMxK3aoEVhjD06QhB8= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= @@ -60,6 +66,8 @@ github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ4 github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= +github.com/goccy/go-json v0.10.3 h1:KZ5WoDbxAIgm2HNbYckL0se1fHD6rz5j4ywS6ebzDqA= +github.com/goccy/go-json v0.10.3/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= @@ -71,6 +79,8 @@ github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golang/snappy v0.0.5-0.20231225225746-43d5d4cd4e0e h1:4bw4WeyTYPp0smaXiJZCNnLrvVBqirQVreixayXezGc= github.com/golang/snappy v0.0.5-0.20231225225746-43d5d4cd4e0e/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/flatbuffers v24.3.25+incompatible h1:CX395cjN9Kke9mmalRoL3d81AtFUxJM+yDthflgJGkI= +github.com/google/flatbuffers v24.3.25+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= @@ -89,6 +99,8 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4= +github.com/klauspost/asmfmt v1.3.2/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= @@ -107,9 +119,15 @@ github.com/logrusorgru/aurora v2.0.3+incompatible h1:tOpm7WcpBTn4fjmVfgpQq0EfczG github.com/logrusorgru/aurora v2.0.3+incompatible/go.mod h1:7rIyQOR62GCctdiQpZ/zOJlFyk6y+94wXzv6RNZgaR4= github.com/maypok86/otter/v2 v2.1.0 h1:H+FO9NtLuSWYUlIUQ/kT6VNEpWSIF4w4GZJRDhxYb7k= github.com/maypok86/otter/v2 v2.1.0/go.mod h1:jX2xEKz9PrNVbDqnk8JUuOt5kURK8h7jd1kDYI5QsZk= +github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs= +github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY= +github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI= +github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= +github.com/pierrec/lz4/v4 v4.1.22 h1:cKFw6uJDK+/gfw5BcDL0JL5aBsAFdsIT18eRtLj7VIU= +github.com/pierrec/lz4/v4 v4.1.22/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4= github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= @@ -138,8 +156,9 @@ github.com/spf13/cast v1.9.2 h1:SsGfm7M8QOFtEzumm7UZrZdLLquNdzFYfIbEXntcFbE= github.com/spf13/cast v1.9.2/go.mod h1:jNfB8QC9IA6ZuY2ZjDp0KtFO2LZZlg4S/7bzP6qqeHo= github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= @@ -157,6 +176,12 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= +github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= +github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= +github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= +go.etcd.io/bbolt v1.4.3 h1:dEadXpI6G79deX5prL3QRNP6JB8UxVkqo4UPnHaNXJo= +go.etcd.io/bbolt v1.4.3/go.mod h1:tKQlpPaYCVFctUIgFKFnAlvbmB3tpy1vkTnDWohtc0E= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= go.opentelemetry.io/otel v1.34.0 h1:zRLXxLCgL1WyKsPVrgbSdMN4c0FMkDAskSTQP+0hdUY= @@ -187,6 +212,8 @@ golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHl golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -230,10 +257,16 @@ golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 h1:+cNy6SZtPcJQH3LJVLOSmiC7MMxXNOb3PU/VUEz+EhU= +golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= +gonum.org/v1/gonum v0.15.0 h1:2lYxjRbTYyxkJxlhC+LvJIx3SsANPdRybu1tGj9/OrQ= +gonum.org/v1/gonum v0.15.0/go.mod h1:xzZVBJBtS+Mz4q0Yl2LJTk+OxOg4jiXZ7qBoM0uISGo= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= @@ -254,6 +287,8 @@ google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2 google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/interface.go b/interface.go index ae9d48b..53dfba1 100644 --- a/interface.go +++ b/interface.go @@ -1,5 +1,9 @@ package benchtop +import ( + "github.com/bmeg/benchtop/query" +) + type TableInfo struct { FileName string `json:"fileName"` Columns []ColumnDef `json:"columns"` @@ -13,35 +17,29 @@ type ColumnDef struct { // Type FieldType `json:"type"` Remove this for now since not using bson anymore } -/* - Keep this code as a reminder for what the table field type architecture when bson was used - type FieldType bsontype.Type - - const ( - Double FieldType = FieldType(bson.TypeDouble) - Int64 FieldType = FieldType(bson.TypeInt64) - String FieldType = FieldType(bson.TypeString) - Bytes FieldType = FieldType(bson.TypeBinary) - VectorArray FieldType = FieldType(bson.TypeArray) - ) -*/ - type Row struct { - Id []byte - TableName string - Data map[string]any + Id []byte + TableID uint16 + Data map[string]any } type Index struct { Key []byte - Loc RowLoc + Loc *RowLoc +} + +type RowLocData struct { + Data []byte + DataMap map[string]any + Loc *RowLoc } type RowLoc struct { TableId uint16 Section uint16 // Sectioning allows for smaller Offset, Size Offset uint32 // Max offset, size is 4GB - Size uint32 + Size uint32 // Compressed size + Index uint16 // Index within the block } type RowFilter interface { @@ -53,24 +51,56 @@ type RowFilter interface { type TableDriver interface { New(name string, columns []ColumnDef) (TableStore, error) - Get(name string) (TableStore, error) - ListTableKeys(tableId uint16) (chan Index, error) + Get(tableID uint16) (TableStore, error) + Delete(tableID uint16) error + Close() + InvalidateLoc(tableId uint16, rowId string) + BulkLoad(tableID uint16, rows chan *Row) error + + // Discovery and Metadata + LookupTableID(name string) (uint16, error) + ListTableIDs() []uint16 + GetTableInfo(tableID uint16) (*TableInfo, error) + GetAllColNames() chan string GetLabels(edges bool, removePrefix bool) chan string + RowIdsByHas(field string, value any, op query.Condition) chan Index + RowIdsByTableFieldValue(tableID uint16, field string, value any, op query.Condition) chan Index List() []string - Delete(name string) error - Close() + GetKV() any // Returns the underlying KV store } type TableStore interface { GetColumnDefs() []ColumnDef + HasField(field string) bool AddRow(elem Row) (*RowLoc, error) + AddRows(elems []Row) ([]*RowLoc, error) GetRow(loc *RowLoc) (map[string]any, error) + GetRowLoc(id string) (*RowLoc, error) + GetRows(locs []*RowLoc) ([]map[string]any, []error) DeleteRow(loc *RowLoc, id []byte) error + MarkDeleteTable(loc *RowLoc) error ScanDoc(filter RowFilter) chan map[string]any + ScanDocProjected(fields []string, filter RowFilter) chan map[string]any ScanId(filter RowFilter) chan string + ScanFull(filter RowFilter) chan RowLocData //Compact() error Close() error } + +// FieldInfo describes an indexed/searchable field for a label. +type FieldInfo struct { + Label string + Field string +} + +// FieldDriver exposes field-index lifecycle operations. +type FieldDriver interface { + AddField(tableID uint16, field string) error + RemoveField(tableID uint16, field string) error + ListFields() []FieldInfo + DeleteRowField(tableID uint16, field, rowID string) error + GetIDsForTable(tableID uint16) chan string +} diff --git a/jsontable/bLoad.go b/jsontable/bLoad.go index d31b752..cd632cf 100644 --- a/jsontable/bLoad.go +++ b/jsontable/bLoad.go @@ -2,109 +2,140 @@ package jsontable import ( "fmt" - "sync" "github.com/bmeg/benchtop" - jTable "github.com/bmeg/benchtop/jsontable/table" + "github.com/bmeg/benchtop/jsontable/table" + "github.com/bmeg/benchtop/jsontable/tpath" "github.com/bmeg/benchtop/pebblebulk" "github.com/bytedance/sonic" + "github.com/cockroachdb/pebble" "github.com/hashicorp/go-multierror" ) -func (dr *JSONDriver) BulkLoad(inputs chan *benchtop.Row, tx *pebblebulk.PebbleBulk) error { +func (dr *JSONDriver) BulkLoad(id uint16, rows chan *benchtop.Row) error { + return dr.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { + return dr.BulkLoadInternal(id, rows, tx) + }) +} + +func (dr *JSONDriver) BulkLoadInternal(targetID uint16, inputs chan *benchtop.Row, tx *pebblebulk.PebbleBulk) error { if dr.Pkv == nil || dr.Pkv.Db == nil { return fmt.Errorf("pebble database instance is nil") } - var wg sync.WaitGroup - tableChannels := make(map[string]chan *benchtop.Row) - - metadataChan := make(chan *jTable.KitchenSink, 100) + if tx == nil { + return fmt.Errorf("passed pebble bulk transaction is nil") + } - snapshot := dr.Pkv.Db.NewSnapshot() - defer snapshot.Close() + const batchSize = 1000 + batch := make([]*benchtop.Row, 0, batchSize) for row := range inputs { - if _, exists := tableChannels[row.TableName]; !exists { - dr.Lock.RLock() - table, exists := dr.Tables[row.TableName] - dr.Lock.RUnlock() - if !exists { - var localErr *multierror.Error - newTable, err := dr.New(row.TableName, nil) - if err != nil { - localErr = multierror.Append(localErr, fmt.Errorf("failed to create table %s: %v", row.TableName, err)) - metadataChan <- &jTable.KitchenSink{ - FieldIndexKeyElements: nil, - Metadata: nil, - Err: localErr.ErrorOrNil(), - } - continue - } - table = newTable.(*jTable.JSONTable) - dr.Lock.Lock() - dr.Tables[row.TableName] = table - dr.Lock.Unlock() + if row == nil { + continue + } + batch = append(batch, row) + if len(batch) >= batchSize { + if err := dr.processBatch(tx, batch); err != nil { + return err } - inputChan := table.StartTableGoroutine(&wg, metadataChan, snapshot, BATCH_SIZE) - tableChannels[row.TableName] = inputChan + batch = batch[:0] } - tableChannels[row.TableName] <- row } - for _, ch := range tableChannels { - close(ch) + return dr.processBatch(tx, batch) +} + +func (dr *JSONDriver) processBatch(tx *pebblebulk.PebbleBulk, entries []*benchtop.Row) error { + if len(entries) == 0 { + return nil + } + + // Group rows by TableID + byTable := make(map[uint16][]*benchtop.Row) + for _, row := range entries { + byTable[row.TableID] = append(byTable[row.TableID], row) } var errs *multierror.Error - done := make(chan struct{}) - go func() { - defer close(done) - writeFunc := func(tx *pebblebulk.PebbleBulk) error { - for meta := range metadataChan { - if meta.Err != nil { - errs = multierror.Append(errs, meta.Err) - continue - } - if meta.Metadata == nil { - continue - } - for _, keyElements := range meta.FieldIndexKeyElements { - forwardKey := benchtop.FieldKey(keyElements.Field, keyElements.TableName, keyElements.Val, []byte(keyElements.RowId)) - if err := tx.Set(forwardKey, []byte{}, nil); err != nil { - errs = multierror.Append(errs, err) - } - BVal, err := sonic.ConfigFastest.Marshal(keyElements.Val) - if err != nil { - errs = multierror.Append(errs, err) - continue - } - if err := tx.Set(benchtop.RFieldKey(keyElements.TableName, keyElements.Field, keyElements.RowId), BVal, nil); err != nil { - errs = multierror.Append(errs, err) - } - } - // Write row location entries. - for id, m := range meta.Metadata { - dr.LocCache.Set(id, m) - dr.AddTableEntryInfo(tx, []byte(id), m) - } + for tid, rows := range byTable { + dr.Lock.RLock() + tbl, ok := dr.Tables[tid] + dr.Lock.RUnlock() + + if !ok { + t, err := dr.Get(tid) + if err != nil { + errs = multierror.Append(errs, fmt.Errorf("BulkLoad: table ID %d not found: %v", tid, err)) + continue } - return nil + tbl = t.(*table.JSONTable) } - if tx == nil { - errs = multierror.Append(errs, fmt.Errorf("pebble bulk instance passed into BulkLoad function is nil")) - } else { - dr.PebbleLock.Lock() - if err := writeFunc(tx); err != nil { + // uniqueRows will hold only rows that don't exist in the DB or this batch + uniqueRows := make([]benchtop.Row, 0, len(rows)) + + for _, r := range rows { + // Persistent Existence Check + // Uses NewPosKey (P | TableID | rowID) to check the Primary Index + // tx.Get is now batch-aware, so it sees both the DB and previous writes in this session. + pKey := benchtop.NewPosKey(tid, r.Id) + _, closer, err := tx.Get(pKey) + if err == nil { + closer.Close() + continue // Row already exists, skip + } + + // If the error is anything other than NotFound, we have a DB issue + if err != pebble.ErrNotFound { errs = multierror.Append(errs, err) + continue } - dr.PebbleLock.Unlock() + + uniqueRows = append(uniqueRows, *r) } - }() - wg.Wait() - close(metadataChan) - <-done + // If all rows in this batch were duplicates, skip to next table + if len(uniqueRows) == 0 { + continue + } + + // 4. Bulk add ONLY the truly unique rows to the physical storage + locs, err := tbl.AddRows(uniqueRows) + if err != nil { + errs = multierror.Append(errs, err) + continue + } + + // 5. Update Pebble Indices and Metadata + for i, row := range uniqueRows { + rowLoc := locs[i] + + // Primary Index: Maps RowID to Section/Offset + if err := dr.AddTableEntryInfo(tx, row.Id, rowLoc); err != nil { + errs = multierror.Append(errs, err) + } + + // Secondary Indices (Field Index and Reverse Field Index) + for field := range tbl.Fields { + if val := tpath.PathLookup(row.Data, field); val != nil { + // F | field | value | tableID | rowID + fKey := benchtop.FieldKey(field, tid, val, row.Id) + if err := tx.Set(fKey, []byte{}, nil); err != nil { + errs = multierror.Append(errs, err) + } + + // R | TableID | field | rowId + rKey := benchtop.RFieldKey(tid, field, string(row.Id)) + bVal, err := sonic.ConfigFastest.Marshal(val) + if err == nil { + if err := tx.Set(rKey, bVal, nil); err != nil { + errs = multierror.Append(errs, err) + } + } + } + } + } + } return errs.ErrorOrNil() } diff --git a/jsontable/block/block.go b/jsontable/block/block.go new file mode 100644 index 0000000..ec3f00c --- /dev/null +++ b/jsontable/block/block.go @@ -0,0 +1,327 @@ +package block + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "sync" + + "github.com/DataDog/zstd" + "github.com/pierrec/lz4/v4" +) + +// Block represents a collection of rows that are compressed together. +type Block struct { + Rows [][]byte +} + +func NewBlock(capacity int) *Block { + return &Block{ + Rows: make([][]byte, 0, capacity), + } +} + +func (b *Block) Add(row []byte) { + b.Rows = append(b.Rows, row) +} + +func (b *Block) Count() int { + return len(b.Rows) +} + +const ( + ZSTD_MAGIC = 0xFD2FB528 +) + +// Serialize packs the block into bytes and compresses it. +// Format: [Count uint16][Offsets uint32...][Data...] +func (b *Block) Serialize(pool *sync.Pool) ([]byte, error) { + count := len(b.Rows) + if count == 0 { + return nil, nil + } + + headerSize := 2 + 4*count + totalDataSize := 0 + for _, r := range b.Rows { + totalDataSize += len(r) + } + totalSize := headerSize + totalDataSize + + rawBuf := make([]byte, totalSize) + + // Write Count + binary.LittleEndian.PutUint16(rawBuf[0:], uint16(count)) + + // Write Offsets and Data + currentOffset := uint32(0) + dataStart := rawBuf[headerSize:] + offsetPtr := 2 + + for _, r := range b.Rows { + // Write Offset + binary.LittleEndian.PutUint32(rawBuf[offsetPtr:], currentOffset) + offsetPtr += 4 + + // Write Data + copy(dataStart[currentOffset:], r) + currentOffset += uint32(len(r)) + } + + // Compress + cBufObj := pool.Get() + var cBuf []byte + if cBufObj != nil { + cBuf = cBufObj.([]byte) + } + + // zstd.Compress might return a new slice if cBuf is too small + compressed, err := zstd.Compress(cBuf[:0], rawBuf) + if err != nil { + if cBuf != nil { + pool.Put(cBuf[:0]) + } + return nil, fmt.Errorf("compression failed: %w", err) + } + + // Verification: Zstd magic number should be present + if len(compressed) >= 4 { + magic := binary.LittleEndian.Uint32(compressed[0:4]) + if magic != ZSTD_MAGIC { + if cBuf != nil { + pool.Put(cBuf[:0]) + } + return nil, fmt.Errorf("zstd compression produced invalid magic: %08x (expected %08x)", magic, ZSTD_MAGIC) + } + } else if len(compressed) > 0 { + if cBuf != nil { + pool.Put(cBuf[:0]) + } + return nil, fmt.Errorf("zstd compression produced too short output: %d bytes", len(compressed)) + } + + // Make a copy since buffer is reused or might be part of pool buffer + out := make([]byte, len(compressed)) + copy(out, compressed) + + if cBuf != nil { + pool.Put(cBuf[:0]) + } + + return out, nil +} + +// ExtractRow decompresses the block and returns the specific row at index. +func ExtractRow(compressed []byte, index uint16, pool *sync.Pool) ([]byte, error) { + outBuf := pool.Get().([]byte) + defer pool.Put(outBuf[:0]) + + decompressed, err := decompressCompat(outBuf[:0], compressed) + if err != nil { + return nil, fmt.Errorf("decompress failed: %w", err) + } + + if len(decompressed) < 2 { + return nil, fmt.Errorf("invalid block: too short") + } + + count := binary.LittleEndian.Uint16(decompressed[0:]) + if int(index) >= int(count) { + return nil, fmt.Errorf("index %d out of bounds (count %d)", index, count) + } + + offsetIdx := 2 + int(index)*4 + if offsetIdx+4 > len(decompressed) { + return nil, fmt.Errorf("corrupt block header") + } + offset := binary.LittleEndian.Uint32(decompressed[offsetIdx:]) + + var nextOffset uint32 + if int(index)+1 < int(count) { + nextOffset = binary.LittleEndian.Uint32(decompressed[offsetIdx+4:]) + } else { + nextOffset = uint32(len(decompressed) - (2 + 4*int(count))) + } + + dataStart := 2 + 4*int(count) + start := dataStart + int(offset) + end := dataStart + int(nextOffset) + + if start > len(decompressed) || end > len(decompressed) { + return nil, fmt.Errorf("block data out of bounds") + } + + // Copy result to return safe byte slice + result := make([]byte, end-start) + copy(result, decompressed[start:end]) + return result, nil +} + +// IterateBlock decompresses the block and calls the callback for each row. +// Returns error if decompression fails. +func IterateBlock(compressed []byte, pool *sync.Pool, callback func([]byte) bool) error { + outBuf := pool.Get().([]byte) + defer pool.Put(outBuf[:0]) + + decompressed, err := decompressCompat(outBuf[:0], compressed) + if err != nil { + return fmt.Errorf("decompress failed: %w", err) + } + + if len(decompressed) < 2 { + return fmt.Errorf("invalid block: too short") + } + + count := binary.LittleEndian.Uint16(decompressed[0:]) + dataStart := 2 + 4*int(count) + + for i := 0; i < int(count); i++ { + offsetIdx := 2 + i*4 + if offsetIdx+4 > len(decompressed) { + return fmt.Errorf("corrupt block header") + } + offset := binary.LittleEndian.Uint32(decompressed[offsetIdx:]) + + var nextOffset uint32 + if i+1 < int(count) { + nextOffset = binary.LittleEndian.Uint32(decompressed[offsetIdx+4:]) + } else { + nextOffset = uint32(len(decompressed) - dataStart) + } + + start := dataStart + int(offset) + end := dataStart + int(nextOffset) + + if start > len(decompressed) || end > len(decompressed) { + return fmt.Errorf("block key out of bouds") + } + + if !callback(decompressed[start:end]) { + return nil + } + } + return nil +} + +// DecompressBlock decompresses the block and returns the raw bytes. +// It allocates a new slice for the result which is suitable for long-term caching. +func DecompressBlock(compressed []byte) ([]byte, error) { + // We do not use the pool here because we want the result to persist in the cache. + decompressed, err := decompressCompat(nil, compressed) + if err != nil { + hexStr := "" + for i := 0; i < len(compressed) && i < 32; i++ { + hexStr += fmt.Sprintf("%02x ", compressed[i]) + } + return nil, fmt.Errorf("decompress failed (len=%d header=[%s]): %w", len(compressed), hexStr, err) + } + return decompressed, nil +} + +func decompressCompat(dst []byte, payload []byte) ([]byte, error) { + // Current format: zstd compressed block + if out, err := zstd.Decompress(dst, payload); err == nil { + return out, nil + } + + // Legacy format support: lz4 frame compressed block + lz4r := lz4.NewReader(bytes.NewReader(payload)) + if out, err := io.ReadAll(lz4r); err == nil { + return out, nil + } + + // Already-uncompressed block + if isValidBlockLayout(payload) { + out := make([]byte, len(payload)) + copy(out, payload) + return out, nil + } + + // Legacy single-row payload (raw JSON row bytes) + if len(payload) > 0 { + i := 0 + for i < len(payload) && (payload[i] == ' ' || payload[i] == '\n' || payload[i] == '\t' || payload[i] == '\r') { + i++ + } + if i < len(payload) && (payload[i] == '{' || payload[i] == '[') { + return packSingleRowBlock(payload), nil + } + } + + return nil, fmt.Errorf("unknown block payload format") +} + +func packSingleRowBlock(row []byte) []byte { + out := make([]byte, 2+4+len(row)) + binary.LittleEndian.PutUint16(out[0:], 1) + binary.LittleEndian.PutUint32(out[2:], 0) + copy(out[6:], row) + return out +} + +func isValidBlockLayout(buf []byte) bool { + if len(buf) < 6 { + return false + } + count := int(binary.LittleEndian.Uint16(buf[0:2])) + if count <= 0 { + return false + } + headerSize := 2 + 4*count + if headerSize > len(buf) { + return false + } + prev := uint32(0) + dataLen := uint32(len(buf) - headerSize) + for i := 0; i < count; i++ { + off := binary.LittleEndian.Uint32(buf[2+i*4 : 2+i*4+4]) + if i > 0 && off < prev { + return false + } + if off > dataLen { + return false + } + prev = off + } + return true +} + +// ExtractRowFromDecompressed returns the specific row at index from an already decompressed block. +// It returns a copy of the data to ensure safety (so modification doesn't affect cache). +func ExtractRowFromDecompressed(decompressed []byte, index uint16) ([]byte, error) { + if len(decompressed) < 2 { + return nil, fmt.Errorf("invalid block: too short") + } + + count := binary.LittleEndian.Uint16(decompressed[0:]) + if int(index) >= int(count) { + return nil, fmt.Errorf("index %d out of bounds (count %d)", index, count) + } + + offsetIdx := 2 + int(index)*4 + if offsetIdx+4 > len(decompressed) { + return nil, fmt.Errorf("corrupt block header") + } + offset := binary.LittleEndian.Uint32(decompressed[offsetIdx:]) + + var nextOffset uint32 + if int(index)+1 < int(count) { + nextOffset = binary.LittleEndian.Uint32(decompressed[offsetIdx+4:]) + } else { + nextOffset = uint32(len(decompressed) - (2 + 4*int(count))) + } + + dataStart := 2 + 4*int(count) + start := dataStart + int(offset) + end := dataStart + int(nextOffset) + + if start > len(decompressed) || end > len(decompressed) { + return nil, fmt.Errorf("block data out of bounds") + } + + // Copy result to return safe byte slice + result := make([]byte, end-start) + copy(result, decompressed[start:end]) + return result, nil +} diff --git a/jsontable/bug_fix_test.go b/jsontable/bug_fix_test.go new file mode 100644 index 0000000..0e1175f --- /dev/null +++ b/jsontable/bug_fix_test.go @@ -0,0 +1,38 @@ +package jsontable + +import ( + "os" + "testing" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/util" + "github.com/stretchr/testify/assert" +) + +func TestLookupTableIDCaseInsensitive(t *testing.T) { + dbPath := "test_case_db_" + util.RandomString(5) + defer os.RemoveAll(dbPath) + + driver, err := NewJSONDriver(dbPath) + assert.NoError(t, err) + defer driver.Close() + + tableName := "TestTable" + columns := []benchtop.ColumnDef{{Key: "data"}} + _, err = driver.New(tableName, columns) + assert.NoError(t, err) + + // Direct match + tid1, err := driver.LookupTableID("TestTable") + assert.NoError(t, err) + assert.NotZero(t, tid1) + + // Different casing + tid2, err := driver.LookupTableID("testtable") + assert.NoError(t, err) + assert.Equal(t, tid1, tid2) + + tid3, err := driver.LookupTableID("TESTTABLE") + assert.NoError(t, err) + assert.Equal(t, tid1, tid3) +} diff --git a/jsontable/cache/cache.go b/jsontable/cache/cache.go deleted file mode 100644 index 74fe879..0000000 --- a/jsontable/cache/cache.go +++ /dev/null @@ -1,100 +0,0 @@ -package cache - -import ( - "bytes" - "context" - "time" - - "github.com/bmeg/benchtop" - "github.com/bmeg/benchtop/pebblebulk" - "github.com/bmeg/grip/log" - "github.com/maypok86/otter/v2" -) - -type JSONCache struct { - pageCache *otter.Cache[string, *benchtop.RowLoc] - pageLoader otter.LoaderFunc[string, *benchtop.RowLoc] - bulkPageLoader otter.BulkLoaderFunc[string, *benchtop.RowLoc] - kv pebblebulk.KVStore -} - -// Get retrieves an item from the cache. If the item is not present, -// it is automatically loaded from the underlying KV store. -func (ca *JSONCache) Get(ctx context.Context, key string) (*benchtop.RowLoc, error) { - return ca.pageCache.Get(ctx, key, ca.pageLoader) -} - -// Set adds or updates an item in the cache. -func (ca *JSONCache) Set(key string, value *benchtop.RowLoc) (*benchtop.RowLoc, bool) { - return ca.pageCache.Set(key, value) -} - -// Delete removes an item from the cache. -func (ca *JSONCache) Invalidate(key string) (*benchtop.RowLoc, bool) { - return ca.pageCache.Invalidate(key) -} - -func NewJSONCache(kv pebblebulk.KVStore) *JSONCache { - cache := &JSONCache{ - kv: kv, - pageCache: otter.Must(&otter.Options[string, *benchtop.RowLoc]{ - MaximumSize: 10_000_000, - }), - } - cache.pageLoader = otter.LoaderFunc[string, *benchtop.RowLoc](func(ctx context.Context, key string) (*benchtop.RowLoc, error) { - log.Debugln("Cache miss, loading from kv: ", key) - val, closer, err := kv.Get([]byte(key)) - if err != nil { - if err.Error() != "pebble: not found" { // Handle Pebble-specific error generically - log.Errorf("Err on kv.Get for key %s in CacheLoader: %v", key, err) - } - return &benchtop.RowLoc{}, err - } - closer.Close() - return benchtop.DecodeRowLoc(val), nil - }) - - cache.bulkPageLoader = otter.BulkLoaderFunc[string, *benchtop.RowLoc](func(ctx context.Context, keys []string) (map[string]*benchtop.RowLoc, error) { - prefix := []byte{benchtop.PosPrefix} - result := make(map[string]*benchtop.RowLoc, len(keys)) - err := kv.View(func(it *pebblebulk.PebbleIterator) error { - for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { - _, id := benchtop.ParsePosKey(it.Key()) - val, err := it.Value() - if err != nil { - log.Errorf("Err on it.Value() in bulkLoader: %v", err) - continue - } - loc := benchtop.DecodeRowLoc(val) - result[string(id)] = loc - } - return nil - }) - if err != nil { - return nil, err - } - return result, nil - }) - return cache -} - -func (ca *JSONCache) PreloadCache() error { - var keys []string - prefix := []byte{benchtop.PosPrefix} - L_Start := time.Now() - err := ca.kv.View(func(it *pebblebulk.PebbleIterator) error { - for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { - _, id := benchtop.ParsePosKey(it.Key()) - keys = append(keys, string(id)) - } - return nil - }) - if err != nil { - return err - } - _, err = ca.pageCache.BulkGet(context.Background(), keys, ca.bulkPageLoader) - if err == nil { - log.Debugf("Successfully loaded %d keys in RowLoc cache in %s", len(keys), time.Since(L_Start).String()) - } - return err -} diff --git a/jsontable/consistency_test.go b/jsontable/consistency_test.go new file mode 100644 index 0000000..d8a7cf2 --- /dev/null +++ b/jsontable/consistency_test.go @@ -0,0 +1,101 @@ +package jsontable + +import ( + "os" + "testing" + "time" + + "github.com/bmeg/benchtop/jsontable/table" +) + +func TestIndexConsistency(t *testing.T) { + tmpDir, err := os.MkdirTemp("", "jsontable_consist") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(tmpDir) + + tableName := "v_Person" + rowCount := 1000 + + // 1. Load Data + { + driver, err := NewJSONDriver(tmpDir) + if err != nil { + t.Fatal(err) + } + tblStore, err := driver.New(tableName, nil) + if err != nil { + t.Fatal(err) + } + tableID := tblStore.(*table.JSONTable).TableId + + err = driver.BulkLoad(tableID, loadRowsHelper(tableID, rowCount)) + if err != nil { + t.Fatal(err) + } + time.Sleep(100 * time.Millisecond) + driver.Close() + } + + // 2. Restart and Check Consistency + { + driver, err := NewJSONDriver(tmpDir) + if err != nil { + t.Fatal(err) + } + defer driver.Close() + + tableID, err := driver.LookupTableID(tableName) + if err != nil { + t.Fatalf("Failed to lookup table ID after restart: %v", err) + } + ts, err := driver.Get(tableID) + if err != nil { + t.Fatal(err) + } + tbl := ts.(*table.JSONTable) + + // Iterate ScanFull (source of truth from disk) + scannedCount := 0 + for row := range tbl.ScanDoc(nil) { + scannedCount++ + + id, ok := row["_id"].(string) + if !ok { + t.Errorf("Row missing _id") + continue + } + + // Lookup in Index + locFromIndex, err := tbl.LocLookup(id) + if err != nil { + t.Errorf("LocLookup failed for %s: %v", id, err) + continue + } + if locFromIndex == nil { + t.Errorf("LocLookup returned nil for %s", id) + continue + } + + // Verify data accessibility via Index Loc + dataFromIndexLoc, err := tbl.Storage.Get(locFromIndex) + if err != nil { + t.Errorf("Storage.Get failed using Index Loc for %s: %v. Loc: %+v", id, err, locFromIndex) + } else { + // Verify content matches (decompressed)? + // Storage.Get returns compressed block usually? + // Wait, Storage.Get returns body (without header). + // We need to decompress to compare with 'row' map? + // Just checking if it errors is good enough for "decompress failed" check. + if len(dataFromIndexLoc) == 0 { + t.Errorf("Storage.Get return empty data") + } + } + } + + if scannedCount != rowCount { + t.Errorf("ScanDoc found %d rows, expected %d", scannedCount, rowCount) + } + } +} diff --git a/jsontable/driver.go b/jsontable/driver.go index dad9dae..105da16 100644 --- a/jsontable/driver.go +++ b/jsontable/driver.go @@ -2,24 +2,31 @@ package jsontable import ( "bytes" + "context" "fmt" "os" "path/filepath" "sync" "time" + "strconv" + "strings" + "github.com/bmeg/benchtop" - "github.com/bmeg/benchtop/jsontable/cache" - "github.com/bmeg/benchtop/jsontable/section" - jTable "github.com/bmeg/benchtop/jsontable/table" + "github.com/bmeg/benchtop/cache" + "github.com/bmeg/benchtop/jsontable/block" + "github.com/bmeg/benchtop/jsontable/storage" + "github.com/bmeg/benchtop/jsontable/table" "github.com/bmeg/benchtop/pebblebulk" "github.com/bmeg/benchtop/util" "github.com/bmeg/grip/log" "github.com/bytedance/sonic" + "github.com/cockroachdb/pebble" + "github.com/maypok86/otter/v2" ) const ( - BATCH_SIZE int = 1000 + BATCH_SIZE int = 5000 ) type JSONDriver struct { @@ -27,14 +34,16 @@ type JSONDriver struct { Lock sync.RWMutex PebbleLock sync.RWMutex Pkv *pebblebulk.PebbleKV - LocCache *cache.JSONCache + LocCache cache.Cache - Tables map[string]*jTable.JSONTable - LabelLookup map[uint16]string + Tables map[uint16]*table.JSONTable + idToName map[uint16]string + nameToId map[string]uint16 + ZoneManager storage.ZoneManager } func NewJSONDriver(path string) (benchtop.TableDriver, error) { - Pkv, err := pebblebulk.NewPebbleKV(path) + pKv, err := pebblebulk.NewPebbleKV(path) if err != nil { return nil, err } @@ -45,125 +54,103 @@ func NewJSONDriver(path string) (benchtop.TableDriver, error) { } if !exist { if err := os.Mkdir(tableDir, 0700); err != nil { - Pkv.Db.Close() + pKv.Db.Close() return nil, fmt.Errorf("failed to create TABLES directory: %v", err) } } driver := &JSONDriver{ - base: path, - Tables: map[string]*jTable.JSONTable{}, - Pkv: &pebblebulk.PebbleKV{ - Db: Pkv.Db, - InsertCount: 0, - CompactLimit: uint32(1000), - }, - LocCache: cache.NewJSONCache(Pkv), - Lock: sync.RWMutex{}, - PebbleLock: sync.RWMutex{}, - LabelLookup: map[uint16]string{}, - } - - return driver, nil -} - -// Update LoadJSONDriver to use DirExists -func LoadJSONDriver(path string) (benchtop.TableDriver, error) { - pKv, err := pebblebulk.NewPebbleKV(path) - if err != nil { - return nil, fmt.Errorf("failed to open database: %v", err) - } - - tableDir := filepath.Join(path, "TABLES") - exist, err := util.DirExists(tableDir) - if err != nil { - pKv.Close() - return nil, err - } - if !exist { - pKv.Close() - return nil, fmt.Errorf("TABLES directory not found at %s", tableDir) - } - - driver := &JSONDriver{ - base: path, - Tables: map[string]*jTable.JSONTable{}, - Pkv: &pebblebulk.PebbleKV{ - Db: pKv.Db, - InsertCount: 0, - CompactLimit: uint32(1000), - }, - LocCache: cache.NewJSONCache(pKv), + base: path, + Tables: map[uint16]*table.JSONTable{}, + Pkv: pKv, + LocCache: cache.NewKVCache(pKv), Lock: sync.RWMutex{}, PebbleLock: sync.RWMutex{}, - LabelLookup: map[uint16]string{}, + idToName: map[uint16]string{}, + nameToId: map[string]uint16{}, + ZoneManager: storage.NewZoneManager(tableDir), } + // Load existing tables from disk for _, tableName := range driver.List() { - table, err := driver.Get(tableName) + tinfo, err := driver.getTableInfo(tableName) if err != nil { driver.Close() return nil, fmt.Errorf("failed to load table %s: %v", tableName, err) } - jsonTable, ok := table.(*jTable.JSONTable) - if !ok { + driver.nameToId[tableName] = tinfo.TableId + driver.idToName[tinfo.TableId] = tableName + + _, err = driver.Get(tinfo.TableId) + if err != nil { driver.Close() - return nil, fmt.Errorf("invalid table type for %s", tableName) + return nil, fmt.Errorf("failed to open table %s (ID %d): %v", tableName, tinfo.TableId, err) } - - driver.Lock.Lock() - driver.LabelLookup[jsonTable.TableId] = tableName[2:] - driver.Tables[tableName] = jsonTable - driver.Lock.Unlock() } - err = driver.LoadFields() - if err != nil { - pKv.Close() - return nil, err + // Load Fields + if err := driver.LoadFields(); err != nil { + driver.Close() + return nil, fmt.Errorf("failed to load fields: %v", err) } + // Preload Cache + // Note: cache.NewKVCache already handles table-aware scanning if we updated it. driver.Lock.RLock() err = driver.LocCache.PreloadCache() driver.Lock.RUnlock() if err != nil { - return nil, err + log.Errorf("Cache preload failed: %v", err) } return driver, nil } -func (dr *JSONDriver) New(name string, columns []benchtop.ColumnDef) (benchtop.TableStore, error) { - dr.Lock.RLock() - if p, ok := dr.Tables[name]; ok { - dr.Lock.RUnlock() - return p, nil - } - dr.Lock.RUnlock() +// makeLocCacheKey creates a unique key for the location cache including tableId +func makeLocCacheKey(tableId uint16, id string) string { + return strconv.FormatUint(uint64(tableId), 10) + ":" + id +} + +// LoadJSONDriver is deprecated and just calls NewJSONDriver which now handles loading. +func LoadJSONDriver(path string) (benchtop.TableDriver, error) { + return NewJSONDriver(path) +} +func (dr *JSONDriver) New(name string, columns []benchtop.ColumnDef) (benchtop.TableStore, error) { dr.Lock.Lock() defer dr.Lock.Unlock() + + if id, ok := dr.nameToId[name]; ok { + if p, ok := dr.Tables[id]; ok { + return p, nil + } + // Attempt to load if we know the ID but it's not in Tables map + // Release lock before calling Get to avoid deadlock (Get acquires ReadLock/Lock) + dr.Lock.Unlock() + tbl, err := dr.Get(id) + dr.Lock.Lock() // Re-acquire lock + if err == nil { + return tbl, nil + } + } + + // Case-insensitive lookup for existing tables on startup + lowerName := strings.ToLower(name) + for existingName, id := range dr.nameToId { + if strings.ToLower(existingName) == lowerName { + if p, ok := dr.Tables[id]; ok { + return p, nil + } + } + } + newId := dr.getMaxTablePrefix() formattedName := util.PadToSixDigits(int(newId)) tPath := filepath.Join(dr.base, "TABLES", formattedName) - out := &jTable.JSONTable{ - Columns: columns, - ColumnMap: map[string]int{}, - Path: tPath, - Name: name, - FileName: tPath, // Base name for partition/section files - TableId: newId, - Fields: map[string]struct{}{}, - ActiveSections: map[uint8]*section.Section{}, - FlushCounter: map[uint8]int{}, - SectionLock: sync.Mutex{}, - MaxConcurrentSections: 10, - PartitionMap: map[uint8][]uint16{}, - Sections: map[uint16]*section.Section{}, - } - for n, d := range columns { - out.ColumnMap[d.Key] = n + out, err := dr.newJSONTable(name, columns, formattedName, newId) + if err != nil { + return nil, err } // Create TableInfo for serialization @@ -186,14 +173,14 @@ func (dr *JSONDriver) New(name string, columns []benchtop.ColumnDef) (benchtop.T } if err := out.Init(10); err != nil { - log.Errorf("TABLE INIT ERR: %v", err) - return nil, fmt.Errorf("failed to init table %s: %v", name, err) + // Init might be no-op now } - dr.Tables[name] = out - dr.LabelLookup[newId] = name[2:] + dr.Tables[newId] = out + dr.nameToId[name] = newId + dr.idToName[newId] = name - log.Debugf("Created table %s", name) + log.Debugf("Created table %s with ID %d", name, newId) return out, nil } @@ -203,7 +190,7 @@ func (dr *JSONDriver) SetIndices(inputs chan benchtop.Index) { dr.AddTableEntryInfo( tx, index.Key, - &index.Loc, + index.Loc, ) } return nil @@ -239,17 +226,59 @@ func (dr *JSONDriver) List() []string { return out } +func (dr *JSONDriver) GetLabels(edges bool, removePrefix bool) chan string { + out := make(chan string, 10) + go func() { + defer close(out) + dr.Lock.RLock() + defer dr.Lock.RUnlock() + for _, name := range dr.idToName { + isEdge := strings.HasPrefix(name, "e_") + if (edges && isEdge) || (!edges && !isEdge) { + if removePrefix && len(name) > 2 { + out <- name[2:] + } else { + out <- name + } + } + } + }() + return out +} + +func (dr *JSONDriver) GetAllColNames() chan string { + out := make(chan string, 10) + go func() { + defer close(out) + dr.Lock.RLock() + defer dr.Lock.RUnlock() + for _, tbl := range dr.Tables { + for _, col := range tbl.GetColumnDefs() { + out <- col.Key + } + } + }() + return out +} + +// BulkLoad implementation is in bLoad.go + +func (dr *JSONDriver) GetKV() any { + return dr.Pkv +} + func (dr *JSONDriver) Close() { dr.Lock.Lock() defer dr.Lock.Unlock() log.Infoln("Closing JSONDriver...") - for tableName, table := range dr.Tables { + for id, table := range dr.Tables { table.Close() // Closes all section handles and file pools - log.Debugf("Closed table %s", tableName) + log.Debugf("Closed table ID %d (%s)", id, table.Name) } - dr.Tables = make(map[string]*jTable.JSONTable) - if dr.Pkv.Db != nil { + dr.Tables = make(map[uint16]*table.JSONTable) + dr.nameToId = make(map[string]uint16) + if dr.Pkv != nil && dr.Pkv.Db != nil { if closeErr := dr.Pkv.Db.Close(); closeErr != nil { log.Errorf("Error closing Pebble database: %v", closeErr) } @@ -260,9 +289,13 @@ func (dr *JSONDriver) Close() { log.Infof("Successfully closed JSONDriver for path %s", dr.base) } -func (dr *JSONDriver) Get(name string) (benchtop.TableStore, error) { +func (dr *JSONDriver) InvalidateLoc(tableId uint16, rowId string) { + dr.LocCache.Invalidate(makeLocCacheKey(tableId, rowId)) +} + +func (dr *JSONDriver) Get(id uint16) (benchtop.TableStore, error) { dr.Lock.RLock() - if x, ok := dr.Tables[name]; ok { + if x, ok := dr.Tables[id]; ok { dr.Lock.RUnlock() return x, nil } @@ -271,68 +304,252 @@ func (dr *JSONDriver) Get(name string) (benchtop.TableStore, error) { dr.Lock.Lock() defer dr.Lock.Unlock() - if x, ok := dr.Tables[name]; ok { + if x, ok := dr.Tables[id]; ok { return x, nil } - nkey := benchtop.NewTableKey([]byte(name)) - value, closer, err := dr.Pkv.Db.Get(nkey) + // Find the name for this ID in idToName + name, ok := dr.idToName[id] + if !ok { + return nil, fmt.Errorf("table ID %d not found", id) + } + + tinfo, err := dr.getTableInfo(name) if err != nil { - log.Errorln("JSONDriver Get: ", err) + log.Errorf("JSONDriver Get(ID %d): could not find info for name '%s': %v", id, name, err) return nil, err } - defer closer.Close() - tinfo := benchtop.TableInfo{} - if err := sonic.ConfigFastest.Unmarshal(value, &tinfo); err != nil { - return nil, fmt.Errorf("failed to unmarshal table info: %v", err) - } - - log.Debugf("Opening Table: %#v\n", tinfo) - tPath := filepath.Join(dr.base, "TABLES", string(tinfo.FileName)) - out := &jTable.JSONTable{ - Columns: tinfo.Columns, - ColumnMap: map[string]int{}, - TableId: tinfo.TableId, - Path: tPath, - FileName: tPath, - Name: name, - Fields: map[string]struct{}{}, - ActiveSections: map[uint8]*section.Section{}, - FlushCounter: map[uint8]int{}, - MaxConcurrentSections: 10, - Sections: map[uint16]*section.Section{}, - PartitionMap: map[uint8][]uint16{}, - SectionLock: sync.Mutex{}, - } - for n, d := range out.Columns { - out.ColumnMap[d.Key] = n + + log.Debugf("Opening Table ID %d: %#v\n", id, tinfo) + + out, err := dr.newJSONTable(name, tinfo.Columns, string(tinfo.FileName), tinfo.TableId) + if err != nil { + return nil, err } if err := out.Init(10); err != nil { return nil, fmt.Errorf("failed to init table %s: %v", name, err) } - dr.Tables[name] = out + + dr.Tables[id] = out + dr.nameToId[name] = id + return out, nil } -func (dr *JSONDriver) Delete(name string) error { +func (dr *JSONDriver) Delete(id uint16) error { dr.Lock.Lock() defer dr.Lock.Unlock() - table, exists := dr.Tables[name] + tableVar, exists := dr.Tables[id] if !exists { - return fmt.Errorf("table %s does not exist", name) + // Attempt to load it first to ensure we can close and delete it + tbl, err := dr.Get(id) + if err != nil { + return fmt.Errorf("table ID %d does not exist and could not be loaded", id) + } + tableVar = tbl.(*table.JSONTable) } - table.Close() // Close all section files + tableVar.Close() // Close all section files + name := tableVar.Name + + // Delete the entire storage zone (O(1) bulk delete) + if err := dr.ZoneManager.DeleteZone(tableVar.FileName); err != nil { + log.Errorf("Failed to delete storage zone for %s: %v", name, err) + } + + // Iterate over keys to invalidate cache and delete from KV + prefix := benchtop.NewPosKeyPrefix(tableVar.TableId) + var keysToDelete [][]byte + dr.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { + _, rowBytes := benchtop.ParsePosKey(it.Key()) + rowId := string(rowBytes) + dr.LocCache.Invalidate(makeLocCacheKey(tableVar.TableId, rowId)) + // Make a copy of the key bytes because pebble reuses them + keyCopy := make([]byte, len(it.Key())) + copy(keyCopy, it.Key()) + keysToDelete = append(keysToDelete, keyCopy) + } + return nil + }) + + if len(keysToDelete) > 0 { + err := dr.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { + for _, k := range keysToDelete { + if err := tx.Delete(k, nil); err != nil { + return err + } + } + return nil + }) + if err != nil { + log.Errorf("Failed to delete keys for table %s: %v", name, err) + } + } - // Delete all section files for the table - for _, sec := range table.Sections { - if err := os.Remove(sec.Path); err != nil { - log.Errorf("Failed to delete section file %s: %v", sec.Path, err) + // Clean up field indexes + for field := range tableVar.Fields { + if err := dr.RemoveField(id, field); err != nil { + log.Errorf("Failed to remove field %s for table ID %d: %v", field, id, err) } } - delete(dr.Tables, name) + tableVar.Fields = nil + + delete(dr.Tables, id) + delete(dr.nameToId, name) + delete(dr.idToName, id) dr.dropTable(name) return nil } + +func (dr *JSONDriver) LookupTableID(name string) (uint16, error) { + dr.Lock.RLock() + if id, ok := dr.nameToId[name]; ok { + dr.Lock.RUnlock() + return id, nil + } + // Case-insensitive fallback for existing tables + lower := strings.ToLower(name) + for existing, id := range dr.nameToId { + if strings.ToLower(existing) == lower { + dr.Lock.RUnlock() + return id, nil + } + } + dr.Lock.RUnlock() + + tinfo, err := dr.getTableInfo(name) + if err != nil { + return 0, err + } + dr.Lock.Lock() + dr.nameToId[name] = tinfo.TableId + dr.idToName[tinfo.TableId] = name + dr.Lock.Unlock() + return tinfo.TableId, nil +} + +func (dr *JSONDriver) ListTableIDs() []uint16 { + dr.Lock.RLock() + defer dr.Lock.RUnlock() + ids := make([]uint16, 0, len(dr.Tables)) + for id := range dr.Tables { + ids = append(ids, id) + } + return ids +} + +func (dr *JSONDriver) GetTableInfo(tableID uint16) (*benchtop.TableInfo, error) { + prefix := []byte{benchtop.TablePrefix} + var found *benchtop.TableInfo + err := dr.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { + val, err := it.Value() + if err != nil { + continue + } + var tinfo benchtop.TableInfo + if err := sonic.ConfigFastest.Unmarshal(val, &tinfo); err == nil { + if tinfo.TableId == tableID { + found = &tinfo + return nil + } + } + } + return nil + }) + if err != nil { + return nil, err + } + if found == nil { + return nil, pebble.ErrNotFound + } + return found, nil +} + +func (dr *JSONDriver) newJSONTable(name string, columns []benchtop.ColumnDef, fileName string, tableID uint16) (*table.JSONTable, error) { + store, err := dr.ZoneManager.CreateZone(fileName) + if err != nil { + return nil, fmt.Errorf("failed to init storage: %w", err) + } + + out := &table.JSONTable{ + Columns: columns, + ColumnMap: make(map[string]int), + TableId: tableID, + FileName: fileName, + Name: name, + Storage: store, + LocLookup: func(id string) (*benchtop.RowLoc, error) { + val, closer, err := dr.Pkv.Get(benchtop.NewPosKey(tableID, []byte(id))) + if err != nil { + return nil, err + } + defer closer.Close() + loc := benchtop.DecodeRowLoc(val) + if loc == nil { + return nil, fmt.Errorf("invalid row location for id %s", id) + } + return loc, nil + }, + BufferPool: sync.Pool{ + New: func() any { + return make([]byte, 0, 4096) + }, + }, + BlockCache: otter.Must(&otter.Options[string, []byte]{ + MaximumSize: 5000, + }), + } + + // Define Loader + out.BlockLoader = func(ctx context.Context, key string) ([]byte, error) { + // Key format: "TableId:Section:Offset:Size" + parts := strings.Split(key, ":") + if len(parts) != 4 { + return nil, fmt.Errorf("invalid cache key: %s (expected 4 parts)", key) + } + + // Parts: TableId (0), Section (1), Offset (2), Size (3) + sec, _ := strconv.Atoi(parts[1]) + off, _ := strconv.Atoi(parts[2]) + sz, _ := strconv.Atoi(parts[3]) + + secId := uint16(sec) + off32 := uint32(off) + sz32 := uint32(sz) + + var blockData []byte + var lastErr error + + // Retry loop for coherence gaps + for i := 0; i < 10; i++ { + compressed, err := out.Storage.Get(&benchtop.RowLoc{ + Section: secId, + Offset: off32, + Size: sz32, + }) + if err != nil { + lastErr = err + time.Sleep(10 * time.Millisecond) + continue + } + + blockData, err = block.DecompressBlock(compressed) + if err == nil { + return blockData, nil + } + lastErr = err + time.Sleep(20 * time.Millisecond) + } + + return nil, fmt.Errorf("decompress failed for section %d offset %d size %d after retries: %w", sec, off, sz, lastErr) + } + + for i, col := range columns { + out.ColumnMap[col.Key] = i + } + return out, nil +} diff --git a/jsontable/duplicate_test.go b/jsontable/duplicate_test.go new file mode 100644 index 0000000..895c7e4 --- /dev/null +++ b/jsontable/duplicate_test.go @@ -0,0 +1,74 @@ +package jsontable + +import ( + "fmt" + "os" + "testing" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/util" + "github.com/stretchr/testify/assert" +) + +func TestBulkLoadDuplicatePrevention(t *testing.T) { + dbPath := "test_duplicate_db_" + util.RandomString(5) + defer os.RemoveAll(dbPath) + + driver, err := NewJSONDriver(dbPath) + assert.NoError(t, err) + defer driver.Close() + + tableName := "v_Person" + columns := []benchtop.ColumnDef{{Key: "name"}} + tStore, err := driver.New(tableName, columns) + assert.NoError(t, err) + + tid, _ := driver.LookupTableID(tableName) + + // 1. First bulk load + rowCount := 10 + ch1 := make(chan *benchtop.Row, rowCount) + go func() { + defer close(ch1) + for i := 0; i < rowCount; i++ { + row := benchtop.Row{ + Id: []byte(fmt.Sprintf("person_%d", i)), + TableID: tid, + Data: map[string]any{"name": fmt.Sprintf("Person %d", i)}, + } + ch1 <- &row + } + }() + err = driver.BulkLoad(tid, ch1) + assert.NoError(t, err) + + // Verify count + count := 0 + for range tStore.ScanDoc(nil) { + count++ + } + assert.Equal(t, rowCount, count) + + // 2. Second bulk load with same IDs + ch2 := make(chan *benchtop.Row, rowCount) + go func() { + defer close(ch2) + for i := 0; i < rowCount; i++ { + row := benchtop.Row{ + Id: []byte(fmt.Sprintf("person_%d", i)), + TableID: tid, + Data: map[string]any{"name": fmt.Sprintf("Person %d Duplicate", i)}, + } + ch2 <- &row + } + }() + err = driver.BulkLoad(tid, ch2) + assert.NoError(t, err) + + // Verify count is STILL rowCount (no duplicates) + count2 := 0 + for range tStore.ScanDoc(nil) { + count2++ + } + assert.Equal(t, rowCount, count2, "Should not have added duplicate person entries") +} diff --git a/jsontable/fields.go b/jsontable/fields.go index f71c7ab..384d10b 100644 --- a/jsontable/fields.go +++ b/jsontable/fields.go @@ -2,122 +2,167 @@ package jsontable import ( "bytes" + "encoding/binary" + "errors" "fmt" "github.com/bmeg/benchtop" "github.com/bmeg/grip/log" "github.com/bytedance/sonic" + "github.com/cockroachdb/pebble" "github.com/bmeg/benchtop/filters" + "github.com/bmeg/benchtop/jsontable/table" "github.com/bmeg/benchtop/jsontable/tpath" - "github.com/bmeg/benchtop/pebblebulk" - "github.com/bmeg/grip/gripql" + "github.com/bmeg/benchtop/query" ) -func (dr *JSONDriver) AddField(label, field string) error { - foundTable, ok := dr.Tables[label] - if !ok { - _, err := dr.New(label, nil) - if err != nil { - return err +func (dr *JSONDriver) lookupPosLoc(tableID uint16, rowID []byte) *benchtop.RowLoc { + dr.Lock.RLock() + tbl, ok := dr.Tables[tableID] + dr.Lock.RUnlock() + if !ok || tbl == nil { + return nil + } + val, closer, err := dr.Pkv.Get(benchtop.NewPosKey(tbl.TableId, rowID)) + if err != nil { + if !errors.Is(err, pebble.ErrNotFound) { + log.Errorf("lookupPosLoc(%s,%s): %v", tbl.Name, string(rowID), err) } + return nil + } + defer closer.Close() + loc := benchtop.DecodeRowLoc(val) + if loc == nil { + return nil + } + return loc +} - dr.Lock.Lock() - defer dr.Lock.Unlock() +func (dr *JSONDriver) AddField(tableID uint16, field string) error { + dr.Lock.Lock() - log.Debugf("Creating index '%s' for table '%s' that has not been written yet", field, label) - // If the table doesn't yet exist, write the index Key stub. - err = dr.Pkv.Set( - benchtop.FieldKey(field, label, nil, nil), - []byte{}, - nil, - ) - if err != nil { - log.Errorf("Err attempting to add field %v", err) - return err - } - err = dr.Pkv.Set( - bytes.Join([][]byte{ - benchtop.RFieldPrefix, - []byte(label), - []byte(field), - }, benchtop.FieldSep), - []byte{}, - nil, - ) - if err != nil { - log.Errorf("Err attempting to add field %v", err) - return err - } + tbl, ok := dr.Tables[tableID] + if !ok { + dr.Lock.Unlock() + return fmt.Errorf("table ID %d not found", tableID) + } + label := tbl.Name + + log.Debugf("Creating index '%s' for table '%s' that has not been written yet", field, label) + // If the table doesn't yet exist, write the index Key stub. + err := dr.Pkv.Set( + benchtop.FieldKey(field, tableID, nil, nil), + []byte{}, + nil, + ) + if err != nil { + dr.Lock.Unlock() + log.Errorf("Err attempting to add field %v", err) + return err + } + err = dr.Pkv.Set( + bytes.Join([][]byte{ + benchtop.RFieldPrefix, + binary.LittleEndian.AppendUint16(nil, tableID), + []byte(field), + }, benchtop.FieldSep), + []byte{}, + nil, + ) + if err != nil { + dr.Lock.Unlock() + log.Errorf("Err attempting to add field %v", err) + return err + } + log.Infof("Found table %s (ID %d); clearing and rebuilding indices for field %s", label, tbl.TableId, field) + // Clean existing index for this field/label first to avoid stale entries + // Use internal non-locking methods to avoid deadlock with AddField's own lock + if err := dr.removeFieldIndexes(tableID, field); err != nil { + log.Errorf("Failed to clear index for ID %d:%s before rebuild: %v", tableID, field, err) } else { - dr.Lock.Lock() - defer dr.Lock.Unlock() - - log.Debugf("Found table %s writing indices for field %s", label, field) - err := dr.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { - var filter benchtop.RowFilter = nil - for r := range foundTable.ScanDoc(filter) { - fieldValue := tpath.PathLookup(r, field) - rowId, ok := r["_id"].(string) - if !ok { - return fmt.Errorf("_id field not found or is not string in map %s", r) - } - err := tx.Set( - benchtop.FieldKey( - field, - label, - fieldValue, - []byte(rowId), - ), - []byte{}, - nil, - ) + log.Debugf("Successfully cleared stale indices for %s:%s", label, field) + } + // Release lock while scanning to avoid blocking other operations + dr.Lock.Unlock() + // Table exists, perform rebuild + errRebuild := dr.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { + var filter benchtop.RowFilter = nil + for r := range tbl.ScanFull(filter) { + fieldValue := tpath.PathLookup(r.DataMap, field) + rowId, ok := r.DataMap["_id"].(string) + if !ok { + return fmt.Errorf("_id field not found or is not string in map %s", r.DataMap) + } + err := tx.Set( + benchtop.FieldKey( + field, + tableID, + fieldValue, + []byte(rowId), + ), + benchtop.EncodeRowLoc(r.Loc), + nil, + ) + if err != nil { + return err + } + if fieldValue != nil { + byteFV, err := sonic.ConfigFastest.Marshal(fieldValue) if err != nil { return err } - if fieldValue != nil { - byteFV, err := sonic.ConfigFastest.Marshal(fieldValue) - if err != nil { - return err - } - err = tx.Set(benchtop.RFieldKey(label, field, rowId), byteFV, nil) - if err != nil { - return err - } + err = tx.Set(benchtop.RFieldKey(tableID, field, rowId), byteFV, nil) + if err != nil { + return err } } - return nil - }) - if err != nil { - return err } + return nil + }) + if errRebuild != nil { + return errRebuild } + dr.Lock.Lock() - if dr.Tables[label].Fields == nil { - dr.Tables[label].Fields = map[string]struct{}{} + if tbl.Fields == nil { + tbl.Fields = map[string]struct{}{} } - if _, existsField := dr.Tables[label].Fields[field]; existsField { + if _, existsField := tbl.Fields[field]; existsField { + dr.Lock.Unlock() return fmt.Errorf("index label '%s' field '%s' already exists", label, field) } - dr.Tables[label].Fields[field] = struct{}{} - log.Debugln("List Fields: ", dr.Tables[label].Fields) + tbl.Fields[field] = struct{}{} + dr.Lock.Unlock() + log.Debugln("List Fields: ", tbl.Fields) return nil } -func (dr *JSONDriver) RemoveField(label string, field string) error { +func (dr *JSONDriver) RemoveField(tableID uint16, field string) error { dr.Lock.Lock() defer dr.Lock.Unlock() + return dr.removeFieldLocked(tableID, field) +} - if table, ok := dr.Tables[label]; ok { - delete(table.Fields, field) +func (dr *JSONDriver) removeFieldLocked(tableID uint16, field string) error { + tbl, ok := dr.Tables[tableID] + if ok { + delete(tbl.Fields, field) + } else { + return fmt.Errorf("table ID %d not found", tableID) } - FieldPrefix := benchtop.FieldLabelKey(field, label) + return dr.removeFieldIndexes(tableID, field) +} + +func (dr *JSONDriver) removeFieldIndexes(tableID uint16, field string) error { + FieldPrefix := benchtop.FieldLabelKey(field, tableID) + RFieldKeyPrefix := bytes.Join([][]byte{ benchtop.RFieldPrefix, - []byte(label), + binary.LittleEndian.AppendUint16(nil, tableID), []byte(field), }, benchtop.FieldSep) @@ -131,10 +176,7 @@ func (dr *JSONDriver) RemoveField(label string, field string) error { } return nil }) - if err != nil { - return err - } - return nil + return err } func (dr *JSONDriver) LoadFields() error { @@ -145,21 +187,35 @@ func (dr *JSONDriver) LoadFields() error { count := 0 err := dr.Pkv.View(func(it *pebblebulk.PebbleIterator) error { for it.Seek(fPrefix); it.Valid() && bytes.HasPrefix(it.Key(), fPrefix); it.Next() { - field, label, _, _ := benchtop.FieldKeyParse(it.Key()) - if _, exists := dr.Tables[label]; !exists { - _, err := dr.New(label, nil) + field, tableID, _, _ := benchtop.FieldKeyParse(it.Key()) + if field == "" { + log.Infof("LoadFields: skipping malformed field key: %x", it.Key()) + continue + } + + dr.Lock.RLock() + tbl, exists := dr.Tables[tableID] + dr.Lock.RUnlock() + + if !exists { + // Attempt to load by ID + tblStore, err := dr.Get(tableID) if err != nil { - return err + log.Errorf("LoadFields: could not load table ID %d: %v", tableID, err) + continue } - + tbl = tblStore.(*table.JSONTable) } - if dr.Tables[label].Fields == nil { - dr.Tables[label].Fields = make(map[string]struct{}) + + dr.Lock.Lock() + if tbl.Fields == nil { + tbl.Fields = make(map[string]struct{}) } - if _, exists := dr.Tables[label].Fields[field]; !exists { - dr.Tables[label].Fields[field] = struct{}{} + if _, exists := tbl.Fields[field]; !exists { + tbl.Fields[field] = struct{}{} count++ } + dr.Lock.Unlock() } log.Debugf("Loaded %d indices", count) return nil @@ -171,26 +227,21 @@ func (dr *JSONDriver) LoadFields() error { return nil } -type FieldInfo struct { - Label string - Field string -} - -func (dr *JSONDriver) ListFields() []FieldInfo { +func (dr *JSONDriver) ListFields() []benchtop.FieldInfo { /* Lists loaded fields. * Since fields on disk are loaded on startup this should be all that is needed */ dr.Lock.RLock() defer dr.Lock.RUnlock() - var out []FieldInfo - for _, table := range dr.Tables { - if table.Fields != nil { - for fieldName, _ := range table.Fields { - if table.Name[:2] == "v_" { - out = append(out, FieldInfo{Label: table.Name[2:], Field: fieldName}) + var out []benchtop.FieldInfo + for _, tbl := range dr.Tables { + if tbl.Fields != nil { + for fieldName := range tbl.Fields { + if len(tbl.Name) > 2 && tbl.Name[:2] == "v_" { + out = append(out, benchtop.FieldInfo{Label: tbl.Name[2:], Field: fieldName}) } else { - out = append(out, FieldInfo{Label: table.Name, Field: fieldName}) + out = append(out, benchtop.FieldInfo{Label: tbl.Name, Field: fieldName}) } } @@ -199,33 +250,30 @@ func (dr *JSONDriver) ListFields() []FieldInfo { return out } -func (dr *JSONDriver) DeleteRowField(label, field, rowID string) error { +func (dr *JSONDriver) DeleteRowField(tableID uint16, field, rowID string) error { /* Deletes a singular row index field */ dr.Lock.Lock() defer dr.Lock.Unlock() // Check if the table exists - _, ok := dr.Tables[label] + tbl, ok := dr.Tables[tableID] if !ok { - _, err := dr.New(label, nil) - if err != nil { - return err - } - + return fmt.Errorf("table ID %d not found", tableID) } + label := tbl.Name - if len(dr.Tables[label].Fields) <= 0 { - log.Errorf("No fields defined for table '%s'", label) + if len(tbl.Fields) <= 0 { + log.Errorf("No fields defined for table ID %d (%s)", tableID, label) return fmt.Errorf("no fields defined for table '%s'", label) } - if _, existsField := dr.Tables[label].Fields[field]; !existsField { - log.Errorf("Field '%s' does not exist in table '%s'", field, label) + if _, existsField := tbl.Fields[field]; !existsField { + log.Errorf("Field '%s' does not exist in table ID %d (%s)", field, tableID, label) return fmt.Errorf("field '%s' does not exist in table '%s'", field, label) } // Get the field value from the reverse index - rowIndexKey := benchtop.RFieldKey(label, field, rowID) + rowIndexKey := benchtop.RFieldKey(tableID, field, rowID) var fieldValueBytes []byte err := dr.Pkv.View(func(it *pebblebulk.PebbleIterator) error { var err error @@ -256,7 +304,7 @@ func (dr *JSONDriver) DeleteRowField(label, field, rowID string) error { // Delete both the forward and reverse index entries err = dr.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { - if err := tx.Delete(benchtop.FieldKey(field, label, fieldValue, []byte(rowID)), nil); err != nil { + if err := tx.Delete(benchtop.FieldKey(field, tableID, fieldValue, []byte(rowID)), nil); err != nil { return err } if err := tx.Delete(rowIndexKey, nil); err != nil { @@ -272,70 +320,150 @@ func (dr *JSONDriver) DeleteRowField(label, field, rowID string) error { return nil } -func (dr *JSONDriver) RowIdsByHas(fltField string, fltValue any, fltOp gripql.Condition) chan string { - dr.Lock.RLock() - defer dr.Lock.RUnlock() +func (dr *JSONDriver) RowIdsByHas(fltField string, fltValue any, fltOp query.Condition) chan benchtop.Index { + log.WithFields(log.Fields{"field": fltField, "value": fltValue, "op": fltOp}).Debug("Running RowIdsByHas") + // Uses indices for EQ queries if Available, otherwise falls back to scanRowsByField. + return dr.scanRowsByField(0, fltField, fltValue, fltOp) +} - prefix := bytes.Join([][]byte{ - benchtop.FieldPrefix, - []byte(fltField), - }, benchtop.FieldSep) +func (dr *JSONDriver) RowIdsByTableFieldValue(tableID uint16, fltField string, fltValue any, fltOp query.Condition) chan benchtop.Index { + log.WithFields(log.Fields{"tableID": tableID, "field": fltField, "value": fltValue, "op": fltOp}).Debug("Running RowIdsByTableFieldValue") - out := make(chan string, 100) + // Uses fast index lookup for EQ if available, otherwise scans the table. + return dr.scanRowsByField(tableID, fltField, fltValue, fltOp) +} + +func (dr *JSONDriver) scanRowsByField(tableID uint16, field string, value any, op query.Condition) chan benchtop.Index { + out := make(chan benchtop.Index, 100) go func() { defer close(out) - err := dr.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { - _, _, value, rowID := benchtop.FieldKeyParse(it.Key()) - if filters.ApplyFilterCondition( - value, - &filters.FieldFilter{ - Field: fltField, Value: fltValue, Operator: fltOp, - }, - ) { - out <- string(rowID) + + dr.Lock.RLock() + var targetTables []*table.JSONTable + if tableID != 0 { + if tbl, ok := dr.Tables[tableID]; ok { + targetTables = append(targetTables, tbl) + } + } else { + for _, tbl := range dr.Tables { + targetTables = append(targetTables, tbl) + } + } + dr.Lock.RUnlock() + + if len(targetTables) == 0 { + return + } + + // FAST PATH: If operator is EQ, use Pebble. + if op == query.EQ { + // For specific tables, check if they are indexed + allIndexed := true + for _, tbl := range targetTables { + if len(tbl.Fields) == 0 { + allIndexed = false + break + } + if _, ok := tbl.Fields[field]; !ok { + allIndexed = false + break } } - return nil - }) - if err != nil { - log.Errorf("Error in View for field %s: %s", fltField, err) + + if allIndexed { + if tableID == 0 { + dr.scanGlobalIndex(field, value, out) + } else { + dr.scanTableIndex(targetTables, field, value, out) + } + return + } } + + // SLOW PATH: Sequential scan. + dr.scanSlow(targetTables, field, value, op, out) }() return out } -func (dr *JSONDriver) RowIdsByLabelFieldValue(fltLabel string, fltField string, fltValue any, fltOp gripql.Condition) chan string { - log.WithFields(log.Fields{"label": fltLabel, "field": fltField, "value": fltValue}).Debug("Running RowIdsByLabelFieldValue") - dr.Lock.RLock() - defer dr.Lock.RUnlock() +func (dr *JSONDriver) scanGlobalIndex(field string, value any, out chan<- benchtop.Index) { + prefix := benchtop.FieldValueKey(field, value) + if prefix == nil { + return + } + prefix = append(prefix, benchtop.FieldSep...) + + _ = dr.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + count := 0 + for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { + val, err := it.Value() + if err != nil { + continue + } + loc := benchtop.DecodeRowLoc(val) + if loc == nil { + continue + } + _, _, _, rowID := benchtop.FieldKeyParse(it.Key()) + safeID := make([]byte, len(rowID)) + copy(safeID, rowID) + out <- benchtop.Index{Key: safeID, Loc: loc} + count++ + if count%1000 == 0 { + log.Debugf("scanGlobalIndex: processed %d items", count) + } + } + return nil + }) +} + +func (dr *JSONDriver) scanTableIndex(targetTables []*table.JSONTable, field string, value any, out chan<- benchtop.Index) { + _ = dr.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for _, tbl := range targetTables { + prefix := benchtop.FieldValueKey(field, value) + prefix = append(prefix, benchtop.FieldSep...) + idBytes := make([]byte, 2) + binary.LittleEndian.PutUint16(idBytes, tbl.TableId) + prefix = append(prefix, idBytes...) + prefix = append(prefix, benchtop.FieldSep...) - prefix := benchtop.FieldLabelKey(fltField, fltLabel) - out := make(chan string, 100) - go func() { - defer close(out) - err := dr.Pkv.View(func(it *pebblebulk.PebbleIterator) error { for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { - _, _, value, rowID := benchtop.FieldKeyParse(it.Key()) - if filters.ApplyFilterCondition( - value, - &filters.FieldFilter{ - Field: fltField, Value: fltValue, Operator: fltOp, - }, - ) { - out <- string(rowID) + val, err := it.Value() + if err != nil { + continue } + loc := benchtop.DecodeRowLoc(val) + if loc == nil { + continue + } + _, _, _, rowID := benchtop.FieldKeyParse(it.Key()) + safeID := make([]byte, len(rowID)) + copy(safeID, rowID) + out <- benchtop.Index{Key: safeID, Loc: loc} } - return nil - }) - if err != nil { - log.Errorf("Error in View for field %s: %s", fltField, err) } - }() - return out + return nil + }) +} + +func (dr *JSONDriver) scanSlow(targetTables []*table.JSONTable, field string, value any, op query.Condition, out chan<- benchtop.Index) { + cond := &filters.FieldFilter{Field: field, Value: value, Operator: op} + for _, tbl := range targetTables { + for row := range tbl.ScanFull(nil) { + fieldVal := tpath.PathLookup(row.DataMap, field) + if !filters.ApplyFilterCondition(fieldVal, cond) { + continue + } + rowID, ok := row.DataMap["_id"].(string) + if !ok || rowID == "" { + continue + } + out <- benchtop.Index{Key: []byte(rowID), Loc: row.Loc} + } + } } -func (dr *JSONDriver) GetIDsForLabel(label string) chan string { +func (dr *JSONDriver) GetIDsForTable(tableID uint16) chan string { dr.Lock.RLock() defer dr.Lock.RUnlock() @@ -343,14 +471,14 @@ func (dr *JSONDriver) GetIDsForLabel(label string) chan string { go func() { defer close(out) - table, err := dr.Get(label) + tbl, err := dr.Get(tableID) if err != nil { - log.Errorf("GetIdsForLabel: %s on table: %s", err, label) + log.Errorf("GetIdsForTable: %s on ID: %d", err, tableID) return } var filter benchtop.RowFilter = nil - for id := range table.ScanId(filter) { + for id := range tbl.ScanId(filter) { out <- id } }() diff --git a/jsontable/helpers.go b/jsontable/helpers.go index 286c9b8..d50293e 100644 --- a/jsontable/helpers.go +++ b/jsontable/helpers.go @@ -2,6 +2,8 @@ package jsontable import ( "bytes" + "encoding/binary" + "strings" "github.com/bmeg/benchtop" "github.com/bmeg/benchtop/pebblebulk" @@ -12,27 +14,64 @@ import ( // Specify a table type prefix to differentiate between edge tables and vertex tables func (dr *JSONDriver) getMaxTablePrefix() uint16 { - // get the max table uint32. Useful for fetching keys. - prefix := []byte{benchtop.TablePrefix} + // Note: Caller must hold dr.Lock + + // 1. Try to load from persistent system counter + val, closer, err := dr.Pkv.Get(benchtop.MaxTableIDKey) + if err == nil { + defer closer.Close() + if len(val) >= 2 { + max := binary.LittleEndian.Uint16(val) + newId := max + 1 + + // Update counter + newVal := make([]byte, 2) + binary.LittleEndian.PutUint16(newVal, newId) + dr.Pkv.Set(benchtop.MaxTableIDKey, newVal, nil) + + log.Debugf("Assigned new TableId %d from persistent counter", newId) + return newId + } + } - maxID := uint16(0) + // 2. Fallback: Scan existing tables to find max (Recovery/First run) + // Start with ID 1 to avoid sentinel issues with ID 0 + prefix := []byte{benchtop.TablePrefix} + maxID := uint16(1) dr.Pkv.View(func(it *pebblebulk.PebbleIterator) error { for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { - // fishing for edge cases - if maxID == ^uint16(0) { - log.Errorf("getMaxTablePrefix( maxID exceeds uint16 max value") + val, err := it.Value() + if err != nil { + continue + } + var tinfo benchtop.TableInfo + if err := sonic.ConfigFastest.Unmarshal(val, &tinfo); err == nil { + if tinfo.TableId >= maxID { + maxID = tinfo.TableId + 1 + } + log.Debugf("Found existing table %s with ID %d", tinfo.Name, tinfo.TableId) } - maxID++ } return nil }) + // Save the found max for next time + newVal := make([]byte, 2) + binary.LittleEndian.PutUint16(newVal, maxID) + dr.Pkv.Set(benchtop.MaxTableIDKey, newVal, nil) + + log.Infof("Initialized persistent TableId counter starting at %d", maxID) return maxID } func (dr *JSONDriver) addTable(Name string, TinfoMarshal []byte) error { + log.Debugf("addTable: %s", Name) nkey := benchtop.NewTableKey([]byte(Name)) - return dr.Pkv.Set(nkey, TinfoMarshal, nil) + err := dr.Pkv.Set(nkey, TinfoMarshal, nil) + if err != nil { + log.Errorf("addTable failed for %s: %v", Name, err) + } + return err } func (dr *JSONDriver) dropTable(name string) error { @@ -42,14 +81,47 @@ func (dr *JSONDriver) dropTable(name string) error { } func (dr *JSONDriver) getTableInfo(name string) (benchtop.TableInfo, error) { - value, closer, err := dr.Pkv.Get([]byte(name)) + log.Debugf("getTableInfo: searching for %s", name) + nkey := benchtop.NewTableKey([]byte(name)) + value, closer, err := dr.Pkv.Get(nkey) + if err == nil { + defer closer.Close() + var tinfo benchtop.TableInfo + if err := sonic.ConfigFastest.Unmarshal(value, &tinfo); err == nil { + return tinfo, nil + } + log.Errorf("getTableInfo: failed to unmarshal %s: %v", name, err) + } + + // Direct lookup failed or corrupt, try case-insensitive scan fallback + log.Debugf("getTableInfo: direct lookup failed for %s, trying scan fallback", name) + prefix := []byte{benchtop.TablePrefix} + var found *benchtop.TableInfo + _ = dr.Pkv.View(func(it *pebblebulk.PebbleIterator) error { + for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { + val, err := it.Value() + if err != nil { + continue + } + var tinfo benchtop.TableInfo + if err := sonic.ConfigFastest.Unmarshal(val, &tinfo); err == nil { + if strings.EqualFold(tinfo.Name, name) { + found = &tinfo + return nil + } + } + } + return nil + }) + if found != nil { + log.Warningf("Found table %s using scan fallback, primary lookup failed (possible case mismatch)", name) + return *found, nil + } + if err != nil { return benchtop.TableInfo{}, err } - tinfo := benchtop.TableInfo{} - sonic.ConfigFastest.Unmarshal(value, &tinfo) - closer.Close() - return tinfo, nil + return benchtop.TableInfo{}, pebble.ErrNotFound } func (dr *JSONDriver) AddTableEntryInfo(tx *pebblebulk.PebbleBulk, rowId []byte, rowLoc *benchtop.RowLoc) error { @@ -69,8 +141,8 @@ func (dr *JSONDriver) AddTableEntryInfo(tx *pebblebulk.PebbleBulk, rowId []byte, return nil } -func (dr *JSONDriver) GetLocFromTableKey(id []byte) (loc *benchtop.RowLoc, err error) { - val, closer, err := dr.Pkv.Get(benchtop.NewPosKey(loc.TableId, id)) +func (dr *JSONDriver) GetLocFromTableKey(tableId uint16, id []byte) (loc *benchtop.RowLoc, err error) { + val, closer, err := dr.Pkv.Get(benchtop.NewPosKey(tableId, id)) if err != nil { if err != pebble.ErrNotFound { log.Errorln("GetLocFromTableKey Err: ", err) diff --git a/jsontable/labels.go b/jsontable/labels.go deleted file mode 100644 index bc68b54..0000000 --- a/jsontable/labels.go +++ /dev/null @@ -1,60 +0,0 @@ -package jsontable - -import ( - "bytes" - - "github.com/bmeg/benchtop" - "github.com/bmeg/benchtop/pebblebulk" -) - -const bufferSize = 100 - -// List all unique col names held by all tables -func (dr *JSONDriver) GetAllColNames() chan string { - dr.Lock.RLock() - defer dr.Lock.RUnlock() - - out := make(chan string, bufferSize) - go func() { - defer close(out) - prefix := []byte{benchtop.TablePrefix} - dr.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { - info, err := dr.getTableInfo(string(it.Key())) - if err != nil { - continue - } - for _, col := range info.Columns { - out <- col.Key - } - } - return nil - }) - }() - return out -} - -func (dr *JSONDriver) GetLabels(edges bool, removePrefix bool) chan string { - dr.Lock.RLock() - defer dr.Lock.RUnlock() - - out := make(chan string, bufferSize) - go func() { - defer close(out) - prefix := []byte{benchtop.TablePrefix} - dr.Pkv.View(func(it *pebblebulk.PebbleIterator) error { - for it.Seek(prefix); it.Valid() && bytes.HasPrefix(it.Key(), prefix); it.Next() { - strKey := string(benchtop.ParseTableKey(it.Key())) - if (edges && strKey[:2] == "e_") || (!edges && strKey[:2] == "v_") { - if removePrefix { - out <- strKey[2:] - } else { - out <- strKey - } - } - } - return nil - }) - }() - return out -} diff --git a/jsontable/reproduce_test.go b/jsontable/reproduce_test.go new file mode 100644 index 0000000..77b530a --- /dev/null +++ b/jsontable/reproduce_test.go @@ -0,0 +1,292 @@ +package jsontable + +import ( + "fmt" + "os" + "strconv" + "strings" + "testing" + "time" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/jsontable/table" + "github.com/bmeg/benchtop/query" + "github.com/bytedance/sonic" + "github.com/bytedance/sonic/ast" +) + +// Helper copied from filter_debug_test.go/filter.go +func parseDirectPath(path string) ([]any, bool) { + path = strings.TrimSpace(path) + path = strings.TrimPrefix(path, "$") + path = strings.TrimPrefix(path, ".") + if path == "" { + return nil, false + } + + parts := []any{} + var token strings.Builder + flushToken := func() { + if token.Len() > 0 { + parts = append(parts, token.String()) + token.Reset() + } + } + + for i := 0; i < len(path); i++ { + ch := path[i] + switch ch { + case '.': + flushToken() + case '[': + flushToken() + j := i + 1 + for j < len(path) && path[j] != ']' { + j++ + } + if j >= len(path) || j == i+1 { + return nil, false + } + idx, err := strconv.Atoi(path[i+1 : j]) + if err != nil { + return nil, false + } + parts = append(parts, idx) + i = j + default: + token.WriteByte(ch) + } + } + flushToken() + + if len(parts) == 0 { + return nil, false + } + return parts, true +} + +func sonicLookup(row []byte, condKey string) any { + if path, ok := parseDirectPath(condKey); ok { + node, err := sonic.Get(row, path...) + if err == nil { + v, ierr := node.Interface() + if ierr == nil { + return v + } + } + } + + // Legacy packed-row fallback + pathArr, err := table.ConvertJSONPathToArray(condKey) + if err != nil { + return nil + } + node, err := sonic.Get(row, pathArr...) + if err != nil { + if err != ast.ErrNotExist { + // log.Debugf("Sonic fetch error: %v", err) + } + return nil + } + v, ierr := node.Interface() + if ierr != nil { + return nil + } + return v +} + +// MockFilter implements benchtop.RowFilter using REAL sonic lookup logic +type MockFilter struct { + NoOp bool + Key string + Value string +} + +func (m *MockFilter) Matches(row []byte, tableName string) bool { + // Simulate GripQLFilter.Matches logic + val := sonicLookup(row, m.Key) + if s, ok := val.(string); ok && s == m.Value { + return true + } + return false +} + +func (m *MockFilter) IsNoOp() bool { + return m.NoOp +} + +func (m *MockFilter) GetFilter() any { + return nil +} + +func (m *MockFilter) RequiredFields() []string { + if m.Key != "" { + return []string{m.Key} + } + return nil +} + +func TestReproduceQueryIssues(t *testing.T) { + // Setup temp dir + tmpDir, err := os.MkdirTemp("", "jsontable_repro") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(tmpDir) + + // Initialize Driver + driver, err := NewJSONDriver(tmpDir) + if err != nil { + t.Fatal(err) + } + defer driver.Close() + + tableName := "v_Observation" + tblStore, err := driver.New(tableName, nil) + if err != nil { + t.Fatal(err) + } + tableID := tblStore.(*table.JSONTable).TableId + + err = driver.BulkLoad(tableID, loadRows(tableID, 100)) + if err != nil { + t.Fatal(err) + } + + time.Sleep(100 * time.Millisecond) + + // Get the table + ts, err := driver.Get(tableID) + if err != nil { + t.Fatal(err) + } + tbl := ts.(*table.JSONTable) + + // 1. Test ScanDoc (Simulates Query 3b) + t.Run("ScanDoc", func(t *testing.T) { + count := 0 + for _ = range tbl.ScanDoc(nil) { + count++ + } + if count != 100 { + t.Errorf("ScanDoc expected 100 rows, got %d", count) + } + }) + + // 2. Test ScanId with nil filter (Should work) + t.Run("ScanId_NilFilter", func(t *testing.T) { + count := 0 + for _ = range tbl.ScanId(nil) { + count++ + } + if count != 100 { + t.Errorf("ScanId_NilFilter expected 100 rows, got %d", count) + } + }) + + // 3. Test ScanId with NoOp Filter (Simulates Query 3) + t.Run("ScanId_NoOpFilter", func(t *testing.T) { + f := &MockFilter{NoOp: true} + count := 0 + for _ = range tbl.ScanId(f) { + count++ + } + if count != 100 { + t.Errorf("ScanId_NoOpFilter expected 100 rows, got %d", count) + } + }) + + // 4. Test ScanDoc with Nested Filter (Simulates Query 5) + t.Run("ScanDoc_NestedFilter", func(t *testing.T) { + f := &MockFilter{ + NoOp: false, + Key: "component.[0].valueString", + Value: "Post-treatment", + } + + count := 0 + for _ = range tbl.ScanDoc(f) { + count++ + } + // expect 50 rows + if count != 50 { + t.Errorf("ScanDoc_NestedFilter expected 50 rows, got %d", count) + } + }) + + // 5. Test ScanFull directly + t.Run("ScanFull_NoOp", func(t *testing.T) { + f := &MockFilter{NoOp: true} + count := 0 + for res := range tbl.ScanFull(f) { + if res.DataMap["_id"] == "" { + t.Error("ScanFull returned empty _id") + } + count++ + } + if count != 100 { + t.Errorf("ScanFull expected 100 rows, got %d", count) + } + }) + + // 6. Test RowIdsByHas (Simulates V().Has(...)) + t.Run("RowIdsByHas_Eq", func(t *testing.T) { + count := 0 + // Look for "Post-treatment" in "component.[0].valueString" + // Note: Field name stored in index includes path? + // In loadRows: "component" is array of map. + // Does benchmark store complex paths in fields? + // fields.go scans: + // fieldValue := tpath.PathLookup(r.DataMap, field) + // So if we ask for field "component.[0].valueString", it should work if we index it via AddField. + // But here we are just scanning. RowIdsByHas defaults to scan if index missing. + + for range driver.RowIdsByHas("component.[0].valueString", "Post-treatment", query.EQ) { + count++ + } + if count != 50 { + t.Errorf("RowIdsByHas_Eq expected 50 rows, got %d", count) + } + }) + + // 7. Test RowIdsByLabelFieldValue + t.Run("RowIdsByLabelFieldValue_Eq", func(t *testing.T) { + count := 0 + for range driver.RowIdsByTableFieldValue(tableID, "component.[0].valueString", "Post-treatment", query.EQ) { + count++ + } + if count != 50 { + t.Errorf("RowIdsByLabelFieldValue_Eq expected 50 rows, got %d", count) + } + }) +} + +func loadRows(tableID uint16, count int) chan *benchtop.Row { + out := make(chan *benchtop.Row) + go func() { + defer close(out) + for i := 0; i < count; i++ { + id := fmt.Sprintf("row_%d", i) + + compVal := "Pre-treatment" + if i%2 == 0 { + compVal = "Post-treatment" + } + + data := map[string]any{ + "data": fmt.Sprintf("value_%d", i), + "component": []any{ + map[string]any{ + "valueString": compVal, + }, + }, + } + out <- &benchtop.Row{ + Id: []byte(id), + TableID: tableID, + Data: data, + } + } + }() + return out +} diff --git a/jsontable/restart_test.go b/jsontable/restart_test.go new file mode 100644 index 0000000..36c1d08 --- /dev/null +++ b/jsontable/restart_test.go @@ -0,0 +1,133 @@ +package jsontable + +import ( + "fmt" + "os" + "testing" + "time" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/jsontable/table" +) + +func TestRestartPersistence(t *testing.T) { + // Setup temp dir + tmpDir, err := os.MkdirTemp("", "jsontable_restart") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(tmpDir) + + tableName := "v_Person" + rowCount := 50000 + + // 1. Initialize Driver and Load Data + { + driver, err := NewJSONDriver(tmpDir) + if err != nil { + t.Fatal(err) + } + + tblStore, err := driver.New(tableName, nil) + if err != nil { + t.Fatal(err) + } + tableID := tblStore.(*table.JSONTable).TableId + + err = driver.BulkLoad(tableID, loadRowsHelper(tableID, rowCount)) + if err != nil { + t.Fatal(err) + } + // Allow async writes to flush? BulkLoad should handle it but let's be safe + time.Sleep(100 * time.Millisecond) + + // Verify initial count + ts, err := driver.Get(tableID) + if err != nil { + t.Fatal(err) + } + tbl := ts.(*table.JSONTable) + + count := 0 + for _ = range tbl.ScanDoc(nil) { + count++ + } + if count != rowCount { + t.Errorf("Initial scan expected %d rows, got %d", rowCount, count) + } + + driver.Close() + } + + // 2. Restart Driver and Verify Data + { + driver, err := NewJSONDriver(tmpDir) + if err != nil { + t.Fatal(err) + } + defer driver.Close() + + // Verify table exists + tableID, err := driver.LookupTableID(tableName) + if err != nil { + t.Fatalf("Failed to lookup table ID after restart: %v", err) + } + ts, err := driver.Get(tableID) + if err != nil { + t.Fatalf("Failed to get table after restart: %v", err) + } + tbl := ts.(*table.JSONTable) + + count := 0 + for _ = range tbl.ScanDoc(nil) { + count++ + } + if count != rowCount { + t.Errorf("Post-restart scan expected %d rows, got %d", rowCount, count) + } + + // 3. Verify Random Access (GetRow equivalent) + // Pick a few IDs + idsToCheck := []string{"row_0", "row_50", "row_9999"} + for _, id := range idsToCheck { + loc, err := tbl.LocLookup(id) + if err != nil { + t.Errorf("LocLookup failed for %s: %v", id, err) + continue + } + if loc == nil { + t.Errorf("LocLookup returned nil for %s", id) + continue + } + + // Try to read via Storage + data, err := tbl.Storage.Get(loc) + if err != nil { + t.Errorf("Storage.Get failed for %s (loc=%v): %v", id, loc, err) + continue + } + if len(data) == 0 { + t.Errorf("Storage.Get returned empty data for %s", id) + } + } + } +} + +func loadRowsHelper(tableID uint16, count int) chan *benchtop.Row { + out := make(chan *benchtop.Row) + go func() { + defer close(out) + for i := 0; i < count; i++ { + id := fmt.Sprintf("row_%d", i) + data := map[string]any{ + "data": fmt.Sprintf("value_%d", i), + } + out <- &benchtop.Row{ + Id: []byte(id), + TableID: tableID, + Data: data, + } + } + }() + return out +} diff --git a/jsontable/section/section.go b/jsontable/section/section.go index f8ee286..0fb1f74 100644 --- a/jsontable/section/section.go +++ b/jsontable/section/section.go @@ -6,7 +6,6 @@ import ( "os" "sync" - "github.com/DataDog/zstd" "github.com/bmeg/benchtop" "github.com/edsrzf/mmap-go" ) @@ -39,13 +38,12 @@ func (s *Section) WriteJsonEntryToSection(payload []byte) (*benchtop.RowLoc, err s.Lock.Lock() defer s.Lock.Unlock() - cPayload, err := zstd.Compress(s.CompressScratch[:0], payload) - if err != nil { - return nil, fmt.Errorf("compress failed: %w", err) + dataLen := uint32(len(payload)) + writeEnd64 := uint64(s.LiveBytes) + uint64(benchtop.ROW_HSIZE) + uint64(dataLen) + if writeEnd64 > uint64(^uint32(0)) { + return nil, fmt.Errorf("write offset overflow: live=%d len=%d", s.LiveBytes, dataLen) } - - compressedLen := uint32(len(cPayload)) - writeEnd := s.LiveBytes + benchtop.ROW_HSIZE + compressedLen + writeEnd := uint32(writeEnd64) // Check if write is outside the CURRENT mapped region if writeEnd > uint32(len(s.MMap)) { @@ -70,19 +68,36 @@ func (s *Section) WriteJsonEntryToSection(payload []byte) (*benchtop.RowLoc, err } oldLiveBytes := s.LiveBytes - nextOffset := s.LiveBytes + benchtop.ROW_HSIZE + compressedLen + nextOffset64 := uint64(s.LiveBytes) + uint64(benchtop.ROW_HSIZE) + uint64(dataLen) + if nextOffset64 > uint64(^uint32(0)) { + return nil, fmt.Errorf("next offset overflow: live=%d len=%d", s.LiveBytes, dataLen) + } + nextOffset := uint32(nextOffset64) + headerEnd := oldLiveBytes + benchtop.ROW_HSIZE + if headerEnd < oldLiveBytes || headerEnd > uint32(len(s.MMap)) { + return nil, fmt.Errorf("invalid header bounds: off=%d end=%d mmap=%d", oldLiveBytes, headerEnd, len(s.MMap)) + } + payloadStart := headerEnd + payloadEnd := payloadStart + dataLen + if payloadEnd < payloadStart || payloadEnd > uint32(len(s.MMap)) { + return nil, fmt.Errorf("invalid payload bounds: start=%d end=%d mmap=%d", payloadStart, payloadEnd, len(s.MMap)) + } + + headerTarget := s.MMap[oldLiveBytes:headerEnd] + binary.LittleEndian.PutUint32(headerTarget[:4], nextOffset) // next row offset + binary.LittleEndian.PutUint32(headerTarget[4:], dataLen) // data size + copy(s.MMap[payloadStart:payloadEnd], payload) + + // Force flush to ensure visibility to other readers/mappers + // This prevents "stale zeros" issues where header is visible but payload is not + s.MMap.Flush() - headerTarget := s.MMap[oldLiveBytes : oldLiveBytes+benchtop.ROW_HSIZE] - binary.LittleEndian.PutUint32(headerTarget[:4], nextOffset) // next row offset - binary.LittleEndian.PutUint32(headerTarget[4:], compressedLen) // compressed size - copy(s.MMap[oldLiveBytes+benchtop.ROW_HSIZE:], cPayload) s.LiveBytes = nextOffset - // Save the buffer for next time. If the buffer allocated to be larger, use the larger one. - s.CompressScratch = cPayload + return &benchtop.RowLoc{ Section: s.ID, Offset: oldLiveBytes, - Size: compressedLen, + Size: dataLen, }, nil } @@ -129,12 +144,9 @@ func (s *Section) RemapReadOnly() error { func (s *Section) GrowAndRemap(newSize int64) error { // 1. Unmap the old region if s.MMap != nil { - // Crucial: ensure any pending data is flushed before unmap + // Ensure data is synced before unmapping/resizing if err := s.MMap.Flush(); err != nil { - return fmt.Errorf("flush before unmap failed: %w", err) - } - if err := s.File.Sync(); err != nil { - return fmt.Errorf("sync failed: %w", err) + return fmt.Errorf("flush failed before resize: %w", err) } if err := s.MMap.Unmap(); err != nil { return fmt.Errorf("unmap failed: %w", err) diff --git a/jsontable/storage/interface.go b/jsontable/storage/interface.go new file mode 100644 index 0000000..41ec525 --- /dev/null +++ b/jsontable/storage/interface.go @@ -0,0 +1,59 @@ +package storage + +import ( + "errors" + "io" + + "github.com/bmeg/benchtop" +) + +// RowStorage abstracts the underlying dense storage mechanism (e.g. mmap files, Parquet). +// implementations must be thread-safe. +type RowStorage interface { + io.Closer + + // AddRow writes data to storage and returns its location. + AddRow(data []byte, id []byte) (*benchtop.RowLoc, error) + + // AddRows writes multiple data blocks to storage and returns their locations. + AddRows(data [][]byte, ids [][]byte) ([]*benchtop.RowLoc, error) + + // Get retrieves data from the specified location. + Get(loc *benchtop.RowLoc) ([]byte, error) + + // GetBatch retrieves multiple data blocks from the specified location. + GetBatch(locs []*benchtop.RowLoc) ([][]byte, []error) + + // MarkDelete marks a row as deleted (tombstone). + MarkDelete(loc *benchtop.RowLoc) error + + // Scan iterates over storage rows, optionally filtering them. + // Returns a channel of raw row bytes. + Scan(concurrency int) chan []byte + + // ScanFull iterates over storage rows, returning both data and locations. + ScanFull(concurrency int) chan benchtop.RowLocData + + // Sync ensures all written data is persisted to disk. + Sync() error + + // Delete removes all underlying storage files. + Delete() error + + // GetPartitionId returns the partition index for a given key. + GetPartitionId(id []byte) int +} + +// ZoneManager handles the lifecycle of storage zones (e.g. one per Project). +type ZoneManager interface { + // GetStorage returns the storage engine for a specific Zone. + GetStorage(zoneId string) (RowStorage, error) + + // CreateZone initializes specialized storage for a new Zone. + CreateZone(zoneId string) (RowStorage, error) + + // DeleteZone performs a bulk-delete of an entire Zone (O(1) operation). + DeleteZone(zoneId string) error +} + +var ErrNotFound = errors.New("storage: entry not found") diff --git a/jsontable/storage/section_storage.go b/jsontable/storage/section_storage.go new file mode 100644 index 0000000..b7c63c4 --- /dev/null +++ b/jsontable/storage/section_storage.go @@ -0,0 +1,616 @@ +package storage + +import ( + "bytes" + "encoding/binary" + "fmt" + "hash/fnv" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/jsontable/section" + "github.com/bmeg/grip/log" + "github.com/edsrzf/mmap-go" + "github.com/hashicorp/go-multierror" +) + +const ( + PART_FILE_SUFFIX = ".partition" +) + +// SectionStorage implements RowStorage using memory-mapped .section files. +// This is the "Legacy" storage mechanism used by benchtop. +type SectionStorage struct { + basePath string + fileName string + numPartitions uint32 + maxConcurrentSections uint8 + + sections map[uint16]*section.Section // All active or closed sections + activeSections map[uint8]*section.Section // Current active section for each partition + partitionMap map[uint8][]uint16 // List of section IDs per partition + maxSecId uint16 // Highest section ID allocated + + lock sync.RWMutex // Protects map access + sectionLock sync.Mutex // Protects section creation + + partitionFunc func(id []byte) uint8 +} + +func NewSectionStorage(basePath string, fileName string, numPartitions uint32) *SectionStorage { + storage := &SectionStorage{ + basePath: basePath, + fileName: fileName, + numPartitions: numPartitions, + sections: make(map[uint16]*section.Section), + activeSections: make(map[uint8]*section.Section), + partitionMap: make(map[uint8][]uint16), + partitionFunc: func(id []byte) uint8 { + h := fnv.New32a() + h.Write(id) + return uint8(h.Sum32() % numPartitions) + }, + maxConcurrentSections: 10, + } + + if err := storage.loadExisting(); err != nil { + fmt.Fprintf(os.Stderr, "failed to load existing sections: %v\n", err) + } + return storage +} + +func (s *SectionStorage) GetPartitionId(id []byte) int { + return int(s.partitionFunc(id)) +} + +func (s *SectionStorage) AddRows(data [][]byte, ids [][]byte) ([]*benchtop.RowLoc, error) { + if len(data) == 0 { + return []*benchtop.RowLoc{}, nil + } + if len(data) != len(ids) { + return nil, fmt.Errorf("data and ids must have same length") + } + + results := make([]*benchtop.RowLoc, len(data)) + + type inputItem struct { + index int + data []byte + id []byte + } + + byPartition := make(map[uint8][]inputItem) + for i := 0; i < len(data); i++ { + pId := s.partitionFunc(ids[i]) + byPartition[pId] = append(byPartition[pId], inputItem{i, data[i], ids[i]}) + } + + var errs *multierror.Error + + // Process each partition + for pId, items := range byPartition { + s.lock.RLock() + sec := s.activeSections[pId] + s.lock.RUnlock() + + if sec == nil { + var err error + sec, err = s.createNewSection(pId) + if err != nil { + errs = multierror.Append(errs, err) + continue + } + } + + // Calculate total size to check for rotation + var totalSize uint32 + for _, item := range items { + totalSize += uint32(len(item.data)) + benchtop.ROW_HSIZE + } + + if sec.LiveBytes+totalSize > section.MAX_SECTION_SIZE { + var err error + sec, err = s.rotateSection(pId, sec) + if err != nil { + errs = multierror.Append(errs, err) + continue + } + } + + for _, item := range items { + loc, err := sec.WriteJsonEntryToSection(item.data) + if err != nil { + errs = multierror.Append(errs, err) + continue // Try next item? + } + sec.TotalRows++ + results[item.index] = &benchtop.RowLoc{ + Section: loc.Section, + Offset: loc.Offset, + Size: loc.Size, + } + } + } + + return results, errs.ErrorOrNil() +} + +func (s *SectionStorage) loadExisting() error { + dir := filepath.Dir(s.fileName) + base := filepath.Base(s.fileName) + + files, err := os.ReadDir(dir) + if err != nil { + return fmt.Errorf("failed to read directory %s: %w", dir, err) + } + + type secInfo struct { + pId uint8 + secId uint16 + localSecId int + fileName string + } + var secList []secInfo + + for _, f := range files { + if strings.Contains(f.Name(), ".id") && strings.Contains(f.Name(), ".part") { + // New format: data.id[ID].part[P] + // Format: basePath.id[ID].part[P] + suffix := strings.TrimPrefix(f.Name(), base+".id") + parts := strings.Split(suffix, ".part") + if len(parts) == 2 { + secId, _ := strconv.Atoi(parts[0]) + pId, _ := strconv.Atoi(parts[1]) + secList = append(secList, secInfo{ + pId: uint8(pId), + secId: uint16(secId), + localSecId: -1, // Not used for new format + fileName: f.Name(), + }) + } + } + } + + for _, si := range secList { + secId := si.secId + if secId > s.maxSecId { + s.maxSecId = secId + } + secPath := filepath.Join(dir, si.fileName) + + handle, err := os.OpenFile(secPath, os.O_RDWR, 0666) + if err != nil { + return fmt.Errorf("failed to open section file %s: %w", secPath, err) + } + + m, err := mmap.Map(handle, mmap.RDWR, 0) + if err != nil { + handle.Close() + return fmt.Errorf("failed to mmap section %s: %w", secPath, err) + } + + filePool := make(chan *os.File, 10) + for i := 0; i < 10; i++ { + f, err := os.OpenFile(secPath, os.O_RDWR, 0666) + if err != nil { + return fmt.Errorf("failed to init file pool: %w", err) + } + filePool <- f + } + + var totalRows uint32 = 0 + var deletedRows uint32 = 0 + var offset uint32 = 0 + for offset+benchtop.ROW_HSIZE <= uint32(len(m)) { + header := m[offset : offset+benchtop.ROW_HSIZE] + nextOffset := binary.LittleEndian.Uint32(header[:benchtop.ROW_OFFSET_HSIZE]) + bSize := binary.LittleEndian.Uint32(header[benchtop.ROW_OFFSET_HSIZE:benchtop.ROW_HSIZE]) + + if nextOffset == 0 || nextOffset <= offset { + break + } + if bSize == 0 { + deletedRows++ + } + totalRows++ + offset = nextOffset + } + + sec := §ion.Section{ + ID: secId, + PartitionID: si.pId, + Path: secPath, + File: handle, + FilePool: filePool, + MMap: m, + LiveBytes: offset, + Active: true, + MMapMode: mmap.RDWR, + TotalRows: totalRows, + DeletedRows: deletedRows, + Lock: sync.RWMutex{}, + CompressScratch: make([]byte, 0), + } + + s.lock.Lock() + s.sections[secId] = sec + s.partitionMap[si.pId] = append(s.partitionMap[si.pId], secId) + s.lock.Unlock() + } + + s.lock.Lock() + defer s.lock.Unlock() + for pId, secIds := range s.partitionMap { + if len(secIds) > 0 { + var maxId uint16 = 0 + var maxSec *section.Section + for _, sid := range secIds { + sec := s.sections[sid] + if sid >= maxId { + maxId = sid + maxSec = sec + } + } + s.activeSections[pId] = maxSec + } + } + + return nil +} + +func (s *SectionStorage) AddRow(data []byte, id []byte) (*benchtop.RowLoc, error) { + partitionId := s.partitionFunc(id) + + s.lock.RLock() + sec := s.activeSections[partitionId] + s.lock.RUnlock() + + if sec == nil { + var err error + sec, err = s.createNewSection(partitionId) + if err != nil { + return nil, err + } + } + + totalSize := uint32(len(data)) + benchtop.ROW_HSIZE + if sec.LiveBytes+totalSize > section.MAX_SECTION_SIZE { + // Release lock while we potentially create a new section to avoid deadlock + var err error + sec, err = s.rotateSection(partitionId, sec) + if err != nil { + return nil, err + } + } + + loc, err := sec.WriteJsonEntryToSection(data) + if err != nil { + return nil, err + } + + sec.TotalRows++ + + return &benchtop.RowLoc{ + Section: loc.Section, + Offset: loc.Offset, + Size: loc.Size, + }, nil +} + +func (s *SectionStorage) Get(loc *benchtop.RowLoc) ([]byte, error) { + s.lock.RLock() + sec, exists := s.sections[loc.Section] + s.lock.RUnlock() + + if !exists { + return nil, fmt.Errorf("section %d not found", loc.Section) + } + + sec.Lock.RLock() + defer sec.Lock.RUnlock() + + if len(sec.MMap) == 0 { + return nil, fmt.Errorf("section %d is empty", loc.Section) + } + + start := loc.Offset + benchtop.ROW_HSIZE + end := start + loc.Size + if end > uint32(len(sec.MMap)) { + return nil, fmt.Errorf("out of bounds for section %d", loc.Section) + } + + // Copy data to avoid reading from unmapped memory after lock release + data := make([]byte, loc.Size) + copy(data, sec.MMap[start:end]) + return data, nil +} + +func (s *SectionStorage) GetBatch(locs []*benchtop.RowLoc) ([][]byte, []error) { + // Fallback to individual gets for simplicity and correctness with collisions + results := make([][]byte, len(locs)) + errors := make([]error, len(locs)) + + for i, loc := range locs { + res, err := s.Get(loc) + if err != nil { + errors[i] = err + } else { + results[i] = res + } + } + return results, errors +} + +func (s *SectionStorage) ScanFull(concurrency int) chan benchtop.RowLocData { + // Scan all sections directly from s.sections map to ensure we visit every file exactly once, + // regardless of partition collisions. + s.lock.RLock() + var allSecs []*section.Section + for _, sec := range s.sections { + allSecs = append(allSecs, sec) + } + s.lock.RUnlock() + + outChan := make(chan benchtop.RowLocData, 100*len(allSecs)) + if concurrency <= 0 { + concurrency = 1 + } + var wg sync.WaitGroup + sem := make(chan struct{}, concurrency) + + go func() { + for _, sec := range allSecs { + if len(sec.MMap) == 0 { + continue + } + + wg.Add(1) + go func(sec *section.Section) { + sem <- struct{}{} + defer func() { <-sem; wg.Done() }() + + sec.Lock.RLock() + defer sec.Lock.RUnlock() + + m := sec.MMap + var offset uint32 = 0 + for offset+benchtop.ROW_HSIZE <= uint32(len(m)) { + header := m[offset : offset+benchtop.ROW_HSIZE] + nextOffset := binary.LittleEndian.Uint32(header[:benchtop.ROW_OFFSET_HSIZE]) + bSize := binary.LittleEndian.Uint32(header[benchtop.ROW_OFFSET_HSIZE:benchtop.ROW_HSIZE]) + + if bSize == 0 { + if nextOffset == 0 || nextOffset <= offset { + break + } + offset = nextOffset + continue + } + + jsonStart := offset + benchtop.ROW_HSIZE + jsonEnd := jsonStart + bSize + if jsonEnd > uint32(len(m)) { + break + } + + rowData := make([]byte, bSize) + copy(rowData, m[jsonStart:jsonEnd]) + outChan <- benchtop.RowLocData{ + Data: rowData, + Loc: &benchtop.RowLoc{ + Section: sec.ID, + Offset: offset, + Size: bSize, + }, + } + + if nextOffset == 0 || nextOffset <= offset { + break + } + offset = nextOffset + } + }(sec) + } + wg.Wait() + close(outChan) + }() + return outChan +} + +func (s *SectionStorage) Scan(concurrency int) chan []byte { + out := make(chan []byte, 100) + go func() { + defer close(out) + for res := range s.ScanFull(concurrency) { + out <- res.Data + } + }() + return out +} + +func (s *SectionStorage) MarkDelete(loc *benchtop.RowLoc) error { + s.lock.RLock() + sec, exists := s.sections[loc.Section] + s.lock.RUnlock() + + if !exists { + return fmt.Errorf("section %d not found", loc.Section) + } + + sec.Lock.RLock() + if len(sec.MMap) == 0 || loc.Offset+benchtop.ROW_HSIZE > uint32(len(sec.MMap)) { + sec.Lock.RUnlock() + return fmt.Errorf("invalid offset or empty section") + } + sec.Lock.RUnlock() + + file := <-sec.FilePool + defer func() { sec.FilePool <- file }() + _, err := file.WriteAt(bytes.Repeat([]byte{0x00}, 4), int64(loc.Offset+benchtop.ROW_OFFSET_HSIZE)) + if err == nil { + sec.Lock.Lock() + sec.DeletedRows++ + sec.Lock.Unlock() + return nil + } + return err +} + +func (s *SectionStorage) Sync() error { + s.lock.RLock() + defer s.lock.RUnlock() + + var errs *multierror.Error + for _, sec := range s.sections { + if sec.File != nil { + if err := sec.File.Sync(); err != nil { + errs = multierror.Append(errs, err) + } + } + if sec.MMap != nil { + if err := sec.MMap.Flush(); err != nil { + errs = multierror.Append(errs, err) + } + } + } + return errs.ErrorOrNil() +} + +func (s *SectionStorage) Close() error { + // Sync before closing to ensure flush + if err := s.Sync(); err != nil { + log.Errorf("Failed to sync on close: %v", err) + } + + s.lock.Lock() + defer s.lock.Unlock() + + var errs *multierror.Error + for _, sec := range s.sections { + if sec.MMap != nil { + if err := sec.MMap.Unmap(); err != nil { + errs = multierror.Append(errs, err) + } + } + if sec.FilePool != nil { + close(sec.FilePool) + for f := range sec.FilePool { + f.Close() + } + } + if sec.File != nil { + if err := sec.File.Close(); err != nil { + errs = multierror.Append(errs, err) + } + } + } + return errs.ErrorOrNil() +} + +func (s *SectionStorage) Delete() error { + if err := s.Close(); err != nil { + log.Errorf("Close failed during delete: %v", err) + } + + s.lock.Lock() + defer s.lock.Unlock() + + var errs *multierror.Error + for id, sec := range s.sections { + if err := os.Remove(sec.Path); err != nil { + errs = multierror.Append(errs, err) + } + delete(s.sections, id) + } + return errs.ErrorOrNil() +} + +func (s *SectionStorage) rotateSection(partitionId uint8, oldSec *section.Section) (*section.Section, error) { + s.lock.RLock() + current := s.activeSections[partitionId] + s.lock.RUnlock() + + // If someone already changed it, use the new one + if current != oldSec { + return current, nil + } + + // Double check under sectionLock + s.sectionLock.Lock() + defer s.sectionLock.Unlock() + + s.lock.RLock() + current = s.activeSections[partitionId] + s.lock.RUnlock() + if current != oldSec { + return current, nil + } + + // It's definitely full and we are the ones to rotate + if oldSec != nil { + oldSec.CloseSection() + } + return s.createNewSectionLocked(partitionId) +} + +func (s *SectionStorage) createNewSection(partitionId uint8) (*section.Section, error) { + s.sectionLock.Lock() + defer s.sectionLock.Unlock() + return s.createNewSectionLocked(partitionId) +} + +func (s *SectionStorage) createNewSectionLocked(partitionId uint8) (*section.Section, error) { + // Critical: Update shared map under lock + s.lock.Lock() + s.maxSecId++ + secId := s.maxSecId + s.lock.Unlock() + + // Use new naming format to avoid collisions and support unique IDs + path := fmt.Sprintf("%s.id%d.part%d", s.fileName, secId, partitionId) + + handle, err := os.Create(path) + if err != nil { + return nil, err + } + handle.Truncate(section.INITIAL_SECTION_SIZE) + + m, err := mmap.Map(handle, mmap.RDWR, 0) + if err != nil { + return nil, err + } + + filePool := make(chan *os.File, 10) + for range cap(filePool) { + f, err := os.OpenFile(path, os.O_RDWR, 0666) + if err != nil { + return nil, err + } + filePool <- f + } + + sec := §ion.Section{ + ID: secId, + PartitionID: partitionId, + Path: path, + File: handle, + FilePool: filePool, + MMap: m, + MMapMode: mmap.RDWR, + Active: true, + LiveBytes: 0, + CompressScratch: make([]byte, 0), + } + + s.lock.Lock() + s.sections[secId] = sec + s.partitionMap[partitionId] = append(s.partitionMap[partitionId], secId) + s.activeSections[partitionId] = sec + s.lock.Unlock() + + return sec, nil +} diff --git a/jsontable/storage/zone_manager.go b/jsontable/storage/zone_manager.go new file mode 100644 index 0000000..9b7c9db --- /dev/null +++ b/jsontable/storage/zone_manager.go @@ -0,0 +1,99 @@ +package storage + +import ( + "fmt" + "os" + "path/filepath" + "sync" +) + +// DefaultZoneManager implements ZoneManager using SectionStorage backend. +type DefaultZoneManager struct { + baseDir string + zones map[string]RowStorage + lock sync.RWMutex +} + +func NewZoneManager(baseDir string) *DefaultZoneManager { + return &DefaultZoneManager{ + baseDir: baseDir, + zones: make(map[string]RowStorage), + } +} + +func (zm *DefaultZoneManager) GetStorage(zoneId string) (RowStorage, error) { + zm.lock.RLock() + s, exists := zm.zones[zoneId] + zm.lock.RUnlock() + if exists { + return s, nil + } + return nil, fmt.Errorf("zone %s not found", zoneId) +} + +func (zm *DefaultZoneManager) CreateZone(zoneId string) (RowStorage, error) { + zm.lock.Lock() + defer zm.lock.Unlock() + + if s, exists := zm.zones[zoneId]; exists { + return s, nil + } + + // Zone path: baseDir/zoneId + // If zoneId is empty, use baseDir directly (legacy behavior) + zonePath := zm.baseDir + if zoneId != "" { + zonePath = filepath.Join(zm.baseDir, zoneId) + } + + if err := os.MkdirAll(zonePath, 0700); err != nil { + return nil, fmt.Errorf("failed to create zone directory: %w", err) + } + + // Create Storage for this zone. + // We use the zoneId as part of the filename prefix if needed, + // but SectionStorage usually takes a base filename prefix. + // Let's assume standard "data" prefix or similar. + // In legacy benchtop, implementation details like filename were "TABLES/Part...". + // Here, we abstract it. For a zone, maybe "data"? + // Or pass the intended filename prefix. + // Wait, SectionStorage takes `fileName` which is the PREFIX for section files. + // e.g. /path/to/table/data + // If Zone = Table, then we pass table path. + // If Zone = Project, we pass project path. + // Let's use "data" as the prefix inside the zone directory. + filePath := filepath.Join(zonePath, "data") + + // Default to 4 partitions to match legacy benchtop partitioning. + s := NewSectionStorage(zonePath, filePath, 4) + + zm.zones[zoneId] = s + return s, nil +} + +func (zm *DefaultZoneManager) DeleteZone(zoneId string) error { + zm.lock.Lock() + defer zm.lock.Unlock() + + s, exists := zm.zones[zoneId] + if !exists { + // Even if not loaded, try to delete directory? + // Safe to delete if we know it's a zone. + } else { + // Close storage first + if err := s.Close(); err != nil { + return fmt.Errorf("failed to close zone storage: %w", err) + } + delete(zm.zones, zoneId) + } + + if zoneId == "" { + return fmt.Errorf("cannot delete root zone") + } + + zonePath := filepath.Join(zm.baseDir, zoneId) + // Safety check: ensure we are deleting a subdir of base + // (Simple check) + + return os.RemoveAll(zonePath) +} diff --git a/jsontable/stress_persistence_test.go b/jsontable/stress_persistence_test.go new file mode 100644 index 0000000..a2eb018 --- /dev/null +++ b/jsontable/stress_persistence_test.go @@ -0,0 +1,117 @@ +package jsontable + +import ( + "fmt" + "math/rand" + "os" + "testing" + "time" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/jsontable/table" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestStressPersistence writes enough data to trigger section growth and rotation, +// then restarts the driver to verify persistence. +func TestStressPersistence(t *testing.T) { + dir, err := os.MkdirTemp("", "jsontable_stress") + require.NoError(t, err) + defer os.RemoveAll(dir) // Clean up + + tableName := "t_stress" + columns := []benchtop.ColumnDef{{Key: "data"}} + + // 1. Initial Write + driver, err := NewJSONDriver(dir) + require.NoError(t, err) + + // Write enough data to trigger growth (16MB increment) and rotation (65MB max) + // We'll write 100MB of data total. + // Each row ~1KB. So 100,000 rows. + rowCount := 100000 + payloadSize := 1000 // 1KB payload + + // Generate heavy payload + heavyPayload := make([]byte, payloadSize) + rand.Read(heavyPayload) + + loadRowsHelper := func(tableID uint16, count int) chan *benchtop.Row { + out := make(chan *benchtop.Row, 100) + go func() { + defer close(out) + for i := 0; i < count; i++ { + id := fmt.Sprintf("row_%06d", i) + data := map[string]any{ + "data": heavyPayload, // Reuse same payload for speed, just need size + "_id": id, + } + out <- &benchtop.Row{ + Id: []byte(id), + TableID: tableID, + Data: data, + } + } + }() + return out + } + + done := make(chan error) + go func() { + defer close(done) + tblStore, err := driver.New(tableName, columns) + if err != nil { + done <- err + return + } + tableID := tblStore.(*table.JSONTable).TableId + + err = driver.BulkLoad(tableID, loadRowsHelper(tableID, rowCount)) + if err != nil { + done <- err + return + } + done <- nil + }() + + start := time.Now() + err = <-done + require.NoError(t, err) + t.Logf("Wrote %d rows in %v", rowCount, time.Since(start)) + + // Close driver to flush everything + driver.Close() + + // 2. Restart and Verify + driver2, err := NewJSONDriver(dir) + require.NoError(t, err) + defer driver2.Close() + + // Verify table exists + tableID, err := driver2.LookupTableID(tableName) + if err != nil { + t.Fatalf("Failed to lookup table ID after restart: %v", err) + } + + tableStore2, err := driver2.Get(tableID) + require.NoError(t, err) + + // Spot check random rows + checkCount := 1000 + for i := 0; i < checkCount; i++ { + idx := rand.Intn(rowCount) + key := fmt.Sprintf("row_%06d", idx) + + loc, err := tableStore2.GetRowLoc(key) + require.NoError(t, err, "GetRowLoc failed for %s", key) + require.NotNil(t, loc) + + row, err := tableStore2.GetRow(loc) + require.NoError(t, err, "GetRow failed for %s", key) + require.NotNil(t, row) + assert.Equal(t, key, row["_id"]) + } + + t.Logf("Verified %d random rows successfully", checkCount) +} diff --git a/jsontable/table/bLoad.go b/jsontable/table/bLoad.go deleted file mode 100644 index f96ede8..0000000 --- a/jsontable/table/bLoad.go +++ /dev/null @@ -1,190 +0,0 @@ -package table - -import ( - "fmt" - "sync" - - "github.com/bmeg/benchtop" - "github.com/bmeg/benchtop/jsontable/section" - "github.com/bmeg/benchtop/jsontable/tpath" - "github.com/bmeg/grip/log" - "github.com/bytedance/sonic" - "github.com/cockroachdb/pebble" - multierror "github.com/hashicorp/go-multierror" -) - -type FieldKeyElements struct { - Field string - TableName string - Val any - RowId string -} - -type KitchenSink struct { - FieldIndexKeyElements []FieldKeyElements - Metadata map[string]*benchtop.RowLoc - Err error -} - -func (b *JSONTable) StartTableGoroutine( - wg *sync.WaitGroup, - metadataChan chan *KitchenSink, - snapshot *pebble.Snapshot, - batchSize int, -) chan *benchtop.Row { - ch := make(chan *benchtop.Row, batchSize) - wg.Add(1) - go func() { - defer func() { - // --- FINAL FLUSH ON EXIT --- - b.SectionLock.Lock() - for _, sec := range b.ActiveSections { - if sec.LiveBytes > 0 { - if err := sec.MMap.Flush(); err != nil { - log.Errorf("Final flush failed for section %d: %v", sec.ID, err) - } - err := sec.File.Sync() - if err != nil { - log.Errorf("File Sync failed in bulk load: %v", err) - } - } - } - b.SectionLock.Unlock() - wg.Done() - }() - - const FLUSH_EVERY = 1000 - var allFieldIndexKeyElements = make([]FieldKeyElements, 0, batchSize*len(b.Fields)) - allMetadata := make(map[string]*benchtop.RowLoc, batchSize) - var localErr *multierror.Error - - var flushCounter uint32 = 0 - for { - batch := make([]*benchtop.Row, 0, batchSize) - for range batchSize { - row, ok := <-ch - if !ok { - break - } - batch = append(batch, row) - } - if len(batch) == 0 { - break - } - - newRows := make([]*benchtop.Row, 0, len(batch)) - for _, row := range batch { - info, err := b.GetTableEntryInfo(snapshot, row.Id) - if err != nil { - localErr = multierror.Append(localErr, fmt.Errorf("error getting entry info for %s: %v", row.Id, err)) - continue - } - if info == nil { - newRows = append(newRows, row) - for field := range b.Fields { - if val := tpath.PathLookup(row.Data, field); val != nil { - allFieldIndexKeyElements = append(allFieldIndexKeyElements, FieldKeyElements{ - Field: field, - TableName: b.Name, - Val: val, - RowId: string(row.Id), - }) - } - } - } - } - - if len(newRows) == 0 { - continue - } - - rowsByPartition := make(map[uint8][]*benchtop.Row) - for _, row := range newRows { - partitionId := b.PartitionFunc(row.Id) - rowsByPartition[partitionId] = append(rowsByPartition[partitionId], row) - } - - for partitionId, rowsInPartition := range rowsByPartition { - if len(rowsInPartition) == 0 { - continue - } - - bDatas := make([][]byte, 0, len(rowsInPartition)) - rowIds := make([]string, 0, len(rowsInPartition)) - var totalUncompressedSize uint32 - - for _, row := range rowsInPartition { - bData, err := sonic.ConfigFastest.Marshal(b.PackData(row.Data, string(row.Id))) - if err != nil { - localErr = multierror.Append(localErr, fmt.Errorf("marshal error for row %s: %v", row.Id, err)) - continue - } - bDatas = append(bDatas, bData) - rowIds = append(rowIds, string(row.Id)) - totalUncompressedSize += uint32(len(bData)) + 8 - } - if len(bDatas) == 0 { - continue - } - - sec := b.ActiveSections[partitionId] // This is the section active for writing - if sec == nil { - // This should not happen if Init is correct, but add recovery/guard - var err error - sec, err = b.CreateNewSection(partitionId) - if err != nil { - localErr = multierror.Append(localErr, fmt.Errorf("failed to get or create active section for partition %d: %v", partitionId, err)) - continue - } - } - - // --- ROTATE SECTION IF FULL --- - if sec.LiveBytes+totalUncompressedSize > section.MAX_SECTION_SIZE { - // Flush old section before rotating - if sec.LiveBytes > 0 { - err := sec.CloseSection() - if err != nil { - localErr = multierror.Append(localErr, err) - } - } - - newSec, err := b.CreateNewSection(partitionId) - if err != nil { - localErr = multierror.Append(localErr, fmt.Errorf("failed to create new section for partition %d: %v", partitionId, err)) - continue - } - sec = newSec - } - - for i, bData := range bDatas { - rowLoc, err := sec.WriteJsonEntryToSection(bData) - if err != nil { - localErr = multierror.Append(localErr, fmt.Errorf("write error for row %s in section %d: %v", rowIds[i], sec.ID, err)) - continue - } - rowLoc.TableId = b.TableId - allMetadata[rowIds[i]] = rowLoc - - // --- PERIODIC FLUSH --- - flushCounter++ - /*if flushCounter >= FLUSH_EVERY { - sec.Lock.Lock() - if err := sec.MMap.Flush(); err != nil { - log.Errorf("Periodic flush failed for section %d: %v", sec.ID, err) - } - sec.Lock.Unlock() - flushCounter = 0 - }*/ - } - sec.TotalRows += uint32(len(bDatas)) - } - } - - metadataChan <- &KitchenSink{ - FieldIndexKeyElements: allFieldIndexKeyElements, - Metadata: allMetadata, - Err: localErr.ErrorOrNil(), - } - }() - return ch -} diff --git a/jsontable/table/helpers.go b/jsontable/table/helpers.go index bf98021..e0cc44f 100644 --- a/jsontable/table/helpers.go +++ b/jsontable/table/helpers.go @@ -1,6 +1,8 @@ package table import ( + "fmt" + "github.com/bmeg/benchtop" "github.com/cockroachdb/pebble" ) @@ -18,6 +20,12 @@ func (b *JSONTable) PackData(entry map[string]any, key string) *RowData { } func (b *JSONTable) GetTableEntryInfo(snap *pebble.Snapshot, id []byte) (*benchtop.RowLoc, error) { + if b == nil { + return nil, fmt.Errorf("JSONTable is nil") + } + if snap == nil { + return nil, fmt.Errorf("snapshot is nil") + } // Really only want to see if anything was returned or not. Since this doesn't interact // with the pebble indices, keep it in JSONTable _, closer, err := snap.Get(benchtop.NewPosKey(b.TableId, id)) diff --git a/jsontable/table/init.go b/jsontable/table/init.go deleted file mode 100644 index ba7158d..0000000 --- a/jsontable/table/init.go +++ /dev/null @@ -1,160 +0,0 @@ -package table - -import ( - "encoding/binary" - "fmt" - "os" - "path/filepath" - "strconv" - "strings" - "sync" - - "github.com/bmeg/benchtop" - "github.com/bmeg/benchtop/jsontable/section" - "github.com/edsrzf/mmap-go" -) - -func (b *JSONTable) Init(poolSize int) error { - b.NumPartitions = 4 - if b.NumPartitions > 256 { - if uint32(b.NumPartitions)*uint32(SECTION_ID_MULT) > 65536 { - return fmt.Errorf("too many partitions (%d) for section ID multiplier (%d)", b.NumPartitions, SECTION_ID_MULT) - } - } - - b.PartitionFunc = defaultPartitionFunc(b.NumPartitions) - b.Sections = map[uint16]*section.Section{} - - dir := filepath.Dir(b.FileName) - base := filepath.Base(b.FileName) - files, err := os.ReadDir(dir) - if err != nil { - return fmt.Errorf("failed to read directory: %w", err) - } - - type secInfo struct { - pId uint8 - localSecId int - fileName string - } - var secList []secInfo - for _, f := range files { - if strings.HasPrefix(f.Name(), base+PART_FILE_SUFFIX) { - parts := strings.Split(strings.TrimPrefix(f.Name(), base+PART_FILE_SUFFIX), SECTION_FILE_SUFFIX) - - if len(parts) != 2 { - continue - } - pId, err := strconv.Atoi(parts[0]) - if err != nil { - continue - } - - localSecId, err := strconv.Atoi(parts[1]) - if err != nil { - continue - } - secList = append(secList, secInfo{ - pId: uint8(pId), - localSecId: localSecId, - fileName: f.Name(), - }) - } - } - - for _, s := range secList { - secId := uint16(s.pId)*SECTION_ID_MULT + uint16(s.localSecId) - secPath := filepath.Join(dir, s.fileName) - - // Open main file handle (for writes) - oFile, err := os.OpenFile(secPath, os.O_RDWR, 0666) - if err != nil { - return fmt.Errorf("failed to open section file %s: %w", secPath, err) - } - m, err := mmap.Map(oFile, mmap.RDWR, 0) - if err != nil { - return fmt.Errorf("failed to mmap section %s: %w", secPath, err) - } - - // Init file pool (for writes) - filePool := make(chan *os.File, poolSize) - for range poolSize { - file, err := os.OpenFile(secPath, os.O_RDWR, 0666) - if err != nil { - // Clean up - m.Unmap() - oFile.Close() - for len(filePool) > 0 { - if f, ok := <-filePool; ok { - f.Close() - } - } - return fmt.Errorf("failed to init file pool for %s: %w", secPath, err) - } - filePool <- file - } - - var liveBytes uint32 = 0 - var totalRows uint32 = 0 - var deletedRows uint32 = 0 - var offset uint32 = 0 - // Loop for Initializing live bytes, deletedRows, totalRows - for offset+benchtop.ROW_HSIZE <= uint32(len(m)) { - header := m[offset : offset+benchtop.ROW_HSIZE] - nextOffset := binary.LittleEndian.Uint32(header[:benchtop.ROW_OFFSET_HSIZE]) - bSize := binary.LittleEndian.Uint32(header[benchtop.ROW_OFFSET_HSIZE:benchtop.ROW_HSIZE]) - if nextOffset == 0 || nextOffset <= offset { - break - } - if bSize == 0 { - deletedRows++ - } - totalRows++ - offset = nextOffset - - } - liveBytes = offset - sec := §ion.Section{ - ID: secId, - PartitionID: s.pId, - Path: secPath, - File: oFile, - FilePool: filePool, - MMap: m, - LiveBytes: liveBytes, - Active: true, - MMapMode: mmap.RDWR, - TotalRows: totalRows, - DeletedRows: deletedRows, - Lock: sync.RWMutex{}, - CompressScratch: make([]byte, 0), - } - - b.Sections[secId] = sec - b.PartitionMap[s.pId] = append(b.PartitionMap[s.pId], secId) - - } - - for pId, secIds := range b.PartitionMap { - if len(secIds) > 0 { - latestSecId := secIds[len(secIds)-1] - latestSec := b.Sections[latestSecId] - - // Mark the latest section as active for writing - b.ActiveSections[pId] = latestSec - b.FlushCounter[pId] = 0 // Reset the counter for the newly active section - } - } - - // --- ENSURE ONE SECTION PER PARTITION --- - for pId := uint8(0); pId < uint8(b.NumPartitions); pId++ { - if len(b.PartitionMap[pId]) == 0 { - _, err := b.CreateNewSection(pId) - if err != nil { - return err - } - } - } - - return nil -} diff --git a/jsontable/table/table.go b/jsontable/table/table.go index 4f4a9c6..0cabda7 100644 --- a/jsontable/table/table.go +++ b/jsontable/table/table.go @@ -1,591 +1,422 @@ package table import ( - "bytes" + "context" // Added "encoding/binary" "fmt" - "hash/fnv" - "io" - "os" - "runtime" + "runtime" // Added "strconv" "strings" "sync" - "github.com/DataDog/zstd" "github.com/bmeg/benchtop" - "github.com/bmeg/benchtop/jsontable/section" + "github.com/bmeg/benchtop/jsontable/block" + "github.com/bmeg/benchtop/jsontable/storage" "github.com/bmeg/grip/log" - "github.com/edsrzf/mmap-go" - "github.com/bytedance/sonic" + "github.com/maypok86/otter/v2" ) -const ( - PART_FILE_SUFFIX string = ".partition" - SECTION_FILE_SUFFIX string = ".section" - SECTION_ID_MULT uint16 = 256 - MAX_COMPACT_RATIO = 0.2 // 20% deleted rows triggers compaction - FLUSH_THRESHOLD = 1000 -) +// Helper for cache keys +func makeCacheKey(tableId, section, offset, size uint32) string { + // Include tableId and size to prevent collision between blocks from different tables + // that happen to use the same Section ID and Offset. + return fmt.Sprintf("%d:%d:%d:%d", tableId, section, offset, size) +} type JSONTable struct { - // Artifact arguments Columns []benchtop.ColumnDef ColumnMap map[string]int TableId uint16 - Path string // Base path (for legacy single file) Name string - FileName string // Base name for section files - - Fields map[string]struct{} // Indexing moved to table level + FileName string - Sections map[uint16]*section.Section // sectionId -> Section - PartitionMap map[uint8][]uint16 // partitionId -> []sectionId - SectionLock sync.Mutex // For creating new sections - NumPartitions uint32 // Number of partitions - PartitionFunc func(id []byte) uint8 // Assigns row to partition - MaxConcurrentSections uint8 // Limit for parallel operations + Fields map[string]struct{} + Storage storage.RowStorage + BufferPool sync.Pool - ActiveSections map[uint8]*section.Section // one per partition - FlushCounter map[uint8]int // per-partition flush counter + BlockCache *otter.Cache[string, []byte] + BlockLoader otter.LoaderFunc[string, []byte] + LocLookup func(id string) (*benchtop.RowLoc, error) } -// DefaultPartitionFunc assigns rows to partitions using FNV hash -func defaultPartitionFunc(numPartitions uint32) func(id []byte) uint8 { - return func(id []byte) uint8 { - h := fnv.New32a() - h.Write(id) - return uint8(h.Sum32() % numPartitions) - } +func (b *JSONTable) Close() error { + return b.Storage.Close() } -func (b *JSONTable) Close() error { - for _, sec := range b.Sections { - if sec.MMap != nil { - err := sec.MMap.Unmap() - if err != nil { - fmt.Printf("ERROR ON UNMAP: %s", err) - return err - } - } - if sec.File != nil { - err := sec.File.Sync() - if err != nil { - fmt.Printf("ERROR ON FILE HANDLE SYNC: %s", err) - return err - } - err = sec.File.Close() - if err != nil { - fmt.Printf("ERROR ON FILE HANDLE CLOSE: %s", err) - return err - } - } - if sec.FilePool != nil { - close(sec.FilePool) - for f := range sec.FilePool { - err := f.Close() - if err != nil { - fmt.Printf("ERROR ON FILE POOL FILE HANDLE CLOSE: %s", err) - return err - } - } - } +func (b *JSONTable) HasField(field string) bool { + if b.Fields == nil { + return false } - b.Fields = map[string]struct{}{} - return nil + _, ok := b.Fields[field] + return ok } -// AddRow adds a single row to the JSONTable, writing it as zstd-compressed data. func (b *JSONTable) AddRow(elem benchtop.Row) (*benchtop.RowLoc, error) { - partitionId := b.PartitionFunc(elem.Id) - if partitionId >= uint8(b.NumPartitions) { - return nil, fmt.Errorf("invalid partition") + locs, err := b.AddRows([]benchtop.Row{elem}) + if err != nil { + return nil, err } + return locs[0], nil +} - // Get or create active section - sec := b.ActiveSections[partitionId] - if sec == nil { - var err error - sec, err = b.CreateNewSection(partitionId) - if err != nil { - return nil, err - } +const BLOCK_HEADER_SIZE = 2 // uint16 count + +func (b *JSONTable) AddRows(elems []benchtop.Row) ([]*benchtop.RowLoc, error) { + if len(elems) == 0 { + return []*benchtop.RowLoc{}, nil } - bData, err := sonic.ConfigFastest.Marshal(b.PackData(elem.Data, string(elem.Id))) - if err != nil { - return nil, err + // 1. Partition rows + type indexedRow struct { + index int + row benchtop.Row } - totalSize := uint32(len(bData)) + benchtop.ROW_HSIZE - - // Check size and rotate if needed - if sec.LiveBytes+totalSize > section.MAX_SECTION_SIZE { - // Close current - err := sec.CloseSection() - if err != nil { - sec.Lock.Unlock() - return nil, err - } - // Create new active - newSec, err := b.CreateNewSection(partitionId) - if err != nil { - return nil, err - } - sec = newSec - b.ActiveSections[partitionId] = sec + byPartition := make(map[int][]indexedRow) + for i, elem := range elems { + pId := b.Storage.GetPartitionId(elem.Id) + byPartition[pId] = append(byPartition[pId], indexedRow{i, elem}) } - loc, err := sec.WriteJsonEntryToSection(bData) - if err != nil { - return nil, err + results := make([]*benchtop.RowLoc, len(elems)) + var blocks [][]byte + var blockIds [][]byte + var blockMap []struct { + startIndex int + count int + partition int + rows []indexedRow } - sec.TotalRows++ - loc.TableId = b.TableId - return loc, nil -} + const BATCH_SIZE = 1 + + // 2. Create blocks per partition + for pId, rows := range byPartition { + for i := 0; i < len(rows); i += BATCH_SIZE { + end := i + BATCH_SIZE + if end > len(rows) { + end = len(rows) + } + batch := rows[i:end] -func (b *JSONTable) CreateNewSection(partitionId uint8) (*section.Section, error) { - b.SectionLock.Lock() - defer b.SectionLock.Unlock() + // Create Block using abstraction + block := block.NewBlock(len(batch)) + + for _, r := range batch { + packed := b.PackData(r.row.Data, string(r.row.Id)) + payload, err := sonic.Marshal(packed) + if err != nil { + return nil, fmt.Errorf("marshal failed: %w", err) + } + // Copy payload to avoid sonic buffer reuse + rowCopy := make([]byte, len(payload)) + copy(rowCopy, payload) + block.Add(rowCopy) + } - localSecId := len(b.PartitionMap[partitionId]) - secId := uint16(partitionId)*SECTION_ID_MULT + uint16(localSecId) - if _, exists := b.Sections[secId]; exists { - return nil, fmt.Errorf("section ID conflict: %d", secId) + compressed, err := block.Serialize(&b.BufferPool) + if err != nil { + return nil, err + } + + blocks = append(blocks, compressed) + blockIds = append(blockIds, batch[0].row.Id) // Use first ID for partition routing + + blockMap = append(blockMap, struct { + startIndex int + count int + partition int + rows []indexedRow + }{ + startIndex: len(blocks) - 1, + count: len(batch), + partition: pId, + rows: batch, + }) + } } - path := fmt.Sprintf("%s%s%d.section%d", b.FileName, PART_FILE_SUFFIX, partitionId, localSecId) - handle, err := os.Create(path) + // 3. Write blocks to storage + sLocs, err := b.Storage.AddRows(blocks, blockIds) if err != nil { return nil, err } - handle.Truncate(section.INITIAL_SECTION_SIZE) // pre-allocate - m, err := mmap.Map(handle, mmap.RDWR, 0) - if err != nil { - return nil, fmt.Errorf("mmap failed on new section: %w", err) + // 4. Map results + if len(sLocs) != len(blockMap) { + return nil, fmt.Errorf("storage returned wrong number of locations") } - filePool := make(chan *os.File, 10) - for range cap(filePool) { - f, err := os.OpenFile(path, os.O_RDWR, 0666) - if err != nil { - m.Unmap() - handle.Close() - return nil, err + for i, sLoc := range sLocs { + bInfo := blockMap[i] + for j, r := range bInfo.rows { + results[r.index] = &benchtop.RowLoc{ + TableId: b.TableId, + Section: sLoc.Section, + Offset: sLoc.Offset, + Size: sLoc.Size, + Index: uint16(j), + } } - filePool <- f - } - - sec := §ion.Section{ - ID: secId, - PartitionID: partitionId, - Path: path, - File: handle, - FilePool: filePool, - MMap: m, - MMapMode: mmap.RDWR, - Active: true, - LiveBytes: 0, - CompressScratch: make([]byte, 0), } - b.Sections[secId] = sec - b.PartitionMap[partitionId] = append(b.PartitionMap[partitionId], secId) - b.ActiveSections[partitionId] = sec - b.FlushCounter[partitionId] = 0 - return sec, nil + return results, nil } -func (b *JSONTable) GetRow(loc *benchtop.RowLoc) (map[string]any, error) { - sec, exists := b.Sections[loc.Section] - if !exists { - return nil, fmt.Errorf("section %d not found", loc.Section) +func (b *JSONTable) GetRowLoc(id string) (*benchtop.RowLoc, error) { + if b.LocLookup == nil { + return nil, fmt.Errorf("LocLookup not initialized for table %s", b.Name) } + return b.LocLookup(id) +} - if len(sec.MMap) == 0 { - return nil, fmt.Errorf("section %d is empty or not mapped", loc.Section) +func (b *JSONTable) GetRow(loc *benchtop.RowLoc) (map[string]any, error) { + if loc.TableId != b.TableId { + return nil, fmt.Errorf("table ID mismatch: loc has %d, table has %d (stale index?)", loc.TableId, b.TableId) } + cacheKey := makeCacheKey(uint32(b.TableId), uint32(loc.Section), loc.Offset, loc.Size) - start := loc.Offset + benchtop.ROW_HSIZE - end := start + loc.Size - if end > uint32(len(sec.MMap)) { - return nil, fmt.Errorf("row out of bounds: %d > %d", end, len(sec.MMap)) + // Use Cache with Loader + decompressed, err := b.BlockCache.Get(context.Background(), cacheKey, b.BlockLoader) + if err != nil { + return nil, err } - compressed := sec.MMap[start:end] - decompressed, err := zstd.Decompress(nil, compressed) + rowBytes, err := block.ExtractRowFromDecompressed(decompressed, loc.Index) if err != nil { - return nil, fmt.Errorf("decompress failed: %w", err) + return nil, err } var m RowData - if err := sonic.ConfigFastest.Unmarshal(decompressed, &m); err != nil { + if err := sonic.Unmarshal(rowBytes, &m); err != nil { return nil, fmt.Errorf("unmarshal failed: %w", err) } + if m.Data != nil { + m.Data["_id"] = m.Key + } return m.Data, nil } -func (b *JSONTable) MarkDeleteTable(loc *benchtop.RowLoc) error { - sec, exists := b.Sections[loc.Section] - if !exists { - return fmt.Errorf("section %d not found", loc.Section) +func (b *JSONTable) GetRows(locs []*benchtop.RowLoc) ([]map[string]any, []error) { + results := make([]map[string]any, len(locs)) + errs := make([]error, len(locs)) + + numWorkers := runtime.NumCPU() + if numWorkers > 8 { + numWorkers = 8 } - file := <-sec.FilePool - defer func() { sec.FilePool <- file }() + var wg sync.WaitGroup + chunkSize := (len(locs) + numWorkers - 1) / numWorkers - _, err := file.WriteAt(bytes.Repeat([]byte{0x00}, 4), int64(loc.Offset+benchtop.ROW_OFFSET_HSIZE)) - if err != nil { - return fmt.Errorf("writeAt failed: %w", err) + for i := 0; i < numWorkers; i++ { + start := i * chunkSize + if start >= len(locs) { + break + } + end := start + chunkSize + if end > len(locs) { + end = len(locs) + } + + wg.Add(1) + go func(s, e int) { + defer wg.Done() + for j := s; j < e; j++ { + loc := locs[j] + if loc.TableId != b.TableId { + log.Errorf("Table lineage mismatch table=%s currentTableId=%d indexLocTableId=%d row=%d/%d: stale index entry?", b.Name, b.TableId, loc.TableId, j, len(locs)) + errs[j] = fmt.Errorf("table ID mismatch: loc has %d, table has %d (stale index?)", loc.TableId, b.TableId) + continue + } + cacheKey := makeCacheKey(uint32(b.TableId), uint32(loc.Section), loc.Offset, loc.Size) + decompressed, err := b.BlockCache.Get(context.Background(), cacheKey, b.BlockLoader) + if err != nil { + log.Errorf("GetRows(%s): block load failed section=%d offset=%d size=%d error=%v", b.Name, loc.Section, loc.Offset, loc.Size, err) + errs[j] = err + continue + } + + // Extract Row + rowBytes, err := block.ExtractRowFromDecompressed(decompressed, loc.Index) + if err != nil { + log.Errorf("GetRows(%s): extract failed index=%d count=%d error=%v", b.Name, loc.Index, binary.LittleEndian.Uint16(decompressed[0:]), err) + errs[j] = err + continue + } + + var m RowData + if err := sonic.Unmarshal(rowBytes, &m); err != nil { + log.Errorf("GetRows(%s): unmarshal failed index=%d count=%d error=%v", b.Name, loc.Index, binary.LittleEndian.Uint16(decompressed[0:]), err) + errs[j] = err + continue + } + + if m.Data != nil { + m.Data["_id"] = m.Key + } + results[j] = m.Data + } + }(start, end) } - sec.Lock.Lock() - sec.DeletedRows++ - sec.LiveBytes -= loc.Size - sec.Lock.Unlock() - return nil + + wg.Wait() + return results, errs } func (b *JSONTable) DeleteRow(loc *benchtop.RowLoc, id []byte) error { - sec, exists := b.Sections[loc.Section] - if !exists { - return fmt.Errorf("section %d not found", loc.Section) - } - - sec.Lock.Lock() - defer sec.Lock.Unlock() + return b.Storage.MarkDelete(loc) +} - _, err := sec.File.Seek(int64(loc.Offset+benchtop.ROW_OFFSET_HSIZE), io.SeekStart) - if err != nil { - return err - } - _, err = sec.File.Write(bytes.Repeat([]byte{0x00}, 4)) - if err != nil { - return fmt.Errorf("writeAt failed: %w", err) - } - sec.DeletedRows++ - sec.LiveBytes -= loc.Size - return nil +func (b *JSONTable) MarkDeleteTable(loc *benchtop.RowLoc) error { + return b.DeleteRow(loc, nil) } func (b *JSONTable) ScanDoc(filter benchtop.RowFilter) chan map[string]any { - outChan := make(chan map[string]any, 100*len(b.Sections)) - var wg sync.WaitGroup - sem := make(chan struct{}, b.MaxConcurrentSections) - for pId := uint8(0); pId < uint8(b.NumPartitions); pId++ { - for _, secId := range b.PartitionMap[pId] { - sec, exists := b.Sections[secId] - if !exists || len(sec.MMap) == 0 { - continue + out := make(chan map[string]any, 100) + rawRows := b.Storage.Scan(10) + + go func() { + defer close(out) + for compressed := range rawRows { + // Check if it is a block or single row + // IterateBlock handles both (if we updated it to handle single row as well) + // My previous edit to block.go handles single row. + + err := block.IterateBlock(compressed, &b.BufferPool, func(rowBytes []byte) bool { + process(rowBytes, out, filter, b, nil) + return true + }) + + if err != nil { + log.Errorf("scan block failed: %v", err) } - wg.Add(1) - go func(sec *section.Section) { - sem <- struct{}{} - defer func() { <-sem; wg.Done() }() - m := sec.MMap - var offset uint32 = 0 - for offset+benchtop.ROW_HSIZE <= uint32(len(m)) { - header := m[offset : offset+benchtop.ROW_HSIZE] - nextOffset := binary.LittleEndian.Uint32(header[:benchtop.ROW_OFFSET_HSIZE]) - bSize := binary.LittleEndian.Uint32(header[benchtop.ROW_OFFSET_HSIZE:benchtop.ROW_HSIZE]) - if bSize == 0 { - if nextOffset == 0 || nextOffset <= offset { - break - } - offset = nextOffset - continue - } - jsonStart := offset + benchtop.ROW_HSIZE - jsonEnd := jsonStart + bSize - if jsonEnd > uint32(len(m)) { - break - } - rowData := m[jsonStart:jsonEnd] - if err := b.processJSONRowDataDoc(rowData, filter, outChan); err != nil { - log.Debugf("skip row in section %d: %v", sec.ID, err) - } - if nextOffset == 0 || nextOffset <= offset { - break - } - offset = nextOffset - } - }(sec) } - } - go func() { wg.Wait(); close(outChan) }() - return outChan + }() + return out } -// processJSONRowDataDoc handles parsing of row bytes for ScanDoc, applying filters, and sending RowData to the output channel. -func (b *JSONTable) processJSONRowDataDoc(rowData []byte, filter benchtop.RowFilter, outChan chan map[string]any) error { - newData, err := zstd.Decompress(nil, rowData) - if err != nil { - return err - } - if filter != nil && !filter.IsNoOp() { - if !filter.Matches(newData, b.Name) { - return nil +func (b *JSONTable) ScanDocProjected(fields []string, filter benchtop.RowFilter) chan map[string]any { + out := make(chan map[string]any, 100) + go func() { + defer close(out) + if len(fields) == 0 { + for row := range b.ScanDoc(filter) { + out <- row + } + return } - } - var m RowData - err = sonic.ConfigFastest.Unmarshal(newData, &m) - if err != nil { - return err - } - if m.Data != nil { - m.Data["_id"] = m.Key - } - outChan <- m.Data - return nil -} - -// ScanId scans the JSONTable and returns IDs (as string) that match the filter. -func (b *JSONTable) ScanId(filter benchtop.RowFilter) chan string { - outChan := make(chan string, 100*len(b.Sections)) - var wg sync.WaitGroup - sem := make(chan struct{}, b.MaxConcurrentSections) - for pId := uint8(0); pId < uint8(b.NumPartitions); pId++ { - for _, secId := range b.PartitionMap[pId] { - sec, exists := b.Sections[secId] - if !exists || len(sec.MMap) == 0 { - continue + for row := range b.ScanDoc(filter) { + proj := map[string]any{} + if id, ok := row["_id"]; ok { + proj["_id"] = id } - wg.Add(1) - go func(sec *section.Section) { - sem <- struct{}{} - defer func() { <-sem; wg.Done() }() - m := sec.MMap - var offset uint32 = 0 - for offset+benchtop.ROW_HSIZE <= uint32(len(m)) { - header := m[offset : offset+benchtop.ROW_HSIZE] - nextOffset := binary.LittleEndian.Uint32(header[:benchtop.ROW_OFFSET_HSIZE]) - bSize := binary.LittleEndian.Uint32(header[benchtop.ROW_OFFSET_HSIZE:benchtop.ROW_HSIZE]) - if bSize == 0 { - if nextOffset == 0 || nextOffset <= offset { - break - } - offset = nextOffset - continue - } - jsonStart := offset + benchtop.ROW_HSIZE - jsonEnd := jsonStart + bSize - if jsonEnd > uint32(len(m)) { - break - } - rowData := m[jsonStart:jsonEnd] - if err := b.processJSONRowDataId(rowData, filter, outChan); err != nil { - log.Debugf("skip row in section %d: %v", sec.ID, err) - } - if nextOffset == 0 || nextOffset <= offset { - break - } - offset = nextOffset + for _, f := range fields { + if f == "_id" { + continue + } + if v, ok := row[f]; ok { + proj[f] = v } - }(sec) + } + out <- proj } - } - go func() { wg.Wait(); close(outChan) }() - return outChan + }() + return out } -// processJSONRowDataId handles parsing of row bytes for ScanId, applying filters, and sending IDs to the output channel. -func (b *JSONTable) processJSONRowDataId(rowData []byte, filter benchtop.RowFilter, outChan chan string) error { - newData, err := zstd.Decompress(nil, rowData) - if err != nil { - return err - } - +func process(rowBytes []byte, out chan map[string]any, filter benchtop.RowFilter, b *JSONTable, pool *sync.Pool) { + // Filter logic if filter != nil && !filter.IsNoOp() { - if !filter.Matches(newData, b.Name) { - return nil + if !filter.Matches(rowBytes, b.Name) { + return } } - - node, err := sonic.Get(newData, "1") - if err != nil { - log.Errorf("Error accessing JSON path for row data %s: %v\n", string(newData), err) - return err + var m RowData + if err := sonic.Unmarshal(rowBytes, &m); err != nil { + log.Errorf("scan unmarshal failed: %v", err) + return } - - ID, err := node.String() - if err != nil { - log.Errorf("Error unmarshaling node: %v\n", err) - return err + if m.Data != nil { + m.Data["_id"] = m.Key } - - outChan <- ID - return nil + out <- m.Data } -/* -func (b *JSONTable) CompactSection(secId uint16) error { - sec, exists := b.Sections[secId] - if !exists { - return fmt.Errorf("section %d not found", secId) - } - sec.Lock.Lock() - defer sec.Lock.Unlock() - - flushCounter := 0 - tempFileName := sec.Path + ".compact" - tempHandle, err := os.Create(tempFileName) - if err != nil { - return fmt.Errorf("failed to create temp file: %w", err) - } - defer tempHandle.Close() - - m, err := mmap.Map(sec.File, mmap.RDONLY, 0) - if err != nil { - return fmt.Errorf("failed to map file: %w", err) - } - defer m.Unmap() - - writer := bufio.NewWriterSize(tempHandle, 16*1024*1024) - var newOffset uint32 = 0 - inputChan := make(chan benchtop.Index, 100) - - // todo: figure out how to set indices from the driver instead of the table - var wg sync.WaitGroup - wg.Add(1) +func (b *JSONTable) ScanId(filter benchtop.RowFilter) chan string { + out := make(chan string, 100) go func() { - defer wg.Done() - b.setDataIndices(inputChan) - }() - - var offset uint32 = 0 - for offset+benchtop.ROW_HSIZE <= uint32(len(m)) { - header := m[offset : offset+benchtop.ROW_HSIZE] - nextOffset := binary.LittleEndian.Uint32(header[:benchtop.ROW_OFFSET_HSIZE]) - bSize := binary.LittleEndian.Uint32(header[benchtop.ROW_OFFSET_HSIZE:benchtop.ROW_HSIZE]) - - if bSize == 0 || int64(nextOffset) == int64(benchtop.ROW_HSIZE) { - if int64(nextOffset) > int64(offset) { - offset = nextOffset + defer close(out) + for row := range b.ScanDoc(filter) { + if id, ok := row["_id"].(string); ok { + out <- id } - continue - } - - jsonStart := offset + benchtop.ROW_HSIZE - jsonEnd := jsonStart + bSize - if jsonEnd > uint32(len(m)) { - return fmt.Errorf("incomplete JSON data at section %d, offset %d, size %d", sec.ID, offset, bSize) } + }() + return out +} - rowData := m[jsonStart:jsonEnd] - - rowData, err := zstd.Decompress(nil, rowData) - if err != nil { - log.Debugf("Failed to decompress row at section %d, offset %d: %v", sec.ID, offset, err) - if nextOffset == 0 || nextOffset <= offset { - break - } - offset = nextOffset - continue - } +func (b *JSONTable) ScanFull(filter benchtop.RowFilter) chan benchtop.RowLocData { + out := make(chan benchtop.RowLocData, 100) + rawRows := b.Storage.ScanFull(10) - var mRow RowData - err = sonic.ConfigFastest.Unmarshal(rowData, &mRow) - if err != nil { - if err == io.EOF { - return fmt.Errorf("JSON data for row at section %d, offset %d, size %d was incomplete: %w", sec.ID, offset, bSize, err) - } - return fmt.Errorf("failed to decode JSON row at section %d, offset %d, size %d: %w", sec.ID, offset, bSize, err) - } + go func() { + defer close(out) + for rowLocData := range rawRows { + // rowLocData.Data is the compressed block of rows + var rowIndex uint16 = 0 + err := block.IterateBlock(rowLocData.Data, &b.BufferPool, func(rowBytes []byte) bool { + // Filter logic + if filter != nil && !filter.IsNoOp() { + if !filter.Matches(rowBytes, b.Name) { + rowIndex++ + return true // Continue + } + } - node, err := sonic.Get(rowData, "1") - if err != nil { - return fmt.Errorf("failed to access ID field at section %d, offset %d: %w", sec.ID, offset, err) - } - key, err := node.String() - if err != nil { - return fmt.Errorf("failed to unmarshal ID field at section %d, offset %d: %w", sec.ID, offset, err) - } - inputChan <- benchtop.Index{Key: []byte(key), Loc: benchtop.RowLoc{Offset: newOffset, Size: bSize}} + var m RowData + if err := sonic.Unmarshal(rowBytes, &m); err != nil { + log.Errorf("scan unmarshal failed: %v", err) + rowIndex++ + return true + } + if m.Data == nil { + m.Data = make(map[string]any) + } + m.Data["_id"] = m.Key - newOffsetBytes := make([]byte, benchtop.ROW_OFFSET_HSIZE) - binary.LittleEndian.PutUint32(newOffsetBytes, newOffset+bSize+benchtop.ROW_HSIZE) - _, err = writer.Write(newOffsetBytes) - if err != nil { - return fmt.Errorf("failed writing new offset at %d: %w", newOffset, err) - } - _, err = writer.Write(rowData) - if err != nil { - return fmt.Errorf("failed writing JSON row at offset %d: %w", newOffset, err) - } + // Create a copy of the location and set the correct row index and TableId + loc := *rowLocData.Loc + loc.Index = rowIndex + loc.TableId = b.TableId - flushCounter++ - if flushCounter%FLUSH_THRESHOLD == 0 { - if err := writer.Flush(); err != nil { - return fmt.Errorf("failed flushing writer: %w", err) + out <- benchtop.RowLocData{ + Data: rowBytes, + DataMap: m.Data, + Loc: &loc, + } + rowIndex++ + return true + }) + if err != nil { + log.Errorf("scan full block failed: %v", err) } } - newOffset += bSize + benchtop.ROW_HSIZE - } - close(inputChan) - //wg.Wait() - - if err := writer.Flush(); err != nil { - return fmt.Errorf("failed final flush: %w", err) - } - if err := tempHandle.Sync(); err != nil { - return fmt.Errorf("failed syncing temp file: %w", err) - } - if err := tempHandle.Close(); err != nil { - return fmt.Errorf("failed closing temp file: %w", err) - } - if err := sec.File.Close(); err != nil { - return fmt.Errorf("failed closing old handle: %w", err) - } - - if err := os.Rename(tempFileName, sec.Path); err != nil { - return fmt.Errorf("failed renaming compacted file: %w", err) - } - - newHandle, err := os.OpenFile(sec.Path, os.O_RDWR, 0644) - if err != nil { - return fmt.Errorf("failed reopening compacted file: %w", err) - } - sec.File = newHandle - - oldPool := sec.FilePool - sec.FilePool = make(chan *os.File, cap(oldPool)) - for range cap(sec.FilePool) { - file, err := os.OpenFile(sec.Path, os.O_RDWR, 0666) - if err != nil { - return fmt.Errorf("failed to refresh file pool: %w", err) - } - sec.FilePool <- file - } - close(oldPool) - for file := range oldPool { - file.Close() - } + }() + return out +} - // Reset stats - stat, _ := os.Stat(sec.Path) - sec.LiveBytes = uint32(stat.Size()) - sec.DeletedRows = 0 - // Note: Could set sec.Active = false and create new section, updating RowLocs in DB, - // but current design reuses same section ID and path - return nil +func (b *JSONTable) GetColumnDefs() []benchtop.ColumnDef { + return b.Columns } -func (b *JSONTable) Compact() error { - var errs *multierror.Error - for secId, sec := range b.Sections { - if float64(sec.DeletedRows)/float64(sec.TotalRows) > MAX_COMPACT_RATIO { - if err := b.CompactSection(secId); err != nil { - errs = multierror.Append(errs, err) - } - } - } - return errs.ErrorOrNil() +func (b *JSONTable) Init(poolSize int) error { + // Storage is already initialized by JSONDriver via ZoneManager. + // This remains for interface compatibility but does nothing now. + return nil } -*/ func ConvertJSONPathToArray(path string) ([]any, error) { path = strings.TrimLeft(path, "./") @@ -656,98 +487,3 @@ func ConvertJSONPathToArray(path string) ([]any, error) { return result, nil } - -func (b *JSONTable) GetRows(locs []*benchtop.RowLoc, sectionId uint16) ([]map[string]any, []error) { - results := make([]map[string]any, len(locs)) - errors := make([]error, len(locs)) - sec, exists := b.Sections[sectionId] - if !exists || len(sec.MMap) == 0 { - return nil, []error{fmt.Errorf("sectionId not found in sections: %d", sectionId)} - } - - sec.Lock.RLock() - defer sec.Lock.RUnlock() - var wg sync.WaitGroup - sem := make(chan struct{}, runtime.NumCPU()) // Per-section concurrency - chunkSize := 100 // Adjust based on profiling - for i := 0; i < len(locs); i += chunkSize { - end := i + chunkSize - if end > len(locs) { - end = len(locs) - } - chunk := locs[i:end] - wg.Add(1) - go func(start int, chunk []*benchtop.RowLoc) { - sem <- struct{}{} - defer func() { <-sem; wg.Done() }() - for j, loc := range chunk { - idx := start + j - if loc.Section != sectionId { - errors[idx] = fmt.Errorf("Expected sectionId %d but got %d instead", sectionId, loc.Section) - continue - } - startOffset := loc.Offset + benchtop.ROW_HSIZE - endOffset := startOffset + loc.Size - if endOffset > uint32(len(sec.MMap)) { - errors[idx] = fmt.Errorf("row out of bounds: %d > %d", endOffset, len(sec.MMap)) - continue - } - compressed := sec.MMap[startOffset:endOffset] - decompressed, err := zstd.Decompress(nil, compressed) - if err != nil { - errors[idx] = fmt.Errorf("decompress failed: %w", err) - continue - } - var m RowData - if err := sonic.ConfigFastest.Unmarshal(decompressed, &m); err != nil { - errors[idx] = fmt.Errorf("unmarshal failed: %w", err) - continue - } - results[idx] = m.Data - } - }(i, chunk) - } - wg.Wait() - return results, errors -} - -/*func (b *JSONTable) GetRows(locs []*benchtop.RowLoc, sectionId uint16) ([]map[string]any, []error) { - results := make([]map[string]any, len(locs)) - errors := make([]error, len(locs)) - sec, exists := b.Sections[sectionId] - if !exists || len(sec.MMap) == 0 { - return nil, []error{fmt.Errorf("sectionId not found in sections: %d", sectionId)} - } - - sec.Lock.RLock() - defer sec.Lock.RUnlock() - var m RowData - var start, end uint32 = 0, 0 - for i, loc := range locs { - if loc.Section != sectionId { - errors[i] = fmt.Errorf("Expected sectionId %d but got %d instead", sectionId, loc.Section) - continue - } - start = loc.Offset + benchtop.ROW_HSIZE - end = start + loc.Size - if end > uint32(len(sec.MMap)) { - errors[i] = fmt.Errorf("row out of bounds: %d > %d", end, len(sec.MMap)) - continue - } - decompressed, err := zstd.Decompress(nil, sec.MMap[start:end]) - if err != nil { - errors[i] = fmt.Errorf("decompress failed: %w", err) - continue - } - if err := sonic.ConfigFastest.Unmarshal(decompressed, &m); err != nil { - errors[i] = fmt.Errorf("unmarshal failed: %w", err) - continue - } - results[i] = m.Data - } - return results, errors -}*/ - -func (b *JSONTable) GetColumnDefs() []benchtop.ColumnDef { - return b.Columns -} diff --git a/jsontable/tpath/tpath.go b/jsontable/tpath/tpath.go index 6147ea2..ce4457a 100644 --- a/jsontable/tpath/tpath.go +++ b/jsontable/tpath/tpath.go @@ -10,6 +10,11 @@ import ( const CURRENT = "_current" func PathLookup(v map[string]any, path string) any { + // Optimization: if it's a simple top-level field, return it directly. + if !strings.Contains(path, ".") && !strings.Contains(path, "$") { + return v[path] + } + /* Expects that special fields like '_id' and '_label' are added to the map before reaching this function */ @@ -63,5 +68,8 @@ func ToLocalPath(path string) string { if strings.HasPrefix(parts[0], "$") { parts[0] = "$" } - return strings.Join(parts, ".") + local := strings.Join(parts, ".") + // jsonpath expects array indexes as "field[0]" instead of "field.[0]" + local = strings.ReplaceAll(local, ".[", "[") + return local } diff --git a/keys.go b/keys.go index aa116e8..ee8d05c 100644 --- a/keys.go +++ b/keys.go @@ -3,13 +3,13 @@ package benchtop import ( "bytes" "encoding/binary" - "encoding/json" "github.com/bmeg/grip/log" + "github.com/bytedance/sonic" ) const ( - ROW_HSIZE uint32 = 8 // Header size: 8-byte next offset + 4-byte size + ROW_HSIZE uint32 = 8 // Header size: 4-byte next offset + 4-byte size ROW_OFFSET_HSIZE uint32 = 4 // Offset part of header ) @@ -36,49 +36,118 @@ var RFieldPrefix = []byte{'R'} // The '0x1F' invisible character unit seperator not supposed to appear in ASCII text var FieldSep = []byte{0x1F} -// builds a RFieldKey in the format "R 0x1F label 0x1F field 0x1F rowId" -func RFieldKey(label, field, rowID string) []byte { +// builds a RFieldKey in the format "R | TableID | field | rowId" +func RFieldKey(tableID uint16, field, rowID string) []byte { + idBytes := make([]byte, 2) + binary.LittleEndian.PutUint16(idBytes, tableID) return bytes.Join([][]byte{ RFieldPrefix, - []byte(label), + idBytes, []byte(field), []byte(rowID), }, FieldSep) } -func FieldKey(field string, label string, value any, rowID []byte) []byte { +// System Metadata +// key: S +var SystemMetaPrefix = byte('S') + +// MaxTableIDKey stores the global counter for Table IDs +var MaxTableIDKey = []byte{SystemMetaPrefix, 'I'} + +// MaxIDKey stores the global counter for Mapping IDs (uint64) +var MaxIDKey = []byte{SystemMetaPrefix, 'G'} + +// IDMappingPrefix (String -> Uint64) +var IDMappingPrefix = byte('I') + +// RIDMappingPrefix (Uint64 -> String) +var RIDMappingPrefix = byte('B') + +func FieldKey(field string, tableID uint16, value any, rowID []byte) []byte { /* creates a full field key for optimizing the beginning of a query */ - valueBytes, err := json.Marshal(value) + valueBytes, err := sonic.ConfigFastest.Marshal(value) if err != nil { log.Infoln("FieldKey Marshal Err: ", err) } + idBytes := make([]byte, 2) + binary.LittleEndian.PutUint16(idBytes, tableID) + // NEW ORDER: F | field | value | tableID | rowID return bytes.Join( [][]byte{ FieldPrefix, // Static prefix - []byte(field), // table field - []byte(label), // label + []byte(field), // field name valueBytes, // JSON-encoded value + idBytes, // table ID rowID, }, FieldSep, ) } -func FieldKeyParse(fieldKey []byte) (field, label string, value any, rowID []byte) { - parts := bytes.Split(fieldKey, FieldSep) - err := json.Unmarshal(parts[3], &value) +func FieldKeyParse(fieldKey []byte) (field string, tableID uint16, value any, rowID []byte) { + // Expected layout: + // F | sep | field | sep | value(json) | sep | tableID(2 bytes) | sep | rowID + // We cannot use bytes.Split here because tableID is raw binary and may contain sep. + if len(fieldKey) < 8 || fieldKey[0] != FieldPrefix[0] || fieldKey[1] != FieldSep[0] { + return "", 0, nil, nil + } + + fieldStart := 2 + fieldEndRel := bytes.IndexByte(fieldKey[fieldStart:], FieldSep[0]) + if fieldEndRel < 0 { + return "", 0, nil, nil + } + fieldEnd := fieldStart + fieldEndRel + + lastSep := bytes.LastIndexByte(fieldKey, FieldSep[0]) + // Need at least 2 tableID bytes and the separator before tableID. + if lastSep < 4 { + return "", 0, nil, nil + } + tableStart := lastSep - 2 + valueEndSep := tableStart - 1 + if valueEndSep <= fieldEnd || fieldKey[valueEndSep] != FieldSep[0] { + return "", 0, nil, nil + } + + valueBytes := fieldKey[fieldEnd+1 : valueEndSep] + err := sonic.ConfigFastest.Unmarshal(valueBytes, &value) if err != nil { log.Infoln("FieldKey Unmarshal Err: ", err) } - return string(parts[1]), string(parts[2]), value, parts[4] + tid := binary.LittleEndian.Uint16(fieldKey[tableStart:lastSep]) + rid := fieldKey[lastSep+1:] + return string(fieldKey[fieldStart:fieldEnd]), tid, value, rid } -func FieldLabelKey(field, label string) []byte { +// FieldValueKey returns a prefix for global seek of a specific field value across all tables +func FieldValueKey(field string, value any) []byte { + valueBytes, err := sonic.ConfigFastest.Marshal(value) + if err != nil { + log.Infoln("FieldValueKey Marshal Err: ", err) + return nil + } + return bytes.Join( + [][]byte{ + FieldPrefix, + []byte(field), + valueBytes, + }, + FieldSep, + ) +} + +func FieldLabelKey(field string, tableID uint16) []byte { + idBytes := make([]byte, 2) + binary.LittleEndian.PutUint16(idBytes, tableID) + // NOTE: This can no longer be used as a simple Prefix for DeletePrefix if value is in the middle. + // But it is still used for individual key construction in some legacy paths. return bytes.Join( [][]byte{ FieldPrefix, // Static prefix []byte(field), // table field - []byte(label), // label + idBytes, // table ID (Legacy order compatibility where needed, though primary uses FieldKey) }, FieldSep, ) @@ -132,19 +201,118 @@ Builds a 12 byte row loc encoding 4 bytes for Size */ func EncodeRowLoc(loc *RowLoc) []byte { - var out [12]byte + var out [14]byte binary.LittleEndian.PutUint16(out[0:], loc.TableId) binary.LittleEndian.PutUint16(out[2:], loc.Section) binary.LittleEndian.PutUint32(out[4:], loc.Offset) binary.LittleEndian.PutUint32(out[8:], loc.Size) + binary.LittleEndian.PutUint16(out[12:], loc.Index) return out[:] } func DecodeRowLoc(v []byte) *RowLoc { - return &RowLoc{ + if len(v) < 12 { + return nil + } + + loc := &RowLoc{ TableId: binary.LittleEndian.Uint16(v[0:]), Section: binary.LittleEndian.Uint16(v[2:]), Offset: binary.LittleEndian.Uint32(v[4:]), Size: binary.LittleEndian.Uint32(v[8:]), } + + // Data that is all zeros is considered invalid (not found/legacy) + // Especially if TableId and Section are both 0, it's likely uninitialized. + if loc.TableId == 0 && loc.Section == 0 && loc.Offset == 0 && loc.Size == 0 { + return nil + } + + if len(v) >= 14 { + loc.Index = binary.LittleEndian.Uint16(v[12:]) + } + return loc +} + +// Integrated Helpers for Grids + +// EncodeVertexValue combines label and RowLoc into a single value +func EncodeVertexValue(label string, loc *RowLoc) []byte { + lBytes := []byte(label) + out := make([]byte, len(lBytes)+1+14) + copy(out, lBytes) + out[len(lBytes)] = 0 + if loc != nil { + copy(out[len(lBytes)+1:], EncodeRowLoc(loc)) + } + return out +} + +// DecodeVertexValue splits label and RowLoc from an integrated value +func DecodeVertexValue(v []byte) (string, *RowLoc) { + idx := bytes.IndexByte(v, 0) + if idx < 0 { + return string(v), nil + } + label := string(v[:idx]) + locBytes := v[idx+1:] + if len(locBytes) >= 12 { + return label, DecodeRowLoc(locBytes) + } + return label, nil +} + +// EncodeEdgeValue combines label, RowLoc, and optional inlined JSON into a single value +func EncodeEdgeValue(label string, loc *RowLoc, data map[string]any) []byte { + lBytes := []byte(label) + // Format: [Label]\0[Flags][Payload] + // Flags: 0x01 (HasLoc), 0x02 (HasData) + var flags byte + var payload []byte + if loc != nil { + flags |= 0x01 + payload = append(payload, EncodeRowLoc(loc)...) + } + if data != nil { + flags |= 0x02 + dBytes, _ := sonic.ConfigFastest.Marshal(data) + payload = append(payload, dBytes...) + } + + out := make([]byte, len(lBytes)+1+1+len(payload)) + copy(out, lBytes) + out[len(lBytes)] = 0 + out[len(lBytes)+1] = flags + copy(out[len(lBytes)+2:], payload) + return out +} + +// DecodeEdgeValue splits label, RowLoc, and optional inlined JSON from an integrated edge value +func DecodeEdgeValue(v []byte) (string, *RowLoc, map[string]any) { + idx := bytes.IndexByte(v, 0) + if idx < 0 { + return "", DecodeRowLoc(v), nil + } + label := string(v[:idx]) + if len(v) <= idx+1 { + return label, nil, nil + } + flags := v[idx+1] + payload := v[idx+2:] + + var loc *RowLoc + var data map[string]any + offset := 0 + if flags&0x01 != 0 { + if len(payload) >= 14 { + loc = DecodeRowLoc(payload[:14]) + offset = 14 + } + } + if flags&0x02 != 0 { + if len(payload) > offset { + sonic.ConfigFastest.Unmarshal(payload[offset:], &data) + } + } + return label, loc, data } diff --git a/pebblebulk/bulk.go b/pebblebulk/bulk.go index fd4a8b6..d9256e1 100644 --- a/pebblebulk/bulk.go +++ b/pebblebulk/bulk.go @@ -2,6 +2,7 @@ package pebblebulk import ( "bytes" + "errors" "io" "sync" @@ -10,7 +11,7 @@ import ( ) const ( - maxWriterBuffer = 3 << 30 + maxWriterBuffer = 64 << 20 ) type PebbleBulk struct { @@ -37,24 +38,56 @@ func (pb *PebbleBulk) Set(id []byte, val []byte, opts *pebble.WriteOptions) erro if pb.Lowest == nil || bytes.Compare(id, pb.Lowest) < 0 { pb.Lowest = util.CopyBytes(id) } - err := pb.Batch.Set(id, val, nil) + + if err := pb.Batch.Set(id, val, nil); err != nil { + return err + } + if pb.CurSize > maxWriterBuffer { - pb.Batch.Commit(nil) + if err := pb.Batch.Commit(nil); err != nil { + return err + } pb.Batch.Reset() pb.CurSize = 0 } - return err + return nil } func (pb *PebbleBulk) Get(key []byte) ([]byte, io.Closer, error) { + pb.mu.Lock() + defer pb.mu.Unlock() + if pb.Batch != nil { + val, closer, err := pb.Batch.Get(key) + if err == nil { + return val, closer, nil + } + if !errors.Is(err, pebble.ErrNotFound) { + return nil, nil, err + } + } return pb.Db.Get(key) } func (pb *PebbleBulk) Delete(key []byte, opts *pebble.WriteOptions) error { pb.mu.Lock() - err := pb.Db.Delete(key, nil) - pb.mu.Unlock() - return err + defer pb.mu.Unlock() + if pb.Batch == nil { + pb.Batch = pb.Db.NewBatch() + } + + if err := pb.Batch.Delete(key, nil); err != nil { + return err + } + + pb.CurSize += len(key) + if pb.CurSize > maxWriterBuffer { + if err := pb.Batch.Commit(nil); err != nil { + return err + } + pb.Batch.Reset() + pb.CurSize = 0 + } + return nil } func (pb *PebbleBulk) BulkRead(fn func(tx *PebbleBulk) error) error { @@ -62,12 +95,45 @@ func (pb *PebbleBulk) BulkRead(fn func(tx *PebbleBulk) error) error { } func (pb *PebbleBulk) Close() error { + if pb.Batch != nil { + pb.Batch.Commit(nil) + pb.Batch.Close() + } return pb.Db.Close() } func (pb *PebbleBulk) DeletePrefix(prefix []byte) error { - nextPrefix := append(prefix, 0xFF) - return pb.Db.DeleteRange(prefix, nextPrefix, nil) + // Standard way to get range end for prefix deletion in Pebble/LevelDB + var limit []byte + for i := len(prefix) - 1; i >= 0; i-- { + if prefix[i] < 0xff { + limit = make([]byte, i+1) + copy(limit, prefix[:i+1]) + limit[i]++ + break + } + } + + pb.mu.Lock() + defer pb.mu.Unlock() + if pb.Batch == nil { + pb.Batch = pb.Db.NewBatch() + } + + // DeleteRange is [start, end) exclusive. limit is the first key that doesn't start with prefix. + if err := pb.Batch.DeleteRange(prefix, limit, nil); err != nil { + return err + } + + pb.CurSize += len(prefix) + len(limit) + if pb.CurSize > maxWriterBuffer { + if err := pb.Batch.Commit(nil); err != nil { + return err + } + pb.Batch.Reset() + pb.CurSize = 0 + } + return nil } func (pb *PebbleBulk) DeleteRange(start, end []byte, opts *pebble.WriteOptions) error { @@ -89,6 +155,7 @@ func (pb *PebbleBulk) DeleteRange(start, end []byte, opts *pebble.WriteOptions) return err } + pb.CurSize += len(start) + len(end) if pb.CurSize > maxWriterBuffer { if err := pb.Batch.Commit(nil); err != nil { return err diff --git a/pebblebulk/kv.go b/pebblebulk/kv.go index 3193dfc..4e92cec 100644 --- a/pebblebulk/kv.go +++ b/pebblebulk/kv.go @@ -2,9 +2,9 @@ package pebblebulk import ( "io" + "runtime" "sync" - "github.com/bmeg/grip/log" "github.com/cockroachdb/pebble" ) @@ -25,7 +25,27 @@ type PebbleKV struct { } func NewPebbleKV(path string) (*PebbleKV, error) { - db, err := pebble.Open(path, &pebble.Options{}) + // 512 MB cache + cache := pebble.NewCache(512 << 20) + opts := &pebble.Options{ + Cache: cache, + MemTableSize: 256 << 20, + // Keep ingest from hitting aggressive write stalls under bulk load. + L0CompactionThreshold: 8, + L0StopWritesThreshold: 128, + LBaseMaxBytes: 512 << 20, + MaxConcurrentCompactions: func() int { + n := runtime.GOMAXPROCS(0) / 2 + if n < 1 { + return 1 + } + if n > 8 { + return 8 + } + return n + }, + } + db, err := pebble.Open(path, opts) if err != nil { return nil, err } @@ -42,19 +62,30 @@ func (pdb *PebbleKV) Set(id []byte, val []byte, opts *pebble.WriteOptions) error } func (pdb *PebbleKV) BulkWrite(u func(tx *PebbleBulk) error) error { - batch := pdb.Db.NewBatch() + batch := pdb.Db.NewIndexedBatch() ptx := &PebbleBulk{pdb.Db, batch, nil, nil, 0, sync.Mutex{}, 0} + err := u(ptx) - batch.Commit(nil) + if err != nil { + batch.Close() + return err + } + // Only commit if there is uncommitted data remaining in the batch. + // PebbleBulk.Set() does intermediate commit+reset when CurSize exceeds + // the threshold, so the batch may already be empty. + if ptx.CurSize > 0 { + // log.Printf("[BulkWrite] final batch.Commit curSize=%d totalInserts=%d", ptx.CurSize, ptx.totalInserts) + if err := batch.Commit(nil); err != nil { + batch.Close() + return err + } + // log.Printf("[BulkWrite] final batch.Commit DONE") + } else { + // log.Printf("[BulkWrite] skipping final commit, batch already flushed (totalInserts=%d)", ptx.totalInserts) + } batch.Close() - pdb.InsertCount += ptx.totalInserts - if pdb.InsertCount > pdb.CompactLimit { - log.Debugf("Running pebble compact %d > %d", pdb.InsertCount, pdb.CompactLimit) - pdb.Db.Compact([]byte{0x00}, []byte{0xFF}, true) - pdb.InsertCount = 0 - } - return err + return nil } func (pb *PebbleKV) View(u func(tx *PebbleIterator) error) error { diff --git a/query/condition.go b/query/condition.go new file mode 100644 index 0000000..ab51100 --- /dev/null +++ b/query/condition.go @@ -0,0 +1,27 @@ +package query + +// Condition is benchtop's internal filter operator enum. +// Values are kept aligned with GripQL condition numeric values for compatibility. +type Condition int32 + +const ( + UNKNOWN_CONDITION Condition = 0 + EQ Condition = 1 + NEQ Condition = 2 + GT Condition = 3 + GTE Condition = 4 + LT Condition = 5 + LTE Condition = 6 + INSIDE Condition = 7 + OUTSIDE Condition = 8 + BETWEEN Condition = 9 + WITHIN Condition = 10 + WITHOUT Condition = 11 + CONTAINS Condition = 12 +) + +type FieldFilter struct { + Field string + Operator Condition + Value any +} diff --git a/test/benchmark/performance_test.go b/test/benchmark/performance_test.go new file mode 100644 index 0000000..8b7851e --- /dev/null +++ b/test/benchmark/performance_test.go @@ -0,0 +1,190 @@ +package test + +import ( + "context" + "fmt" + "os" + "testing" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/jsontable" + jTable "github.com/bmeg/benchtop/jsontable/table" + "github.com/bmeg/benchtop/pebblebulk" + "github.com/bmeg/benchtop/test/fixtures" + "github.com/bmeg/benchtop/util" + "github.com/cockroachdb/pebble" +) + +const ( + NumRows = 50000 + ValueSize = 1024 // 1KB + BatchSize = 1000 +) + +func setupBenchmarkDB(b *testing.B) (*jsontable.JSONDriver, *jTable.JSONTable, string) { + dbPath := "bench_db_" + util.RandomString(5) + _ = os.RemoveAll(dbPath) // Cleanup potential old run + + driver, err := jsontable.NewJSONDriver(dbPath) + if err != nil { + b.Fatal(err) + } + jDriver, _ := driver.(*jsontable.JSONDriver) + + columns := []benchtop.ColumnDef{{Key: "data"}} + tableName := "bench_table" + + tStore, err := jDriver.New(tableName, columns) + if err != nil { + b.Fatal(err) + } + table, _ := tStore.(*jTable.JSONTable) + + // Populate Data + rows := make([]benchtop.Row, NumRows) + for i := 0; i < NumRows; i++ { + key := []byte(fmt.Sprintf("key_%d", i)) + val := fixtures.GenerateRandomBytes(ValueSize) + rows[i] = benchtop.Row{ + Id: key, + TableID: table.TableId, + Data: map[string]any{"data": val, "id": string(key)}, + } + } + + // Write in batches + for i := 0; i < NumRows; i += 1000 { + end := i + 1000 + if end > NumRows { + end = NumRows + } + batch := rows[i:end] + + // This simulates grip's batch load (conceptually) + // But we just use direct table.AddRows which is what grip does under the hood via driver? + // No, grip uses driver.BulkLoad or insert. + // Let's use table.AddRows to be direct. + locs, err := table.AddRows(batch) + if err != nil { + b.Fatal(err) + } + + // Register in Driver (LocCache) mimicking grip's behavior + pk := &pebblebulk.PebbleBulk{Batch: jDriver.Pkv.Db.NewBatch()} + for j, loc := range locs { + r := batch[j] + jDriver.LocCache.Set(string(r.Id), loc) + // Add entry info (skipped for pure read benchmark correctness, assuming we use LocCache) + } + pk.Batch.Commit(pebble.NoSync) + pk.Batch.Close() + } + + return jDriver, table, dbPath +} + +func BenchmarkGetRowSequential(b *testing.B) { + driver, table, path := setupBenchmarkDB(b) + defer func() { + driver.Close() + os.RemoveAll(path) + }() + + b.ResetTimer() + + // We will query keys sequentially + for i := 0; i < b.N; i++ { + idx := i % NumRows + key := fmt.Sprintf("key_%d", idx) + + // 1. LocLookup + loc, err := driver.LocCache.Get(context.Background(), key) + if err != nil { + b.Fatal(err) + } + + // 2. Fetch + _, err = table.GetRow(loc) + if err != nil { + b.Fatal(err) + } + } +} + +func BenchmarkGetRowsBatch(b *testing.B) { + driver, table, path := setupBenchmarkDB(b) + defer func() { + driver.Close() + os.RemoveAll(path) + }() + + // Prepare batches of RowLocs + // Sequential batches (simulates scanning) + // 1000 items per batch + + var allLocs []*benchtop.RowLoc + for i := 0; i < NumRows; i++ { + key := fmt.Sprintf("key_%d", i) + loc, _ := driver.LocCache.Get(context.Background(), key) + allLocs = append(allLocs, loc) + } + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + start := (i * BatchSize) % NumRows + end := start + BatchSize + if end > NumRows { + end = NumRows + // Wrap around handling simplified: just cap and continue + } + batchLocs := allLocs[start:end] + + _, errs := table.GetRows(batchLocs) + for _, e := range errs { + if e != nil { + b.Fatal(e) + } + } + } +} + +// Mimics grip's random access pattern if IDs are random +func BenchmarkGetRowsRandomBatch(b *testing.B) { + // Setup with random order? + // Actually, shuffling the locs array simulates random access + driver, table, path := setupBenchmarkDB(b) + defer func() { + driver.Close() + os.RemoveAll(path) + }() + + var allLocs []*benchtop.RowLoc + for i := 0; i < NumRows; i++ { + key := fmt.Sprintf("key_%d", i) + loc, _ := driver.LocCache.Get(context.Background(), key) + allLocs = append(allLocs, loc) + } + + // Shuffle + // (Skipping shuffle implementation for brevity, relying on pseudo-random access via stride) + // Access with stride + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + // Create a "random" batch by taking every 7th element wrapping around + batchLocs := make([]*benchtop.RowLoc, BatchSize) + for j := 0; j < BatchSize; j++ { + idx := ((i * BatchSize) + (j * 17)) % NumRows + batchLocs[j] = allLocs[idx] + } + + _, errs := table.GetRows(batchLocs) + for _, e := range errs { + if e != nil { + b.Fatal(e) + } + } + } +} diff --git a/test/benchmark/repro_test.go b/test/benchmark/repro_test.go new file mode 100644 index 0000000..a6ff78a --- /dev/null +++ b/test/benchmark/repro_test.go @@ -0,0 +1,136 @@ +package test + +import ( + "fmt" + "os" + "sync" + "testing" + + "github.com/bmeg/benchtop" + "github.com/bmeg/benchtop/jsontable" + "github.com/bmeg/benchtop/test/fixtures" + "github.com/bmeg/benchtop/util" +) + +// Simplified structures to mock gdbi/grip without importing everything +type MockVertex struct { + ID string + Label string + Data map[string]any +} +type MockGraphElement struct { + Vertex *MockVertex + Graph string +} + +func BenchmarkGripFullPipeline(b *testing.B) { + dbPath := "bench_repro_full_db_" + util.RandomString(5) + _ = os.RemoveAll(dbPath) + defer os.RemoveAll(dbPath) + + driver, err := jsontable.NewJSONDriver(dbPath) + if err != nil { + b.Fatal(err) + } + // driver_test.go (arrow) + // tid, _ := drv2.LookupTableID("e_knows") // This line is commented out as drv2 is not defined + // t2, err = drv2.Get(tid) // This line is commented out as drv2 is not defined + defer driver.Close() + jDriver, _ := driver.(*jsontable.JSONDriver) + + columns := []benchtop.ColumnDef{{Key: "data"}} + tableName := "v_test_label" + jDriver.New(tableName, columns) + + val := fixtures.GenerateRandomBytes(1024) + + b.ResetTimer() + + // scale_test.go + // ch <- benchtop.Row{Id: []byte(k), TableID: tid, Data: v} + // Need to get tid first. + // But channel is consumed by BulkLoad. + // BulkLoad call is AFTER loop? No, concurrently? + // The snippet above shows BulkLoad called with channel. + // We need tableID inside loop? + // "go func() { ... ch <- Row... }" + // Pass tid to goroutine. + // 1. Client Stream (Input) + clientStream := make(chan *MockGraphElement, 100) + + // 2. Server BulkAdd (reads clientStream, validates, pushes to elementStream) + elementStream := make(chan *MockGraphElement, 100) + var serverWG sync.WaitGroup + serverWG.Add(1) + go func() { + defer serverWG.Done() + defer close(elementStream) + for elem := range clientStream { + // Simulate Validation logic + if elem.Vertex.ID == "" { + continue + } + elementStream <- elem + } + }() + + // 3. Graph BulkAdd (reads elementStream, splits to insert/index streams) + insertStream := make(chan *MockGraphElement, 100) + indexStream := make(chan *benchtop.Row, 100) + var graphWG sync.WaitGroup + graphWG.Add(2) // Two consumers for the split streams + + // Graph Splitter + go func() { + defer close(insertStream) + defer close(indexStream) + for elem := range elementStream { + insertStream <- elem + if elem.Vertex != nil { + tName := "v_" + elem.Vertex.Label + tid, _ := jDriver.LookupTableID(tName) + row := benchtop.Row{ + Id: []byte(elem.Vertex.ID), + TableID: tid, + Data: elem.Vertex.Data, + } + indexStream <- &row + } + } + }() + + // Consumer 1: InsertVertex (Simulated) + go func() { + defer graphWG.Done() + for range insertStream { + } + }() + + // Consumer 2: Index (BulkLoad) + tid, _ := jDriver.LookupTableID(tableName) + go func() { + defer graphWG.Done() + // scale_test.go + // tid, _ := jsonDriver.LookupTableID(Jsonname) // This line is commented out as jsonDriver and Jsonname are not defined + // err = jsonDriver.BulkLoad(tid, ch) // This line is commented out as jsonDriver and ch are not defined + _ = jDriver.BulkLoad(tid, indexStream) + }() + + // Producer + for i := 0; i < b.N; i++ { + key := fmt.Sprintf("v%010d", i) + elem := &MockGraphElement{ + Vertex: &MockVertex{ + ID: key, + Label: "test_label", + Data: map[string]any{"data": val, "id": key}, + }, + Graph: "test-graph", + } + clientStream <- elem + } + close(clientStream) + + serverWG.Wait() + graphWG.Wait() +} diff --git a/test/benchmark/scale_test.go b/test/benchmark/scale_test.go index 781ad8b..6f71525 100644 --- a/test/benchmark/scale_test.go +++ b/test/benchmark/scale_test.go @@ -3,12 +3,12 @@ package test import ( "fmt" "os" + "sync" "testing" "github.com/bmeg/benchtop" "github.com/bmeg/benchtop/jsontable" jTable "github.com/bmeg/benchtop/jsontable/table" - "github.com/bmeg/benchtop/pebblebulk" "github.com/bmeg/benchtop/test/fixtures" "github.com/bmeg/benchtop/util" "github.com/bmeg/grip/log" @@ -57,24 +57,33 @@ func BenchmarkScaleWriteJson(b *testing.B) { b.ResetTimer() - jsonDriver.Pkv.BulkWrite(func(tx *pebblebulk.PebbleBulk) error { - for b.Loop() { - inputChan := make(chan *benchtop.Row, 100) - go func() { - for j := range scalenumKeys { - key := []byte(fmt.Sprintf("key_%d", j)) - value := fixtures.GenerateRandomBytes(scalevalueSize) - inputChan <- &benchtop.Row{Id: key, Data: map[string]any{"data": value}} - } - close(inputChan) - }() - err = jsonDriver.BulkLoad(inputChan, tx) + // Start producer + b.ResetTimer() + + for i := 0; i < b.N; i++ { + var wg sync.WaitGroup + ch := make(chan *benchtop.Row, 100) + + // Start consumer + tid, _ := jsonDriver.LookupTableID(Jsonname) + wg.Add(1) + go func() { + defer wg.Done() + err = jsonDriver.BulkLoad(tid, ch) if err != nil { - b.Fatal(err) + b.Error(err) } + }() + + k := fmt.Sprintf("%016d", i) + v := map[string]interface{}{} + for j := 0; j < 10; j++ { + v[fmt.Sprintf("key_%d", j)] = fmt.Sprintf("value_%d", j) } - return nil - }) + ch <- &benchtop.Row{Id: []byte(k), TableID: tid, Data: v} + close(ch) + wg.Wait() + } } func BenchmarkRandomReadJson(b *testing.B) { @@ -91,7 +100,8 @@ func BenchmarkRandomReadJson(b *testing.B) { } } - ot, err := jsonDriver.Get(Jsonname) + tid, _ := jsonDriver.LookupTableID(Jsonname) + ot, err := jsonDriver.Get(tid) if err != nil { b.Log(err) } @@ -117,6 +127,11 @@ func BenchmarkRandomReadJson(b *testing.B) { loc := benchtop.DecodeRowLoc(val) closer.Close() + // driver_test.go + // The following lines are commented out because 'drv2' is not defined in this scope, + // and 't2' is not declared. This snippet appears to be from a different test file. + // tid, _ := drv2.LookupTableID("e_knows") + // t2, err = drv2.Get(tid) rOw, err := jT.GetRow(loc) if err != nil { b.Fatal(err) @@ -140,7 +155,8 @@ func BenchmarkRandomKeysJson(b *testing.B) { b.Fatal("Failed to assert type *benchtop.JSONDriver") } } - ot, err := jsonDriver.Get(Jsonname) + tid_get, _ := jsonDriver.LookupTableID(Jsonname) + ot, err := jsonDriver.Get(tid_get) if err != nil { b.Log(err) } diff --git a/test/integration/basic_test.go b/test/integration/basic_test.go index 196293b..74005f6 100644 --- a/test/integration/basic_test.go +++ b/test/integration/basic_test.go @@ -52,7 +52,11 @@ func TestOpenClose(t *testing.T) { if err != nil { t.Error(err) } - ot, err := or.Get("table_1") + tid, err := or.LookupTableID("table_1") + if err != nil { + t.Error(err) + } + ot, err := or.Get(tid) if err != nil { t.Error(err) } @@ -80,18 +84,23 @@ func TestInsert(t *testing.T) { } jT, _ := ts.(*jTable.JSONTable) + // jDR, _ := dr.(*jsontable.JSONDriver) // Unused? + jDR, _ := dr.(*jsontable.JSONDriver) for k, r := range data { - loc, err := jT.AddRow(benchtop.Row{Id: []byte(k), TableName: "table_1", Data: r}) + loc, err := jT.AddRow(benchtop.Row{Id: []byte(k), TableID: jT.TableId, Data: r}) if err != nil { t.Error(err) } - err = jDR.AddTableEntryInfo(nil, []byte(k), loc) + + pKey := benchtop.NewPosKey(jT.TableId, []byte(k)) + err = jDR.Pkv.Db.Set(pKey, benchtop.EncodeRowLoc(loc), pebble.Sync) if err != nil { t.Error(err) } } + // for k := range data { ... } for k := range data { pKey := benchtop.NewPosKey(jT.TableId, []byte(k)) @@ -101,9 +110,12 @@ func TestInsert(t *testing.T) { log.Errorf("Err on dr.Pb.Get for key %s in CacheLoader: %v", k, err) } log.Errorln("ERR: ", err) + t.Fatal(err) } loc := benchtop.DecodeRowLoc(val) - closer.Close() + if closer != nil { + closer.Close() + } post, err := ts.GetRow(loc) if err != nil { @@ -118,7 +130,9 @@ func TestInsert(t *testing.T) { } } } - keyList, err := dr.ListTableKeys(jT.TableId) + // ListTableKeys is on JSONDriver but not in interface? + // It IS in JSONDriver struct methods. + keyList, err := jDR.ListTableKeys(jT.TableId) if err != nil { t.Error(err) } @@ -153,7 +167,8 @@ func TestDeleteTable(t *testing.T) { t.Error(err) } - err = dr.Delete("table_1") + tid, _ := dr.LookupTableID("table_1") + err = dr.Delete(tid) if err != nil { t.Error(err) } @@ -165,7 +180,14 @@ func TestDeleteTable(t *testing.T) { t.Error(err) } - _, err = or.Get("table_1") + tid2, err := or.LookupTableID("table_1") + if err == nil { + _, err = or.Get(tid2) + // If Lookup succeeded, Get might succeed. + // But Delete should remove it from mapping? + // If Delete works, LookupTableID might fail or return error. + // Let's check Lookup error. + } if err == nil { t.Errorf("expected table to be gone. table still exists") } diff --git a/test/integration/compact_test.go b/test/integration/compact_test.go index 8be7ffc..640b0a9 100644 --- a/test/integration/compact_test.go +++ b/test/integration/compact_test.go @@ -34,7 +34,7 @@ func TestCompact(t *testing.T) { jT, _ := ts.(*jTable.JSONTable) for k, r := range fixtures.ScanData { - loc, err := jT.AddRow(benchtop.Row{Id: []byte(k), TableName: "table_1", Data: r}) + loc, err := jT.AddRow(benchtop.Row{Id: []byte(k), TableID: jT.TableId, Data: r}) if err != nil { t.Fatal(err) } diff --git a/test/integration/keys_test.go b/test/integration/keys_test.go index a010430..5eb9405 100644 --- a/test/integration/keys_test.go +++ b/test/integration/keys_test.go @@ -52,3 +52,41 @@ func TestPosValueParse(t *testing.T) { t.Errorf("%d != %d", size, loc.Size) } } + +func TestFieldKeyParse_TableIDContainsFieldSepLowByte(t *testing.T) { + tableID := uint16(31) // 0x1F in low byte + key := benchtop.FieldKey("name", tableID, "value", []byte("row-1")) + + field, parsedTableID, value, rowID := benchtop.FieldKeyParse(key) + if field != "name" { + t.Fatalf("field mismatch: got %q", field) + } + if parsedTableID != tableID { + t.Fatalf("table id mismatch: got %d expected %d", parsedTableID, tableID) + } + if value != "value" { + t.Fatalf("value mismatch: got %#v", value) + } + if string(rowID) != "row-1" { + t.Fatalf("row id mismatch: got %q", string(rowID)) + } +} + +func TestFieldKeyParse_TableIDContainsFieldSepHighByte(t *testing.T) { + tableID := uint16(7936) // 0x1F00 in little-endian high byte + key := benchtop.FieldKey("name", tableID, "value", []byte("row-2")) + + field, parsedTableID, value, rowID := benchtop.FieldKeyParse(key) + if field != "name" { + t.Fatalf("field mismatch: got %q", field) + } + if parsedTableID != tableID { + t.Fatalf("table id mismatch: got %d expected %d", parsedTableID, tableID) + } + if value != "value" { + t.Fatalf("value mismatch: got %#v", value) + } + if string(rowID) != "row-2" { + t.Fatalf("row id mismatch: got %q", string(rowID)) + } +} diff --git a/test/integration/scan_test.go b/test/integration/scan_test.go index 88fc6bb..3cefa25 100644 --- a/test/integration/scan_test.go +++ b/test/integration/scan_test.go @@ -10,6 +10,8 @@ import ( "github.com/bmeg/benchtop/jsontable" "github.com/bmeg/benchtop/jsontable/table" jTable "github.com/bmeg/benchtop/jsontable/table" + "github.com/bmeg/benchtop/query" + "github.com/bmeg/benchtop/util" "github.com/bytedance/sonic" "github.com/bytedance/sonic/ast" "google.golang.org/protobuf/types/known/structpb" @@ -17,8 +19,6 @@ import ( "github.com/bmeg/benchtop/test/fixtures" "github.com/bmeg/grip/gripql" "github.com/bmeg/grip/log" - - "github.com/bmeg/benchtop/util" ) type FieldFilters []filters.FieldFilter @@ -71,7 +71,7 @@ func localMatchesHasExpression(row []byte, stmt *gripql.HasExpression, tableName return filters.ApplyFilterCondition( lookupVal, &filters.FieldFilter{ - Operator: cond.Condition, + Operator: query.Condition(cond.Condition), Field: cond.Key, Value: cond.Value.AsInterface(), }, @@ -121,7 +121,7 @@ func (ff FieldFilters) Matches(row []byte, tableStr string) bool { condition := &gripql.HasExpression_Condition{ Condition: &gripql.HasCondition{ Key: filter.Field, - Condition: filter.Operator, + Condition: gripql.Condition(filter.Operator), Value: valuePB, }, } @@ -195,7 +195,7 @@ func TestScan(t *testing.T) { jDr.LocCache.Set(k, loc) } - filters1 := FieldFilters{filters.FieldFilter{Field: "name", Operator: gripql.Condition_EQ, Value: "alice"}} + filters1 := FieldFilters{filters.FieldFilter{Field: "name", Operator: query.Condition(gripql.Condition_EQ), Value: "alice"}} lenscanChan1 := 0 for elem := range jT.ScanDoc(filters1) { lenscanChan1++ @@ -214,7 +214,7 @@ func TestScan(t *testing.T) { // Second test case: "field1" == 0.2 for elem := range jT.ScanDoc( FieldFilters{filters.FieldFilter{ - Field: "field1", Operator: gripql.Condition_EQ, Value: 0.2}, + Field: "field1", Operator: query.Condition(gripql.Condition_EQ), Value: 0.2}, }, ) { t.Log("scanChantwo: ", elem) @@ -229,7 +229,7 @@ func TestScan(t *testing.T) { } // Third test case: "field1" > 0.2 - filters3 := FieldFilters{filters.FieldFilter{Field: "field1", Operator: gripql.Condition_GT, Value: 0.2}} + filters3 := FieldFilters{filters.FieldFilter{Field: "field1", Operator: query.Condition(gripql.Condition_GT), Value: 0.2}} scanChan3 := jT.ScanDoc(filters3) scanChanLen3 := 0 @@ -260,7 +260,7 @@ func TestScan(t *testing.T) { FieldFilters{ filters.FieldFilter{ Field: "name", - Operator: gripql.Condition_EQ, + Operator: query.Condition(gripql.Condition_EQ), Value: "mnbv", }, }, diff --git a/util/util.go b/util/util.go index 3c970d3..1a89016 100644 --- a/util/util.go +++ b/util/util.go @@ -1,9 +1,9 @@ package util import ( + "fmt" "os" - "strconv" - "strings" + "reflect" "time" "golang.org/x/exp/rand" @@ -11,11 +11,11 @@ import ( // RandomString generates a random string of length n. func RandomString(n int) string { - rand.NewSource(uint64(time.Now().UnixNano())) + r := rand.New(rand.NewSource(uint64(time.Now().UnixNano()))) var letter = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") b := make([]rune, n) for i := range b { - b[i] = letter[rand.Intn(len(letter))] + b[i] = letter[r.Intn(len(letter))] } return string(b) } @@ -46,10 +46,17 @@ func CopyBytes(in []byte) []byte { } func PadToSixDigits(number int) string { - numStr := strconv.Itoa(number) - numZeros := 6 - len(numStr) - if numZeros < 0 { - return numStr + return fmt.Sprintf("%06d", number) +} + +func SliceToAny(v any) []any { + rv := reflect.ValueOf(v) + if rv.Kind() != reflect.Slice && rv.Kind() != reflect.Array { + return []any{v} } - return strings.Repeat("0", numZeros) + numStr + out := make([]any, rv.Len()) + for i := 0; i < rv.Len(); i++ { + out[i] = rv.Index(i).Interface() + } + return out }