diff --git a/benchmark/TTFX_REPORT.md b/benchmark/TTFX_REPORT.md new file mode 100644 index 00000000..b9bc6b8c --- /dev/null +++ b/benchmark/TTFX_REPORT.md @@ -0,0 +1,68 @@ +# TTFX investigation: operator-product specialization + +This note records a time-to-first-execution (TTFX) investigation of `SecondQuantizedAlgebra`, motivated by the first-call latency of `QuantumCumulants.meanfield` + `complete`. It documents where the compile-time cost goes, the change made in this branch, and the larger architectural option for future work. + +## Methodology + +TTFX is reported as **wall-clock time of the first call in a fresh `julia --startup-file=no` process, median of 5 runs**, after precompilation is already done. This isolates first-call latency (inference, native codegen, execution) from precompilation. The `@snoop_inference` instrumented time is used only for *ranking* MethodInstances and packages; its absolute numbers carry roughly ±1.5 s of noise on this machine and are not reliable for A/B magnitude. Measurements were taken on Julia 1.12.6 using the Jaynes-Cummings model (one Fock mode, one two-level atom) as the workload. + +## Where the first-call time goes + +Breakdown of first-call inference for `meanfield` + `complete` (JC model), grouped by the package owning each inferred method: + +| Bucket | Share | What it is | +| :-- | --: | :-- | +| Base / stdlib | ~46% | `Dict` hashing/rehash, array growth, `Broadcast`, specialized on symbolic key/value types | +| SymbolicUtils | ~28% | v4 `Add`/`Mul` worker buffers specialized on many argument-tuple types | +| SecondQuantizedAlgebra | ~16-19% | operator algebra (`*` and the canonicalization passes) | +| Moshi | ~2.5% | the sum-type machinery the SymbolicUtils v4 backend uses | + +There is no single hotspot. About three quarters of the cost is the symbolic coefficient/expression stack (Base plus SymbolicUtils), which is upstream of this package and independent of how operators are represented. The remaining ~16-19% is this package's own operator algebra, which is the part addressable here. + +## Operator specialization inventory + +There are 7 concrete `QSym` types: `Destroy`, `Create`, `Transition`, `Pauli`, `Spin`, `Position`, `Momentum`. The per-pair methods grow as O(n^2): + +| Function | Methods | Growth | +| :-- | --: | :-- | +| `_site_compare` | 12 | per operator pair | +| `_can_commute` | 12 | per operator pair | +| `_commute_pair` | 5 | per operator pair | +| `_reduce_pair` | 3 | per operator pair | +| `adjoint`, `_ground_state_expand`, `_type_order` | ~9 | per operator type | + +Operator products flow through `Vector{QSym}` (in `QTerm.ops` and the canonicalization pipeline) and `QTermDict = Dict{QTerm, CNum}`. Because the element type is the abstract `QSym`, every `Base` container method specializes on the abstract type and element access dispatches dynamically. + +## Change in this branch + +Two source-level changes, both verified to preserve results: + +1. **Function-barrier return-type assertions** in the canonicalization passes (`src/algebra/passes.jl`, `src/algebra/pipelines.jl`, `src/expressions/index.jl`). The leaf functions `_site_compare`, `_can_commute`, `_commute_pair`, `_reduce_pair`, and the `.index` field access each have a concrete return type, but inference widens them to `Any` because they are called on elements of the abstract `Vector{QSym}`. Pinning the true return type at each call site (`::SiteCmp`, `::Bool`, `::Tuple{...}`, `::Index`) stops `Any` from cascading into the downstream code. This does not remove the dynamic dispatch into those functions (that is inherent to the heterogeneous product), but it removes the secondary dispatch cascade: `@report_opt` on a product goes from 15 to 5 runtime-dispatch sites. + +2. **`@nospecialize` / `Base.@nospecializeinfer`** on the three operator-product methods `*(::QSym, ::QSym)`, `*(::QAdd, ::QSym)`, `*(::QSym, ::QAdd)`. These are single methods that Julia specialized once per concrete operator pair (`*(::Destroy, ::QAdd)`, `*(::Create, ::Create)`, and so on), even though their bodies only box the operand into a `Vector{QSym}` and run the pipeline, so the concrete operator type is irrelevant to the body. Collapsing those specializations cuts native codegen for the product entry points. + +### Measured effect + +Wall-clock first-call (`meanfield` + `complete`, JC model), median of 5 fresh runs: + +| State | build | meanfield | complete | total first-exec | +| :-- | --: | --: | --: | --: | +| baseline | 2.49 s | 13.10 s | 1.01 s | **17.13 s** | +| this branch | 1.92 s | 12.29 s | 0.70 s | **15.40 s** | + +A reproducible **~1.7 s (~10%)** reduction in first-call latency (run-to-run spread ~0.4 s). Operator-product results and runtime are unchanged (for example `a*a'` still gives `1 + a'*a`, product runtime ~1.5 us). Note that `@nospecialize` reduces codegen rather than inference, so this win shows in wall-clock first-call but not in `@snoop_inference` totals. + +## Architectural option (future work) + +The largest remaining lever on this package's share is to collapse the 7 `QSym` subtypes into one concrete sum type, for example via Moshi's `@data` (the same mechanism SymbolicUtils v4 uses for `BasicSymbolicImpl`). This would: + +- make `Vector{QSym}` a concrete-element vector, so `Base` container methods specialize once and element access stops being dynamic dispatch; and +- collapse the ~32 per-pair O(n^2) methods into a handful of functions that branch on a tag (via `@match`), turning `_site_compare(::QSym, ...)` runtime dispatch into static calls. + +This targets the SecondQuantizedAlgebra share fully plus a slice of the Base share. It does not touch the SymbolicUtils coefficient stack, which is the remaining floor. It is a substantial refactor (rewriting the 7 operator types and the per-pair methods as tagged variants and `@match` branches, with a closed-world variant set), and the per-pair methods encode real algebraic correctness (Pauli epsilon, Transition composition, Spin commutators), so each branch must be preserved and guarded by the test suite. A reference datapoint for the magnitude: a 100-variant stress test compiles in ~6.7 s as a sum type versus ~56 s as a `Union`. + +## Summary + +- The dominant first-call cost is the upstream symbolic stack (~74%), not the operator algebra. +- This branch removes the operator-product codegen explosion for a measured ~10% first-call reduction, with no change to results or runtime. +- A sum-type collapse of the operator hierarchy is the principled next step for this package's remaining share, scoped as its own project. diff --git a/src/algebra/passes.jl b/src/algebra/passes.jl index ef03bc2f..a0e4c720 100644 --- a/src/algebra/passes.jl +++ b/src/algebra/passes.jl @@ -17,7 +17,9 @@ function _partial_sort!(ops::Vector{QSym}, ne::Vector{NonEqualPair}) for i in 2:n j = i while j > 1 - cmp = _site_compare(ops[j - 1], ops[j], ne) + # ops is Vector{QSym} (abstract elt), so leaf-pass dispatch is dynamic; + # pin the concrete return type so it cannot widen to Any and cascade. + cmp = _site_compare(ops[j - 1], ops[j], ne)::SiteCmp cmp === Greater || break ops[j - 1], ops[j] = ops[j], ops[j - 1] j -= 1 @@ -55,7 +57,7 @@ same-site pair `(a, b)` for which `_reduce_pair(a, b)` returns non-`nothing`. i = 1 while i < length(ops) a, b = ops[i], ops[i + 1] - kind, new_op, factor = _reduce_pair(a, b) + kind, new_op, factor = _reduce_pair(a, b)::Tuple{ReduceKind, QSym, CNum} if kind === NoReduction i += 1 else @@ -96,9 +98,10 @@ function _commute_ops_at(sink::F, ops::Vector{QSym}, c::CNum, start::Int) where i = start while i < length(ops) a, b = ops[i], ops[i + 1] - cmp = _site_compare(a, b, _EMPTY_NE) - if cmp === Equal && !_can_commute(a, b) - sw_b, sw_a, residual_coeff, residual_ops = _commute_pair(a, b) + cmp = _site_compare(a, b, _EMPTY_NE)::SiteCmp + if cmp === Equal && !(_can_commute(a, b)::Bool) + sw_b, sw_a, residual_coeff, residual_ops = + _commute_pair(a, b)::Tuple{QSym, QSym, CNum, Vector{QSym}} # Swap branch: stepping back to i-1 because ops[i-1] may now form # a new non-canonical pair with sw_b. swapped = copy(ops) diff --git a/src/algebra/pipelines.jl b/src/algebra/pipelines.jl index 72276d78..c3fff3eb 100644 --- a/src/algebra/pipelines.jl +++ b/src/algebra/pipelines.jl @@ -74,7 +74,7 @@ end function _depends_on_index_ops(c::CNum, ops::Vector{QSym}, idx::Index) for op in ops - op.index == idx && return true + op.index::Index == idx && return true end return _depends_on_index_term(c, ops, idx) end @@ -129,7 +129,7 @@ function _accumulate_with_diag!( _canonicalize!(out, copy(ops), c, augmented_ne) for (sum_idx, ext_idx) in diag_pairs - sub_ops = QSym[change_index(o, sum_idx, ext_idx) for o in ops] + sub_ops = QSym[change_index(o, sum_idx, ext_idx)::QSym for o in ops] sub_c = change_index(c, sum_idx, ext_idx) sub_ne = _drop_ne_with(ne, sum_idx) # Sibling diagonal slices for the same sum must be mutually exclusive; @@ -178,7 +178,7 @@ function _emit_scaled_by_scope!( return nothing end -function Base.:*(a::QSym, b::QSym) +Base.@nospecializeinfer function Base.:*(@nospecialize(a::QSym), @nospecialize(b::QSym)) out = QTermDict() _emit_product!( out, @@ -189,7 +189,7 @@ function Base.:*(a::QSym, b::QSym) return QAdd(out, _EMPTY_INDICES) end -function Base.:*(a::QAdd, b::QSym) +Base.@nospecializeinfer function Base.:*(a::QAdd, @nospecialize(b::QSym)) out = QTermDict() needs = !isempty(a.indices) for (ta, ca) in a @@ -202,7 +202,7 @@ function Base.:*(a::QAdd, b::QSym) return QAdd(out, _absorb_pinned_sums(a.indices, a, b)) end -function Base.:*(a::QSym, b::QAdd) +Base.@nospecializeinfer function Base.:*(@nospecialize(a::QSym), b::QAdd) out = QTermDict() needs = !isempty(b.indices) for (tb, cb) in b @@ -270,7 +270,7 @@ function _every_term_has_op_index(q::QAdd, idx::Index) @inbounds for t in keys(q.arguments) found = false for op in t.ops - if op.index == idx + if op.index::Index == idx found = true break end diff --git a/src/expressions/index.jl b/src/expressions/index.jl index d233073c..ec309a40 100644 --- a/src/expressions/index.jl +++ b/src/expressions/index.jl @@ -270,7 +270,7 @@ _check_not_identical(result::Number) = Num(result) function _depends_on_index_term(c::CNum, ops::Vector{QSym}, idx::Index) for op in ops - op.index == idx && return true + op.index::Index == idx && return true end isym = SymbolicUtils.unwrap(idx.sym) for part in (real(c), imag(c))