diff --git a/.gitignore b/.gitignore
index 9847c84..6e783ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ Manifest.toml
 docs/build
 docs/site
 docs/Manifest.toml
+bench/
diff --git a/src/Primes.jl b/src/Primes.jl
index b95fdac..0bbf285 100644
--- a/src/Primes.jl
+++ b/src/Primes.jl
@@ -8,103 +8,11 @@ using Base: BitSigned
 using Base.Checked: checked_neg
 using IntegerMathUtils
 
-export isprime, primes, primesmask, factor, eachfactor, divisors, ismersenneprime, isrieselprime,
+export isprime, primes, primesmask, eachprime, factor, eachfactor, divisors, ismersenneprime, isrieselprime,
        nextprime, nextprimes, prevprime, prevprimes, prime, prodfactors, radical, totient
 
 include("factorization.jl")
-
-# Primes generating functions
-#     https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes
-#     https://en.wikipedia.org/wiki/Wheel_factorization
-#     http://primesieve.org
-#     Jonathan Sorenson, "An analysis of two prime number sieves", Computer Science Technical Report Vol. 1028, 1991
-const wheel         = [4,  2,  4,  2,  4,  6,  2,  6]
-const wheel_primes  = [7, 11, 13, 17, 19, 23, 29, 31]
-const wheel_indices = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7]
-
-@inline function wheel_index(n)
-    d, r = divrem(n - 1, 30)
-    return 8d + wheel_indices[r + 2]
-end
-@inline function wheel_prime(n)
-    d, r = (n - 1) >>> 3, (n - 1) & 7
-    return 30d + wheel_primes[r + 1]
-end
-
-function _primesmask(limit::Int)
-    limit < 7 && throw(ArgumentError("The condition limit ≥ 7 must be met."))
-    n = wheel_index(limit)
-    m = wheel_prime(n)
-    sieve = ones(Bool, n)
-    @inbounds for i = 1:wheel_index(isqrt(limit))
-        if sieve[i]
-            p = wheel_prime(i)
-            q = p^2
-            j = (i - 1) & 7 + 1
-            while q ≤ m
-                sieve[wheel_index(q)] = false
-                q += wheel[j] * p
-                j = j & 7 + 1
-            end
-        end
-    end
-    return sieve
-end
-
-function _primesmask(lo::Int, hi::Int)
-    7 ≤ lo ≤ hi || throw(ArgumentError("The condition 7 ≤ lo ≤ hi must be met."))
-    lo == 7 && return _primesmask(hi)
-    wlo, whi = wheel_index(lo - 1), wheel_index(hi)
-    m = wheel_prime(whi)
-    sieve = ones(Bool, whi - wlo)
-    hi < 49 && return sieve
-    small_sieve = _primesmask(isqrt(hi))
-    @inbounds for i = 1:length(small_sieve)  # don't use eachindex here
-        if small_sieve[i]
-            p = wheel_prime(i)
-            j = wheel_index(2 * div(lo - p - 1, 2p) + 1)
-            r = widemul(p, wheel_prime(j + 1))
-            r > m && continue # use widemul to avoid r <= m caused by overflow
-            j = j & 7 + 1
-            q = Int(r)
-            # q < 0 indicates overflow when incrementing q inside loop
-            while 0 ≤ q ≤ m
-                sieve[wheel_index(q) - wlo] = false
-                q += wheel[j] * p
-                j = j & 7 + 1
-            end
-        end
-    end
-    return sieve
-end
-
-"""
-    primesmask([lo,] hi)
-
-Returns a prime sieve, as a `BitArray`, of the positive integers (from `lo`, if specified)
-up to `hi`. Useful when working with either primes or composite numbers.
-"""
-function primesmask(lo::Int, hi::Int)
-    0 < lo ≤ hi || throw(ArgumentError("The condition 0 < lo ≤ hi must be met."))
-    sieve = falses(hi - lo + 1)
-    lo ≤ 2 ≤ hi && (sieve[3 - lo] = true)
-    lo ≤ 3 ≤ hi && (sieve[4 - lo] = true)
-    lo ≤ 5 ≤ hi && (sieve[6 - lo] = true)
-    hi < 7 && return sieve
-    wheel_sieve = _primesmask(max(7, lo), hi)
-    lsi = lo - 1
-    lwi = wheel_index(lsi)
-    @inbounds for i = 1:length(wheel_sieve)   # don't use eachindex here
-        sieve[wheel_prime(i + lwi) - lsi] = wheel_sieve[i]
-    end
-    return sieve
-end
-primesmask(lo::Integer, hi::Integer) = lo ≤ hi ≤ typemax(Int) ? primesmask(Int(lo), Int(hi)) :
-    throw(ArgumentError("Both endpoints of the interval to sieve must be ≤ $(typemax(Int)), got $lo and $hi."))
-
-primesmask(limit::Int) = primesmask(1, limit)
-primesmask(n::Integer) = n ≤ typemax(Int) ? primesmask(Int(n)) :
-    throw(ArgumentError("Requested number of primes must be ≤ $(typemax(Int)), got $n."))
+include("sieve.jl")
 
 """
     primes([lo,] hi)
@@ -120,10 +28,11 @@ function primes(lo::Int, hi::Int)
     hi < 7 && return list
     lo = max(2, lo)
     sizehint!(list, 5 + floor(Int, hi / (log(hi) - 1.12) - lo / (log(lo) - 1.12 * (lo > 7))) ) # http://projecteuclid.org/euclid.rmjm/1181070157
-    sieve = _primesmask(max(7, lo), hi)
-    lwi = wheel_index(lo - 1)
-    @inbounds for i = 1:length(sieve)   # don't use eachindex here
-        sieve[i] && push!(list, wheel_prime(i + lwi))
+    # collect window by window; the sieve masks values < max(7, lo) and > hi
+    for s in SegmentedSieve(max(7, lo), hi)
+        each_lane_prime(s) do p
+            push!(list, p)
+        end
     end
     return list
 end
diff --git a/src/sieve.jl b/src/sieve.jl
new file mode 100644
index 0000000..22c93f4
--- /dev/null
+++ b/src/sieve.jl
@@ -0,0 +1,336 @@
+const wheel = (1, 7, 11, 13, 17, 19, 23, 29)              # residues coprime to 30, ascending
+
+# INV_MOD_30[r + 1] = inverse of r modulo 30 (for r coprime to 30)
+const INV_MOD_30 = let table = fill(0, 30)
+    for r in 1:29
+        gcd(r, 30) == 1 && (table[r + 1] = invmod(r, 30))
+    end
+    Tuple(table)
+end
+
+# Residue → lane index, the inverse of `wheel`: WHEEL_IDX[r + 1] = i with wheel[i] == r.
+const WHEEL_IDX = let table = fill(0, 30)
+    for (i, r) in enumerate(wheel)
+        table[r + 1] = i
+    end
+    Tuple(table)
+end
+
+# Per (residue-class pri, lane li) stride constants, so the large-prime loop places a multiple
+# without dividing by 30. For a prime p ≡ wheel[pri] (mod 30) with pq = p÷30, its smallest
+# lane-li multiple sits in block  pq·STRIDE_KW[pri][li] + STRIDE_C[pri][li].  (STRIDE_KW[pri][li]
+# is the least k with p·k ≡ wheel[li] (mod 30); STRIDE_C folds in the ÷30.)
+const STRIDE_KW = ntuple(pri -> ntuple(li -> mod(wheel[li] * INV_MOD_30[wheel[pri] + 1], 30), Val(8)), Val(8))
+const STRIDE_C  = ntuple(pri -> ntuple(li -> (wheel[pri] * STRIDE_KW[pri][li] - wheel[li]) ÷ 30, Val(8)), Val(8))
+# p² depends only on p's residue class mod 30, so p²÷30 = 30·pq² + 2·pq·wheel[pri] + STRIDE_PSQ_DIV[pri]
+# and p²%30 = STRIDE_PSQ_RES[pri]; together cld(p²-w, 30) = p²÷30 + (STRIDE_PSQ_RES[pri] > w).
+const STRIDE_PSQ_DIV = ntuple(pri -> wheel[pri]^2 ÷ 30, Val(8))
+const STRIDE_PSQ_RES = ntuple(pri -> wheel[pri]^2 % 30, Val(8))
+
+# build_combtab(p)[lane][(abs_chunk % p) + 1] is the mask that clears, in residue `lane`, the
+# multiples of p. The pattern has period p chunks, since 30·64·p ≡ 0 (mod p).
+function build_combtab(p::Int)
+    ntuple(Val(8)) do lane
+        res = wheel[lane]
+        mask = fill(~UInt64(0), p)
+        for chunk in 0:(p - 1), bit in 0:63
+            if (30 * (64 * chunk + bit) + res) % p == 0
+                mask[chunk + 1] &= ~(UInt64(1) << bit)
+            end
+        end
+        mask
+    end
+end
+
+# Small primes are cheaper to sieve word at a time rather than prime at a time.
+const COMB_THRESH = 256
+const COMB_PRIMES = Int[p for p in 7:(COMB_THRESH - 1) if all(p % d != 0 for d in 2:isqrt(p))]
+const COMB_TABLE   = NTuple{8,Vector{UInt64}}[build_combtab(p) for p in COMB_PRIMES]
+
+# AND the lane against its period-p comb, realigned to the window's absolute phase so the
+# inner loop is a contiguous (vectorizable) slice-AND — no `%p` per chunk.
+@inline function comb_clear!(chunks::Vector{UInt64}, comb::Vector{UInt64}, p::Int, first_chunk::Int, nchunks::Int)
+    chunk = 1
+    phase = first_chunk % p
+    @inbounds while chunk ≤ nchunks
+        run = min(p - phase, nchunks - chunk + 1)
+        @simd for j in 0:(run - 1)
+            chunks[chunk + j] &= comb[phase + 1 + j]
+        end
+        chunk += run
+        phase = 0
+    end
+end
+
+# Clear every `stride`-th bit of the lane, from bit `startbit` up to `nbits`.
+@inline function stride_clear!(chunks::Vector{UInt64}, startbit::Int, stride::Int, nbits::Int)
+    bit = startbit
+    @inbounds while bit < nbits
+        chunks[(bit >>> 6) + 1] &= ~(UInt64(1) << (bit & 63))
+        bit += stride
+    end
+end
+
+"""
+Segmented mod-30 lane sieve: one BitVector per residue coprime to 30,
+rather than 8 bits packed per 30-block.
+Within a lane the multiples of a prime p form a constant-stride-p progression.
+As an optimization:
+    p < COMB_THRESH : AND the lane with a periodic stride-p comb
+    p ≥ COMB_THRESH : scalar stride-p bit-clear
+Lanes are indexed in absolute block space, so a comb mask is a fixed function of
+(abs_chunk % p) and can be built once per prime.
+Output byte-transposes the 8 lane words into packed block-major words and scans those.
+
+A `SegmentedSieve` is an iterator of windows: iterating fills the lanes for the next window
+and yields the sieve itself. `each_lane_prime`, `primes`, `primesmask`, and `eachprime`
+consume those windows.
+
+2, 3, 5 are the caller's responsibility (the wheel skips them).
+"""
+mutable struct SegmentedSieve
+    lo::Int
+    hi::Int
+    seg_chunks::Int          # lane size (64-block chunks per window), so windows stay chunk-aligned
+    # Large base primes (≥ COMB_THRESH) as (p÷30, residue-class index); p and p² reconstruct from
+    # these, so the raw prime values are never stored. Small primes are the COMB_PRIMES const.
+    stride_primes::Vector{Tuple{Int32,Int8}}
+    lanes::Vector{BitVector} # the current window: 8 lanes, ≤ 64·seg_chunks bits each
+    win_block::Int           # current window: first absolute block
+    nchunks::Int             # current window: number of valid chunks (padding bits are zeroed)
+end
+
+# Optimal number of chunks per window. Low cap is 32 KiB/lane allowing for sieving within L1.
+# As hi grows, we start sieving by bigger primes so we grow the window, but cap at MiB/lane to stay within L2.
+_seg_chunks(hi::Integer) = clamp(isqrt(hi) >>> 3, 1 << 12, 1 << 18)
+
+# Chunk holding `lo` (window starts are chunk-aligned)
+_first_chunk(lo::Integer) = fld(lo, 30) >>> 6
+
+function SegmentedSieve(lo::Int, hi::Int; seg_chunks::Int = _seg_chunks(hi))
+    lo = max(7, lo)
+    # Decompose the stride primes (≥ COMB_THRESH; smaller ones are the comb) into (p÷30, residue
+    # class) once. Stream them with eachprime so we never allocate the full base-prime vector.
+    stride_primes = Tuple{Int32,Int8}[]
+    sq = isqrt(hi)
+    if sq ≥ COMB_THRESH
+        for p in eachprime(COMB_THRESH, sq)
+            pq, pr = divrem(p, 30)
+            push!(stride_primes, (pq % Int32, WHEEL_IDX[pr + 1] % Int8))
+        end
+    end
+    num_chunks = min(seg_chunks, cld(fld(hi, 30) + 1, 64) - _first_chunk(lo))
+    lanes = [BitVector(undef, num_chunks << 6) for _ in 1:8]
+    return SegmentedSieve(lo, hi, seg_chunks, stride_primes, lanes, 0, 0)
+end
+
+Base.IteratorSize(::Type{SegmentedSieve}) = Base.SizeUnknown()
+Base.eltype(::Type{SegmentedSieve}) = SegmentedSieve
+
+# Fill the next window into the lanes and yield the sieve itself (with `win_block`/`nchunks` set
+# to the current window). Boundary bits (< lo, > hi, padding) are masked, so consumers need
+# no per-prime range checks.
+function Base.iterate(ss::SegmentedSieve, win_chunk::Int = _first_chunk(ss.lo))
+    win_block = win_chunk << 6
+    last_block = fld(ss.hi, 30)
+    win_block > last_block && return nothing
+    lanes = ss.lanes
+    nblocks = min(ss.seg_chunks << 6, last_block - win_block + 1)
+    nchunks = cld(nblocks, 64)
+    win_last_block = win_block + nblocks - 1
+    win_max = win_last_block == last_block ? ss.hi : 30 * win_last_block + 29
+    max_prime = isqrt(win_max)
+
+    # Loop-invariant boundary conditions (applied per lane below to make consumers range-check-free).
+    lo_block, lo_res = divrem(ss.lo, 30)               # values < lo occupy blocks ≤ lo_block
+    tail_local = last_block - win_block                # local block of hi (if hi falls in this window)
+    hi_res = ss.hi - 30 * last_block                   # hi % 30, overflow-safe threshold
+    tail_bits = nblocks - (nchunks - 1) * 64           # valid bits in the final chunk (64 ⇒ no padding)
+
+    # One lane fully before the next, so it stays cache-hot across all its primes: fill it, comb
+    # the small primes (restoring each prime's own bit), stride the large ones, then mask boundaries.
+    @inbounds for lane in 1:8
+        chunks = lanes[lane].chunks
+        fill!(chunks, ~UInt64(0))
+        w = wheel[lane]
+        for ci in eachindex(COMB_PRIMES)               # small primes: periodic comb AND
+            p = COMB_PRIMES[ci]
+            p > max_prime && break
+            comb_clear!(chunks, COMB_TABLE[ci][lane], p, win_chunk, nchunks)
+            if win_block == 0 && p % 30 == w           # comb cleared p's own bit (p·1) and this is p's lane
+                b = fld(p, 30)
+                chunks[b >>> 6 + 1] |= UInt64(1) << (b & 63)
+            end
+        end
+        for (pq32, pri) in ss.stride_primes            # large primes: scalar stride clear
+            # Reconstruct p and p²÷30 from (pq, residue class) — no division. res_block is the phase:
+            # lane-w multiples sit at blocks ≡ res_block (mod p). The only division left is mod p.
+            pq = Int(pq32)
+            r = wheel[pri]
+            p = 30 * pq + r
+            p > max_prime && break                     # primes are ascending; rest are too big for this window
+            res_block = pq * STRIDE_KW[pri][lane] + STRIDE_C[pri][lane]
+            min_block = 30 * pq * pq + 2 * pq * r + STRIDE_PSQ_DIV[pri] + (STRIDE_PSQ_RES[pri] > w)
+            from_block = max(min_block, win_block)      # min_block = cld(p²-w, 30): first block ≥ p²
+            start_block = from_block + mod(res_block - from_block, p)
+            local_start = start_block - win_block
+            local_start < nblocks && stride_clear!(chunks, local_start, p, nblocks)
+        end
+        if win_block ≤ lo_block                        # first window: holds values < lo (incl. value 1)
+            for block in win_block:(lo_block - 1)      # whole blocks below lo
+                loc = block - win_block
+                chunks[loc >>> 6 + 1] &= ~(UInt64(1) << (loc & 63))
+            end
+            if w < lo_res                              # partial block holding lo
+                loc = lo_block - win_block
+                chunks[loc >>> 6 + 1] &= ~(UInt64(1) << (loc & 63))
+            end
+        end
+        if tail_local < nblocks && w > hi_res          # clear values > hi in the tail block
+            chunks[tail_local >>> 6 + 1] &= ~(UInt64(1) << (tail_local & 63))
+        end
+        if tail_bits < 64                              # zero the final chunk's padding
+            chunks[nchunks] &= (UInt64(1) << tail_bits) - 1
+        end
+    end
+
+    ss.win_block = win_block
+    ss.nchunks = nchunks
+    return (ss, win_chunk + ss.seg_chunks)
+end
+
+# Exchange the bit-blocks of `a` and `b` selected by `mask` / `shift`
+@inline function delta_swap(a::UInt64, b::UInt64, shift::Int, mask::UInt64)
+    t = (a ⊻ (b << shift)) & mask
+    return a ⊻ t, b ⊻ (t >> shift)
+end
+
+# transpose 8 Int64s viewed as a matrix of 8x8 bytes
+# A 3-stage delta-swap network at byte distances 4, 2, 1.
+@inline function transpose_lanes(w::NTuple{8,UInt64})
+    a1, a2, a3, a4, a5, a6, a7, a8 = w
+    a1, a5 = delta_swap(a1, a5, 32, 0xFFFFFFFF00000000)
+    a2, a6 = delta_swap(a2, a6, 32, 0xFFFFFFFF00000000)
+    a3, a7 = delta_swap(a3, a7, 32, 0xFFFFFFFF00000000)
+    a4, a8 = delta_swap(a4, a8, 32, 0xFFFFFFFF00000000)
+    a1, a3 = delta_swap(a1, a3, 16, 0xFFFF0000FFFF0000)
+    a2, a4 = delta_swap(a2, a4, 16, 0xFFFF0000FFFF0000)
+    a5, a7 = delta_swap(a5, a7, 16, 0xFFFF0000FFFF0000)
+    a6, a8 = delta_swap(a6, a8, 16, 0xFFFF0000FFFF0000)
+    a1, a2 = delta_swap(a1, a2, 8,  0xFF00FF00FF00FF00)
+    a3, a4 = delta_swap(a3, a4, 8,  0xFF00FF00FF00FF00)
+    a5, a6 = delta_swap(a5, a6, 8,  0xFF00FF00FF00FF00)
+    a7, a8 = delta_swap(a7, a8, 8,  0xFF00FF00FF00FF00)
+    return (a1, a2, a3, a4, a5, a6, a7, a8)
+end
+
+# transpose of Int64 viewed as a matrix of 8x8 bits: bit (8i+j) ↔ bit (8j+i).
+@inline function transpose8(x::UInt64)
+    t = (x ⊻ (x >> 7))  & 0x00AA00AA00AA00AA
+    x ⊻= t ⊻ (t << 7)
+    t = (x ⊻ (x >> 14)) & 0x0000CCCC0000CCCC
+    x ⊻= t ⊻ (t << 14)
+    t = (x ⊻ (x >> 28)) & 0x00000000F0F0F0F0
+    x ⊻= t ⊻ (t << 28)
+    return x
+end
+
+# Call f(prime) for each prime in the filled window, in ascending order.
+function each_lane_prime(f, ss::SegmentedSieve)
+    win_block = ss.win_block
+    chunks = ntuple(i -> ss.lanes[i].chunks, Val(8))            # the 8 lanes' word arrays
+    nchunks = ss.nchunks
+    @inbounds for chunk in 1:nchunks
+        w = ntuple(i -> chunks[i][chunk], Val(8))               # one word from each lane
+        reduce(|, w) == 0 && continue                           # no primes in these 64 blocks
+        packed = transpose_lanes(w)                             # group g now holds 8 blocks × 8 residues
+        chunk_base = 30 * (win_block + (chunk - 1) * 64)
+        for g in 1:8                                            # each group is 8 consecutive blocks
+            word = transpose8(packed[g])                        # within-word: bit 8·block + (lane-1)
+            group_base = chunk_base + 30 * 8 * (g - 1)
+            while word != 0
+                pos = trailing_zeros(word)
+                f(group_base + 30 * (pos >> 3) + wheel[(pos & 7) + 1])
+                word &= word - 1
+            end
+        end
+    end
+end
+
+"""
+    primesmask([lo,] hi)
+
+Returns a prime sieve, as a `BitArray`, of the positive integers (from `lo`, if specified)
+up to `hi`. Useful when working with either primes or composite numbers.
+"""
+function primesmask(lo::Int, hi::Int)
+    0 < lo ≤ hi || throw(ArgumentError("The condition 0 < lo ≤ hi must be met."))
+    sieve = falses(hi - lo + 1)
+    lo ≤ 2 ≤ hi && (sieve[3 - lo] = true)
+    lo ≤ 3 ≤ hi && (sieve[4 - lo] = true)
+    lo ≤ 5 ≤ hi && (sieve[6 - lo] = true)
+    hi < 7 && return sieve
+    offset = lo - 1
+    for window in SegmentedSieve(max(7, lo), hi)
+        each_lane_prime(window) do p
+            sieve[p - offset] = true
+        end
+    end
+    return sieve
+end
+primesmask(lo::Integer, hi::Integer) = lo ≤ hi ≤ typemax(Int) ? primesmask(Int(lo), Int(hi)) :
+    throw(ArgumentError("Both endpoints of the interval to sieve must be ≤ $(typemax(Int)), got $lo and $hi."))
+
+primesmask(limit::Int) = primesmask(1, limit)
+primesmask(n::Integer) = n ≤ typemax(Int) ? primesmask(Int(n)) :
+    throw(ArgumentError("Requested number of primes must be ≤ $(typemax(Int)), got $n."))
+
+# Lazy prime stream over [lo, hi], backed by a SegmentedSieve.
+# Buffers one window's primes at a time (refilling on exhaustion); 2, 3, 5 are yielded first.
+# All iteration state lives in the (mutable) struct, so the iteration `state` is a dummy.
+mutable struct EachPrime
+    lo::Int
+    hi::Int
+    sieve::Union{SegmentedSieve,Nothing}  # nothing when hi < 7
+    next_chunk::Int                       # first chunk of the next window to fetch
+    buffer::Vector{Int}                   # the current window's primes
+    pos::Int                              # next index into buffer
+    phase::Int                            # 0,1,2 ⇒ emit 2,3,5; ≥3 ⇒ window mode
+end
+
+"""
+    eachprime([lo,] hi)
+
+Lazily iterate the primes in `[lo, hi]` (from 2 if `lo` is omitted) in increasing order.
+"""
+function eachprime(lo::Integer, hi::Integer)
+    lo ≤ hi || throw(ArgumentError("The condition lo ≤ hi must be met."))
+    lo, hi = Int(lo), Int(hi)
+    sieve = hi < 7 ? nothing : SegmentedSieve(max(7, lo), hi)
+    next_chunk = sieve === nothing ? 0 : _first_chunk(sieve.lo)
+    return EachPrime(lo, hi, sieve, next_chunk, Int[], 1, 0)
+end
+eachprime(hi::Integer) = eachprime(1, hi)
+
+Base.IteratorSize(::Type{EachPrime}) = Base.SizeUnknown()
+Base.eltype(::Type{EachPrime}) = Int
+
+function Base.iterate(ep::EachPrime, ::Any = nothing)
+    while ep.phase < 3                          # 2, 3, 5 precede every wheel prime
+        p = (2, 3, 5)[ep.phase + 1]
+        ep.phase += 1
+        ep.lo ≤ p ≤ ep.hi && return (p, nothing)
+    end
+    while ep.pos > length(ep.buffer)            # refill from the next window(s)
+        ep.sieve === nothing && return nothing
+        next = iterate(ep.sieve, ep.next_chunk)
+        next === nothing && return nothing
+        window, ep.next_chunk = next            # window === ep.sieve (it yields itself)
+        empty!(ep.buffer)
+        each_lane_prime(v -> push!(ep.buffer, v), window)
+        ep.pos = 1
+    end
+    p = ep.buffer[ep.pos]
+    ep.pos += 1
+    return (p, nothing)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index f63e59a..0d9d049 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -111,6 +111,16 @@ for n = 100:100:1000
     @test primesmask(n, 10n) == primesmask(10n)[n:end]
 end
 
+# Window advance must stay chunk-aligned for any seg_chunks (odd sizes force many misalignable
+# window boundaries); compare multi-window output against the single-window default.
+for seg_chunks in (1, 3, 7)
+    ps = Int[]
+    for ss in Primes.SegmentedSieve(7, 300_000; seg_chunks)
+        Primes.each_lane_prime(p -> push!(ps, p), ss)
+    end
+    @test ps == primes(7, 300_000)
+end
+
 for T in [Int, BigInt], n = [1:1000;1000000]
     n = convert(T, n)
     f = factor(n)
@@ -511,3 +521,5 @@ end
     end
     @test primes(2^31-20, 2^31-1) == [2147483629, 2147483647]
 end
+
+include("sieve_tests.jl")
diff --git a/test/sieve_tests.jl b/test/sieve_tests.jl
new file mode 100644
index 0000000..2773244
--- /dev/null
+++ b/test/sieve_tests.jl
@@ -0,0 +1,62 @@
+# Sieve tests, split out for a fast standalone dev loop:
+#   julia --project -e 'using TestEnv; TestEnv.activate(); include("test/sieve_tests.jl")'
+# Also included from runtests.jl so CI runs them too. Self-contained imports make
+# both entry points work (re-importing when run from runtests.jl is harmless).
+using Primes
+using Test
+import Primes: isprime, primes, primesmask
+
+@testset "segmented sieve" begin
+    # brute-force reference independent of the sieve
+    bruteprimes(lo, hi) = [n for n in max(2, lo):hi if isprime(n)]
+    brutemask(lo, hi)   = Bool[isprime(n) for n in lo:hi]
+
+    # small ranges, lo==7 boundary, sub-49 ranges
+    for (lo, hi) in [(1, 100), (7, 100), (2, 7), (7, 7), (10, 48), (7, 49), (1, 1000)]
+        @test primes(lo, hi) == bruteprimes(lo, hi)
+        @test primesmask(lo, hi) == brutemask(lo, hi)
+    end
+
+    # ranges that span multiple segment windows (a window covers ~10^6 integers)
+    for (lo, hi) in [(1, 3_000_000), (2_000_000, 3_000_000), (999_983, 2_500_000)]
+        @test primes(lo, hi) == bruteprimes(lo, hi)
+    end
+    @test primesmask(2_000_000, 2_100_000) == brutemask(2_000_000, 2_100_000)
+
+    # consistency: primes(n) and primesmask(n) agree
+    let pm = primesmask(2_000_000)
+        @test primes(2_000_000) == [n for n in 2:2_000_000 if pm[n]]
+    end
+
+    # direct core tests: the SegmentedSieve iterator (yields itself per filled window) and
+    # the each_lane_prime consumer. The sieve masks values < max(7, lo) and > hi.
+    function collect_sieve(lo, hi)
+        out = Int[]
+        for s in Primes.SegmentedSieve(max(7, lo), hi)
+            Primes.each_lane_prime(s) do p
+                push!(out, p)
+            end
+        end
+        out
+    end
+    @test collect_sieve(7, 1000) == [n for n in 7:1000 if isprime(n)]
+    # a range crossing several segment windows
+    @test collect_sieve(1_000_000, 1_500_000) == [n for n in 1_000_000:1_500_000 if isprime(n)]
+    # a window starting mid-chunk (lo not 64-block aligned)
+    @test collect_sieve(999_983, 1_100_000) == [n for n in 999_983:1_100_000 if isprime(n)]
+
+    # eachprime streams the same primes as primes(), and is lazily takeable
+    @test eltype(eachprime(100)) == Int
+    @test collect(eachprime(100)) == primes(100)
+    for (lo, hi) in [(1, 100), (7, 100), (2, 7), (10, 48), (8, 10), (1, 3_000_000), (999_983, 1_100_000)]
+        @test collect(eachprime(lo, hi)) == primes(lo, hi)
+    end
+    @test collect(eachprime(6)) == [2, 3, 5]
+    @test isempty(collect(eachprime(8, 10)))
+    @test collect(Iterators.take(eachprime(10^6), 6)) == [2, 3, 5, 7, 11, 13]
+    @test_throws ArgumentError eachprime(10, 5)
+    # NOTE: the widemul / `q < 0` overflow guards only trigger for hi near
+    # typemax(Int), which needs base primes up to ~isqrt(typemax) ≈ 3e9 — far too
+    # expensive to sieve in a test. Those guards are carried over verbatim from the
+    # existing implementation and are not unit-tested here.
+end