diff --git a/.gitignore b/.gitignore index 9847c84..6e783ae 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ Manifest.toml docs/build docs/site docs/Manifest.toml +bench/ diff --git a/src/Primes.jl b/src/Primes.jl index b95fdac..0bbf285 100644 --- a/src/Primes.jl +++ b/src/Primes.jl @@ -8,103 +8,11 @@ using Base: BitSigned using Base.Checked: checked_neg using IntegerMathUtils -export isprime, primes, primesmask, factor, eachfactor, divisors, ismersenneprime, isrieselprime, +export isprime, primes, primesmask, eachprime, factor, eachfactor, divisors, ismersenneprime, isrieselprime, nextprime, nextprimes, prevprime, prevprimes, prime, prodfactors, radical, totient include("factorization.jl") - -# Primes generating functions -# https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes -# https://en.wikipedia.org/wiki/Wheel_factorization -# http://primesieve.org -# Jonathan Sorenson, "An analysis of two prime number sieves", Computer Science Technical Report Vol. 1028, 1991 -const wheel = [4, 2, 4, 2, 4, 6, 2, 6] -const wheel_primes = [7, 11, 13, 17, 19, 23, 29, 31] -const wheel_indices = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7] - -@inline function wheel_index(n) - d, r = divrem(n - 1, 30) - return 8d + wheel_indices[r + 2] -end -@inline function wheel_prime(n) - d, r = (n - 1) >>> 3, (n - 1) & 7 - return 30d + wheel_primes[r + 1] -end - -function _primesmask(limit::Int) - limit < 7 && throw(ArgumentError("The condition limit ≥ 7 must be met.")) - n = wheel_index(limit) - m = wheel_prime(n) - sieve = ones(Bool, n) - @inbounds for i = 1:wheel_index(isqrt(limit)) - if sieve[i] - p = wheel_prime(i) - q = p^2 - j = (i - 1) & 7 + 1 - while q ≤ m - sieve[wheel_index(q)] = false - q += wheel[j] * p - j = j & 7 + 1 - end - end - end - return sieve -end - -function _primesmask(lo::Int, hi::Int) - 7 ≤ lo ≤ hi || throw(ArgumentError("The condition 7 ≤ lo ≤ hi must be met.")) - lo == 7 && return _primesmask(hi) - wlo, whi = wheel_index(lo - 1), wheel_index(hi) - m = wheel_prime(whi) - sieve = ones(Bool, whi - wlo) - hi < 49 && return sieve - small_sieve = _primesmask(isqrt(hi)) - @inbounds for i = 1:length(small_sieve) # don't use eachindex here - if small_sieve[i] - p = wheel_prime(i) - j = wheel_index(2 * div(lo - p - 1, 2p) + 1) - r = widemul(p, wheel_prime(j + 1)) - r > m && continue # use widemul to avoid r <= m caused by overflow - j = j & 7 + 1 - q = Int(r) - # q < 0 indicates overflow when incrementing q inside loop - while 0 ≤ q ≤ m - sieve[wheel_index(q) - wlo] = false - q += wheel[j] * p - j = j & 7 + 1 - end - end - end - return sieve -end - -""" - primesmask([lo,] hi) - -Returns a prime sieve, as a `BitArray`, of the positive integers (from `lo`, if specified) -up to `hi`. Useful when working with either primes or composite numbers. -""" -function primesmask(lo::Int, hi::Int) - 0 < lo ≤ hi || throw(ArgumentError("The condition 0 < lo ≤ hi must be met.")) - sieve = falses(hi - lo + 1) - lo ≤ 2 ≤ hi && (sieve[3 - lo] = true) - lo ≤ 3 ≤ hi && (sieve[4 - lo] = true) - lo ≤ 5 ≤ hi && (sieve[6 - lo] = true) - hi < 7 && return sieve - wheel_sieve = _primesmask(max(7, lo), hi) - lsi = lo - 1 - lwi = wheel_index(lsi) - @inbounds for i = 1:length(wheel_sieve) # don't use eachindex here - sieve[wheel_prime(i + lwi) - lsi] = wheel_sieve[i] - end - return sieve -end -primesmask(lo::Integer, hi::Integer) = lo ≤ hi ≤ typemax(Int) ? primesmask(Int(lo), Int(hi)) : - throw(ArgumentError("Both endpoints of the interval to sieve must be ≤ $(typemax(Int)), got $lo and $hi.")) - -primesmask(limit::Int) = primesmask(1, limit) -primesmask(n::Integer) = n ≤ typemax(Int) ? primesmask(Int(n)) : - throw(ArgumentError("Requested number of primes must be ≤ $(typemax(Int)), got $n.")) +include("sieve.jl") """ primes([lo,] hi) @@ -120,10 +28,11 @@ function primes(lo::Int, hi::Int) hi < 7 && return list lo = max(2, lo) sizehint!(list, 5 + floor(Int, hi / (log(hi) - 1.12) - lo / (log(lo) - 1.12 * (lo > 7))) ) # http://projecteuclid.org/euclid.rmjm/1181070157 - sieve = _primesmask(max(7, lo), hi) - lwi = wheel_index(lo - 1) - @inbounds for i = 1:length(sieve) # don't use eachindex here - sieve[i] && push!(list, wheel_prime(i + lwi)) + # collect window by window; the sieve masks values < max(7, lo) and > hi + for s in SegmentedSieve(max(7, lo), hi) + each_lane_prime(s) do p + push!(list, p) + end end return list end diff --git a/src/sieve.jl b/src/sieve.jl new file mode 100644 index 0000000..22c93f4 --- /dev/null +++ b/src/sieve.jl @@ -0,0 +1,336 @@ +const wheel = (1, 7, 11, 13, 17, 19, 23, 29) # residues coprime to 30, ascending + +# INV_MOD_30[r + 1] = inverse of r modulo 30 (for r coprime to 30) +const INV_MOD_30 = let table = fill(0, 30) + for r in 1:29 + gcd(r, 30) == 1 && (table[r + 1] = invmod(r, 30)) + end + Tuple(table) +end + +# Residue → lane index, the inverse of `wheel`: WHEEL_IDX[r + 1] = i with wheel[i] == r. +const WHEEL_IDX = let table = fill(0, 30) + for (i, r) in enumerate(wheel) + table[r + 1] = i + end + Tuple(table) +end + +# Per (residue-class pri, lane li) stride constants, so the large-prime loop places a multiple +# without dividing by 30. For a prime p ≡ wheel[pri] (mod 30) with pq = p÷30, its smallest +# lane-li multiple sits in block pq·STRIDE_KW[pri][li] + STRIDE_C[pri][li]. (STRIDE_KW[pri][li] +# is the least k with p·k ≡ wheel[li] (mod 30); STRIDE_C folds in the ÷30.) +const STRIDE_KW = ntuple(pri -> ntuple(li -> mod(wheel[li] * INV_MOD_30[wheel[pri] + 1], 30), Val(8)), Val(8)) +const STRIDE_C = ntuple(pri -> ntuple(li -> (wheel[pri] * STRIDE_KW[pri][li] - wheel[li]) ÷ 30, Val(8)), Val(8)) +# p² depends only on p's residue class mod 30, so p²÷30 = 30·pq² + 2·pq·wheel[pri] + STRIDE_PSQ_DIV[pri] +# and p²%30 = STRIDE_PSQ_RES[pri]; together cld(p²-w, 30) = p²÷30 + (STRIDE_PSQ_RES[pri] > w). +const STRIDE_PSQ_DIV = ntuple(pri -> wheel[pri]^2 ÷ 30, Val(8)) +const STRIDE_PSQ_RES = ntuple(pri -> wheel[pri]^2 % 30, Val(8)) + +# build_combtab(p)[lane][(abs_chunk % p) + 1] is the mask that clears, in residue `lane`, the +# multiples of p. The pattern has period p chunks, since 30·64·p ≡ 0 (mod p). +function build_combtab(p::Int) + ntuple(Val(8)) do lane + res = wheel[lane] + mask = fill(~UInt64(0), p) + for chunk in 0:(p - 1), bit in 0:63 + if (30 * (64 * chunk + bit) + res) % p == 0 + mask[chunk + 1] &= ~(UInt64(1) << bit) + end + end + mask + end +end + +# Small primes are cheaper to sieve word at a time rather than prime at a time. +const COMB_THRESH = 256 +const COMB_PRIMES = Int[p for p in 7:(COMB_THRESH - 1) if all(p % d != 0 for d in 2:isqrt(p))] +const COMB_TABLE = NTuple{8,Vector{UInt64}}[build_combtab(p) for p in COMB_PRIMES] + +# AND the lane against its period-p comb, realigned to the window's absolute phase so the +# inner loop is a contiguous (vectorizable) slice-AND — no `%p` per chunk. +@inline function comb_clear!(chunks::Vector{UInt64}, comb::Vector{UInt64}, p::Int, first_chunk::Int, nchunks::Int) + chunk = 1 + phase = first_chunk % p + @inbounds while chunk ≤ nchunks + run = min(p - phase, nchunks - chunk + 1) + @simd for j in 0:(run - 1) + chunks[chunk + j] &= comb[phase + 1 + j] + end + chunk += run + phase = 0 + end +end + +# Clear every `stride`-th bit of the lane, from bit `startbit` up to `nbits`. +@inline function stride_clear!(chunks::Vector{UInt64}, startbit::Int, stride::Int, nbits::Int) + bit = startbit + @inbounds while bit < nbits + chunks[(bit >>> 6) + 1] &= ~(UInt64(1) << (bit & 63)) + bit += stride + end +end + +""" +Segmented mod-30 lane sieve: one BitVector per residue coprime to 30, +rather than 8 bits packed per 30-block. +Within a lane the multiples of a prime p form a constant-stride-p progression. +As an optimization: + p < COMB_THRESH : AND the lane with a periodic stride-p comb + p ≥ COMB_THRESH : scalar stride-p bit-clear +Lanes are indexed in absolute block space, so a comb mask is a fixed function of +(abs_chunk % p) and can be built once per prime. +Output byte-transposes the 8 lane words into packed block-major words and scans those. + +A `SegmentedSieve` is an iterator of windows: iterating fills the lanes for the next window +and yields the sieve itself. `each_lane_prime`, `primes`, `primesmask`, and `eachprime` +consume those windows. + +2, 3, 5 are the caller's responsibility (the wheel skips them). +""" +mutable struct SegmentedSieve + lo::Int + hi::Int + seg_chunks::Int # lane size (64-block chunks per window), so windows stay chunk-aligned + # Large base primes (≥ COMB_THRESH) as (p÷30, residue-class index); p and p² reconstruct from + # these, so the raw prime values are never stored. Small primes are the COMB_PRIMES const. + stride_primes::Vector{Tuple{Int32,Int8}} + lanes::Vector{BitVector} # the current window: 8 lanes, ≤ 64·seg_chunks bits each + win_block::Int # current window: first absolute block + nchunks::Int # current window: number of valid chunks (padding bits are zeroed) +end + +# Optimal number of chunks per window. Low cap is 32 KiB/lane allowing for sieving within L1. +# As hi grows, we start sieving by bigger primes so we grow the window, but cap at MiB/lane to stay within L2. +_seg_chunks(hi::Integer) = clamp(isqrt(hi) >>> 3, 1 << 12, 1 << 18) + +# Chunk holding `lo` (window starts are chunk-aligned) +_first_chunk(lo::Integer) = fld(lo, 30) >>> 6 + +function SegmentedSieve(lo::Int, hi::Int; seg_chunks::Int = _seg_chunks(hi)) + lo = max(7, lo) + # Decompose the stride primes (≥ COMB_THRESH; smaller ones are the comb) into (p÷30, residue + # class) once. Stream them with eachprime so we never allocate the full base-prime vector. + stride_primes = Tuple{Int32,Int8}[] + sq = isqrt(hi) + if sq ≥ COMB_THRESH + for p in eachprime(COMB_THRESH, sq) + pq, pr = divrem(p, 30) + push!(stride_primes, (pq % Int32, WHEEL_IDX[pr + 1] % Int8)) + end + end + num_chunks = min(seg_chunks, cld(fld(hi, 30) + 1, 64) - _first_chunk(lo)) + lanes = [BitVector(undef, num_chunks << 6) for _ in 1:8] + return SegmentedSieve(lo, hi, seg_chunks, stride_primes, lanes, 0, 0) +end + +Base.IteratorSize(::Type{SegmentedSieve}) = Base.SizeUnknown() +Base.eltype(::Type{SegmentedSieve}) = SegmentedSieve + +# Fill the next window into the lanes and yield the sieve itself (with `win_block`/`nchunks` set +# to the current window). Boundary bits (< lo, > hi, padding) are masked, so consumers need +# no per-prime range checks. +function Base.iterate(ss::SegmentedSieve, win_chunk::Int = _first_chunk(ss.lo)) + win_block = win_chunk << 6 + last_block = fld(ss.hi, 30) + win_block > last_block && return nothing + lanes = ss.lanes + nblocks = min(ss.seg_chunks << 6, last_block - win_block + 1) + nchunks = cld(nblocks, 64) + win_last_block = win_block + nblocks - 1 + win_max = win_last_block == last_block ? ss.hi : 30 * win_last_block + 29 + max_prime = isqrt(win_max) + + # Loop-invariant boundary conditions (applied per lane below to make consumers range-check-free). + lo_block, lo_res = divrem(ss.lo, 30) # values < lo occupy blocks ≤ lo_block + tail_local = last_block - win_block # local block of hi (if hi falls in this window) + hi_res = ss.hi - 30 * last_block # hi % 30, overflow-safe threshold + tail_bits = nblocks - (nchunks - 1) * 64 # valid bits in the final chunk (64 ⇒ no padding) + + # One lane fully before the next, so it stays cache-hot across all its primes: fill it, comb + # the small primes (restoring each prime's own bit), stride the large ones, then mask boundaries. + @inbounds for lane in 1:8 + chunks = lanes[lane].chunks + fill!(chunks, ~UInt64(0)) + w = wheel[lane] + for ci in eachindex(COMB_PRIMES) # small primes: periodic comb AND + p = COMB_PRIMES[ci] + p > max_prime && break + comb_clear!(chunks, COMB_TABLE[ci][lane], p, win_chunk, nchunks) + if win_block == 0 && p % 30 == w # comb cleared p's own bit (p·1) and this is p's lane + b = fld(p, 30) + chunks[b >>> 6 + 1] |= UInt64(1) << (b & 63) + end + end + for (pq32, pri) in ss.stride_primes # large primes: scalar stride clear + # Reconstruct p and p²÷30 from (pq, residue class) — no division. res_block is the phase: + # lane-w multiples sit at blocks ≡ res_block (mod p). The only division left is mod p. + pq = Int(pq32) + r = wheel[pri] + p = 30 * pq + r + p > max_prime && break # primes are ascending; rest are too big for this window + res_block = pq * STRIDE_KW[pri][lane] + STRIDE_C[pri][lane] + min_block = 30 * pq * pq + 2 * pq * r + STRIDE_PSQ_DIV[pri] + (STRIDE_PSQ_RES[pri] > w) + from_block = max(min_block, win_block) # min_block = cld(p²-w, 30): first block ≥ p² + start_block = from_block + mod(res_block - from_block, p) + local_start = start_block - win_block + local_start < nblocks && stride_clear!(chunks, local_start, p, nblocks) + end + if win_block ≤ lo_block # first window: holds values < lo (incl. value 1) + for block in win_block:(lo_block - 1) # whole blocks below lo + loc = block - win_block + chunks[loc >>> 6 + 1] &= ~(UInt64(1) << (loc & 63)) + end + if w < lo_res # partial block holding lo + loc = lo_block - win_block + chunks[loc >>> 6 + 1] &= ~(UInt64(1) << (loc & 63)) + end + end + if tail_local < nblocks && w > hi_res # clear values > hi in the tail block + chunks[tail_local >>> 6 + 1] &= ~(UInt64(1) << (tail_local & 63)) + end + if tail_bits < 64 # zero the final chunk's padding + chunks[nchunks] &= (UInt64(1) << tail_bits) - 1 + end + end + + ss.win_block = win_block + ss.nchunks = nchunks + return (ss, win_chunk + ss.seg_chunks) +end + +# Exchange the bit-blocks of `a` and `b` selected by `mask` / `shift` +@inline function delta_swap(a::UInt64, b::UInt64, shift::Int, mask::UInt64) + t = (a ⊻ (b << shift)) & mask + return a ⊻ t, b ⊻ (t >> shift) +end + +# transpose 8 Int64s viewed as a matrix of 8x8 bytes +# A 3-stage delta-swap network at byte distances 4, 2, 1. +@inline function transpose_lanes(w::NTuple{8,UInt64}) + a1, a2, a3, a4, a5, a6, a7, a8 = w + a1, a5 = delta_swap(a1, a5, 32, 0xFFFFFFFF00000000) + a2, a6 = delta_swap(a2, a6, 32, 0xFFFFFFFF00000000) + a3, a7 = delta_swap(a3, a7, 32, 0xFFFFFFFF00000000) + a4, a8 = delta_swap(a4, a8, 32, 0xFFFFFFFF00000000) + a1, a3 = delta_swap(a1, a3, 16, 0xFFFF0000FFFF0000) + a2, a4 = delta_swap(a2, a4, 16, 0xFFFF0000FFFF0000) + a5, a7 = delta_swap(a5, a7, 16, 0xFFFF0000FFFF0000) + a6, a8 = delta_swap(a6, a8, 16, 0xFFFF0000FFFF0000) + a1, a2 = delta_swap(a1, a2, 8, 0xFF00FF00FF00FF00) + a3, a4 = delta_swap(a3, a4, 8, 0xFF00FF00FF00FF00) + a5, a6 = delta_swap(a5, a6, 8, 0xFF00FF00FF00FF00) + a7, a8 = delta_swap(a7, a8, 8, 0xFF00FF00FF00FF00) + return (a1, a2, a3, a4, a5, a6, a7, a8) +end + +# transpose of Int64 viewed as a matrix of 8x8 bits: bit (8i+j) ↔ bit (8j+i). +@inline function transpose8(x::UInt64) + t = (x ⊻ (x >> 7)) & 0x00AA00AA00AA00AA + x ⊻= t ⊻ (t << 7) + t = (x ⊻ (x >> 14)) & 0x0000CCCC0000CCCC + x ⊻= t ⊻ (t << 14) + t = (x ⊻ (x >> 28)) & 0x00000000F0F0F0F0 + x ⊻= t ⊻ (t << 28) + return x +end + +# Call f(prime) for each prime in the filled window, in ascending order. +function each_lane_prime(f, ss::SegmentedSieve) + win_block = ss.win_block + chunks = ntuple(i -> ss.lanes[i].chunks, Val(8)) # the 8 lanes' word arrays + nchunks = ss.nchunks + @inbounds for chunk in 1:nchunks + w = ntuple(i -> chunks[i][chunk], Val(8)) # one word from each lane + reduce(|, w) == 0 && continue # no primes in these 64 blocks + packed = transpose_lanes(w) # group g now holds 8 blocks × 8 residues + chunk_base = 30 * (win_block + (chunk - 1) * 64) + for g in 1:8 # each group is 8 consecutive blocks + word = transpose8(packed[g]) # within-word: bit 8·block + (lane-1) + group_base = chunk_base + 30 * 8 * (g - 1) + while word != 0 + pos = trailing_zeros(word) + f(group_base + 30 * (pos >> 3) + wheel[(pos & 7) + 1]) + word &= word - 1 + end + end + end +end + +""" + primesmask([lo,] hi) + +Returns a prime sieve, as a `BitArray`, of the positive integers (from `lo`, if specified) +up to `hi`. Useful when working with either primes or composite numbers. +""" +function primesmask(lo::Int, hi::Int) + 0 < lo ≤ hi || throw(ArgumentError("The condition 0 < lo ≤ hi must be met.")) + sieve = falses(hi - lo + 1) + lo ≤ 2 ≤ hi && (sieve[3 - lo] = true) + lo ≤ 3 ≤ hi && (sieve[4 - lo] = true) + lo ≤ 5 ≤ hi && (sieve[6 - lo] = true) + hi < 7 && return sieve + offset = lo - 1 + for window in SegmentedSieve(max(7, lo), hi) + each_lane_prime(window) do p + sieve[p - offset] = true + end + end + return sieve +end +primesmask(lo::Integer, hi::Integer) = lo ≤ hi ≤ typemax(Int) ? primesmask(Int(lo), Int(hi)) : + throw(ArgumentError("Both endpoints of the interval to sieve must be ≤ $(typemax(Int)), got $lo and $hi.")) + +primesmask(limit::Int) = primesmask(1, limit) +primesmask(n::Integer) = n ≤ typemax(Int) ? primesmask(Int(n)) : + throw(ArgumentError("Requested number of primes must be ≤ $(typemax(Int)), got $n.")) + +# Lazy prime stream over [lo, hi], backed by a SegmentedSieve. +# Buffers one window's primes at a time (refilling on exhaustion); 2, 3, 5 are yielded first. +# All iteration state lives in the (mutable) struct, so the iteration `state` is a dummy. +mutable struct EachPrime + lo::Int + hi::Int + sieve::Union{SegmentedSieve,Nothing} # nothing when hi < 7 + next_chunk::Int # first chunk of the next window to fetch + buffer::Vector{Int} # the current window's primes + pos::Int # next index into buffer + phase::Int # 0,1,2 ⇒ emit 2,3,5; ≥3 ⇒ window mode +end + +""" + eachprime([lo,] hi) + +Lazily iterate the primes in `[lo, hi]` (from 2 if `lo` is omitted) in increasing order. +""" +function eachprime(lo::Integer, hi::Integer) + lo ≤ hi || throw(ArgumentError("The condition lo ≤ hi must be met.")) + lo, hi = Int(lo), Int(hi) + sieve = hi < 7 ? nothing : SegmentedSieve(max(7, lo), hi) + next_chunk = sieve === nothing ? 0 : _first_chunk(sieve.lo) + return EachPrime(lo, hi, sieve, next_chunk, Int[], 1, 0) +end +eachprime(hi::Integer) = eachprime(1, hi) + +Base.IteratorSize(::Type{EachPrime}) = Base.SizeUnknown() +Base.eltype(::Type{EachPrime}) = Int + +function Base.iterate(ep::EachPrime, ::Any = nothing) + while ep.phase < 3 # 2, 3, 5 precede every wheel prime + p = (2, 3, 5)[ep.phase + 1] + ep.phase += 1 + ep.lo ≤ p ≤ ep.hi && return (p, nothing) + end + while ep.pos > length(ep.buffer) # refill from the next window(s) + ep.sieve === nothing && return nothing + next = iterate(ep.sieve, ep.next_chunk) + next === nothing && return nothing + window, ep.next_chunk = next # window === ep.sieve (it yields itself) + empty!(ep.buffer) + each_lane_prime(v -> push!(ep.buffer, v), window) + ep.pos = 1 + end + p = ep.buffer[ep.pos] + ep.pos += 1 + return (p, nothing) +end diff --git a/test/runtests.jl b/test/runtests.jl index f63e59a..0d9d049 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -111,6 +111,16 @@ for n = 100:100:1000 @test primesmask(n, 10n) == primesmask(10n)[n:end] end +# Window advance must stay chunk-aligned for any seg_chunks (odd sizes force many misalignable +# window boundaries); compare multi-window output against the single-window default. +for seg_chunks in (1, 3, 7) + ps = Int[] + for ss in Primes.SegmentedSieve(7, 300_000; seg_chunks) + Primes.each_lane_prime(p -> push!(ps, p), ss) + end + @test ps == primes(7, 300_000) +end + for T in [Int, BigInt], n = [1:1000;1000000] n = convert(T, n) f = factor(n) @@ -511,3 +521,5 @@ end end @test primes(2^31-20, 2^31-1) == [2147483629, 2147483647] end + +include("sieve_tests.jl") diff --git a/test/sieve_tests.jl b/test/sieve_tests.jl new file mode 100644 index 0000000..2773244 --- /dev/null +++ b/test/sieve_tests.jl @@ -0,0 +1,62 @@ +# Sieve tests, split out for a fast standalone dev loop: +# julia --project -e 'using TestEnv; TestEnv.activate(); include("test/sieve_tests.jl")' +# Also included from runtests.jl so CI runs them too. Self-contained imports make +# both entry points work (re-importing when run from runtests.jl is harmless). +using Primes +using Test +import Primes: isprime, primes, primesmask + +@testset "segmented sieve" begin + # brute-force reference independent of the sieve + bruteprimes(lo, hi) = [n for n in max(2, lo):hi if isprime(n)] + brutemask(lo, hi) = Bool[isprime(n) for n in lo:hi] + + # small ranges, lo==7 boundary, sub-49 ranges + for (lo, hi) in [(1, 100), (7, 100), (2, 7), (7, 7), (10, 48), (7, 49), (1, 1000)] + @test primes(lo, hi) == bruteprimes(lo, hi) + @test primesmask(lo, hi) == brutemask(lo, hi) + end + + # ranges that span multiple segment windows (a window covers ~10^6 integers) + for (lo, hi) in [(1, 3_000_000), (2_000_000, 3_000_000), (999_983, 2_500_000)] + @test primes(lo, hi) == bruteprimes(lo, hi) + end + @test primesmask(2_000_000, 2_100_000) == brutemask(2_000_000, 2_100_000) + + # consistency: primes(n) and primesmask(n) agree + let pm = primesmask(2_000_000) + @test primes(2_000_000) == [n for n in 2:2_000_000 if pm[n]] + end + + # direct core tests: the SegmentedSieve iterator (yields itself per filled window) and + # the each_lane_prime consumer. The sieve masks values < max(7, lo) and > hi. + function collect_sieve(lo, hi) + out = Int[] + for s in Primes.SegmentedSieve(max(7, lo), hi) + Primes.each_lane_prime(s) do p + push!(out, p) + end + end + out + end + @test collect_sieve(7, 1000) == [n for n in 7:1000 if isprime(n)] + # a range crossing several segment windows + @test collect_sieve(1_000_000, 1_500_000) == [n for n in 1_000_000:1_500_000 if isprime(n)] + # a window starting mid-chunk (lo not 64-block aligned) + @test collect_sieve(999_983, 1_100_000) == [n for n in 999_983:1_100_000 if isprime(n)] + + # eachprime streams the same primes as primes(), and is lazily takeable + @test eltype(eachprime(100)) == Int + @test collect(eachprime(100)) == primes(100) + for (lo, hi) in [(1, 100), (7, 100), (2, 7), (10, 48), (8, 10), (1, 3_000_000), (999_983, 1_100_000)] + @test collect(eachprime(lo, hi)) == primes(lo, hi) + end + @test collect(eachprime(6)) == [2, 3, 5] + @test isempty(collect(eachprime(8, 10))) + @test collect(Iterators.take(eachprime(10^6), 6)) == [2, 3, 5, 7, 11, 13] + @test_throws ArgumentError eachprime(10, 5) + # NOTE: the widemul / `q < 0` overflow guards only trigger for hi near + # typemax(Int), which needs base primes up to ~isqrt(typemax) ≈ 3e9 — far too + # expensive to sieve in a test. Those guards are carried over verbatim from the + # existing implementation and are not unit-tested here. +end