diff --git a/swift/Sources/CoreAILanguageModels/Bundle/LanguageBundle.swift b/swift/Sources/CoreAILanguageModels/Bundle/LanguageBundle.swift index 5fd1b5e..9962eb7 100644 --- a/swift/Sources/CoreAILanguageModels/Bundle/LanguageBundle.swift +++ b/swift/Sources/CoreAILanguageModels/Bundle/LanguageBundle.swift @@ -21,6 +21,7 @@ public struct LanguageBundle: Sendable { public let bundle: ModelBundle public let modelAssetPath: String public let language: LanguageConfig + public let visionConfig: VisionConfig? public init(from path: String) throws { let expanded = (path as NSString).expandingTildeInPath @@ -45,6 +46,11 @@ public struct LanguageBundle: Sendable { } self.modelAssetPath = main self.language = language + self.visionConfig = payload.vision + + if bundle.kind == .vlm && self.visionConfig == nil { + throw ModelBundle.BundleError.missingField("vision") + } } // MARK: - Convenience accessors @@ -98,6 +104,7 @@ extension LanguageBundle { fileprivate struct LanguagePayload: Decodable { let assets: Assets let language: LanguageConfig? + let vision: VisionConfig? struct Assets: Decodable { let main: String? diff --git a/swift/Sources/CoreAILanguageModels/Bundle/LanguageConfig.swift b/swift/Sources/CoreAILanguageModels/Bundle/LanguageConfig.swift index 83d9955..71d5461 100644 --- a/swift/Sources/CoreAILanguageModels/Bundle/LanguageConfig.swift +++ b/swift/Sources/CoreAILanguageModels/Bundle/LanguageConfig.swift @@ -22,18 +22,23 @@ public struct LanguageConfig: Codable, Sendable, Equatable { /// known role conventions (`main`, `extend_`, `load_embeddings`, ...). public let functionMap: FunctionMap? + /// Vision-specific configuration. Nil for text-only language models. + public let vision: VisionConfig? + public init( tokenizer: String, vocabSize: Int, maxContextLength: Int, embeddedTokenizer: Bool = true, - functionMap: FunctionMap? = nil + functionMap: FunctionMap? = nil, + vision: VisionConfig? = nil ) { self.tokenizer = tokenizer self.vocabSize = vocabSize self.maxContextLength = maxContextLength self.embeddedTokenizer = embeddedTokenizer self.functionMap = functionMap + self.vision = vision } enum CodingKeys: String, CodingKey { @@ -42,6 +47,7 @@ public struct LanguageConfig: Codable, Sendable, Equatable { case maxContextLength = "max_context_length" case embeddedTokenizer = "embedded_tokenizer" case functionMap = "function_map" + case vision } public init(from decoder: Swift.Decoder) throws { @@ -51,6 +57,7 @@ public struct LanguageConfig: Codable, Sendable, Equatable { self.maxContextLength = try c.decode(Int.self, forKey: .maxContextLength) self.embeddedTokenizer = try c.decodeIfPresent(Bool.self, forKey: .embeddedTokenizer) ?? true self.functionMap = try c.decodeIfPresent(FunctionMap.self, forKey: .functionMap) + self.vision = try c.decodeIfPresent(VisionConfig.self, forKey: .vision) } // MARK: - Additional Stop Tokens @@ -137,3 +144,71 @@ public struct LanguageConfig: Codable, Sendable, Equatable { return Array(result) } } + +/// Vision-specific configuration for VLM bundles. +/// Nil for text-only language models. +public struct VisionConfig: Codable, Sendable, Equatable { + /// Input image size (square). Vision encoder expects this resolution. + public let imageSize: Int + + /// Patch size for the vision transformer. + public let patchSize: Int + + /// Number of embedding tokens produced per image after projection. + public let imageTokenCount: Int + + /// Token ID used as a placeholder in the text sequence for image positions. + public let imageTokenId: Int32 + + /// Per-channel normalization mean (RGB). Defaults to CLIP values when omitted. + public let imageMean: [Double] + + /// Per-channel normalization std (RGB). Defaults to CLIP values when omitted. + public let imageStd: [Double] + + /// Pixel rescale factor applied before normalization. Defaults to 1.0 when omitted. + public let rescaleFactor: Double + + /// CLIP normalization (Qwen VL, Pixtral, InternVL, Phi-3.5-vision). + public static let clipMean = [0.48145466, 0.4578275, 0.40821073] + public static let clipStd = [0.26862954, 0.26130258, 0.27577711] + + public init( + imageSize: Int, + patchSize: Int, + imageTokenCount: Int, + imageTokenId: Int32, + imageMean: [Double]? = nil, + imageStd: [Double]? = nil, + rescaleFactor: Double? = nil + ) { + self.imageSize = imageSize + self.patchSize = patchSize + self.imageTokenCount = imageTokenCount + self.imageTokenId = imageTokenId + self.imageMean = imageMean ?? Self.clipMean + self.imageStd = imageStd ?? Self.clipStd + self.rescaleFactor = rescaleFactor ?? 1.0 + } + + enum CodingKeys: String, CodingKey { + case imageSize = "image_size" + case patchSize = "patch_size" + case imageTokenCount = "image_token_count" + case imageTokenId = "image_token_id" + case imageMean = "image_mean" + case imageStd = "image_std" + case rescaleFactor = "rescale_factor" + } + + public init(from decoder: Swift.Decoder) throws { + let c = try decoder.container(keyedBy: CodingKeys.self) + self.imageSize = try c.decode(Int.self, forKey: .imageSize) + self.patchSize = try c.decode(Int.self, forKey: .patchSize) + self.imageTokenCount = try c.decode(Int.self, forKey: .imageTokenCount) + self.imageTokenId = try c.decode(Int32.self, forKey: .imageTokenId) + self.imageMean = try c.decodeIfPresent([Double].self, forKey: .imageMean) ?? Self.clipMean + self.imageStd = try c.decodeIfPresent([Double].self, forKey: .imageStd) ?? Self.clipStd + self.rescaleFactor = try c.decodeIfPresent(Double.self, forKey: .rescaleFactor) ?? 1.0 + } +} diff --git a/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift b/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift new file mode 100644 index 0000000..c6645f8 --- /dev/null +++ b/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift @@ -0,0 +1,1120 @@ +// Copyright 2026 Apple Inc. +// +// Use of this source code is governed by a BSD-3-clause license that can +// be found in the LICENSE file or at https://opensource.org/licenses/BSD-3-Clause + +// TODO: Refactor to re-use common components with CoreAISequentialEngine +// TODO: Add pipelined engine variant for higher throughput + +import CoreAI +import CoreAIShared +import CoreImage +import Foundation +import Synchronization + +// MARK: - VLM Model Config + +/// Configuration for a Vision-Language Model engine. +/// +/// Extends the base `ModelConfig` with vision-specific parameters: image size, +/// patch geometry, placeholder token ID, and per-image embedding token count. +public struct VLMModelConfig: InferenceConfiguration, Codable, Sendable { + public let base: ModelConfig + public let visionConfig: VisionConfig + + public var maxContextLength: Int { base.maxContextLength } + public var vocabSize: Int { base.vocabSize } + public var function: String { base.function } + public var name: String { base.name } + + /// Prefill chunk size from base config. + public var prefillChunkSize: Int { base.prefillChunkSize } + public var chunkThreshold: Int { base.chunkThreshold } + + public init(base: ModelConfig, visionConfig: VisionConfig) { + self.base = base + self.visionConfig = visionConfig + } +} + +// MARK: - Core AI Sequential VLM Engine + +/// Sequential inference engine for Vision-Language Models using Core AI APIs. +/// +/// ## Model Contract +/// +/// Manages three model functions (potentially from separate `.aimodel` bundles): +/// +/// 1. **Vision encoder** (`encode_image`): +/// - Input: `pixel_values` (Float32, shape `[1, 3, H, W]`) +/// - Output: encoder hidden states (Float32, shape `[1, num_patches, vision_hidden_dim]`) +/// +/// 2. **Vision projector** (`project`): +/// - Input: encoder hidden states +/// - Output: projected embeddings (Float16/BFloat16, shape `[1, image_token_count, hidden_dim]`) +/// +/// 3. **Embedding lookup** (`embed_tokens`): +/// - Input: `input_ids` (Int32, shape `[1, seq_len]`) +/// - Output: token embeddings (Float16/BFloat16, shape `[1, seq_len, hidden_dim]`) +/// +/// 4. **LLM decoder** (`main`): +/// - Inputs: `in_embeddings` (Float16/BFloat16), `position_ids` (Int32) +/// - States: `keyCache`, `valueCache` (persistent KV cache) +/// - Output: `logits` (Float16, shape `[1, seq_len, vocab_size]`) +/// +/// ## Inference Flow +/// +/// 1. `encodeImage(at:)` — preprocess image, run vision encoder + projector, return `EmbeddedInput` +/// 2. `generate(with: EmbeddedInput, tokens:, ...)` — embed tokens, scatter-merge with vision +/// embeddings at placeholder positions, run LLM prefill, then standard autoregressive decode +/// +/// KV cache is managed identically to `CoreAISequentialEngine`: starts small and grows +/// dynamically with 2x expansion. +public final class CoreAISequentialVLMEngine: MultimodalInferenceEngine, @unchecked Sendable { + public typealias ConfigType = VLMModelConfig + public typealias OutputSequence = GenerationSequence + + public var supportsLogits: Bool { true } + public var vocabSize: Int { config.vocabSize } + public let config: VLMModelConfig + + // MARK: - Vision Model Handles + + private let visionFunction: InferenceFunction + private let visionFunctionDescriptor: InferenceFunctionDescriptor + private let projectFunction: InferenceFunction + private let projectFunctionDescriptor: InferenceFunctionDescriptor + private let visionProjectorFused: Bool + + // MARK: - Embed Model Handle + + private let embedFunction: InferenceFunction + private let embedFunctionDescriptor: InferenceFunctionDescriptor + + // MARK: - LLM Model Handle + + private let llmFunction: InferenceFunction + private let llmFunctionDescriptor: InferenceFunctionDescriptor + + // LLM I/O names from descriptor + private let embeddingsInputName: String + private let positionIdsName: String + private let keyCacheName: String + private let valueCacheName: String + private let logitsName: String + + // LLM descriptors for dynamic shape resolution + private let embeddingsInputDescriptor: NDArrayDescriptor + private let positionIdsDescriptor: NDArrayDescriptor + private let logitsDescriptor: NDArrayDescriptor + + // MARK: - Persistent State + + private var keyCache: NDArray + private var valueCache: NDArray + private var logitsArray: NDArray + private var cachedLogitsBatchSize: Int + private var currentKVCapacity: Int + private let keyCacheDescriptor: NDArrayDescriptor + private let valueCacheDescriptor: NDArrayDescriptor + + // Track processed tokens for incremental inference + public private(set) var processedTokenCount: Int = 0 + + // MARK: - Image Preprocessor + + private let imagePreprocessor: ImagePreprocessor + + // MARK: - Generation Token + + private let _activeToken = Mutex(nil) + public var isBusy: Bool { _activeToken.withLock { $0 != nil } } + + func clearTokenIfActive(_ token: GenerationToken) { + _activeToken.withLock { if $0 === token { $0 = nil } } + } + + // MARK: - Init + + /// Initialize the VLM engine with separate model assets for vision, embed, and LLM. + /// + /// - Parameters: + /// - config: VLM model configuration (includes vision config) + /// - visionModel: Prepared model containing `encode_image` and `project` functions + /// - embedModel: Prepared model containing `embed_tokens` function + /// - llmModel: Prepared model containing `main` function (embedding-input decoder) + /// - options: Engine options including KV cache strategy + public init( + config: VLMModelConfig, + visionModel: PreparedModel, + embedModel: PreparedModel, + llmModel: PreparedModel, + options: EngineOptions = EngineOptions() + ) async throws { + self.config = config + + let modelLoadSignpost = InstrumentsProfiler.beginCustomInterval( + name: "CoreAIVLMModelLoading", + details: "Loading VLM \(config.name)" + ) + + // --- Vision pipeline --- + // Vision model may have separate "encode_image"+"project" functions (internal export) + // or a single fused "main" function (public export). Support both. + + let hasSeparateVision = visionModel.model.functionDescriptor(for: "encode_image") != nil + self.visionProjectorFused = !hasSeparateVision + if hasSeparateVision { + guard let visionDesc = visionModel.model.functionDescriptor(for: "encode_image") else { + throw InferenceRuntimeError.functionNotFound("encode_image") + } + self.visionFunctionDescriptor = visionDesc + + guard let visionFn = try visionModel.model.loadFunction(named: "encode_image") else { + throw InferenceRuntimeError.functionNotFound("encode_image") + } + self.visionFunction = visionFn + + guard let projectDesc = visionModel.model.functionDescriptor(for: "project") else { + throw InferenceRuntimeError.functionNotFound("project") + } + self.projectFunctionDescriptor = projectDesc + + guard let projectFn = try visionModel.model.loadFunction(named: "project") else { + throw InferenceRuntimeError.functionNotFound("project") + } + self.projectFunction = projectFn + } else { + // Fused vision+projector as single "main" function + guard let visionDesc = visionModel.model.functionDescriptor(for: "main") else { + throw InferenceRuntimeError.functionNotFound( + "Vision model needs 'encode_image' or 'main' function") + } + self.visionFunctionDescriptor = visionDesc + self.projectFunctionDescriptor = visionDesc + + guard let visionFn = try visionModel.model.loadFunction(named: "main") else { + throw InferenceRuntimeError.functionNotFound("main (vision)") + } + self.visionFunction = visionFn + self.projectFunction = visionFn + } + + // --- Embed pipeline --- + + // embed_tokens may be named "main" or "embed_tokens" depending on the asset + let embedFunctionName = + embedModel.model.functionDescriptor(for: "embed_tokens") != nil + ? "embed_tokens" : "main" + + guard let embedDesc = embedModel.model.functionDescriptor(for: embedFunctionName) else { + throw InferenceRuntimeError.functionNotFound("embed_tokens (tried 'embed_tokens' and 'main')") + } + self.embedFunctionDescriptor = embedDesc + + guard let embedFn = try embedModel.model.loadFunction(named: embedFunctionName) else { + throw InferenceRuntimeError.functionNotFound(embedFunctionName) + } + self.embedFunction = embedFn + + // --- LLM pipeline --- + + guard let llmDesc = llmModel.model.functionDescriptor(for: config.function) else { + throw InferenceRuntimeError.functionNotFound(config.function) + } + self.llmFunctionDescriptor = llmDesc + + // Validate LLM architecture: expects inputs for embeddings + position_ids, states for KV cache + guard llmDesc.inputNames.count == 2 else { + throw InferenceRuntimeError.invalidInputType( + "VLM LLM function expected 2 inputs (in_embeddings, position_ids), " + + "got \(llmDesc.inputNames.count): \(llmDesc.inputNames)") + } + guard llmDesc.stateNames.count == 2 else { + throw InferenceRuntimeError.invalidOutputType( + "VLM LLM function expected 2 states (KV cache), " + + "got \(llmDesc.stateNames.count): \(llmDesc.stateNames)") + } + guard llmDesc.outputNames.count >= 1 else { + throw InferenceRuntimeError.invalidOutputType( + "VLM LLM function expected at least 1 output (logits), " + + "got \(llmDesc.outputNames.count): \(llmDesc.outputNames)") + } + + // Extract I/O names + self.embeddingsInputName = llmDesc.inputNames[0] + self.positionIdsName = llmDesc.inputNames[1] + self.keyCacheName = llmDesc.stateNames[0] + self.valueCacheName = llmDesc.stateNames[1] + self.logitsName = llmDesc.outputNames[0] + + // Extract and validate descriptors + guard case .ndArray(let embedsDesc) = llmDesc.inputDescriptor(of: embeddingsInputName) else { + throw InferenceRuntimeError.invalidInputType( + "Cannot get descriptor for '\(embeddingsInputName)'") + } + self.embeddingsInputDescriptor = embedsDesc + + guard case .ndArray(let posIdsDesc) = llmDesc.inputDescriptor(of: positionIdsName) else { + throw InferenceRuntimeError.invalidInputType( + "Cannot get descriptor for '\(positionIdsName)'") + } + self.positionIdsDescriptor = posIdsDesc + + guard case .ndArray(let logitsDesc) = llmDesc.outputDescriptor(of: logitsName) else { + throw InferenceRuntimeError.invalidOutputType( + "Cannot get descriptor for '\(logitsName)'") + } + guard logitsDesc.scalarType == .float16 || logitsDesc.scalarType == .bfloat16 else { + throw InferenceRuntimeError.unsupportedLogitsType( + "Only float16/bfloat16 logits supported, got \(logitsDesc.scalarType)") + } + self.logitsDescriptor = logitsDesc + + // Extract KV cache state descriptors + guard case .ndArray(let keyCacheDesc) = llmDesc.stateDescriptor(of: keyCacheName), + case .ndArray(let valueCacheDesc) = llmDesc.stateDescriptor(of: valueCacheName) + else { + throw InferenceRuntimeError.invalidOutputType("Cannot get KV cache state descriptors") + } + self.keyCacheDescriptor = keyCacheDesc + self.valueCacheDescriptor = valueCacheDesc + + // Allocate KV cache + let isDynamic = keyCacheDesc.shape.contains(where: { $0 < 0 }) + let initialCapacity: Int + if options.kvCacheStrategy == .fixedSize || !isDynamic { + initialCapacity = config.maxContextLength + } else { + initialCapacity = min(256, config.maxContextLength) + } + self.currentKVCapacity = initialCapacity + + let resolvedKeyDesc = keyCacheDesc.resolvingDynamicDimensions( + keyCacheDesc.shape.map { $0 < 0 ? initialCapacity : $0 }) + let resolvedValueDesc = valueCacheDesc.resolvingDynamicDimensions( + valueCacheDesc.shape.map { $0 < 0 ? initialCapacity : $0 }) + self.keyCache = NDArray(descriptor: resolvedKeyDesc) + self.valueCache = NDArray(descriptor: resolvedValueDesc) + + CLILogger.log( + "VLM KV cache: dynamic=\(isDynamic), initial=\(initialCapacity), key=\(keyCacheDesc.shape) -> \(resolvedKeyDesc.shape)" + ) + + // Allocate initial logits (1 token) + let initLogitsDesc = logitsDesc.resolvingDynamicDimensions([1, 1, config.vocabSize]) + self.logitsArray = NDArray(descriptor: initLogitsDesc) + self.cachedLogitsBatchSize = 1 + + // Load LLM inference function + guard let llmFn = try llmModel.model.loadFunction(named: config.function) else { + throw InferenceRuntimeError.genericError( + "Cannot load function '\(config.function)'") + } + self.llmFunction = llmFn + + // Build image preprocessor from vision config normalization fields. + let vc = config.visionConfig + self.imagePreprocessor = ImagePreprocessor( + targetSize: CGSize(width: vc.imageSize, height: vc.imageSize), + mean: (CGFloat(vc.imageMean[0]), CGFloat(vc.imageMean[1]), CGFloat(vc.imageMean[2])), + std: (CGFloat(vc.imageStd[0]), CGFloat(vc.imageStd[1]), CGFloat(vc.imageStd[2])), + rescaleFactor: CGFloat(vc.rescaleFactor) + ) + + InstrumentsProfiler.endCustomInterval( + name: "CoreAIVLMModelLoading", + signpostID: modelLoadSignpost + ) + + CLILogger.log( + "CoreAI VLM engine initialized — vision: encode_image+project, " + + "embed: \(embedFunctionName), llm: \(config.function)" + ) + } + + // MARK: - Image Encoding (MultimodalInferenceEngine) + + /// Encode an image into embeddings suitable for injection into the LLM. + /// + /// Pipeline: + /// 1. Load image, resize to `visionConfig.imageSize`, normalize channels + /// 2. Run vision encoder (`encode_image`) to get patch features + /// 3. Run projector (`project`) to map features to LLM hidden dimension + /// 4. Return as `EmbeddedInput` with placeholder token positions + /// + /// - Parameter url: URL to the image file (JPEG, PNG, HEIC, etc.) + /// - Returns: `EmbeddedInput` containing projected embeddings and token positions + public func encodeImage(at url: URL) async throws -> EmbeddedInput { + let encodeSignpost = InstrumentsProfiler.beginCustomInterval( + name: "CoreAIVLM EncodeImage", + details: url.lastPathComponent + ) + + // Step 1: Preprocess image to CHW Float32 + guard let ciImage = CIImage(contentsOf: url) else { + throw ImagePreprocessorError.loadFailed(url) + } + guard let cgImage = CIContext().createCGImage(ciImage, from: ciImage.extent) else { + throw ImagePreprocessorError.renderFailed + } + let chwPixels = try imagePreprocessor.preprocessCHW(cgImage: cgImage) + + // Step 2: Run encode_image + let encoderOutput = try await runVisionEncoder(pixels: chwPixels) + + // Step 3: Run projector (skip if fused with encoder) + let projectedEmbeddings = + visionProjectorFused ? encoderOutput : try await runProjector(encoderOutput: encoderOutput) + + InstrumentsProfiler.endCustomInterval( + name: "CoreAIVLM EncodeImage", + signpostID: encodeSignpost + ) + + // The image token positions will be determined during generate() when we know the + // full token sequence. For now, record the expected token count. + // Use a placeholder range; the actual positions are resolved at generate() time + // by scanning for imageTokenId in the token sequence. + let tokenCount = config.visionConfig.imageTokenCount + let placeholderRange = 0..