Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ public struct LanguageBundle: Sendable {
public let bundle: ModelBundle
public let modelAssetPath: String
public let language: LanguageConfig
public let visionConfig: VisionConfig?

public init(from path: String) throws {
let expanded = (path as NSString).expandingTildeInPath
Expand All @@ -45,6 +46,7 @@ public struct LanguageBundle: Sendable {
}
self.modelAssetPath = main
self.language = language
self.visionConfig = payload.vision
}

// MARK: - Convenience accessors
Expand Down Expand Up @@ -98,6 +100,7 @@ extension LanguageBundle {
fileprivate struct LanguagePayload: Decodable {
let assets: Assets
let language: LanguageConfig?
let vision: VisionConfig?

struct Assets: Decodable {
let main: String?
Expand Down
44 changes: 43 additions & 1 deletion swift/Sources/CoreAILanguageModels/Bundle/LanguageConfig.swift
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,23 @@ public struct LanguageConfig: Codable, Sendable, Equatable {
/// known role conventions (`main`, `extend_<N>`, `load_embeddings`, ...).
public let functionMap: FunctionMap?

/// Vision-specific configuration. Nil for text-only language models.
public let vision: VisionConfig?

public init(
tokenizer: String,
vocabSize: Int,
maxContextLength: Int,
embeddedTokenizer: Bool = true,
functionMap: FunctionMap? = nil
functionMap: FunctionMap? = nil,
vision: VisionConfig? = nil
) {
self.tokenizer = tokenizer
self.vocabSize = vocabSize
self.maxContextLength = maxContextLength
self.embeddedTokenizer = embeddedTokenizer
self.functionMap = functionMap
self.vision = vision
}

enum CodingKeys: String, CodingKey {
Expand All @@ -42,6 +47,7 @@ public struct LanguageConfig: Codable, Sendable, Equatable {
case maxContextLength = "max_context_length"
case embeddedTokenizer = "embedded_tokenizer"
case functionMap = "function_map"
case vision
}

public init(from decoder: Swift.Decoder) throws {
Expand All @@ -51,6 +57,7 @@ public struct LanguageConfig: Codable, Sendable, Equatable {
self.maxContextLength = try c.decode(Int.self, forKey: .maxContextLength)
self.embeddedTokenizer = try c.decodeIfPresent(Bool.self, forKey: .embeddedTokenizer) ?? true
self.functionMap = try c.decodeIfPresent(FunctionMap.self, forKey: .functionMap)
self.vision = try c.decodeIfPresent(VisionConfig.self, forKey: .vision)
}

// MARK: - Additional Stop Tokens
Expand Down Expand Up @@ -137,3 +144,38 @@ public struct LanguageConfig: Codable, Sendable, Equatable {
return Array(result)
}
}

/// Vision-specific configuration for VLM bundles.
/// Nil for text-only language models.
public struct VisionConfig: Codable, Sendable, Equatable {
/// Input image size (square). Vision encoder expects this resolution.
public let imageSize: Int

/// Patch size for the vision transformer.
public let patchSize: Int

/// Number of embedding tokens produced per image after projection.
public let imageTokenCount: Int

/// Token ID used as a placeholder in the text sequence for image positions.
public let imageTokenId: Int32

public init(
imageSize: Int,
patchSize: Int,
imageTokenCount: Int,
imageTokenId: Int32
) {
self.imageSize = imageSize
self.patchSize = patchSize
self.imageTokenCount = imageTokenCount
self.imageTokenId = imageTokenId
}

enum CodingKeys: String, CodingKey {
case imageSize = "image_size"
case patchSize = "patch_size"
case imageTokenCount = "image_token_count"
case imageTokenId = "image_token_id"
}
}
Loading
Loading